Source code for textdescriptives.components.coherence

from typing import Callable, List

import numpy as np
from spacy.language import Language
from spacy.tokens import Doc


def n_order_coherence(doc: Doc, order: int) -> List[float]:
    """Calculate coherence for a `Doc` for a given order.

    Args:
        doc: A `Doc` object.
        order: The order of coherence to calculate. For example, order=1 will
            calculate the semantic similarity between consecutive sentences. And
            order=2 will calculate the semantic similarity between sentences that
            are two sentences apart.

    Returns:
        A list of floats representing the semantic similarity between sentences
    """

    if not doc.has_annotation("SENT_START"):
        raise ValueError(
            "A sentence boundary detector has not been run on this Doc, which is "
            + "required to calculate coherence. Have you added a model with a "
            + "sentencizer and word vectors to the pipeline?",
        )
    sents = list(doc.sents)
    if len(sents) < order + 1:
        return [np.nan]

    if doc.vector.size == 0:
        raise ValueError(
            "Sentence vectors are not available. Thus it is not possible to "
            + "calculate the coherence between sentences. Please add a component "
            + "that includes word vectors or sentence embeddings."
            + "See https://spacy.io/usage/vectors-similarity for more details.",
        )

    similarities: List[float] = []
    for i, sent in enumerate(sents):
        if i == len(sents) - order:
            break
        similarities.append(sent.similarity(sents[i + order]))
    return similarities


class Coherence:
    """Spacy v.3.0 component that adds attributes with coherence to `Doc` and
    `Span` objects."""

    def __init__(self, nlp: Language):
        """Initialise component."""
        extensions = [
            "first_order_coherence_values",
            "second_order_coherence_values",
            "coherence",
        ]
        for extension in extensions:
            if not Doc.has_extension(extension):
                Doc.set_extension(extension, default=None)

    @staticmethod
    def _first_order_coherence(doc: Doc) -> List[float]:
        """Calculate first order coherence for a `Doc`, i.e. the semantic
        similarity between consecutive sentences."""
        return n_order_coherence(doc=doc, order=1)

    @staticmethod
    def _second_order_coherence(doc: Doc) -> List[float]:
        """Calculate second order coherence for a `Doc`, i.e. the semantic
        similarity between sentences that are two sentences apart."""
        return n_order_coherence(doc, order=2)

    def coherence(self, doc: Doc) -> None:
        """Calculate mean semantic coherence for a `Doc` and set the coherence
        attribute.

        Coherence is calculated by taking the mean of the similarity between
        sentence embeddings. See the documentation for more details.
        """
        first_order_coherence = self._first_order_coherence(doc)
        second_order_coherence = self._second_order_coherence(doc)

        # get mean of coherence values
        if len(first_order_coherence) < 2:
            first_order_coherence_mean = first_order_coherence[0]
        else:
            first_order_coherence_mean = np.nanmean(first_order_coherence)
        if len(second_order_coherence) < 2:
            second_order_coherence_mean = second_order_coherence[0]
        else:
            second_order_coherence_mean = np.nanmean(second_order_coherence)

        # set attributes
        setattr(doc._, "first_order_coherence_values", first_order_coherence)
        setattr(doc._, "second_order_coherence_values", second_order_coherence)
        setattr(
            doc._,
            "coherence",
            {
                "first_order_coherence": first_order_coherence_mean,
                "second_order_coherence": second_order_coherence_mean,
            },
        )

    def __call__(self, doc: Doc):
        """Run the pipeline component."""
        self.coherence(doc)
        return doc


[docs]@Language.factory(
    "textdescriptives/coherence",
    assigns=[
        "doc._.first_order_coherence_values",
        "doc._.second_order_coherence_values",
        "doc._.coherence",
    ],
)
def create_coherence_component(nlp: Language, name: str) -> Callable[[Doc], Doc]:
    """Allows Coherence to be added to a spaCy pipe using
    nlp.add_pipe("textdescriptives/coherence").

    Adding this component to a pipeline sets the following attributes:
        - doc._.first_order_coherence_values
        - doc._.second_order_coherence_values
        - doc._.coherence

    Args:
        nlp (Language): spaCy language object, does not need to be specified in the
            nlp.add_pipe call.
        name (str): name of the component. Can be optionally specified in the
            nlp.add_pipe call, using the name argument.

    Returns:
        Callable[[Doc], Doc]: The Coherence component to be added to the pipe.

    Examples:
        >>> import spacy
        >>> import textdescriptives as td
        >>> nlp = spacy.load("en_core_web_md")
        >>> nlp.add_pipe("textdescriptives/coherence")
        >>> # apply the pipeline to a text
        >>> doc = nlp("This is a sentence. This is another sentence.")
        >>> # get coherence values
        >>> doc._.coherence
    """
    return Coherence(nlp)