Source code for textdescriptives.components.dependency_distance

"""Calculation of statistics related to dependency distance."""

from typing import Callable

import numpy as np
from spacy.language import Language
from spacy.tokens import Doc, Span, Token


class DependencyDistance:
    """spaCy v.3.0 component that adds attributes to `Doc`, `Span`, and `Token`
    objects relating to dependency distance.

    Dependency distance can be used as a measure of syntactic complexity, and
    measures the distance from a word to its head word. For `Doc` objects,
    dependency distance is calculated on the sentence level.
    """

    def __init__(self, nlp: Language):
        """Initialise components."""
        if not Token.has_extension("dependency_distance"):
            Token.set_extension("dependency_distance", getter=self.token_dependency)
        if not Span.has_extension("dependency_distance"):
            Span.set_extension("dependency_distance", getter=self.span_dependency)
        if not Doc.has_extension("dependency_distance"):
            Doc.set_extension("dependency_distance", getter=self.doc_dependency)

    def token_dependency(self, token: Token) -> dict:
        """Calculate token level dependency distance, i.e. the distance from a
        token to its head token. Also returns a boolean indicating whether the
        dependency relation is adjacent to the token.

        Returns:
            dict: Dictionary with the following keys:
                - dependency_distance: Dependency distance
                - adjacent_dependency: Boolean indicating whether the dependency
                  relation is adjacent to the token
        """
        dep_dist = 0
        ajd_dep = False
        if token.dep_ != "ROOT":
            dep_dist = abs(token.head.i - token.i)
            if dep_dist == 1:
                ajd_dep = True
        return {"dependency_distance": dep_dist, "adjacent_dependency": ajd_dep}

    def span_dependency(self, span: Span) -> dict:
        """Aggregates token level dependency distance on the span level by
        taking the mean of the dependency distance and the proportion of
        adjacent dependency relations.

        Returns:
            dict: Dictionary with the following keys: dependency_distance_mean:
                Mean dependency distance and prop_adjacent_dependency_relation:
                Proportion of adjacent dependency relations
        """
        dep_dists, adj_deps = zip(
            *[token._.dependency_distance.values() for token in span],
        )
        return {
            "dependency_distance_mean": np.mean(dep_dists),
            "prop_adjacent_dependency_relation": np.mean(adj_deps),
        }

    def doc_dependency(self, doc: Doc) -> dict:
        """Aggregates token level dependency distance on the document level by
        taking the mean of the dependency distance and the proportion of
        adjacent dependency relations on the sentence level.

        Returns:
            dict: Dictionary with the following keys:
                - dependency_distance_mean: Mean dependency distance on the sentence
                  level
                - dependency_distance_std: Standard deviation of dependency distance on
                  the sentence level
                - prop_adjacent_dependency_relation_mean: Mean proportion of adjacent
                  dependency relations on the sentence level
                - prop_adjacent_dependency_relation_std: Standard deviation of
                  proportion of adjacent dependency relations on the sentence level
        """
        if len(doc) == 0:
            return {
                "dependency_distance_mean": np.nan,
                "dependency_distance_std": np.nan,
                "prop_adjacent_dependency_relation_mean": np.nan,
                "prop_adjacent_dependency_relation_std": np.nan,
            }
        dep_dists, adj_deps = zip(
            *[sent._.dependency_distance.values() for sent in doc.sents],
        )
        return {
            "dependency_distance_mean": np.mean(dep_dists),
            "dependency_distance_std": np.std(dep_dists),
            "prop_adjacent_dependency_relation_mean": np.mean(adj_deps),
            "prop_adjacent_dependency_relation_std": np.std(adj_deps),
        }

    def __call__(self, doc: Doc):
        """Run the pipeline component."""
        return doc


[docs]@Language.factory( "textdescriptives/dependency_distance", assigns=[ "token._.dependency_distance", "span._.dependency_distance", "doc._.dependency_distance", ], ) def create_dependency_distance_component( nlp: Language, name: str, ) -> Callable[[Doc], Doc]: """Create spaCy language factory that allows DependencyDistance attributes to be added to a pipe using nlp.add_pipe("textdescriptives/dependency_distance") Adding this component to a pipeline sets the following attributes: - `token._.dependency_distance` - `span._.dependency_distance` - `doc._.dependency_distance` Args: nlp (Language): spaCy language object, does not need to be specified in the nlp.add_pipe call. name (str): name of the component. Can be optionally specified in the nlp.add_pipe call, using the name argument. Returns: Callable[[Doc], Doc]: The DependencyDistance component Example: >>> import spacy >>> import textdescriptives as td >>> nlp = spacy.load("en_core_web_sm") >>> nlp.add_pipe("textdescriptives/dependency_distance") >>> # apply the pipeline to a text >>> doc = nlp("This is a sentence.") >>> # access the dependency distance attributes >>> doc._.dependency_distance """ return DependencyDistance(nlp)