Source code for textdescriptives.components.pos_proportions

"""Calculation of statistics that require a pos-tagger in the pipeline."""

from typing import Callable, Counter, List, Union

import numpy as np
from spacy.language import Language
from spacy.tokens import Doc, Span

from textdescriptives.components.utils import all_upos_tags


class POSProportions:
    """spaCy v.3.0 component that adds attributes for POS statistics to `Doc`
    and `Span` objects."""

    def __init__(self, nlp: Language, use_pos: bool, add_all_tags: bool):
        """Initialise components.

        Args:
            use_pos: If True, uses the simple POS tag. If False, uses the detailed
                universal POS tag.
            add_all_tags: If True, returns proportions of all possible POS tags.
                If False, only returns proportions for the POS tags present in the
                text.
        """
        self.use_pos: bool = use_pos
        self.add_all_tags: bool = add_all_tags
        self.model_tags: List[str] = (
            all_upos_tags if use_pos else nlp.meta["labels"]["tagger"]
        )

        if not Doc.has_extension("pos_proportions"):
            Doc.set_extension("pos_proportions", getter=self.pos_proportions)

        if not Span.has_extension("pos_proportions"):
            Span.set_extension("pos_proportions", getter=self.pos_proportions)

    def pos_proportions(self, text: Union[Doc, Span]) -> dict:
        """Calculates the proportion of tokens in a `Doc`|`Span` that are tagged
        with each POS tag.

        Returns:
            Dict containing {pos_prop_POSTAG: proportion of all tokens tagged with
                POSTAG.
        """
        pos_counts: Counter = Counter()
        if self.add_all_tags:
            # add all tags to the counter so they are included in the output
            pos_counts.update(self.model_tags)
            # reset all counts to 0
            pos_counts.subtract(self.model_tags)

        if self.use_pos:
            pos_counts.update([token.pos_ for token in text])
        else:
            pos_counts.update([token.tag_ for token in text])

        if self.add_all_tags:
            # filter out tags that are not in self.model_tags
            pos_counts = {  # type: ignore
                tag: count
                for tag, count in pos_counts.items()
                if tag in self.model_tags
            }

        len_text = len(text)
        return {
            f"pos_prop_{tag}": count / len(text) if len_text > 0 else np.nan
            for tag, count in pos_counts.items()
        }

    def __call__(self, doc):
        """Run the pipeline component."""
        return doc


[docs]@Language.factory(
    "textdescriptives/pos_proportions",
    assigns=["doc._.pos_proportions", "span._.pos_proportions"],
    default_config={"use_pos": True, "add_all_tags": True},
)
def create_pos_proportions_component(
    nlp: Language,
    name: str,
    use_pos: bool,
    add_all_tags: bool,
) -> Callable[[Doc], Doc]:
    """Allows PosPropotions to be added to a spaCy pipe using
    nlp.add_pipe("textdescriptives/pos_proportions")

    Adding this component to a pipeline sets the following attributes:
        - `doc._.pos_proportions`
        - `span._.pos_proportions`

    Args:
        nlp (Language): spaCy language object, does not need to be specified in the
            nlp.add_pipe call.
        name (str): name of the component. Can be optionally specified in the
            nlp.add_pipe call, using the name argument.
        use_pos: If True, uses the simple token.pos attribute. If False, uses the
            detailed token.tag attribute.

    Returns:
        Callable[[Doc], Doc]: The POSProportions component to be added to the pipe.

    Example:
        >>> import spacy
        >>> import textdescriptives as td
        >>> nlp = spacy.load("en_core_web_sm")
        >>> nlp.add_pipe("textdescriptives/pos_proportions")
        >>> # apply the component to a document
        >>> doc = nlp("This is a test sentence.")
        >>> doc._.pos_proportions
    """

    tagger = {"tagger", "attribute_ruler"}
    if not tagger.intersection(set(nlp.pipe_names)):
        raise ValueError(
            "The pipeline does not contain a component for POS tagging. Please load "
            + "a spaCy model which includes a 'tagger' or an 'attribute ruler' "
            + "component.",
        )
    return POSProportions(nlp, use_pos=use_pos, add_all_tags=add_all_tags)