"""Calculation of descriptive statistics."""
from typing import Callable, Dict, Union
import numpy as np
from pyphen import Pyphen
from spacy.language import Language
from spacy.tokens import Doc, Span
from wasabi import msg
from .utils import filter_tokens, n_sentences, n_syllables, n_tokens
def language_exists_in_pyphen(lang: str) -> bool:
try:
_ = Pyphen(lang=lang)
return True
except KeyError:
return False
class DescriptiveStatistics:
"""spaCy v.3.0 component that adds attributes with descriptive statistics to
`Doc` and `Span` objects.
The attributes relate to token and sentence length, number of syllables, and
counts of tokens and sentences.
"""
def __init__(self, nlp: Language, verbose: bool):
"""Initialise components."""
self.can_calculate_syllables = language_exists_in_pyphen(lang=nlp.lang)
if not self.can_calculate_syllables and verbose:
msg.warn(
f"Could not load syllable counter for language {nlp.lang}. "
+ "The following extensions will be set to np.nan: "
+ "syllables, flesch_reading_ease, flesch_kincaid_grade, "
+ "smog, gunning_fog.",
)
extensions: Dict[str, Callable] = {
"_n_sentences": n_sentences,
"_n_tokens": n_tokens,
"_n_syllables": n_syllables,
"token_length": self.token_length,
"sentence_length": self.sentence_length,
"syllables": self.syllables,
"counts": self.counts,
"descriptive_stats": self.descriptive_stats,
}
for extension_name, getter_fun in extensions.items():
if extension_name not in [
"_n_sentences",
"sentence_length",
"syllables",
] and not Span.has_extension(extension_name):
Span.set_extension(extension_name, getter=getter_fun)
if not Doc.has_extension(extension_name):
Doc.set_extension(extension_name, getter=getter_fun)
def token_length(self, doc: Union[Doc, Span]) -> dict:
"""Calculate mean, median and std of token length for a `Doc` or `Span`.
Returns:
dict: token_length_mean, token_length_median, token_length_std
"""
token_lengths = [len(token) for token in filter_tokens(doc)]
if not token_lengths:
return {
"token_length_mean": np.nan,
"token_length_median": np.nan,
"token_length_std": np.nan,
}
return {
"token_length_mean": np.mean(token_lengths),
"token_length_median": np.median(token_lengths),
"token_length_std": np.std(token_lengths),
}
def sentence_length(self, doc: Doc) -> dict:
"""Calculate mean, median and std of sentence length for a `Doc`.
Returns:
dict: sentence_length_mean, sentence_length_median, sentence_length_std
"""
# get length of filtered tokens per sentence
tokenized_sentences = [
[
token.text
for token in sent
if not token.is_punct and "'" not in token.text
]
for sent in doc.sents
]
len_sentences = [len(sentence) for sentence in tokenized_sentences]
if not len_sentences:
return {
"sentence_length_mean": np.nan,
"sentence_length_median": np.nan,
"sentence_length_std": np.nan,
}
return {
"sentence_length_mean": np.mean(len_sentences),
"sentence_length_median": np.median(len_sentences),
"sentence_length_std": np.std(len_sentences),
}
def syllables(self, doc: Doc) -> dict:
"""Calculate mean, median and std of syllables per token for a `Doc`.
Uses `Pyphen` for hyphenation.
Returns:
dict: syllables_per_token_mean, syllables_per_token_median,
syllables_per_token_std
"""
nan_output = {
"syllables_per_token_mean": np.nan,
"syllables_per_token_median": np.nan,
"syllables_per_token_std": np.nan,
}
if not self.can_calculate_syllables:
return nan_output
if n_syllables := doc._._n_syllables:
return {
"syllables_per_token_mean": np.mean(n_syllables),
"syllables_per_token_median": np.median(n_syllables),
"syllables_per_token_std": np.std(n_syllables),
}
else:
return nan_output
def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True) -> dict:
"""Calculate counts of tokens, unique tokens, and characters for a `Doc`
or `Span`. Adds number of sentences for `Doc` objects.
Args:
ignore_whitespace: if True, whitespace is not counted as a character when
counting number of characters.
Return:
dict: n_tokens, n_unique_tokens, proportion_unique_tokens, n_characters,
(n_sentences)
"""
n_tokens = doc._._n_tokens
n_types = len({tok.lower_ for tok in filter_tokens(doc)})
if ignore_whitespace:
n_chars = len(doc.text.replace(" ", ""))
else:
n_chars = len(doc.text)
prop_unique_tokens = np.nan if n_tokens == 0 else n_types / n_tokens
out = {
"n_tokens": n_tokens,
"n_unique_tokens": n_types,
"proportion_unique_tokens": prop_unique_tokens,
"n_characters": n_chars,
}
if isinstance(doc, Doc):
out["n_sentences"] = doc._._n_sentences
return out
def descriptive_stats(self, doc: Union[Doc, Span]) -> dict:
"""Get all descriptive statistics in a single dict."""
out = {**doc._.counts, **doc._.token_length}
if isinstance(doc, Span):
return out
return {**out, **doc._.sentence_length, **doc._.syllables}
def __call__(self, doc):
"""Run the pipeline component."""
return doc
[docs]@Language.factory(
"textdescriptives/descriptive_stats",
assigns=[
"doc._._n_sentences",
"doc._._n_tokens",
"doc._._n_syllables",
"doc._.token_length",
"doc._.sentence_length",
"doc._.syllables",
"doc._.counts",
"doc._.descriptive_stats",
"span._._n_tokens",
"span._._n_syllables",
"span._.token_length",
"span._.counts",
"span._.descriptive_stats",
],
default_config={"verbose": True},
)
def create_descriptive_stats_component(
nlp: Language,
name: str,
verbose: bool,
) -> Callable[[Doc], Doc]:
"""Allows DescriptiveStatistics to be added to a spaCy pipe using
nlp.add_pipe("textdescriptives/descriptive_stats").
Adding the component to the pipe will add the following attributes to
`Doc` and `Span` objects:
- `doc._.n_sentences`
- `doc._.n_tokens`
- `doc._.token_length`
- `doc._.sentence_length`
- `doc._.syllables`
- `doc._.counts`
- `doc._.descriptive_stats`
- `span._.token_length`
- `span._.counts`
- `span._.descriptive_stats`
Args:
nlp (Language): spaCy language object, does not need to be specified in the
nlp.add_pipe call.
name (str): name of the component. Can be optionally specified in the
nlp.add_pipe call, using the name argument.
Returns:
Callable[[Doc], Doc]: DescriptiveStatistics component
Example:
>>> import spacy
>>> import textdescriptives as td
>>> nlp = spacy.blank("en")
>>> # add sentencizer
>>> nlp.add_pipe("sentencizer")
>>> # add descriptive stats
>>> nlp.add_pipe("textdescriptives/descriptive_stats")
>>> # apply to a document
>>> doc = nlp("This is a sentence. This is another sentence.")
>>> doc._.descriptive_stats
"""
sentencizers = {"sentencizer", "parser"}
if not sentencizers.intersection(set(nlp.pipe_names)):
nlp.add_pipe("sentencizer") # add a sentencizer if not one in pipe
return DescriptiveStatistics(nlp, verbose=verbose)