Source code for textdescriptives.components.quality_data_classes

"""Data classes used for the quality component."""

from typing import Any, Dict, Optional, Tuple, Union

from pydantic import ConfigDict, BaseModel, Field

Interval = Tuple[Optional[float], Optional[float]]


[docs]class ThresholdsOutput(BaseModel):
    """An output which contains an three items. 1) a thresholds which is either
    an interval or a accepted boolean value. 2) a value which is the value of
    the metric. 3) a boolean which is True if the value is within the
    thresholds.

    Example:
        >>> t_out = ThresholdsOutput(threshold=(0, 2), value=2)
        >>> t_out
        ThresholdsOutput(value=2.0, passed=True, threshold=(0.0, 2.0))
        >>> t_out.passed
        True
    """

    model_config = ConfigDict(extra="forbid")

    threshold: Union[Interval, bool, None]
    value: Union[float, None]

    @property
    def passed(self) -> Optional[bool]:
        """Return True if the value is within the thresholds."""
        if self.value is None:
            return None
        if self.threshold is None:
            return True
        if isinstance(self.threshold, bool):
            return self.threshold == self.value
        lower, upper = self.threshold
        return (lower is None or lower <= self.value) and (
            upper is None or self.value <= upper
        )

    def __repr_str__(self, join_str: str) -> str:
        value = round(self.value, 2) if isinstance(self.value, float) else self.value
        return join_str.join(
            repr(v) if a is None else f"{a}={v!r}"
            for a, v in [
                ("value", value),
                ("passed", self.passed),
                ("threshold", self.threshold),
            ]
        )

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, ThresholdsOutput):
            return self.value == other.value and self.threshold == other.threshold
        return self.value == other


[docs]class QualityThresholds(BaseModel):
    """Thresholds for quality metrics."""

    model_config = ConfigDict(extra="forbid")

    n_stop_words: Interval = Field(
        (2, None),
        description="A Range for the number of stop words. Default: (2, None), i.e. "
        + "at least 2 stop words, but no upper limit.",
    )
    alpha_ratio: Interval = Field(
        (0.7, None),
        description="A Range for the alpha ratio. Default: (0.7, None), i.e. at "
        + r"least 70% of tokens contain at least one alphabetic character, but no "
        + "upper limit. Note this is lowered from the original 0.8 to account for a"
        + "different definition of word boundaries. E.g. in spaCy a punctuation is"
        + "not a part of a word.",
    )
    mean_word_length: Interval = Field(
        (3, 10),
        description="A Range for the mean word length. Default: (3, 10), i.e. between"
        + " 3 and 10 characters.",
    )
    doc_length: Interval = Field(
        (10, 100_000),
        description="A Range for the document length. Default: (10, 100_000), i.e."
        + " between 10 and 100_000 words (spacy tokens).",
    )
    symbol_to_word_ratio: Dict[str, Interval] = Field(
        {"#": (None, 0.1)},
        description="A dict of symbols and the allowed range for the "
        + r"symbol-to-word-ratio. The symbol-to-word-ratio is the ratio between symbol"
        + "occurrence and word occurrence. Defaults to {'#': (None, 0.1)} i.e. no lower"
        + r" limit, but there must at most be a ratio of 0.1 between the number of of "
        + "words and hashtags. i.e. if we have 100 words the symbol should appear no "
        + "more than 10 times. Values not in the dict are not checked.",
    )
    proportion_ellipsis: Interval = Field(
        (None, 0.3),
        description="A Range for the proportion of lines which end with ellipsis. "
        + "Default: (None, 0.3), "
        + r"i.e. no lower limit, but at most 30% of lines end with an ellipsis.",
    )
    proportion_bullet_points: Interval = Field(
        (None, 0.8),
        description="A Range for the proportion lines which start with a bullet "
        + r"points. Default: (None, 0.8), i.e. no lower limit, but at most 80% of lines"
        + " start with a bullet point.",
    )
    contains: Dict[str, bool] = Field(
        {"lorem ipsum": False},
        description="A dictionary of strings and whether they should be contained in "
        + "the document. Default: {'lorem ipsum': False}, i.e. the document should not"
        + " contain the string 'lorem ipsum'.",
    )
    duplicate_line_chr_fraction: Interval = Field(
        (None, 0.2),
        description="A Range for the duplicate line character fraction. Default: "
        + r"(None, 0.2), i.e. no lower limit, but at most 20% of characters are"
        + " duplicates.",
    )
    duplicate_paragraph_chr_fraction: Interval = Field(
        (None, 0.2),
        description="A Range for the duplicate paragraph character fraction. Default:"
        + r" (None, 0.2), i.e. no lower limit, but at most 20% of characters are "
        + "duplicates.",
    )
    duplicate_ngram_chr_fraction: Dict[str, Interval] = Field(
        {
            "5": (None, 0.15),
            "6": (None, 0.14),
            "7": (None, 0.13),
            "8": (None, 0.12),
            "9": (None, 0.11),
            "10": (None, 0.1),
        },
        description="A dictionary of n-gram lengths and the allowed range for the "
        + "duplicate n-gram character fraction. Default: {5: (None, 0.15), 6: (None, "
        + "0.14), 7: (None, 0.13), 8: (None, 0.12), 9: (None, 0.11), 10: (None, 0.1)}, "
        + r"i.e. no lower limit, but at most 15% of characters are duplicates for "
        + r"5-grams, 14% for 6-grams, 13% for 7-grams, 12% for 8-grams, 11% for 9-grams"
        + r" and 10% for 10-grams.",
    )
    top_ngram_chr_fraction: Dict[str, Interval] = Field(
        {
            "2": (None, 0.2),
            "3": (None, 0.18),
            "4": (None, 0.16),
        },
        description="A dictionary of n-gram lengths and the allowed range for the "
        + "top n-gram character fraction. Default: {2: (None, 0.2), 3: (None, 0.18)"
        + r", 4: (None, 0.16)}, i.e. no lower limit, but at most 20% of characters "
        + r"are contained within a duplicate for 2-grams, 18% for 3-grams and 16% "
        + "for 4-grams.",
    )
    oov_ratio: Interval = Field(
        (None, 0.2),
        description="A range for the out-of-vocabulary ratio. Default: (None, 0.2)"
        + r" i.e. no lower limit, but at most 20% of words are out-of-vocabulary.",
    )


[docs]class QualityOutput(BaseModel):
    """The output of the quality function."""

    model_config = ConfigDict(extra="forbid")

    n_stop_words: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the number of stop words.",
    )
    alpha_ratio: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the alpha ratio.",
    )
    mean_word_length: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the mean word length.",
    )
    doc_length: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the document length.",
    )
    symbol_to_word_ratio: Dict[str, ThresholdsOutput] = Field(
        ...,
        description="The thresholds output for the symbol-to-word-ratio.",
    )
    proportion_ellipsis: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the proportion of lines ending with "
        + "ellipsis.",
    )
    proportion_bullet_points: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the proportion of lines starting with "
        + "bullet points.",
    )
    contains: Dict[str, ThresholdsOutput] = Field(
        ...,
        description="The thresholds output for the presence of strings.",
    )
    duplicate_line_chr_fraction: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the duplicate line character fraction.",
    )
    duplicate_paragraph_chr_fraction: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the duplicate paragraph character "
        + "fraction.",
    )
    duplicate_ngram_chr_fraction: Dict[str, ThresholdsOutput] = Field(
        ...,
        description="The thresholds output for the duplicate n-gram character "
        + "fraction.",
    )
    top_ngram_chr_fraction: Dict[str, ThresholdsOutput] = Field(
        ...,
        description="The thresholds output for the top n-gram character fraction.",
    )
    oov_ratio: ThresholdsOutput = Field(
        ...,
        description="The thresholds output for the out-of-vocabulary ratio.",
    )

    @property
    def passed(self) -> bool:
        """
        Returns:
            bool: Whether all thresholds have been passed.
        """
        passed_or_none = [
            self.n_stop_words.passed,
            self.alpha_ratio.passed,
            self.mean_word_length.passed,
            self.doc_length.passed,
            all(v.passed for v in self.symbol_to_word_ratio.values()),
            self.proportion_ellipsis.passed,
            self.proportion_bullet_points.passed,
            all(v.passed for v in self.contains.values()),
            self.duplicate_line_chr_fraction.passed,
            self.duplicate_paragraph_chr_fraction.passed,
            all(v.passed for v in self.duplicate_ngram_chr_fraction.values()),
            all(v.passed for v in self.top_ngram_chr_fraction.values()),
            self.oov_ratio.passed,
        ]

        return all(i is None or i for i in passed_or_none)

    def __repr_str__(self, join_str: str) -> str:
        return join_str.join(
            repr(v) if a is None else f"\n\t{a}={v!r}"
            for a, v in [
                ("passed", self.passed),
            ]
            + list(self.__repr_args__())
        )

[docs]    def to_flat_value_dict(self) -> Dict[str, Any]:
        """Creates a flat dictionary representation of the object to allow for
        easy easy conversion to a pandas DataFrame."""
        flat_dict = {"passed_quality_check": self.passed}

        for k, v in self.__dict__.items():
            if isinstance(v, dict):
                for k2, v2 in v.items():
                    flat_dict[f"{k}_{k2}"] = v2.value
            else:
                flat_dict[k] = v.value

        return flat_dict