blender-portable-repo/extensions/.local/lib/python3.11/site-packages/segments/tokenizer.py

"""
Tokenizer of Unicode characters, grapheme clusters and tailored grapheme clusters
(of orthographies) given an orthography profile.
"""
import typing
import pathlib
import unicodedata

import regex
from csvw.dsv import reader

from segments.util import nfd, grapheme_pattern
from segments import errors
from segments.profile import Profile


def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
    with pathlib.Path(p).open(encoding='utf-8') as fp:
        for line in fp.readlines():
            line = line.strip()
            if line and not line.startswith('#'):
                yield unicodedata.normalize('NFD', line)


class Rules:
    """
    Rules are given in tuple format, comma delimited.
    Regular expressions are given in Python syntax.
    """
    def __init__(self, *rules: typing.Tuple[str, str]):
        self._rules = [(regex.compile(rule), replacement) for rule, replacement in rules]

    @classmethod
    def from_file(cls, fname) -> 'Rules':
        return cls(*list(reader(list(iterlines(fname)))))

    def apply(self, s):
        for rule, replacement in self._rules:
            s = rule.sub(replacement, s)
        return s


class Tokenizer:
    """
    Class for Unicode character and grapheme tokenization.

    This class provides extended functionality for
    orthography-specific tokenization with orthography profiles.

    Parameters
    ----------

    profile : string or pathlib.Path or Profile instance (default = None)
        Specifies an orthography profile to use.

    rules : string (default = None)
        Filename of a rules file.

    Notes
    -----
    The tokenizer can be used for pure Unicode character and grapheme
    tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
    implemented in the Python regex package by Matthew Barnett, to do basic tokenization
    with the "\\X" grapheme regular expression match. This grapheme match
    combines one or more Combining Diacritical Marks to their base character.
    These are called "grapheme clusters" in Unicode parlance. With these functions
    the Tokenizer is meant to do basic rudimentary parsing for things like generating
    unigram models (segments and their counts) from input data.

    When a profile is passed, the Tokenizer reads the orthography profile and calls a helper
    class to build a tree data structure, which stores the possible Unicode
    character combinations that are specified in the orthography profile
    (i.e. tailored grapheme clusters) that appear in the data source.

    For example, an orthography profile might specify that in source X
    <uu> is a single grapheme (Unicode parlance: tailored grapheme) and
    therefore it should be chunked as so. Given an orthography profile and
    some data to tokenize, the process would look like this:

    input string example: uubo uubo
    output string example: uu b o # uu b o

    >>> prf = Profile({'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'})
    >>> t = Tokenizer(profile=prf)
    >>> t('uubo uubo')
    'uu b o # uu b o'

    See also the test orthography profile and rules in the test directory.

    An additional method "combine_modifiers" handles the case where there are
    Unicode Spacing Modifier Letters, which are not explicitly
    combined to their base character in the Unicode Standard. These graphemes
    are called "Tailored grapheme clusters" in Unicode. For more information
    see the Unicode Standard Annex #29: Unicode Text Segmentation:

    * http://www.unicode.org/reports/tr29/

    Additionally, the Tokenizer provides functionality to transform graphemes
    into associated character(s) specified in additional columns in the orthography
    profile. A dictionary is created that keeps a mapping between source-specific
    graphemes and their counterparts (e.g. an IPA column in the orthography profile).

    Lastly, the Tokenizer can be used to transform text as specified in an
    orthography rules file. These transformations are specified in a separate
    file from the orthography profile (that specifics the document specific graphemes,
    and possibly their IPA counterparts) and the orthography rules should
    be applied to the output of a grapheme tokenization.

    In an orthography rules file, rules are given in order as regular
    expressions, e.g. this rule replaces a vowel followed by an <n>
    followed by <space> followed by a second vowel with first vowel
    <space> <n> <space> second vowel, e.g.::

        $ (a|á|e|é|i|í|o|ó|u|ú)(n)(\\s)(a|á|e|é|i|í|o|ó|u|ú), \\1 \\2 \\4

    """
    def __init__(self,
                 profile=None,
                 rules=None,
                 errors_strict: typing.Callable[[str], typing.Optional[str]] = errors.strict,
                 errors_replace: typing.Callable[[str], typing.Optional[str]] = errors.replace,
                 errors_ignore: typing.Callable[[str], typing.Optional[str]] = errors.ignore):
        self.op = None
        if isinstance(profile, Profile):
            self.op = profile
        elif profile is not None:
            self.op = Profile.from_file(profile)
        if not rules and self.op and self.op.fname:
            _rules = self.op.fname.parent / (self.op.fname.stem + '.rules')
            if _rules.exists():
                rules = _rules
        self._rules = Rules.from_file(rules) if rules else None
        self._errors = {
            'strict': errors_strict,
            'replace': errors_replace,
            'ignore': errors_ignore,
        }

    def __call__(self,
                 string: str,
                 column: str = Profile.GRAPHEME_COL,
                 form: typing.Optional[typing.Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
                 ipa: bool = False,
                 segment_separator=' ',
                 separator=' # ',
                 errors: typing.Literal['replace', 'strict', 'ignore'] = 'replace') -> str:
        """
        The main task of a Tokenizer is tokenizing! This is what happens when called.

        This function determines what to do given any combination
        of orthography profile and rules or not orthography profile
        or rules.

        Parameters
        ----------
        string : str
            The input string to be tokenized.

        column : str (default = "graphemes")
            The column label for the transformation, if specified.

        form : None or unicode normalization form
            Normalize return value if form is not None.

        ipa : bool
            Tokenize IPA (work in progress)

        Returns
        -------
        result : str
            Result of the tokenization.

        """
        res = []
        for word in string.split():
            if ipa:
                res.append(self.combine_modifiers(self.grapheme_clusters(nfd(word))))
            else:
                if self.op:
                    res.append(
                        self.transform(word, column=column, error=self._errors[errors]))
                else:
                    res.append(self.grapheme_clusters(nfd(word)))

        def pp(word):
            res = segment_separator.join(word).strip()
            res = self._rules.apply(res) if self._rules else res
            return unicodedata.normalize(form, res) if form else res

        return separator.join(pp(word) for word in res)

    def characters(self, string, segment_separator=' ', separator=' # ',) -> str:
        """
        Given a string as input, return a space-delimited string of Unicode characters
        (code points rendered as glyphs).
        Parameters
        ----------
        string : str
            A Unicode string to be tokenized into graphemes.
        Returns
        -------
        result : str
            String returned is space-delimited on Unicode characters and contains "#" to
            mark word boundaries.
            The string is in NFD.
        Notes
        -----
        Input is first normalized according to Normalization Ford D(ecomposition).
        String returned contains "#" to mark word boundaries.
        """
        return separator.join(segment_separator.join(word) for word in nfd(string).split())

    def grapheme_clusters(self, word):
        """
        See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
        http://www.unicode.org/reports/tr29/

        Given a string as input, return a list of Unicode graphemes using the
        "\\X" regular expression.

        Parameters
        ----------
        word : str
            A Unicode string to be tokenized into graphemes.

        Returns
        -------
        result : list
            List of Unicode graphemes in NFD.

        """
        # init the regex Unicode grapheme cluster match
        return grapheme_pattern.findall(word)

    def transform(self, word, column=Profile.GRAPHEME_COL, error=errors.replace):
        """
        Transform a string's graphemes into the mappings given in a different column
        in the orthography profile.

        Parameters
        ----------
        word : str
            The input string to be tokenized.

        column : str (default = "Grapheme")
            The label of the column to transform to. Default it to tokenize with
            orthography profile.

        Returns
        -------
        result : list of lists
            Result of the transformation.

        """
        assert self.op, 'method can only be called with orthography profile.'

        if column != Profile.GRAPHEME_COL and column not in self.op.column_labels:
            raise ValueError("Column {0} not found in profile.".format(column))

        word = self.op.tree.parse(word, error)
        if column == Profile.GRAPHEME_COL:
            return word
        out = []
        for token in word:
            try:
                target = self.op.graphemes[token][column]
            except KeyError:
                target = self._errors['replace'](token)
            if target is not None:
                if isinstance(target, (tuple, list)):
                    out.extend(target)
                else:
                    out.append(target)
        return out

    def rules(self, word):
        """
        Function to tokenize input string and return output of str with ortho rules
        applied.

        Parameters
        ----------
        word : str
            The input string to be tokenized.

        Returns
        -------
        result : str
            Result of the orthography rules applied to the input str.

        """
        return self._rules.apply(word) if self._rules else word

    def combine_modifiers(self, graphemes):
        """
        Given a string that is space-delimited on Unicode grapheme clusters,
        group Unicode modifier letters with their preceding base characters,
        deal with tie bars, etc.

        Parameters
        ----------
        string : str
            A Unicode string tokenized into grapheme clusters to be tokenized into simple
            IPA.

        """
        result = []
        temp = ""
        count = len(graphemes)
        for grapheme in reversed(graphemes):
            count -= 1
            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" \
                    and not ord(grapheme) in [712, 716] and len(graphemes) > 1:
                temp = grapheme + temp
                # hack for the cases where a space modifier is the first character in the
                # string
                if count == 0:
                    result[-1] = temp + result[-1]
                continue  # pragma: no cover

            # catch and repair stress marks
            if len(grapheme) == 1 and (ord(grapheme) in [712, 716]) and result:
                # If result == [], there's nothing to combine with ...
                result[-1] = grapheme + result[-1]
                temp = ""
                continue

            # combine contour tone marks (non-accents)
            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
                if len(result) == 0:
                    result.append(grapheme)
                    temp = ""
                    continue
                else:
                    if unicodedata.category(result[-1][0]) == "Sk":
                        result[-1] = grapheme + temp + result[-1]
                        temp = ""
                        continue

            result.append(grapheme + temp)
            temp = ""

        # last check for tie bars
        segments = result[::-1]
        i = 0
        r = []
        while i < len(segments):
            # tie bars
            if ord(segments[i][-1]) in [865, 860]:
                r.append(segments[i] + segments[i + 1])
                i += 2
            else:
                r.append(segments[i])
                i += 1
        return r