2025-12-01

2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,355 @@
+"""
+Tokenizer of Unicode characters, grapheme clusters and tailored grapheme clusters
+(of orthographies) given an orthography profile.
+"""
+import typing
+import pathlib
+import unicodedata
+
+import regex
+from csvw.dsv import reader
+
+from segments.util import nfd, grapheme_pattern
+from segments import errors
+from segments.profile import Profile
+
+
+def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
+    with pathlib.Path(p).open(encoding='utf-8') as fp:
+        for line in fp.readlines():
+            line = line.strip()
+            if line and not line.startswith('#'):
+                yield unicodedata.normalize('NFD', line)
+
+
+class Rules:
+    """
+    Rules are given in tuple format, comma delimited.
+    Regular expressions are given in Python syntax.
+    """
+    def __init__(self, *rules: typing.Tuple[str, str]):
+        self._rules = [(regex.compile(rule), replacement) for rule, replacement in rules]
+
+    @classmethod
+    def from_file(cls, fname) -> 'Rules':
+        return cls(*list(reader(list(iterlines(fname)))))
+
+    def apply(self, s):
+        for rule, replacement in self._rules:
+            s = rule.sub(replacement, s)
+        return s
+
+
+class Tokenizer:
+    """
+    Class for Unicode character and grapheme tokenization.
+
+    This class provides extended functionality for
+    orthography-specific tokenization with orthography profiles.
+
+    Parameters
+    ----------
+
+    profile : string or pathlib.Path or Profile instance (default = None)
+        Specifies an orthography profile to use.
+
+    rules : string (default = None)
+        Filename of a rules file.
+
+    Notes
+    -----
+    The tokenizer can be used for pure Unicode character and grapheme
+    tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
+    implemented in the Python regex package by Matthew Barnett, to do basic tokenization
+    with the "\\X" grapheme regular expression match. This grapheme match
+    combines one or more Combining Diacritical Marks to their base character.
+    These are called "grapheme clusters" in Unicode parlance. With these functions
+    the Tokenizer is meant to do basic rudimentary parsing for things like generating
+    unigram models (segments and their counts) from input data.
+
+    When a profile is passed, the Tokenizer reads the orthography profile and calls a helper
+    class to build a tree data structure, which stores the possible Unicode
+    character combinations that are specified in the orthography profile
+    (i.e. tailored grapheme clusters) that appear in the data source.
+
+    For example, an orthography profile might specify that in source X
+    <uu> is a single grapheme (Unicode parlance: tailored grapheme) and
+    therefore it should be chunked as so. Given an orthography profile and
+    some data to tokenize, the process would look like this:
+
+    input string example: uubo uubo
+    output string example: uu b o # uu b o
+
+    >>> prf = Profile({'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'})
+    >>> t = Tokenizer(profile=prf)
+    >>> t('uubo uubo')
+    'uu b o # uu b o'
+
+    See also the test orthography profile and rules in the test directory.
+
+    An additional method "combine_modifiers" handles the case where there are
+    Unicode Spacing Modifier Letters, which are not explicitly
+    combined to their base character in the Unicode Standard. These graphemes
+    are called "Tailored grapheme clusters" in Unicode. For more information
+    see the Unicode Standard Annex #29: Unicode Text Segmentation:
+
+    * http://www.unicode.org/reports/tr29/
+
+    Additionally, the Tokenizer provides functionality to transform graphemes
+    into associated character(s) specified in additional columns in the orthography
+    profile. A dictionary is created that keeps a mapping between source-specific
+    graphemes and their counterparts (e.g. an IPA column in the orthography profile).
+
+    Lastly, the Tokenizer can be used to transform text as specified in an
+    orthography rules file. These transformations are specified in a separate
+    file from the orthography profile (that specifics the document specific graphemes,
+    and possibly their IPA counterparts) and the orthography rules should
+    be applied to the output of a grapheme tokenization.
+
+    In an orthography rules file, rules are given in order as regular
+    expressions, e.g. this rule replaces a vowel followed by an <n>
+    followed by <space> followed by a second vowel with first vowel
+    <space> <n> <space> second vowel, e.g.::
+
+        $ (a|á|e|é|i|í|o|ó|u|ú)(n)(\\s)(a|á|e|é|i|í|o|ó|u|ú), \\1 \\2 \\4
+
+    """
+    def __init__(self,
+                 profile=None,
+                 rules=None,
+                 errors_strict: typing.Callable[[str], typing.Optional[str]] = errors.strict,
+                 errors_replace: typing.Callable[[str], typing.Optional[str]] = errors.replace,
+                 errors_ignore: typing.Callable[[str], typing.Optional[str]] = errors.ignore):
+        self.op = None
+        if isinstance(profile, Profile):
+            self.op = profile
+        elif profile is not None:
+            self.op = Profile.from_file(profile)
+        if not rules and self.op and self.op.fname:
+            _rules = self.op.fname.parent / (self.op.fname.stem + '.rules')
+            if _rules.exists():
+                rules = _rules
+        self._rules = Rules.from_file(rules) if rules else None
+        self._errors = {
+            'strict': errors_strict,
+            'replace': errors_replace,
+            'ignore': errors_ignore,
+        }
+
+    def __call__(self,
+                 string: str,
+                 column: str = Profile.GRAPHEME_COL,
+                 form: typing.Optional[typing.Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
+                 ipa: bool = False,
+                 segment_separator=' ',
+                 separator=' # ',
+                 errors: typing.Literal['replace', 'strict', 'ignore'] = 'replace') -> str:
+        """
+        The main task of a Tokenizer is tokenizing! This is what happens when called.
+
+        This function determines what to do given any combination
+        of orthography profile and rules or not orthography profile
+        or rules.
+
+        Parameters
+        ----------
+        string : str
+            The input string to be tokenized.
+
+        column : str (default = "graphemes")
+            The column label for the transformation, if specified.
+
+        form : None or unicode normalization form
+            Normalize return value if form is not None.
+
+        ipa : bool
+            Tokenize IPA (work in progress)
+
+        Returns
+        -------
+        result : str
+            Result of the tokenization.
+
+        """
+        res = []
+        for word in string.split():
+            if ipa:
+                res.append(self.combine_modifiers(self.grapheme_clusters(nfd(word))))
+            else:
+                if self.op:
+                    res.append(
+                        self.transform(word, column=column, error=self._errors[errors]))
+                else:
+                    res.append(self.grapheme_clusters(nfd(word)))
+
+        def pp(word):
+            res = segment_separator.join(word).strip()
+            res = self._rules.apply(res) if self._rules else res
+            return unicodedata.normalize(form, res) if form else res
+
+        return separator.join(pp(word) for word in res)
+
+    def characters(self, string, segment_separator=' ', separator=' # ',) -> str:
+        """
+        Given a string as input, return a space-delimited string of Unicode characters
+        (code points rendered as glyphs).
+        Parameters
+        ----------
+        string : str
+            A Unicode string to be tokenized into graphemes.
+        Returns
+        -------
+        result : str
+            String returned is space-delimited on Unicode characters and contains "#" to
+            mark word boundaries.
+            The string is in NFD.
+        Notes
+        -----
+        Input is first normalized according to Normalization Ford D(ecomposition).
+        String returned contains "#" to mark word boundaries.
+        """
+        return separator.join(segment_separator.join(word) for word in nfd(string).split())
+
+    def grapheme_clusters(self, word):
+        """
+        See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
+        http://www.unicode.org/reports/tr29/
+
+        Given a string as input, return a list of Unicode graphemes using the
+        "\\X" regular expression.
+
+        Parameters
+        ----------
+        word : str
+            A Unicode string to be tokenized into graphemes.
+
+        Returns
+        -------
+        result : list
+            List of Unicode graphemes in NFD.
+
+        """
+        # init the regex Unicode grapheme cluster match
+        return grapheme_pattern.findall(word)
+
+    def transform(self, word, column=Profile.GRAPHEME_COL, error=errors.replace):
+        """
+        Transform a string's graphemes into the mappings given in a different column
+        in the orthography profile.
+
+        Parameters
+        ----------
+        word : str
+            The input string to be tokenized.
+
+        column : str (default = "Grapheme")
+            The label of the column to transform to. Default it to tokenize with
+            orthography profile.
+
+        Returns
+        -------
+        result : list of lists
+            Result of the transformation.
+
+        """
+        assert self.op, 'method can only be called with orthography profile.'
+
+        if column != Profile.GRAPHEME_COL and column not in self.op.column_labels:
+            raise ValueError("Column {0} not found in profile.".format(column))
+
+        word = self.op.tree.parse(word, error)
+        if column == Profile.GRAPHEME_COL:
+            return word
+        out = []
+        for token in word:
+            try:
+                target = self.op.graphemes[token][column]
+            except KeyError:
+                target = self._errors['replace'](token)
+            if target is not None:
+                if isinstance(target, (tuple, list)):
+                    out.extend(target)
+                else:
+                    out.append(target)
+        return out
+
+    def rules(self, word):
+        """
+        Function to tokenize input string and return output of str with ortho rules
+        applied.
+
+        Parameters
+        ----------
+        word : str
+            The input string to be tokenized.
+
+        Returns
+        -------
+        result : str
+            Result of the orthography rules applied to the input str.
+
+        """
+        return self._rules.apply(word) if self._rules else word
+
+    def combine_modifiers(self, graphemes):
+        """
+        Given a string that is space-delimited on Unicode grapheme clusters,
+        group Unicode modifier letters with their preceding base characters,
+        deal with tie bars, etc.
+
+        Parameters
+        ----------
+        string : str
+            A Unicode string tokenized into grapheme clusters to be tokenized into simple
+            IPA.
+
+        """
+        result = []
+        temp = ""
+        count = len(graphemes)
+        for grapheme in reversed(graphemes):
+            count -= 1
+            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" \
+                    and not ord(grapheme) in [712, 716] and len(graphemes) > 1:
+                temp = grapheme + temp
+                # hack for the cases where a space modifier is the first character in the
+                # string
+                if count == 0:
+                    result[-1] = temp + result[-1]
+                continue  # pragma: no cover
+
+            # catch and repair stress marks
+            if len(grapheme) == 1 and (ord(grapheme) in [712, 716]) and result:
+                # If result == [], there's nothing to combine with ...
+                result[-1] = grapheme + result[-1]
+                temp = ""
+                continue
+
+            # combine contour tone marks (non-accents)
+            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
+                if len(result) == 0:
+                    result.append(grapheme)
+                    temp = ""
+                    continue
+                else:
+                    if unicodedata.category(result[-1][0]) == "Sk":
+                        result[-1] = grapheme + temp + result[-1]
+                        temp = ""
+                        continue
+
+            result.append(grapheme + temp)
+            temp = ""
+
+        # last check for tie bars
+        segments = result[::-1]
+        i = 0
+        r = []
+        while i < len(segments):
+            # tie bars
+            if ord(segments[i][-1]) in [865, 860]:
+                r.append(segments[i] + segments[i + 1])
+                i += 2
+            else:
+                r.append(segments[i])
+                i += 1
+        return r