2025-12-01

2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,6 @@
+from segments.tokenizer import Tokenizer, Rules  # noqa: F401
+from segments.profile import Profile  # noqa: F401
+from segments.util import REPLACEMENT_MARKER  # noqa: F401
+
+__version__ = '2.3.0'
+__all__ = ['Tokenizer', 'Profile', 'Rules']
@@ -0,0 +1,80 @@
+import sys
+import logging
+import pathlib
+import argparse
+
+from segments import Tokenizer, Profile
+
+
+class ParserError(Exception):
+    pass
+
+
+def tokenize(args):
+    """
+    Tokenize a string (passed as argument or read from stdin)
+
+    segments [--profile=PATH/TO/PROFILE] tokenize [STRING]
+    """
+    if args.profile and not pathlib.Path(args.profile).exists():  # pragma: no cover
+        raise ParserError('--profile must be a path for an existing file')
+    print(Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
+
+
+def profile(args):
+    """
+    Create an orthography profile for a string (passed as argument or read from stdin)
+
+    segments profile [STRING]
+    """
+    print(Profile.from_text(_read(args)))
+
+
+def _read(args):
+    string = args.args[0] if args.args else sys.stdin.read()
+    if not isinstance(string, str):
+        string = string.decode(args.encoding)
+    return string.strip()
+
+
+def main(parsed_args=None):
+    commands = {'tokenize': tokenize, 'profile': profile}
+    logging.basicConfig()
+    parser = argparse.ArgumentParser(
+        description="Main command line interface of the segments package.",
+        epilog="Use '%(prog)s help <cmd>' to get help about individual commands.")
+    parser.add_argument("--verbosity", help="increase output verbosity")
+    parser.add_argument('command', help=' | '.join(commands))
+    parser.add_argument('args', nargs=argparse.REMAINDER)
+    parser.add_argument("--encoding", help='input encoding', default="utf8")
+    parser.add_argument("--profile", help='path to an orthography profile', default=None)
+    parser.add_argument(
+        "--mapping",
+        help='column name in ortho profile to map graphemes',
+        default=Profile.GRAPHEME_COL)
+
+    args = parsed_args or parser.parse_args()
+    if args.command == 'help' and len(args.args):
+        # As help text for individual commands we simply re-use the docstrings of the
+        # callables registered for the command:
+        print(commands[args.args[0]].__doc__.strip()
+              if args.args[0] in commands else "Invalid command: '{}'".format(args.args[0]))
+    else:
+        if args.command not in commands:
+            print('invalid command')
+            parser.print_help()
+            sys.exit(64)
+        try:
+            commands[args.command](args)
+        except ParserError as e:
+            print(e)
+            print(commands[args.command].__doc__.strip())
+            sys.exit(64)
+        except Exception as e:  # pragma: no cover
+            print(e)
+            sys.exit(1)
+    sys.exit(0)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()
@@ -0,0 +1,23 @@
+"""
+Default implementations for error handlers
+"""
+import logging
+
+from segments.util import REPLACEMENT_MARKER
+
+log = logging.getLogger(__name__)
+
+
+def strict(c):
+    log.debug('invalid grapheme: {0}'.format(c))
+    raise ValueError('invalid grapheme')
+
+
+def replace(c):
+    log.debug('replacing grapheme: {0}'.format(c))
+    return REPLACEMENT_MARKER
+
+
+def ignore(c):
+    log.debug('ignoring grapheme: {0}'.format(c))
+    return ''
@@ -0,0 +1,165 @@
+import copy
+import typing
+import logging
+import pathlib
+import warnings
+import collections
+import unicodedata
+import json.decoder
+
+from csvw import TableGroup, Column
+
+from segments.tree import Tree
+from segments.util import grapheme_pattern
+
+
+class Profile:
+    """
+    An Orthography Profile as specified by Moran and Cysouw 2018.
+    """
+    GRAPHEME_COL = 'Grapheme'
+    NULL = "NULL"
+    MD = {
+        "tables": [
+            {
+                "dialect": {
+                    "delimiter": "\t",
+                    "header": True,
+                    "encoding": "utf-8"
+                },
+                "tableSchema": {
+                    "columns": [
+                        {
+                            "name": GRAPHEME_COL,
+                            "datatype": "string",
+                            "required": True
+                        }
+                    ],
+                    "primaryKey": GRAPHEME_COL
+                }
+            }
+        ]
+    }
+
+    @classmethod
+    def default_metadata(cls, fname=None) -> dict:
+        md = copy.copy(cls.MD)
+        md['tables'][0]['url'] = str(fname or '')
+        return md
+
+    def __init__(self, *specs: dict, **kw):
+        """
+
+        Parameters
+        ----------
+        specs : list of dict
+            A list of grapheme specifications.
+        kw :
+            The following keyword arguments are recognized:
+            - fname: Path of the profile or profile metadata.
+            - form: Unicode normalization to apply to the data in the profile before use.
+            - remaining keyword arguments are assigned as dict to `Profile.metadata`.
+        """
+        self.graphemes = collections.OrderedDict()
+        self.column_labels = set()
+        self.fname = kw.pop('fname', None)
+        self.form = kw.pop('form', None)
+        self.metadata = kw
+
+        log = logging.getLogger(__name__)
+        for i, spec in enumerate(specs):
+            if self.GRAPHEME_COL not in spec:
+                raise ValueError('invalid grapheme specification')
+
+            if self.form:
+                spec = {
+                    unicodedata.normalize(self.form, k):
+                        None if v is None else unicodedata.normalize(self.form, v)
+                    for k, v in spec.items()}
+
+            grapheme = spec.pop(self.GRAPHEME_COL)
+            if not grapheme:
+                raise ValueError('Grapheme must not be empty')
+
+            self.column_labels = self.column_labels.union(spec.keys())
+
+            # check for duplicates in the orthography profile (fail if dups)
+            if grapheme not in self.graphemes:
+                self.graphemes[grapheme] = spec
+            else:
+                log.warning(
+                    'line {0}:duplicate grapheme in profile: {1}'.format(i + 2, grapheme))
+        self.tree = Tree(list(self.graphemes.keys()))
+
+    def iteritems(self) -> typing.Generator[dict, None, None]:
+        for grapheme, spec in self.graphemes.items():
+            res = {self.GRAPHEME_COL: grapheme}
+            res.update({k: None for k in self.column_labels})
+            res.update({k: v for k, v in spec.items()})
+            yield res
+
+    @classmethod
+    def from_file(cls, fname, form=None) -> 'Profile':
+        """
+        Read an orthography profile from a metadata file or a default tab-separated profile file.
+        """
+        try:
+            tg = TableGroup.from_file(fname)
+            opfname = None
+        except json.decoder.JSONDecodeError:
+            tg = TableGroup.fromvalue(cls.default_metadata(fname))
+            opfname = fname
+        if len(tg.tables) != 1:  # pragma: no cover
+            raise ValueError('profile description must contain exactly one table')
+        metadata = tg.common_props
+        metadata.update(fname=pathlib.Path(fname), form=form)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            res = cls(
+                *[{k: None if (k != cls.GRAPHEME_COL and v == cls.NULL) else v
+                   for k, v in d.items()}
+                  for d in tg.tables[0].iterdicts(fname=opfname)],
+                **metadata)
+        return res
+
+    @classmethod
+    def from_text(cls, text: str, mapping='mapping') -> 'Profile':
+        """
+        Create a Profile instance from the Unicode graphemes found in `text`.
+
+        Parameters
+        ----------
+        text
+        mapping
+
+        Returns
+        -------
+        A Profile instance.
+
+        """
+        graphemes = collections.Counter(grapheme_pattern.findall(text))
+        specs = [
+            collections.OrderedDict([
+                (cls.GRAPHEME_COL, grapheme),
+                ('frequency', frequency),
+                (mapping, grapheme)])
+            for grapheme, frequency in graphemes.most_common()]
+        return cls(*specs)
+
+    @classmethod
+    def from_textfile(cls, fname, mapping='mapping') -> 'Profile':
+        with pathlib.Path(fname).open(encoding='utf-8') as fp:
+            lines = fp.readlines()
+            return cls.from_text(' '.join(lines), mapping=mapping)
+
+    def __str__(self):
+        """
+        A Profile is represented as tab-separated lines of grapheme specifications.
+        """
+        tg = TableGroup.fromvalue(self.default_metadata())
+        for col in self.column_labels:
+            if col != self.GRAPHEME_COL:
+                tg.tables[0].tableSchema.columns.append(
+                    Column.fromvalue({"name": col, "null": self.NULL}))
+
+        return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()
@@ -0,0 +1,355 @@
+"""
+Tokenizer of Unicode characters, grapheme clusters and tailored grapheme clusters
+(of orthographies) given an orthography profile.
+"""
+import typing
+import pathlib
+import unicodedata
+
+import regex
+from csvw.dsv import reader
+
+from segments.util import nfd, grapheme_pattern
+from segments import errors
+from segments.profile import Profile
+
+
+def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
+    with pathlib.Path(p).open(encoding='utf-8') as fp:
+        for line in fp.readlines():
+            line = line.strip()
+            if line and not line.startswith('#'):
+                yield unicodedata.normalize('NFD', line)
+
+
+class Rules:
+    """
+    Rules are given in tuple format, comma delimited.
+    Regular expressions are given in Python syntax.
+    """
+    def __init__(self, *rules: typing.Tuple[str, str]):
+        self._rules = [(regex.compile(rule), replacement) for rule, replacement in rules]
+
+    @classmethod
+    def from_file(cls, fname) -> 'Rules':
+        return cls(*list(reader(list(iterlines(fname)))))
+
+    def apply(self, s):
+        for rule, replacement in self._rules:
+            s = rule.sub(replacement, s)
+        return s
+
+
+class Tokenizer:
+    """
+    Class for Unicode character and grapheme tokenization.
+
+    This class provides extended functionality for
+    orthography-specific tokenization with orthography profiles.
+
+    Parameters
+    ----------
+
+    profile : string or pathlib.Path or Profile instance (default = None)
+        Specifies an orthography profile to use.
+
+    rules : string (default = None)
+        Filename of a rules file.
+
+    Notes
+    -----
+    The tokenizer can be used for pure Unicode character and grapheme
+    tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
+    implemented in the Python regex package by Matthew Barnett, to do basic tokenization
+    with the "\\X" grapheme regular expression match. This grapheme match
+    combines one or more Combining Diacritical Marks to their base character.
+    These are called "grapheme clusters" in Unicode parlance. With these functions
+    the Tokenizer is meant to do basic rudimentary parsing for things like generating
+    unigram models (segments and their counts) from input data.
+
+    When a profile is passed, the Tokenizer reads the orthography profile and calls a helper
+    class to build a tree data structure, which stores the possible Unicode
+    character combinations that are specified in the orthography profile
+    (i.e. tailored grapheme clusters) that appear in the data source.
+
+    For example, an orthography profile might specify that in source X
+    <uu> is a single grapheme (Unicode parlance: tailored grapheme) and
+    therefore it should be chunked as so. Given an orthography profile and
+    some data to tokenize, the process would look like this:
+
+    input string example: uubo uubo
+    output string example: uu b o # uu b o
+
+    >>> prf = Profile({'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'})
+    >>> t = Tokenizer(profile=prf)
+    >>> t('uubo uubo')
+    'uu b o # uu b o'
+
+    See also the test orthography profile and rules in the test directory.
+
+    An additional method "combine_modifiers" handles the case where there are
+    Unicode Spacing Modifier Letters, which are not explicitly
+    combined to their base character in the Unicode Standard. These graphemes
+    are called "Tailored grapheme clusters" in Unicode. For more information
+    see the Unicode Standard Annex #29: Unicode Text Segmentation:
+
+    * http://www.unicode.org/reports/tr29/
+
+    Additionally, the Tokenizer provides functionality to transform graphemes
+    into associated character(s) specified in additional columns in the orthography
+    profile. A dictionary is created that keeps a mapping between source-specific
+    graphemes and their counterparts (e.g. an IPA column in the orthography profile).
+
+    Lastly, the Tokenizer can be used to transform text as specified in an
+    orthography rules file. These transformations are specified in a separate
+    file from the orthography profile (that specifics the document specific graphemes,
+    and possibly their IPA counterparts) and the orthography rules should
+    be applied to the output of a grapheme tokenization.
+
+    In an orthography rules file, rules are given in order as regular
+    expressions, e.g. this rule replaces a vowel followed by an <n>
+    followed by <space> followed by a second vowel with first vowel
+    <space> <n> <space> second vowel, e.g.::
+
+        $ (a|á|e|é|i|í|o|ó|u|ú)(n)(\\s)(a|á|e|é|i|í|o|ó|u|ú), \\1 \\2 \\4
+
+    """
+    def __init__(self,
+                 profile=None,
+                 rules=None,
+                 errors_strict: typing.Callable[[str], typing.Optional[str]] = errors.strict,
+                 errors_replace: typing.Callable[[str], typing.Optional[str]] = errors.replace,
+                 errors_ignore: typing.Callable[[str], typing.Optional[str]] = errors.ignore):
+        self.op = None
+        if isinstance(profile, Profile):
+            self.op = profile
+        elif profile is not None:
+            self.op = Profile.from_file(profile)
+        if not rules and self.op and self.op.fname:
+            _rules = self.op.fname.parent / (self.op.fname.stem + '.rules')
+            if _rules.exists():
+                rules = _rules
+        self._rules = Rules.from_file(rules) if rules else None
+        self._errors = {
+            'strict': errors_strict,
+            'replace': errors_replace,
+            'ignore': errors_ignore,
+        }
+
+    def __call__(self,
+                 string: str,
+                 column: str = Profile.GRAPHEME_COL,
+                 form: typing.Optional[typing.Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
+                 ipa: bool = False,
+                 segment_separator=' ',
+                 separator=' # ',
+                 errors: typing.Literal['replace', 'strict', 'ignore'] = 'replace') -> str:
+        """
+        The main task of a Tokenizer is tokenizing! This is what happens when called.
+
+        This function determines what to do given any combination
+        of orthography profile and rules or not orthography profile
+        or rules.
+
+        Parameters
+        ----------
+        string : str
+            The input string to be tokenized.
+
+        column : str (default = "graphemes")
+            The column label for the transformation, if specified.
+
+        form : None or unicode normalization form
+            Normalize return value if form is not None.
+
+        ipa : bool
+            Tokenize IPA (work in progress)
+
+        Returns
+        -------
+        result : str
+            Result of the tokenization.
+
+        """
+        res = []
+        for word in string.split():
+            if ipa:
+                res.append(self.combine_modifiers(self.grapheme_clusters(nfd(word))))
+            else:
+                if self.op:
+                    res.append(
+                        self.transform(word, column=column, error=self._errors[errors]))
+                else:
+                    res.append(self.grapheme_clusters(nfd(word)))
+
+        def pp(word):
+            res = segment_separator.join(word).strip()
+            res = self._rules.apply(res) if self._rules else res
+            return unicodedata.normalize(form, res) if form else res
+
+        return separator.join(pp(word) for word in res)
+
+    def characters(self, string, segment_separator=' ', separator=' # ',) -> str:
+        """
+        Given a string as input, return a space-delimited string of Unicode characters
+        (code points rendered as glyphs).
+        Parameters
+        ----------
+        string : str
+            A Unicode string to be tokenized into graphemes.
+        Returns
+        -------
+        result : str
+            String returned is space-delimited on Unicode characters and contains "#" to
+            mark word boundaries.
+            The string is in NFD.
+        Notes
+        -----
+        Input is first normalized according to Normalization Ford D(ecomposition).
+        String returned contains "#" to mark word boundaries.
+        """
+        return separator.join(segment_separator.join(word) for word in nfd(string).split())
+
+    def grapheme_clusters(self, word):
+        """
+        See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
+        http://www.unicode.org/reports/tr29/
+
+        Given a string as input, return a list of Unicode graphemes using the
+        "\\X" regular expression.
+
+        Parameters
+        ----------
+        word : str
+            A Unicode string to be tokenized into graphemes.
+
+        Returns
+        -------
+        result : list
+            List of Unicode graphemes in NFD.
+
+        """
+        # init the regex Unicode grapheme cluster match
+        return grapheme_pattern.findall(word)
+
+    def transform(self, word, column=Profile.GRAPHEME_COL, error=errors.replace):
+        """
+        Transform a string's graphemes into the mappings given in a different column
+        in the orthography profile.
+
+        Parameters
+        ----------
+        word : str
+            The input string to be tokenized.
+
+        column : str (default = "Grapheme")
+            The label of the column to transform to. Default it to tokenize with
+            orthography profile.
+
+        Returns
+        -------
+        result : list of lists
+            Result of the transformation.
+
+        """
+        assert self.op, 'method can only be called with orthography profile.'
+
+        if column != Profile.GRAPHEME_COL and column not in self.op.column_labels:
+            raise ValueError("Column {0} not found in profile.".format(column))
+
+        word = self.op.tree.parse(word, error)
+        if column == Profile.GRAPHEME_COL:
+            return word
+        out = []
+        for token in word:
+            try:
+                target = self.op.graphemes[token][column]
+            except KeyError:
+                target = self._errors['replace'](token)
+            if target is not None:
+                if isinstance(target, (tuple, list)):
+                    out.extend(target)
+                else:
+                    out.append(target)
+        return out
+
+    def rules(self, word):
+        """
+        Function to tokenize input string and return output of str with ortho rules
+        applied.
+
+        Parameters
+        ----------
+        word : str
+            The input string to be tokenized.
+
+        Returns
+        -------
+        result : str
+            Result of the orthography rules applied to the input str.
+
+        """
+        return self._rules.apply(word) if self._rules else word
+
+    def combine_modifiers(self, graphemes):
+        """
+        Given a string that is space-delimited on Unicode grapheme clusters,
+        group Unicode modifier letters with their preceding base characters,
+        deal with tie bars, etc.
+
+        Parameters
+        ----------
+        string : str
+            A Unicode string tokenized into grapheme clusters to be tokenized into simple
+            IPA.
+
+        """
+        result = []
+        temp = ""
+        count = len(graphemes)
+        for grapheme in reversed(graphemes):
+            count -= 1
+            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" \
+                    and not ord(grapheme) in [712, 716] and len(graphemes) > 1:
+                temp = grapheme + temp
+                # hack for the cases where a space modifier is the first character in the
+                # string
+                if count == 0:
+                    result[-1] = temp + result[-1]
+                continue  # pragma: no cover
+
+            # catch and repair stress marks
+            if len(grapheme) == 1 and (ord(grapheme) in [712, 716]) and result:
+                # If result == [], there's nothing to combine with ...
+                result[-1] = grapheme + result[-1]
+                temp = ""
+                continue
+
+            # combine contour tone marks (non-accents)
+            if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
+                if len(result) == 0:
+                    result.append(grapheme)
+                    temp = ""
+                    continue
+                else:
+                    if unicodedata.category(result[-1][0]) == "Sk":
+                        result[-1] = grapheme + temp + result[-1]
+                        temp = ""
+                        continue
+
+            result.append(grapheme + temp)
+            temp = ""
+
+        # last check for tie bars
+        segments = result[::-1]
+        i = 0
+        r = []
+        while i < len(segments):
+            # tie bars
+            if ord(segments[i][-1]) in [865, 860]:
+                r.append(segments[i] + segments[i + 1])
+                i += 2
+            else:
+                r.append(segments[i])
+                i += 1
+        return r
@@ -0,0 +1,68 @@
+from segments.errors import replace
+
+
+class TreeNode:
+    """
+    Private class that creates the tree data structure from the orthography profile for
+    parsing.
+    """
+
+    def __init__(self, char, sentinel=False):
+        self.char = char
+        self.children = {}
+        self.sentinel = sentinel
+
+
+class Tree:
+    def __init__(self, graphemes):
+        def _multigraph(node, line):
+            # Internal function to add a multigraph starting at node.
+            for char in line:
+                node = node.children.setdefault(char, TreeNode(char))
+            node.sentinel = True
+
+        self.root = TreeNode('', sentinel=True)
+        for grapheme in graphemes:
+            _multigraph(self.root, grapheme)
+
+    def parse(self, line, error=replace):
+        res, idx = self._parse(self.root, line, 0)
+        rem = line[idx:]
+        while rem:
+            # Chop off one character and try parsing the remainder:
+            res.append(error(rem[0]))
+            rem = rem[1:]
+            r, i = self._parse(self.root, rem, 0)
+            res.extend(r)
+            rem = rem[i:]
+        return res
+
+    def _parse(self, root, line, idx):
+        """
+        :param root: Tree node.
+        :param line: String to parse.
+        :param idx: Global counter of characters parsed.
+        :return: (list of parsed graphemes, incremented character count)
+        """
+        # Base (or degenerate..) case.
+        if len(line) == 0:
+            return [], idx
+
+        parse = []
+        curr = 0
+        node = root
+        cidx = idx
+        while curr < len(line):
+            node = node.children.get(line[curr])
+            curr += 1
+            if not node:
+                break
+            if node.sentinel:
+                subparse, cidx = self._parse(root, line[curr:], idx + curr)
+                # Always keep the latest valid parse, which will be
+                # the longest-matched (greedy match) graphemes.
+                parse = [line[:curr]]
+                parse.extend(subparse)
+        if parse:
+            idx = cidx
+        return parse, idx
@@ -0,0 +1,8 @@
+import functools
+import unicodedata
+
+import regex
+
+REPLACEMENT_MARKER = '�'
+nfd = functools.partial(unicodedata.normalize, 'NFD')
+grapheme_pattern = regex.compile(r"\X", regex.UNICODE)