blender-portable-repo/extensions/.local/lib/python3.11/site-packages/segments/profile.py

import copy
import typing
import logging
import pathlib
import warnings
import collections
import unicodedata
import json.decoder

from csvw import TableGroup, Column

from segments.tree import Tree
from segments.util import grapheme_pattern


class Profile:
    """
    An Orthography Profile as specified by Moran and Cysouw 2018.
    """
    GRAPHEME_COL = 'Grapheme'
    NULL = "NULL"
    MD = {
        "tables": [
            {
                "dialect": {
                    "delimiter": "\t",
                    "header": True,
                    "encoding": "utf-8"
                },
                "tableSchema": {
                    "columns": [
                        {
                            "name": GRAPHEME_COL,
                            "datatype": "string",
                            "required": True
                        }
                    ],
                    "primaryKey": GRAPHEME_COL
                }
            }
        ]
    }

    @classmethod
    def default_metadata(cls, fname=None) -> dict:
        md = copy.copy(cls.MD)
        md['tables'][0]['url'] = str(fname or '')
        return md

    def __init__(self, *specs: dict, **kw):
        """

        Parameters
        ----------
        specs : list of dict
            A list of grapheme specifications.
        kw :
            The following keyword arguments are recognized:
            - fname: Path of the profile or profile metadata.
            - form: Unicode normalization to apply to the data in the profile before use.
            - remaining keyword arguments are assigned as dict to `Profile.metadata`.
        """
        self.graphemes = collections.OrderedDict()
        self.column_labels = set()
        self.fname = kw.pop('fname', None)
        self.form = kw.pop('form', None)
        self.metadata = kw

        log = logging.getLogger(__name__)
        for i, spec in enumerate(specs):
            if self.GRAPHEME_COL not in spec:
                raise ValueError('invalid grapheme specification')

            if self.form:
                spec = {
                    unicodedata.normalize(self.form, k):
                        None if v is None else unicodedata.normalize(self.form, v)
                    for k, v in spec.items()}

            grapheme = spec.pop(self.GRAPHEME_COL)
            if not grapheme:
                raise ValueError('Grapheme must not be empty')

            self.column_labels = self.column_labels.union(spec.keys())

            # check for duplicates in the orthography profile (fail if dups)
            if grapheme not in self.graphemes:
                self.graphemes[grapheme] = spec
            else:
                log.warning(
                    'line {0}:duplicate grapheme in profile: {1}'.format(i + 2, grapheme))
        self.tree = Tree(list(self.graphemes.keys()))

    def iteritems(self) -> typing.Generator[dict, None, None]:
        for grapheme, spec in self.graphemes.items():
            res = {self.GRAPHEME_COL: grapheme}
            res.update({k: None for k in self.column_labels})
            res.update({k: v for k, v in spec.items()})
            yield res

    @classmethod
    def from_file(cls, fname, form=None) -> 'Profile':
        """
        Read an orthography profile from a metadata file or a default tab-separated profile file.
        """
        try:
            tg = TableGroup.from_file(fname)
            opfname = None
        except json.decoder.JSONDecodeError:
            tg = TableGroup.fromvalue(cls.default_metadata(fname))
            opfname = fname
        if len(tg.tables) != 1:  # pragma: no cover
            raise ValueError('profile description must contain exactly one table')
        metadata = tg.common_props
        metadata.update(fname=pathlib.Path(fname), form=form)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            res = cls(
                *[{k: None if (k != cls.GRAPHEME_COL and v == cls.NULL) else v
                   for k, v in d.items()}
                  for d in tg.tables[0].iterdicts(fname=opfname)],
                **metadata)
        return res

    @classmethod
    def from_text(cls, text: str, mapping='mapping') -> 'Profile':
        """
        Create a Profile instance from the Unicode graphemes found in `text`.

        Parameters
        ----------
        text
        mapping

        Returns
        -------
        A Profile instance.

        """
        graphemes = collections.Counter(grapheme_pattern.findall(text))
        specs = [
            collections.OrderedDict([
                (cls.GRAPHEME_COL, grapheme),
                ('frequency', frequency),
                (mapping, grapheme)])
            for grapheme, frequency in graphemes.most_common()]
        return cls(*specs)

    @classmethod
    def from_textfile(cls, fname, mapping='mapping') -> 'Profile':
        with pathlib.Path(fname).open(encoding='utf-8') as fp:
            lines = fp.readlines()
            return cls.from_text(' '.join(lines), mapping=mapping)

    def __str__(self):
        """
        A Profile is represented as tab-separated lines of grapheme specifications.
        """
        tg = TableGroup.fromvalue(self.default_metadata())
        for col in self.column_labels:
            if col != self.GRAPHEME_COL:
                tg.tables[0].tableSchema.columns.append(
                    Column.fromvalue({"name": col, "null": self.NULL}))

        return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()