Files
blender-portable-repo/extensions/.local/lib/python3.11/site-packages/segments/profile.py
T
2026-03-17 14:58:51 -06:00

166 lines
5.4 KiB
Python

import copy
import typing
import logging
import pathlib
import warnings
import collections
import unicodedata
import json.decoder
from csvw import TableGroup, Column
from segments.tree import Tree
from segments.util import grapheme_pattern
class Profile:
"""
An Orthography Profile as specified by Moran and Cysouw 2018.
"""
GRAPHEME_COL = 'Grapheme'
NULL = "NULL"
MD = {
"tables": [
{
"dialect": {
"delimiter": "\t",
"header": True,
"encoding": "utf-8"
},
"tableSchema": {
"columns": [
{
"name": GRAPHEME_COL,
"datatype": "string",
"required": True
}
],
"primaryKey": GRAPHEME_COL
}
}
]
}
@classmethod
def default_metadata(cls, fname=None) -> dict:
md = copy.copy(cls.MD)
md['tables'][0]['url'] = str(fname or '')
return md
def __init__(self, *specs: dict, **kw):
"""
Parameters
----------
specs : list of dict
A list of grapheme specifications.
kw :
The following keyword arguments are recognized:
- fname: Path of the profile or profile metadata.
- form: Unicode normalization to apply to the data in the profile before use.
- remaining keyword arguments are assigned as dict to `Profile.metadata`.
"""
self.graphemes = collections.OrderedDict()
self.column_labels = set()
self.fname = kw.pop('fname', None)
self.form = kw.pop('form', None)
self.metadata = kw
log = logging.getLogger(__name__)
for i, spec in enumerate(specs):
if self.GRAPHEME_COL not in spec:
raise ValueError('invalid grapheme specification')
if self.form:
spec = {
unicodedata.normalize(self.form, k):
None if v is None else unicodedata.normalize(self.form, v)
for k, v in spec.items()}
grapheme = spec.pop(self.GRAPHEME_COL)
if not grapheme:
raise ValueError('Grapheme must not be empty')
self.column_labels = self.column_labels.union(spec.keys())
# check for duplicates in the orthography profile (fail if dups)
if grapheme not in self.graphemes:
self.graphemes[grapheme] = spec
else:
log.warning(
'line {0}:duplicate grapheme in profile: {1}'.format(i + 2, grapheme))
self.tree = Tree(list(self.graphemes.keys()))
def iteritems(self) -> typing.Generator[dict, None, None]:
for grapheme, spec in self.graphemes.items():
res = {self.GRAPHEME_COL: grapheme}
res.update({k: None for k in self.column_labels})
res.update({k: v for k, v in spec.items()})
yield res
@classmethod
def from_file(cls, fname, form=None) -> 'Profile':
"""
Read an orthography profile from a metadata file or a default tab-separated profile file.
"""
try:
tg = TableGroup.from_file(fname)
opfname = None
except json.decoder.JSONDecodeError:
tg = TableGroup.fromvalue(cls.default_metadata(fname))
opfname = fname
if len(tg.tables) != 1: # pragma: no cover
raise ValueError('profile description must contain exactly one table')
metadata = tg.common_props
metadata.update(fname=pathlib.Path(fname), form=form)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
res = cls(
*[{k: None if (k != cls.GRAPHEME_COL and v == cls.NULL) else v
for k, v in d.items()}
for d in tg.tables[0].iterdicts(fname=opfname)],
**metadata)
return res
@classmethod
def from_text(cls, text: str, mapping='mapping') -> 'Profile':
"""
Create a Profile instance from the Unicode graphemes found in `text`.
Parameters
----------
text
mapping
Returns
-------
A Profile instance.
"""
graphemes = collections.Counter(grapheme_pattern.findall(text))
specs = [
collections.OrderedDict([
(cls.GRAPHEME_COL, grapheme),
('frequency', frequency),
(mapping, grapheme)])
for grapheme, frequency in graphemes.most_common()]
return cls(*specs)
@classmethod
def from_textfile(cls, fname, mapping='mapping') -> 'Profile':
with pathlib.Path(fname).open(encoding='utf-8') as fp:
lines = fp.readlines()
return cls.from_text(' '.join(lines), mapping=mapping)
def __str__(self):
"""
A Profile is represented as tab-separated lines of grapheme specifications.
"""
tg = TableGroup.fromvalue(self.default_metadata())
for col in self.column_labels:
if col != self.GRAPHEME_COL:
tg.tables[0].tableSchema.columns.append(
Column.fromvalue({"name": col, "null": self.NULL}))
return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()