166 lines
5.4 KiB
Python
166 lines
5.4 KiB
Python
import copy
|
|
import typing
|
|
import logging
|
|
import pathlib
|
|
import warnings
|
|
import collections
|
|
import unicodedata
|
|
import json.decoder
|
|
|
|
from csvw import TableGroup, Column
|
|
|
|
from segments.tree import Tree
|
|
from segments.util import grapheme_pattern
|
|
|
|
|
|
class Profile:
|
|
"""
|
|
An Orthography Profile as specified by Moran and Cysouw 2018.
|
|
"""
|
|
GRAPHEME_COL = 'Grapheme'
|
|
NULL = "NULL"
|
|
MD = {
|
|
"tables": [
|
|
{
|
|
"dialect": {
|
|
"delimiter": "\t",
|
|
"header": True,
|
|
"encoding": "utf-8"
|
|
},
|
|
"tableSchema": {
|
|
"columns": [
|
|
{
|
|
"name": GRAPHEME_COL,
|
|
"datatype": "string",
|
|
"required": True
|
|
}
|
|
],
|
|
"primaryKey": GRAPHEME_COL
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
@classmethod
|
|
def default_metadata(cls, fname=None) -> dict:
|
|
md = copy.copy(cls.MD)
|
|
md['tables'][0]['url'] = str(fname or '')
|
|
return md
|
|
|
|
def __init__(self, *specs: dict, **kw):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
specs : list of dict
|
|
A list of grapheme specifications.
|
|
kw :
|
|
The following keyword arguments are recognized:
|
|
- fname: Path of the profile or profile metadata.
|
|
- form: Unicode normalization to apply to the data in the profile before use.
|
|
- remaining keyword arguments are assigned as dict to `Profile.metadata`.
|
|
"""
|
|
self.graphemes = collections.OrderedDict()
|
|
self.column_labels = set()
|
|
self.fname = kw.pop('fname', None)
|
|
self.form = kw.pop('form', None)
|
|
self.metadata = kw
|
|
|
|
log = logging.getLogger(__name__)
|
|
for i, spec in enumerate(specs):
|
|
if self.GRAPHEME_COL not in spec:
|
|
raise ValueError('invalid grapheme specification')
|
|
|
|
if self.form:
|
|
spec = {
|
|
unicodedata.normalize(self.form, k):
|
|
None if v is None else unicodedata.normalize(self.form, v)
|
|
for k, v in spec.items()}
|
|
|
|
grapheme = spec.pop(self.GRAPHEME_COL)
|
|
if not grapheme:
|
|
raise ValueError('Grapheme must not be empty')
|
|
|
|
self.column_labels = self.column_labels.union(spec.keys())
|
|
|
|
# check for duplicates in the orthography profile (fail if dups)
|
|
if grapheme not in self.graphemes:
|
|
self.graphemes[grapheme] = spec
|
|
else:
|
|
log.warning(
|
|
'line {0}:duplicate grapheme in profile: {1}'.format(i + 2, grapheme))
|
|
self.tree = Tree(list(self.graphemes.keys()))
|
|
|
|
def iteritems(self) -> typing.Generator[dict, None, None]:
|
|
for grapheme, spec in self.graphemes.items():
|
|
res = {self.GRAPHEME_COL: grapheme}
|
|
res.update({k: None for k in self.column_labels})
|
|
res.update({k: v for k, v in spec.items()})
|
|
yield res
|
|
|
|
@classmethod
|
|
def from_file(cls, fname, form=None) -> 'Profile':
|
|
"""
|
|
Read an orthography profile from a metadata file or a default tab-separated profile file.
|
|
"""
|
|
try:
|
|
tg = TableGroup.from_file(fname)
|
|
opfname = None
|
|
except json.decoder.JSONDecodeError:
|
|
tg = TableGroup.fromvalue(cls.default_metadata(fname))
|
|
opfname = fname
|
|
if len(tg.tables) != 1: # pragma: no cover
|
|
raise ValueError('profile description must contain exactly one table')
|
|
metadata = tg.common_props
|
|
metadata.update(fname=pathlib.Path(fname), form=form)
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
res = cls(
|
|
*[{k: None if (k != cls.GRAPHEME_COL and v == cls.NULL) else v
|
|
for k, v in d.items()}
|
|
for d in tg.tables[0].iterdicts(fname=opfname)],
|
|
**metadata)
|
|
return res
|
|
|
|
@classmethod
|
|
def from_text(cls, text: str, mapping='mapping') -> 'Profile':
|
|
"""
|
|
Create a Profile instance from the Unicode graphemes found in `text`.
|
|
|
|
Parameters
|
|
----------
|
|
text
|
|
mapping
|
|
|
|
Returns
|
|
-------
|
|
A Profile instance.
|
|
|
|
"""
|
|
graphemes = collections.Counter(grapheme_pattern.findall(text))
|
|
specs = [
|
|
collections.OrderedDict([
|
|
(cls.GRAPHEME_COL, grapheme),
|
|
('frequency', frequency),
|
|
(mapping, grapheme)])
|
|
for grapheme, frequency in graphemes.most_common()]
|
|
return cls(*specs)
|
|
|
|
@classmethod
|
|
def from_textfile(cls, fname, mapping='mapping') -> 'Profile':
|
|
with pathlib.Path(fname).open(encoding='utf-8') as fp:
|
|
lines = fp.readlines()
|
|
return cls.from_text(' '.join(lines), mapping=mapping)
|
|
|
|
def __str__(self):
|
|
"""
|
|
A Profile is represented as tab-separated lines of grapheme specifications.
|
|
"""
|
|
tg = TableGroup.fromvalue(self.default_metadata())
|
|
for col in self.column_labels:
|
|
if col != self.GRAPHEME_COL:
|
|
tg.tables[0].tableSchema.columns.append(
|
|
Column.fromvalue({"name": col, "null": self.NULL}))
|
|
|
|
return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()
|