2025-12-01
This commit is contained in:
@@ -0,0 +1,6 @@
|
||||
from segments.tokenizer import Tokenizer, Rules # noqa: F401
|
||||
from segments.profile import Profile # noqa: F401
|
||||
from segments.util import REPLACEMENT_MARKER # noqa: F401
|
||||
|
||||
__version__ = '2.3.0'
|
||||
__all__ = ['Tokenizer', 'Profile', 'Rules']
|
||||
@@ -0,0 +1,80 @@
|
||||
import sys
|
||||
import logging
|
||||
import pathlib
|
||||
import argparse
|
||||
|
||||
from segments import Tokenizer, Profile
|
||||
|
||||
|
||||
class ParserError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def tokenize(args):
|
||||
"""
|
||||
Tokenize a string (passed as argument or read from stdin)
|
||||
|
||||
segments [--profile=PATH/TO/PROFILE] tokenize [STRING]
|
||||
"""
|
||||
if args.profile and not pathlib.Path(args.profile).exists(): # pragma: no cover
|
||||
raise ParserError('--profile must be a path for an existing file')
|
||||
print(Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
|
||||
|
||||
|
||||
def profile(args):
|
||||
"""
|
||||
Create an orthography profile for a string (passed as argument or read from stdin)
|
||||
|
||||
segments profile [STRING]
|
||||
"""
|
||||
print(Profile.from_text(_read(args)))
|
||||
|
||||
|
||||
def _read(args):
|
||||
string = args.args[0] if args.args else sys.stdin.read()
|
||||
if not isinstance(string, str):
|
||||
string = string.decode(args.encoding)
|
||||
return string.strip()
|
||||
|
||||
|
||||
def main(parsed_args=None):
|
||||
commands = {'tokenize': tokenize, 'profile': profile}
|
||||
logging.basicConfig()
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Main command line interface of the segments package.",
|
||||
epilog="Use '%(prog)s help <cmd>' to get help about individual commands.")
|
||||
parser.add_argument("--verbosity", help="increase output verbosity")
|
||||
parser.add_argument('command', help=' | '.join(commands))
|
||||
parser.add_argument('args', nargs=argparse.REMAINDER)
|
||||
parser.add_argument("--encoding", help='input encoding', default="utf8")
|
||||
parser.add_argument("--profile", help='path to an orthography profile', default=None)
|
||||
parser.add_argument(
|
||||
"--mapping",
|
||||
help='column name in ortho profile to map graphemes',
|
||||
default=Profile.GRAPHEME_COL)
|
||||
|
||||
args = parsed_args or parser.parse_args()
|
||||
if args.command == 'help' and len(args.args):
|
||||
# As help text for individual commands we simply re-use the docstrings of the
|
||||
# callables registered for the command:
|
||||
print(commands[args.args[0]].__doc__.strip()
|
||||
if args.args[0] in commands else "Invalid command: '{}'".format(args.args[0]))
|
||||
else:
|
||||
if args.command not in commands:
|
||||
print('invalid command')
|
||||
parser.print_help()
|
||||
sys.exit(64)
|
||||
try:
|
||||
commands[args.command](args)
|
||||
except ParserError as e:
|
||||
print(e)
|
||||
print(commands[args.command].__doc__.strip())
|
||||
sys.exit(64)
|
||||
except Exception as e: # pragma: no cover
|
||||
print(e)
|
||||
sys.exit(1)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__': # pragma: no cover
|
||||
main()
|
||||
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
Default implementations for error handlers
|
||||
"""
|
||||
import logging
|
||||
|
||||
from segments.util import REPLACEMENT_MARKER
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def strict(c):
|
||||
log.debug('invalid grapheme: {0}'.format(c))
|
||||
raise ValueError('invalid grapheme')
|
||||
|
||||
|
||||
def replace(c):
|
||||
log.debug('replacing grapheme: {0}'.format(c))
|
||||
return REPLACEMENT_MARKER
|
||||
|
||||
|
||||
def ignore(c):
|
||||
log.debug('ignoring grapheme: {0}'.format(c))
|
||||
return ''
|
||||
@@ -0,0 +1,165 @@
|
||||
import copy
|
||||
import typing
|
||||
import logging
|
||||
import pathlib
|
||||
import warnings
|
||||
import collections
|
||||
import unicodedata
|
||||
import json.decoder
|
||||
|
||||
from csvw import TableGroup, Column
|
||||
|
||||
from segments.tree import Tree
|
||||
from segments.util import grapheme_pattern
|
||||
|
||||
|
||||
class Profile:
|
||||
"""
|
||||
An Orthography Profile as specified by Moran and Cysouw 2018.
|
||||
"""
|
||||
GRAPHEME_COL = 'Grapheme'
|
||||
NULL = "NULL"
|
||||
MD = {
|
||||
"tables": [
|
||||
{
|
||||
"dialect": {
|
||||
"delimiter": "\t",
|
||||
"header": True,
|
||||
"encoding": "utf-8"
|
||||
},
|
||||
"tableSchema": {
|
||||
"columns": [
|
||||
{
|
||||
"name": GRAPHEME_COL,
|
||||
"datatype": "string",
|
||||
"required": True
|
||||
}
|
||||
],
|
||||
"primaryKey": GRAPHEME_COL
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def default_metadata(cls, fname=None) -> dict:
|
||||
md = copy.copy(cls.MD)
|
||||
md['tables'][0]['url'] = str(fname or '')
|
||||
return md
|
||||
|
||||
def __init__(self, *specs: dict, **kw):
|
||||
"""
|
||||
|
||||
Parameters
|
||||
----------
|
||||
specs : list of dict
|
||||
A list of grapheme specifications.
|
||||
kw :
|
||||
The following keyword arguments are recognized:
|
||||
- fname: Path of the profile or profile metadata.
|
||||
- form: Unicode normalization to apply to the data in the profile before use.
|
||||
- remaining keyword arguments are assigned as dict to `Profile.metadata`.
|
||||
"""
|
||||
self.graphemes = collections.OrderedDict()
|
||||
self.column_labels = set()
|
||||
self.fname = kw.pop('fname', None)
|
||||
self.form = kw.pop('form', None)
|
||||
self.metadata = kw
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
for i, spec in enumerate(specs):
|
||||
if self.GRAPHEME_COL not in spec:
|
||||
raise ValueError('invalid grapheme specification')
|
||||
|
||||
if self.form:
|
||||
spec = {
|
||||
unicodedata.normalize(self.form, k):
|
||||
None if v is None else unicodedata.normalize(self.form, v)
|
||||
for k, v in spec.items()}
|
||||
|
||||
grapheme = spec.pop(self.GRAPHEME_COL)
|
||||
if not grapheme:
|
||||
raise ValueError('Grapheme must not be empty')
|
||||
|
||||
self.column_labels = self.column_labels.union(spec.keys())
|
||||
|
||||
# check for duplicates in the orthography profile (fail if dups)
|
||||
if grapheme not in self.graphemes:
|
||||
self.graphemes[grapheme] = spec
|
||||
else:
|
||||
log.warning(
|
||||
'line {0}:duplicate grapheme in profile: {1}'.format(i + 2, grapheme))
|
||||
self.tree = Tree(list(self.graphemes.keys()))
|
||||
|
||||
def iteritems(self) -> typing.Generator[dict, None, None]:
|
||||
for grapheme, spec in self.graphemes.items():
|
||||
res = {self.GRAPHEME_COL: grapheme}
|
||||
res.update({k: None for k in self.column_labels})
|
||||
res.update({k: v for k, v in spec.items()})
|
||||
yield res
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, fname, form=None) -> 'Profile':
|
||||
"""
|
||||
Read an orthography profile from a metadata file or a default tab-separated profile file.
|
||||
"""
|
||||
try:
|
||||
tg = TableGroup.from_file(fname)
|
||||
opfname = None
|
||||
except json.decoder.JSONDecodeError:
|
||||
tg = TableGroup.fromvalue(cls.default_metadata(fname))
|
||||
opfname = fname
|
||||
if len(tg.tables) != 1: # pragma: no cover
|
||||
raise ValueError('profile description must contain exactly one table')
|
||||
metadata = tg.common_props
|
||||
metadata.update(fname=pathlib.Path(fname), form=form)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
res = cls(
|
||||
*[{k: None if (k != cls.GRAPHEME_COL and v == cls.NULL) else v
|
||||
for k, v in d.items()}
|
||||
for d in tg.tables[0].iterdicts(fname=opfname)],
|
||||
**metadata)
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, text: str, mapping='mapping') -> 'Profile':
|
||||
"""
|
||||
Create a Profile instance from the Unicode graphemes found in `text`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
mapping
|
||||
|
||||
Returns
|
||||
-------
|
||||
A Profile instance.
|
||||
|
||||
"""
|
||||
graphemes = collections.Counter(grapheme_pattern.findall(text))
|
||||
specs = [
|
||||
collections.OrderedDict([
|
||||
(cls.GRAPHEME_COL, grapheme),
|
||||
('frequency', frequency),
|
||||
(mapping, grapheme)])
|
||||
for grapheme, frequency in graphemes.most_common()]
|
||||
return cls(*specs)
|
||||
|
||||
@classmethod
|
||||
def from_textfile(cls, fname, mapping='mapping') -> 'Profile':
|
||||
with pathlib.Path(fname).open(encoding='utf-8') as fp:
|
||||
lines = fp.readlines()
|
||||
return cls.from_text(' '.join(lines), mapping=mapping)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
A Profile is represented as tab-separated lines of grapheme specifications.
|
||||
"""
|
||||
tg = TableGroup.fromvalue(self.default_metadata())
|
||||
for col in self.column_labels:
|
||||
if col != self.GRAPHEME_COL:
|
||||
tg.tables[0].tableSchema.columns.append(
|
||||
Column.fromvalue({"name": col, "null": self.NULL}))
|
||||
|
||||
return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()
|
||||
@@ -0,0 +1,355 @@
|
||||
"""
|
||||
Tokenizer of Unicode characters, grapheme clusters and tailored grapheme clusters
|
||||
(of orthographies) given an orthography profile.
|
||||
"""
|
||||
import typing
|
||||
import pathlib
|
||||
import unicodedata
|
||||
|
||||
import regex
|
||||
from csvw.dsv import reader
|
||||
|
||||
from segments.util import nfd, grapheme_pattern
|
||||
from segments import errors
|
||||
from segments.profile import Profile
|
||||
|
||||
|
||||
def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
|
||||
with pathlib.Path(p).open(encoding='utf-8') as fp:
|
||||
for line in fp.readlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
yield unicodedata.normalize('NFD', line)
|
||||
|
||||
|
||||
class Rules:
|
||||
"""
|
||||
Rules are given in tuple format, comma delimited.
|
||||
Regular expressions are given in Python syntax.
|
||||
"""
|
||||
def __init__(self, *rules: typing.Tuple[str, str]):
|
||||
self._rules = [(regex.compile(rule), replacement) for rule, replacement in rules]
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, fname) -> 'Rules':
|
||||
return cls(*list(reader(list(iterlines(fname)))))
|
||||
|
||||
def apply(self, s):
|
||||
for rule, replacement in self._rules:
|
||||
s = rule.sub(replacement, s)
|
||||
return s
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""
|
||||
Class for Unicode character and grapheme tokenization.
|
||||
|
||||
This class provides extended functionality for
|
||||
orthography-specific tokenization with orthography profiles.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
profile : string or pathlib.Path or Profile instance (default = None)
|
||||
Specifies an orthography profile to use.
|
||||
|
||||
rules : string (default = None)
|
||||
Filename of a rules file.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The tokenizer can be used for pure Unicode character and grapheme
|
||||
tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
|
||||
implemented in the Python regex package by Matthew Barnett, to do basic tokenization
|
||||
with the "\\X" grapheme regular expression match. This grapheme match
|
||||
combines one or more Combining Diacritical Marks to their base character.
|
||||
These are called "grapheme clusters" in Unicode parlance. With these functions
|
||||
the Tokenizer is meant to do basic rudimentary parsing for things like generating
|
||||
unigram models (segments and their counts) from input data.
|
||||
|
||||
When a profile is passed, the Tokenizer reads the orthography profile and calls a helper
|
||||
class to build a tree data structure, which stores the possible Unicode
|
||||
character combinations that are specified in the orthography profile
|
||||
(i.e. tailored grapheme clusters) that appear in the data source.
|
||||
|
||||
For example, an orthography profile might specify that in source X
|
||||
<uu> is a single grapheme (Unicode parlance: tailored grapheme) and
|
||||
therefore it should be chunked as so. Given an orthography profile and
|
||||
some data to tokenize, the process would look like this:
|
||||
|
||||
input string example: uubo uubo
|
||||
output string example: uu b o # uu b o
|
||||
|
||||
>>> prf = Profile({'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'})
|
||||
>>> t = Tokenizer(profile=prf)
|
||||
>>> t('uubo uubo')
|
||||
'uu b o # uu b o'
|
||||
|
||||
See also the test orthography profile and rules in the test directory.
|
||||
|
||||
An additional method "combine_modifiers" handles the case where there are
|
||||
Unicode Spacing Modifier Letters, which are not explicitly
|
||||
combined to their base character in the Unicode Standard. These graphemes
|
||||
are called "Tailored grapheme clusters" in Unicode. For more information
|
||||
see the Unicode Standard Annex #29: Unicode Text Segmentation:
|
||||
|
||||
* http://www.unicode.org/reports/tr29/
|
||||
|
||||
Additionally, the Tokenizer provides functionality to transform graphemes
|
||||
into associated character(s) specified in additional columns in the orthography
|
||||
profile. A dictionary is created that keeps a mapping between source-specific
|
||||
graphemes and their counterparts (e.g. an IPA column in the orthography profile).
|
||||
|
||||
Lastly, the Tokenizer can be used to transform text as specified in an
|
||||
orthography rules file. These transformations are specified in a separate
|
||||
file from the orthography profile (that specifics the document specific graphemes,
|
||||
and possibly their IPA counterparts) and the orthography rules should
|
||||
be applied to the output of a grapheme tokenization.
|
||||
|
||||
In an orthography rules file, rules are given in order as regular
|
||||
expressions, e.g. this rule replaces a vowel followed by an <n>
|
||||
followed by <space> followed by a second vowel with first vowel
|
||||
<space> <n> <space> second vowel, e.g.::
|
||||
|
||||
$ (a|á|e|é|i|í|o|ó|u|ú)(n)(\\s)(a|á|e|é|i|í|o|ó|u|ú), \\1 \\2 \\4
|
||||
|
||||
"""
|
||||
def __init__(self,
|
||||
profile=None,
|
||||
rules=None,
|
||||
errors_strict: typing.Callable[[str], typing.Optional[str]] = errors.strict,
|
||||
errors_replace: typing.Callable[[str], typing.Optional[str]] = errors.replace,
|
||||
errors_ignore: typing.Callable[[str], typing.Optional[str]] = errors.ignore):
|
||||
self.op = None
|
||||
if isinstance(profile, Profile):
|
||||
self.op = profile
|
||||
elif profile is not None:
|
||||
self.op = Profile.from_file(profile)
|
||||
if not rules and self.op and self.op.fname:
|
||||
_rules = self.op.fname.parent / (self.op.fname.stem + '.rules')
|
||||
if _rules.exists():
|
||||
rules = _rules
|
||||
self._rules = Rules.from_file(rules) if rules else None
|
||||
self._errors = {
|
||||
'strict': errors_strict,
|
||||
'replace': errors_replace,
|
||||
'ignore': errors_ignore,
|
||||
}
|
||||
|
||||
def __call__(self,
|
||||
string: str,
|
||||
column: str = Profile.GRAPHEME_COL,
|
||||
form: typing.Optional[typing.Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
|
||||
ipa: bool = False,
|
||||
segment_separator=' ',
|
||||
separator=' # ',
|
||||
errors: typing.Literal['replace', 'strict', 'ignore'] = 'replace') -> str:
|
||||
"""
|
||||
The main task of a Tokenizer is tokenizing! This is what happens when called.
|
||||
|
||||
This function determines what to do given any combination
|
||||
of orthography profile and rules or not orthography profile
|
||||
or rules.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
The input string to be tokenized.
|
||||
|
||||
column : str (default = "graphemes")
|
||||
The column label for the transformation, if specified.
|
||||
|
||||
form : None or unicode normalization form
|
||||
Normalize return value if form is not None.
|
||||
|
||||
ipa : bool
|
||||
Tokenize IPA (work in progress)
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : str
|
||||
Result of the tokenization.
|
||||
|
||||
"""
|
||||
res = []
|
||||
for word in string.split():
|
||||
if ipa:
|
||||
res.append(self.combine_modifiers(self.grapheme_clusters(nfd(word))))
|
||||
else:
|
||||
if self.op:
|
||||
res.append(
|
||||
self.transform(word, column=column, error=self._errors[errors]))
|
||||
else:
|
||||
res.append(self.grapheme_clusters(nfd(word)))
|
||||
|
||||
def pp(word):
|
||||
res = segment_separator.join(word).strip()
|
||||
res = self._rules.apply(res) if self._rules else res
|
||||
return unicodedata.normalize(form, res) if form else res
|
||||
|
||||
return separator.join(pp(word) for word in res)
|
||||
|
||||
def characters(self, string, segment_separator=' ', separator=' # ',) -> str:
|
||||
"""
|
||||
Given a string as input, return a space-delimited string of Unicode characters
|
||||
(code points rendered as glyphs).
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
A Unicode string to be tokenized into graphemes.
|
||||
Returns
|
||||
-------
|
||||
result : str
|
||||
String returned is space-delimited on Unicode characters and contains "#" to
|
||||
mark word boundaries.
|
||||
The string is in NFD.
|
||||
Notes
|
||||
-----
|
||||
Input is first normalized according to Normalization Ford D(ecomposition).
|
||||
String returned contains "#" to mark word boundaries.
|
||||
"""
|
||||
return separator.join(segment_separator.join(word) for word in nfd(string).split())
|
||||
|
||||
def grapheme_clusters(self, word):
|
||||
"""
|
||||
See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
|
||||
http://www.unicode.org/reports/tr29/
|
||||
|
||||
Given a string as input, return a list of Unicode graphemes using the
|
||||
"\\X" regular expression.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
word : str
|
||||
A Unicode string to be tokenized into graphemes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : list
|
||||
List of Unicode graphemes in NFD.
|
||||
|
||||
"""
|
||||
# init the regex Unicode grapheme cluster match
|
||||
return grapheme_pattern.findall(word)
|
||||
|
||||
def transform(self, word, column=Profile.GRAPHEME_COL, error=errors.replace):
|
||||
"""
|
||||
Transform a string's graphemes into the mappings given in a different column
|
||||
in the orthography profile.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
word : str
|
||||
The input string to be tokenized.
|
||||
|
||||
column : str (default = "Grapheme")
|
||||
The label of the column to transform to. Default it to tokenize with
|
||||
orthography profile.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : list of lists
|
||||
Result of the transformation.
|
||||
|
||||
"""
|
||||
assert self.op, 'method can only be called with orthography profile.'
|
||||
|
||||
if column != Profile.GRAPHEME_COL and column not in self.op.column_labels:
|
||||
raise ValueError("Column {0} not found in profile.".format(column))
|
||||
|
||||
word = self.op.tree.parse(word, error)
|
||||
if column == Profile.GRAPHEME_COL:
|
||||
return word
|
||||
out = []
|
||||
for token in word:
|
||||
try:
|
||||
target = self.op.graphemes[token][column]
|
||||
except KeyError:
|
||||
target = self._errors['replace'](token)
|
||||
if target is not None:
|
||||
if isinstance(target, (tuple, list)):
|
||||
out.extend(target)
|
||||
else:
|
||||
out.append(target)
|
||||
return out
|
||||
|
||||
def rules(self, word):
|
||||
"""
|
||||
Function to tokenize input string and return output of str with ortho rules
|
||||
applied.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
word : str
|
||||
The input string to be tokenized.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : str
|
||||
Result of the orthography rules applied to the input str.
|
||||
|
||||
"""
|
||||
return self._rules.apply(word) if self._rules else word
|
||||
|
||||
def combine_modifiers(self, graphemes):
|
||||
"""
|
||||
Given a string that is space-delimited on Unicode grapheme clusters,
|
||||
group Unicode modifier letters with their preceding base characters,
|
||||
deal with tie bars, etc.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
A Unicode string tokenized into grapheme clusters to be tokenized into simple
|
||||
IPA.
|
||||
|
||||
"""
|
||||
result = []
|
||||
temp = ""
|
||||
count = len(graphemes)
|
||||
for grapheme in reversed(graphemes):
|
||||
count -= 1
|
||||
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" \
|
||||
and not ord(grapheme) in [712, 716] and len(graphemes) > 1:
|
||||
temp = grapheme + temp
|
||||
# hack for the cases where a space modifier is the first character in the
|
||||
# string
|
||||
if count == 0:
|
||||
result[-1] = temp + result[-1]
|
||||
continue # pragma: no cover
|
||||
|
||||
# catch and repair stress marks
|
||||
if len(grapheme) == 1 and (ord(grapheme) in [712, 716]) and result:
|
||||
# If result == [], there's nothing to combine with ...
|
||||
result[-1] = grapheme + result[-1]
|
||||
temp = ""
|
||||
continue
|
||||
|
||||
# combine contour tone marks (non-accents)
|
||||
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
|
||||
if len(result) == 0:
|
||||
result.append(grapheme)
|
||||
temp = ""
|
||||
continue
|
||||
else:
|
||||
if unicodedata.category(result[-1][0]) == "Sk":
|
||||
result[-1] = grapheme + temp + result[-1]
|
||||
temp = ""
|
||||
continue
|
||||
|
||||
result.append(grapheme + temp)
|
||||
temp = ""
|
||||
|
||||
# last check for tie bars
|
||||
segments = result[::-1]
|
||||
i = 0
|
||||
r = []
|
||||
while i < len(segments):
|
||||
# tie bars
|
||||
if ord(segments[i][-1]) in [865, 860]:
|
||||
r.append(segments[i] + segments[i + 1])
|
||||
i += 2
|
||||
else:
|
||||
r.append(segments[i])
|
||||
i += 1
|
||||
return r
|
||||
@@ -0,0 +1,68 @@
|
||||
from segments.errors import replace
|
||||
|
||||
|
||||
class TreeNode:
|
||||
"""
|
||||
Private class that creates the tree data structure from the orthography profile for
|
||||
parsing.
|
||||
"""
|
||||
|
||||
def __init__(self, char, sentinel=False):
|
||||
self.char = char
|
||||
self.children = {}
|
||||
self.sentinel = sentinel
|
||||
|
||||
|
||||
class Tree:
|
||||
def __init__(self, graphemes):
|
||||
def _multigraph(node, line):
|
||||
# Internal function to add a multigraph starting at node.
|
||||
for char in line:
|
||||
node = node.children.setdefault(char, TreeNode(char))
|
||||
node.sentinel = True
|
||||
|
||||
self.root = TreeNode('', sentinel=True)
|
||||
for grapheme in graphemes:
|
||||
_multigraph(self.root, grapheme)
|
||||
|
||||
def parse(self, line, error=replace):
|
||||
res, idx = self._parse(self.root, line, 0)
|
||||
rem = line[idx:]
|
||||
while rem:
|
||||
# Chop off one character and try parsing the remainder:
|
||||
res.append(error(rem[0]))
|
||||
rem = rem[1:]
|
||||
r, i = self._parse(self.root, rem, 0)
|
||||
res.extend(r)
|
||||
rem = rem[i:]
|
||||
return res
|
||||
|
||||
def _parse(self, root, line, idx):
|
||||
"""
|
||||
:param root: Tree node.
|
||||
:param line: String to parse.
|
||||
:param idx: Global counter of characters parsed.
|
||||
:return: (list of parsed graphemes, incremented character count)
|
||||
"""
|
||||
# Base (or degenerate..) case.
|
||||
if len(line) == 0:
|
||||
return [], idx
|
||||
|
||||
parse = []
|
||||
curr = 0
|
||||
node = root
|
||||
cidx = idx
|
||||
while curr < len(line):
|
||||
node = node.children.get(line[curr])
|
||||
curr += 1
|
||||
if not node:
|
||||
break
|
||||
if node.sentinel:
|
||||
subparse, cidx = self._parse(root, line[curr:], idx + curr)
|
||||
# Always keep the latest valid parse, which will be
|
||||
# the longest-matched (greedy match) graphemes.
|
||||
parse = [line[:curr]]
|
||||
parse.extend(subparse)
|
||||
if parse:
|
||||
idx = cidx
|
||||
return parse, idx
|
||||
@@ -0,0 +1,8 @@
|
||||
import functools
|
||||
import unicodedata
|
||||
|
||||
import regex
|
||||
|
||||
REPLACEMENT_MARKER = '�'
|
||||
nfd = functools.partial(unicodedata.normalize, 'NFD')
|
||||
grapheme_pattern = regex.compile(r"\X", regex.UNICODE)
|
||||
Reference in New Issue
Block a user