2025-12-01

This commit is contained in:
2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,6 @@
from segments.tokenizer import Tokenizer, Rules # noqa: F401
from segments.profile import Profile # noqa: F401
from segments.util import REPLACEMENT_MARKER # noqa: F401
__version__ = '2.3.0'
__all__ = ['Tokenizer', 'Profile', 'Rules']
@@ -0,0 +1,80 @@
import sys
import logging
import pathlib
import argparse
from segments import Tokenizer, Profile
class ParserError(Exception):
pass
def tokenize(args):
"""
Tokenize a string (passed as argument or read from stdin)
segments [--profile=PATH/TO/PROFILE] tokenize [STRING]
"""
if args.profile and not pathlib.Path(args.profile).exists(): # pragma: no cover
raise ParserError('--profile must be a path for an existing file')
print(Tokenizer(profile=args.profile)(_read(args), column=args.mapping))
def profile(args):
"""
Create an orthography profile for a string (passed as argument or read from stdin)
segments profile [STRING]
"""
print(Profile.from_text(_read(args)))
def _read(args):
string = args.args[0] if args.args else sys.stdin.read()
if not isinstance(string, str):
string = string.decode(args.encoding)
return string.strip()
def main(parsed_args=None):
commands = {'tokenize': tokenize, 'profile': profile}
logging.basicConfig()
parser = argparse.ArgumentParser(
description="Main command line interface of the segments package.",
epilog="Use '%(prog)s help <cmd>' to get help about individual commands.")
parser.add_argument("--verbosity", help="increase output verbosity")
parser.add_argument('command', help=' | '.join(commands))
parser.add_argument('args', nargs=argparse.REMAINDER)
parser.add_argument("--encoding", help='input encoding', default="utf8")
parser.add_argument("--profile", help='path to an orthography profile', default=None)
parser.add_argument(
"--mapping",
help='column name in ortho profile to map graphemes',
default=Profile.GRAPHEME_COL)
args = parsed_args or parser.parse_args()
if args.command == 'help' and len(args.args):
# As help text for individual commands we simply re-use the docstrings of the
# callables registered for the command:
print(commands[args.args[0]].__doc__.strip()
if args.args[0] in commands else "Invalid command: '{}'".format(args.args[0]))
else:
if args.command not in commands:
print('invalid command')
parser.print_help()
sys.exit(64)
try:
commands[args.command](args)
except ParserError as e:
print(e)
print(commands[args.command].__doc__.strip())
sys.exit(64)
except Exception as e: # pragma: no cover
print(e)
sys.exit(1)
sys.exit(0)
if __name__ == '__main__': # pragma: no cover
main()
@@ -0,0 +1,23 @@
"""
Default implementations for error handlers
"""
import logging
from segments.util import REPLACEMENT_MARKER
log = logging.getLogger(__name__)
def strict(c):
log.debug('invalid grapheme: {0}'.format(c))
raise ValueError('invalid grapheme')
def replace(c):
log.debug('replacing grapheme: {0}'.format(c))
return REPLACEMENT_MARKER
def ignore(c):
log.debug('ignoring grapheme: {0}'.format(c))
return ''
@@ -0,0 +1,165 @@
import copy
import typing
import logging
import pathlib
import warnings
import collections
import unicodedata
import json.decoder
from csvw import TableGroup, Column
from segments.tree import Tree
from segments.util import grapheme_pattern
class Profile:
"""
An Orthography Profile as specified by Moran and Cysouw 2018.
"""
GRAPHEME_COL = 'Grapheme'
NULL = "NULL"
MD = {
"tables": [
{
"dialect": {
"delimiter": "\t",
"header": True,
"encoding": "utf-8"
},
"tableSchema": {
"columns": [
{
"name": GRAPHEME_COL,
"datatype": "string",
"required": True
}
],
"primaryKey": GRAPHEME_COL
}
}
]
}
@classmethod
def default_metadata(cls, fname=None) -> dict:
md = copy.copy(cls.MD)
md['tables'][0]['url'] = str(fname or '')
return md
def __init__(self, *specs: dict, **kw):
"""
Parameters
----------
specs : list of dict
A list of grapheme specifications.
kw :
The following keyword arguments are recognized:
- fname: Path of the profile or profile metadata.
- form: Unicode normalization to apply to the data in the profile before use.
- remaining keyword arguments are assigned as dict to `Profile.metadata`.
"""
self.graphemes = collections.OrderedDict()
self.column_labels = set()
self.fname = kw.pop('fname', None)
self.form = kw.pop('form', None)
self.metadata = kw
log = logging.getLogger(__name__)
for i, spec in enumerate(specs):
if self.GRAPHEME_COL not in spec:
raise ValueError('invalid grapheme specification')
if self.form:
spec = {
unicodedata.normalize(self.form, k):
None if v is None else unicodedata.normalize(self.form, v)
for k, v in spec.items()}
grapheme = spec.pop(self.GRAPHEME_COL)
if not grapheme:
raise ValueError('Grapheme must not be empty')
self.column_labels = self.column_labels.union(spec.keys())
# check for duplicates in the orthography profile (fail if dups)
if grapheme not in self.graphemes:
self.graphemes[grapheme] = spec
else:
log.warning(
'line {0}:duplicate grapheme in profile: {1}'.format(i + 2, grapheme))
self.tree = Tree(list(self.graphemes.keys()))
def iteritems(self) -> typing.Generator[dict, None, None]:
for grapheme, spec in self.graphemes.items():
res = {self.GRAPHEME_COL: grapheme}
res.update({k: None for k in self.column_labels})
res.update({k: v for k, v in spec.items()})
yield res
@classmethod
def from_file(cls, fname, form=None) -> 'Profile':
"""
Read an orthography profile from a metadata file or a default tab-separated profile file.
"""
try:
tg = TableGroup.from_file(fname)
opfname = None
except json.decoder.JSONDecodeError:
tg = TableGroup.fromvalue(cls.default_metadata(fname))
opfname = fname
if len(tg.tables) != 1: # pragma: no cover
raise ValueError('profile description must contain exactly one table')
metadata = tg.common_props
metadata.update(fname=pathlib.Path(fname), form=form)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
res = cls(
*[{k: None if (k != cls.GRAPHEME_COL and v == cls.NULL) else v
for k, v in d.items()}
for d in tg.tables[0].iterdicts(fname=opfname)],
**metadata)
return res
@classmethod
def from_text(cls, text: str, mapping='mapping') -> 'Profile':
"""
Create a Profile instance from the Unicode graphemes found in `text`.
Parameters
----------
text
mapping
Returns
-------
A Profile instance.
"""
graphemes = collections.Counter(grapheme_pattern.findall(text))
specs = [
collections.OrderedDict([
(cls.GRAPHEME_COL, grapheme),
('frequency', frequency),
(mapping, grapheme)])
for grapheme, frequency in graphemes.most_common()]
return cls(*specs)
@classmethod
def from_textfile(cls, fname, mapping='mapping') -> 'Profile':
with pathlib.Path(fname).open(encoding='utf-8') as fp:
lines = fp.readlines()
return cls.from_text(' '.join(lines), mapping=mapping)
def __str__(self):
"""
A Profile is represented as tab-separated lines of grapheme specifications.
"""
tg = TableGroup.fromvalue(self.default_metadata())
for col in self.column_labels:
if col != self.GRAPHEME_COL:
tg.tables[0].tableSchema.columns.append(
Column.fromvalue({"name": col, "null": self.NULL}))
return tg.tables[0].write(self.iteritems(), fname=None).decode('utf8').strip()
@@ -0,0 +1,355 @@
"""
Tokenizer of Unicode characters, grapheme clusters and tailored grapheme clusters
(of orthographies) given an orthography profile.
"""
import typing
import pathlib
import unicodedata
import regex
from csvw.dsv import reader
from segments.util import nfd, grapheme_pattern
from segments import errors
from segments.profile import Profile
def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
with pathlib.Path(p).open(encoding='utf-8') as fp:
for line in fp.readlines():
line = line.strip()
if line and not line.startswith('#'):
yield unicodedata.normalize('NFD', line)
class Rules:
"""
Rules are given in tuple format, comma delimited.
Regular expressions are given in Python syntax.
"""
def __init__(self, *rules: typing.Tuple[str, str]):
self._rules = [(regex.compile(rule), replacement) for rule, replacement in rules]
@classmethod
def from_file(cls, fname) -> 'Rules':
return cls(*list(reader(list(iterlines(fname)))))
def apply(self, s):
for rule, replacement in self._rules:
s = rule.sub(replacement, s)
return s
class Tokenizer:
"""
Class for Unicode character and grapheme tokenization.
This class provides extended functionality for
orthography-specific tokenization with orthography profiles.
Parameters
----------
profile : string or pathlib.Path or Profile instance (default = None)
Specifies an orthography profile to use.
rules : string (default = None)
Filename of a rules file.
Notes
-----
The tokenizer can be used for pure Unicode character and grapheme
tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
implemented in the Python regex package by Matthew Barnett, to do basic tokenization
with the "\\X" grapheme regular expression match. This grapheme match
combines one or more Combining Diacritical Marks to their base character.
These are called "grapheme clusters" in Unicode parlance. With these functions
the Tokenizer is meant to do basic rudimentary parsing for things like generating
unigram models (segments and their counts) from input data.
When a profile is passed, the Tokenizer reads the orthography profile and calls a helper
class to build a tree data structure, which stores the possible Unicode
character combinations that are specified in the orthography profile
(i.e. tailored grapheme clusters) that appear in the data source.
For example, an orthography profile might specify that in source X
<uu> is a single grapheme (Unicode parlance: tailored grapheme) and
therefore it should be chunked as so. Given an orthography profile and
some data to tokenize, the process would look like this:
input string example: uubo uubo
output string example: uu b o # uu b o
>>> prf = Profile({'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'})
>>> t = Tokenizer(profile=prf)
>>> t('uubo uubo')
'uu b o # uu b o'
See also the test orthography profile and rules in the test directory.
An additional method "combine_modifiers" handles the case where there are
Unicode Spacing Modifier Letters, which are not explicitly
combined to their base character in the Unicode Standard. These graphemes
are called "Tailored grapheme clusters" in Unicode. For more information
see the Unicode Standard Annex #29: Unicode Text Segmentation:
* http://www.unicode.org/reports/tr29/
Additionally, the Tokenizer provides functionality to transform graphemes
into associated character(s) specified in additional columns in the orthography
profile. A dictionary is created that keeps a mapping between source-specific
graphemes and their counterparts (e.g. an IPA column in the orthography profile).
Lastly, the Tokenizer can be used to transform text as specified in an
orthography rules file. These transformations are specified in a separate
file from the orthography profile (that specifics the document specific graphemes,
and possibly their IPA counterparts) and the orthography rules should
be applied to the output of a grapheme tokenization.
In an orthography rules file, rules are given in order as regular
expressions, e.g. this rule replaces a vowel followed by an <n>
followed by <space> followed by a second vowel with first vowel
<space> <n> <space> second vowel, e.g.::
$ (a|á|e|é|i|í|o|ó|u|ú)(n)(\\s)(a|á|e|é|i|í|o|ó|u|ú), \\1 \\2 \\4
"""
def __init__(self,
profile=None,
rules=None,
errors_strict: typing.Callable[[str], typing.Optional[str]] = errors.strict,
errors_replace: typing.Callable[[str], typing.Optional[str]] = errors.replace,
errors_ignore: typing.Callable[[str], typing.Optional[str]] = errors.ignore):
self.op = None
if isinstance(profile, Profile):
self.op = profile
elif profile is not None:
self.op = Profile.from_file(profile)
if not rules and self.op and self.op.fname:
_rules = self.op.fname.parent / (self.op.fname.stem + '.rules')
if _rules.exists():
rules = _rules
self._rules = Rules.from_file(rules) if rules else None
self._errors = {
'strict': errors_strict,
'replace': errors_replace,
'ignore': errors_ignore,
}
def __call__(self,
string: str,
column: str = Profile.GRAPHEME_COL,
form: typing.Optional[typing.Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
ipa: bool = False,
segment_separator=' ',
separator=' # ',
errors: typing.Literal['replace', 'strict', 'ignore'] = 'replace') -> str:
"""
The main task of a Tokenizer is tokenizing! This is what happens when called.
This function determines what to do given any combination
of orthography profile and rules or not orthography profile
or rules.
Parameters
----------
string : str
The input string to be tokenized.
column : str (default = "graphemes")
The column label for the transformation, if specified.
form : None or unicode normalization form
Normalize return value if form is not None.
ipa : bool
Tokenize IPA (work in progress)
Returns
-------
result : str
Result of the tokenization.
"""
res = []
for word in string.split():
if ipa:
res.append(self.combine_modifiers(self.grapheme_clusters(nfd(word))))
else:
if self.op:
res.append(
self.transform(word, column=column, error=self._errors[errors]))
else:
res.append(self.grapheme_clusters(nfd(word)))
def pp(word):
res = segment_separator.join(word).strip()
res = self._rules.apply(res) if self._rules else res
return unicodedata.normalize(form, res) if form else res
return separator.join(pp(word) for word in res)
def characters(self, string, segment_separator=' ', separator=' # ',) -> str:
"""
Given a string as input, return a space-delimited string of Unicode characters
(code points rendered as glyphs).
Parameters
----------
string : str
A Unicode string to be tokenized into graphemes.
Returns
-------
result : str
String returned is space-delimited on Unicode characters and contains "#" to
mark word boundaries.
The string is in NFD.
Notes
-----
Input is first normalized according to Normalization Ford D(ecomposition).
String returned contains "#" to mark word boundaries.
"""
return separator.join(segment_separator.join(word) for word in nfd(string).split())
def grapheme_clusters(self, word):
"""
See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
http://www.unicode.org/reports/tr29/
Given a string as input, return a list of Unicode graphemes using the
"\\X" regular expression.
Parameters
----------
word : str
A Unicode string to be tokenized into graphemes.
Returns
-------
result : list
List of Unicode graphemes in NFD.
"""
# init the regex Unicode grapheme cluster match
return grapheme_pattern.findall(word)
def transform(self, word, column=Profile.GRAPHEME_COL, error=errors.replace):
"""
Transform a string's graphemes into the mappings given in a different column
in the orthography profile.
Parameters
----------
word : str
The input string to be tokenized.
column : str (default = "Grapheme")
The label of the column to transform to. Default it to tokenize with
orthography profile.
Returns
-------
result : list of lists
Result of the transformation.
"""
assert self.op, 'method can only be called with orthography profile.'
if column != Profile.GRAPHEME_COL and column not in self.op.column_labels:
raise ValueError("Column {0} not found in profile.".format(column))
word = self.op.tree.parse(word, error)
if column == Profile.GRAPHEME_COL:
return word
out = []
for token in word:
try:
target = self.op.graphemes[token][column]
except KeyError:
target = self._errors['replace'](token)
if target is not None:
if isinstance(target, (tuple, list)):
out.extend(target)
else:
out.append(target)
return out
def rules(self, word):
"""
Function to tokenize input string and return output of str with ortho rules
applied.
Parameters
----------
word : str
The input string to be tokenized.
Returns
-------
result : str
Result of the orthography rules applied to the input str.
"""
return self._rules.apply(word) if self._rules else word
def combine_modifiers(self, graphemes):
"""
Given a string that is space-delimited on Unicode grapheme clusters,
group Unicode modifier letters with their preceding base characters,
deal with tie bars, etc.
Parameters
----------
string : str
A Unicode string tokenized into grapheme clusters to be tokenized into simple
IPA.
"""
result = []
temp = ""
count = len(graphemes)
for grapheme in reversed(graphemes):
count -= 1
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" \
and not ord(grapheme) in [712, 716] and len(graphemes) > 1:
temp = grapheme + temp
# hack for the cases where a space modifier is the first character in the
# string
if count == 0:
result[-1] = temp + result[-1]
continue # pragma: no cover
# catch and repair stress marks
if len(grapheme) == 1 and (ord(grapheme) in [712, 716]) and result:
# If result == [], there's nothing to combine with ...
result[-1] = grapheme + result[-1]
temp = ""
continue
# combine contour tone marks (non-accents)
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
if len(result) == 0:
result.append(grapheme)
temp = ""
continue
else:
if unicodedata.category(result[-1][0]) == "Sk":
result[-1] = grapheme + temp + result[-1]
temp = ""
continue
result.append(grapheme + temp)
temp = ""
# last check for tie bars
segments = result[::-1]
i = 0
r = []
while i < len(segments):
# tie bars
if ord(segments[i][-1]) in [865, 860]:
r.append(segments[i] + segments[i + 1])
i += 2
else:
r.append(segments[i])
i += 1
return r
@@ -0,0 +1,68 @@
from segments.errors import replace
class TreeNode:
"""
Private class that creates the tree data structure from the orthography profile for
parsing.
"""
def __init__(self, char, sentinel=False):
self.char = char
self.children = {}
self.sentinel = sentinel
class Tree:
def __init__(self, graphemes):
def _multigraph(node, line):
# Internal function to add a multigraph starting at node.
for char in line:
node = node.children.setdefault(char, TreeNode(char))
node.sentinel = True
self.root = TreeNode('', sentinel=True)
for grapheme in graphemes:
_multigraph(self.root, grapheme)
def parse(self, line, error=replace):
res, idx = self._parse(self.root, line, 0)
rem = line[idx:]
while rem:
# Chop off one character and try parsing the remainder:
res.append(error(rem[0]))
rem = rem[1:]
r, i = self._parse(self.root, rem, 0)
res.extend(r)
rem = rem[i:]
return res
def _parse(self, root, line, idx):
"""
:param root: Tree node.
:param line: String to parse.
:param idx: Global counter of characters parsed.
:return: (list of parsed graphemes, incremented character count)
"""
# Base (or degenerate..) case.
if len(line) == 0:
return [], idx
parse = []
curr = 0
node = root
cidx = idx
while curr < len(line):
node = node.children.get(line[curr])
curr += 1
if not node:
break
if node.sentinel:
subparse, cidx = self._parse(root, line[curr:], idx + curr)
# Always keep the latest valid parse, which will be
# the longest-matched (greedy match) graphemes.
parse = [line[:curr]]
parse.extend(subparse)
if parse:
idx = cidx
return parse, idx
@@ -0,0 +1,8 @@
import functools
import unicodedata
import regex
REPLACEMENT_MARKER = ''
nfd = functools.partial(unicodedata.normalize, 'NFD')
grapheme_pattern = regex.compile(r"\X", regex.UNICODE)