2025-12-01
This commit is contained in:
@@ -0,0 +1,355 @@
|
||||
"""
|
||||
Tokenizer of Unicode characters, grapheme clusters and tailored grapheme clusters
|
||||
(of orthographies) given an orthography profile.
|
||||
"""
|
||||
import typing
|
||||
import pathlib
|
||||
import unicodedata
|
||||
|
||||
import regex
|
||||
from csvw.dsv import reader
|
||||
|
||||
from segments.util import nfd, grapheme_pattern
|
||||
from segments import errors
|
||||
from segments.profile import Profile
|
||||
|
||||
|
||||
def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
|
||||
with pathlib.Path(p).open(encoding='utf-8') as fp:
|
||||
for line in fp.readlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
yield unicodedata.normalize('NFD', line)
|
||||
|
||||
|
||||
class Rules:
|
||||
"""
|
||||
Rules are given in tuple format, comma delimited.
|
||||
Regular expressions are given in Python syntax.
|
||||
"""
|
||||
def __init__(self, *rules: typing.Tuple[str, str]):
|
||||
self._rules = [(regex.compile(rule), replacement) for rule, replacement in rules]
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, fname) -> 'Rules':
|
||||
return cls(*list(reader(list(iterlines(fname)))))
|
||||
|
||||
def apply(self, s):
|
||||
for rule, replacement in self._rules:
|
||||
s = rule.sub(replacement, s)
|
||||
return s
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""
|
||||
Class for Unicode character and grapheme tokenization.
|
||||
|
||||
This class provides extended functionality for
|
||||
orthography-specific tokenization with orthography profiles.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
profile : string or pathlib.Path or Profile instance (default = None)
|
||||
Specifies an orthography profile to use.
|
||||
|
||||
rules : string (default = None)
|
||||
Filename of a rules file.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The tokenizer can be used for pure Unicode character and grapheme
|
||||
tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
|
||||
implemented in the Python regex package by Matthew Barnett, to do basic tokenization
|
||||
with the "\\X" grapheme regular expression match. This grapheme match
|
||||
combines one or more Combining Diacritical Marks to their base character.
|
||||
These are called "grapheme clusters" in Unicode parlance. With these functions
|
||||
the Tokenizer is meant to do basic rudimentary parsing for things like generating
|
||||
unigram models (segments and their counts) from input data.
|
||||
|
||||
When a profile is passed, the Tokenizer reads the orthography profile and calls a helper
|
||||
class to build a tree data structure, which stores the possible Unicode
|
||||
character combinations that are specified in the orthography profile
|
||||
(i.e. tailored grapheme clusters) that appear in the data source.
|
||||
|
||||
For example, an orthography profile might specify that in source X
|
||||
<uu> is a single grapheme (Unicode parlance: tailored grapheme) and
|
||||
therefore it should be chunked as so. Given an orthography profile and
|
||||
some data to tokenize, the process would look like this:
|
||||
|
||||
input string example: uubo uubo
|
||||
output string example: uu b o # uu b o
|
||||
|
||||
>>> prf = Profile({'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'})
|
||||
>>> t = Tokenizer(profile=prf)
|
||||
>>> t('uubo uubo')
|
||||
'uu b o # uu b o'
|
||||
|
||||
See also the test orthography profile and rules in the test directory.
|
||||
|
||||
An additional method "combine_modifiers" handles the case where there are
|
||||
Unicode Spacing Modifier Letters, which are not explicitly
|
||||
combined to their base character in the Unicode Standard. These graphemes
|
||||
are called "Tailored grapheme clusters" in Unicode. For more information
|
||||
see the Unicode Standard Annex #29: Unicode Text Segmentation:
|
||||
|
||||
* http://www.unicode.org/reports/tr29/
|
||||
|
||||
Additionally, the Tokenizer provides functionality to transform graphemes
|
||||
into associated character(s) specified in additional columns in the orthography
|
||||
profile. A dictionary is created that keeps a mapping between source-specific
|
||||
graphemes and their counterparts (e.g. an IPA column in the orthography profile).
|
||||
|
||||
Lastly, the Tokenizer can be used to transform text as specified in an
|
||||
orthography rules file. These transformations are specified in a separate
|
||||
file from the orthography profile (that specifics the document specific graphemes,
|
||||
and possibly their IPA counterparts) and the orthography rules should
|
||||
be applied to the output of a grapheme tokenization.
|
||||
|
||||
In an orthography rules file, rules are given in order as regular
|
||||
expressions, e.g. this rule replaces a vowel followed by an <n>
|
||||
followed by <space> followed by a second vowel with first vowel
|
||||
<space> <n> <space> second vowel, e.g.::
|
||||
|
||||
$ (a|á|e|é|i|í|o|ó|u|ú)(n)(\\s)(a|á|e|é|i|í|o|ó|u|ú), \\1 \\2 \\4
|
||||
|
||||
"""
|
||||
def __init__(self,
|
||||
profile=None,
|
||||
rules=None,
|
||||
errors_strict: typing.Callable[[str], typing.Optional[str]] = errors.strict,
|
||||
errors_replace: typing.Callable[[str], typing.Optional[str]] = errors.replace,
|
||||
errors_ignore: typing.Callable[[str], typing.Optional[str]] = errors.ignore):
|
||||
self.op = None
|
||||
if isinstance(profile, Profile):
|
||||
self.op = profile
|
||||
elif profile is not None:
|
||||
self.op = Profile.from_file(profile)
|
||||
if not rules and self.op and self.op.fname:
|
||||
_rules = self.op.fname.parent / (self.op.fname.stem + '.rules')
|
||||
if _rules.exists():
|
||||
rules = _rules
|
||||
self._rules = Rules.from_file(rules) if rules else None
|
||||
self._errors = {
|
||||
'strict': errors_strict,
|
||||
'replace': errors_replace,
|
||||
'ignore': errors_ignore,
|
||||
}
|
||||
|
||||
def __call__(self,
|
||||
string: str,
|
||||
column: str = Profile.GRAPHEME_COL,
|
||||
form: typing.Optional[typing.Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
|
||||
ipa: bool = False,
|
||||
segment_separator=' ',
|
||||
separator=' # ',
|
||||
errors: typing.Literal['replace', 'strict', 'ignore'] = 'replace') -> str:
|
||||
"""
|
||||
The main task of a Tokenizer is tokenizing! This is what happens when called.
|
||||
|
||||
This function determines what to do given any combination
|
||||
of orthography profile and rules or not orthography profile
|
||||
or rules.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
The input string to be tokenized.
|
||||
|
||||
column : str (default = "graphemes")
|
||||
The column label for the transformation, if specified.
|
||||
|
||||
form : None or unicode normalization form
|
||||
Normalize return value if form is not None.
|
||||
|
||||
ipa : bool
|
||||
Tokenize IPA (work in progress)
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : str
|
||||
Result of the tokenization.
|
||||
|
||||
"""
|
||||
res = []
|
||||
for word in string.split():
|
||||
if ipa:
|
||||
res.append(self.combine_modifiers(self.grapheme_clusters(nfd(word))))
|
||||
else:
|
||||
if self.op:
|
||||
res.append(
|
||||
self.transform(word, column=column, error=self._errors[errors]))
|
||||
else:
|
||||
res.append(self.grapheme_clusters(nfd(word)))
|
||||
|
||||
def pp(word):
|
||||
res = segment_separator.join(word).strip()
|
||||
res = self._rules.apply(res) if self._rules else res
|
||||
return unicodedata.normalize(form, res) if form else res
|
||||
|
||||
return separator.join(pp(word) for word in res)
|
||||
|
||||
def characters(self, string, segment_separator=' ', separator=' # ',) -> str:
|
||||
"""
|
||||
Given a string as input, return a space-delimited string of Unicode characters
|
||||
(code points rendered as glyphs).
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
A Unicode string to be tokenized into graphemes.
|
||||
Returns
|
||||
-------
|
||||
result : str
|
||||
String returned is space-delimited on Unicode characters and contains "#" to
|
||||
mark word boundaries.
|
||||
The string is in NFD.
|
||||
Notes
|
||||
-----
|
||||
Input is first normalized according to Normalization Ford D(ecomposition).
|
||||
String returned contains "#" to mark word boundaries.
|
||||
"""
|
||||
return separator.join(segment_separator.join(word) for word in nfd(string).split())
|
||||
|
||||
def grapheme_clusters(self, word):
|
||||
"""
|
||||
See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
|
||||
http://www.unicode.org/reports/tr29/
|
||||
|
||||
Given a string as input, return a list of Unicode graphemes using the
|
||||
"\\X" regular expression.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
word : str
|
||||
A Unicode string to be tokenized into graphemes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : list
|
||||
List of Unicode graphemes in NFD.
|
||||
|
||||
"""
|
||||
# init the regex Unicode grapheme cluster match
|
||||
return grapheme_pattern.findall(word)
|
||||
|
||||
def transform(self, word, column=Profile.GRAPHEME_COL, error=errors.replace):
|
||||
"""
|
||||
Transform a string's graphemes into the mappings given in a different column
|
||||
in the orthography profile.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
word : str
|
||||
The input string to be tokenized.
|
||||
|
||||
column : str (default = "Grapheme")
|
||||
The label of the column to transform to. Default it to tokenize with
|
||||
orthography profile.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : list of lists
|
||||
Result of the transformation.
|
||||
|
||||
"""
|
||||
assert self.op, 'method can only be called with orthography profile.'
|
||||
|
||||
if column != Profile.GRAPHEME_COL and column not in self.op.column_labels:
|
||||
raise ValueError("Column {0} not found in profile.".format(column))
|
||||
|
||||
word = self.op.tree.parse(word, error)
|
||||
if column == Profile.GRAPHEME_COL:
|
||||
return word
|
||||
out = []
|
||||
for token in word:
|
||||
try:
|
||||
target = self.op.graphemes[token][column]
|
||||
except KeyError:
|
||||
target = self._errors['replace'](token)
|
||||
if target is not None:
|
||||
if isinstance(target, (tuple, list)):
|
||||
out.extend(target)
|
||||
else:
|
||||
out.append(target)
|
||||
return out
|
||||
|
||||
def rules(self, word):
|
||||
"""
|
||||
Function to tokenize input string and return output of str with ortho rules
|
||||
applied.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
word : str
|
||||
The input string to be tokenized.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : str
|
||||
Result of the orthography rules applied to the input str.
|
||||
|
||||
"""
|
||||
return self._rules.apply(word) if self._rules else word
|
||||
|
||||
def combine_modifiers(self, graphemes):
|
||||
"""
|
||||
Given a string that is space-delimited on Unicode grapheme clusters,
|
||||
group Unicode modifier letters with their preceding base characters,
|
||||
deal with tie bars, etc.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
A Unicode string tokenized into grapheme clusters to be tokenized into simple
|
||||
IPA.
|
||||
|
||||
"""
|
||||
result = []
|
||||
temp = ""
|
||||
count = len(graphemes)
|
||||
for grapheme in reversed(graphemes):
|
||||
count -= 1
|
||||
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" \
|
||||
and not ord(grapheme) in [712, 716] and len(graphemes) > 1:
|
||||
temp = grapheme + temp
|
||||
# hack for the cases where a space modifier is the first character in the
|
||||
# string
|
||||
if count == 0:
|
||||
result[-1] = temp + result[-1]
|
||||
continue # pragma: no cover
|
||||
|
||||
# catch and repair stress marks
|
||||
if len(grapheme) == 1 and (ord(grapheme) in [712, 716]) and result:
|
||||
# If result == [], there's nothing to combine with ...
|
||||
result[-1] = grapheme + result[-1]
|
||||
temp = ""
|
||||
continue
|
||||
|
||||
# combine contour tone marks (non-accents)
|
||||
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
|
||||
if len(result) == 0:
|
||||
result.append(grapheme)
|
||||
temp = ""
|
||||
continue
|
||||
else:
|
||||
if unicodedata.category(result[-1][0]) == "Sk":
|
||||
result[-1] = grapheme + temp + result[-1]
|
||||
temp = ""
|
||||
continue
|
||||
|
||||
result.append(grapheme + temp)
|
||||
temp = ""
|
||||
|
||||
# last check for tie bars
|
||||
segments = result[::-1]
|
||||
i = 0
|
||||
r = []
|
||||
while i < len(segments):
|
||||
# tie bars
|
||||
if ord(segments[i][-1]) in [865, 860]:
|
||||
r.append(segments[i] + segments[i + 1])
|
||||
i += 2
|
||||
else:
|
||||
r.append(segments[i])
|
||||
i += 1
|
||||
return r
|
||||
Reference in New Issue
Block a user