356 lines
13 KiB
Python
356 lines
13 KiB
Python
"""
|
|
Tokenizer of Unicode characters, grapheme clusters and tailored grapheme clusters
|
|
(of orthographies) given an orthography profile.
|
|
"""
|
|
import typing
|
|
import pathlib
|
|
import unicodedata
|
|
|
|
import regex
|
|
from csvw.dsv import reader
|
|
|
|
from segments.util import nfd, grapheme_pattern
|
|
from segments import errors
|
|
from segments.profile import Profile
|
|
|
|
|
|
def iterlines(p: typing.Union[pathlib.Path, str]) -> typing.Generator[str, None, None]:
|
|
with pathlib.Path(p).open(encoding='utf-8') as fp:
|
|
for line in fp.readlines():
|
|
line = line.strip()
|
|
if line and not line.startswith('#'):
|
|
yield unicodedata.normalize('NFD', line)
|
|
|
|
|
|
class Rules:
|
|
"""
|
|
Rules are given in tuple format, comma delimited.
|
|
Regular expressions are given in Python syntax.
|
|
"""
|
|
def __init__(self, *rules: typing.Tuple[str, str]):
|
|
self._rules = [(regex.compile(rule), replacement) for rule, replacement in rules]
|
|
|
|
@classmethod
|
|
def from_file(cls, fname) -> 'Rules':
|
|
return cls(*list(reader(list(iterlines(fname)))))
|
|
|
|
def apply(self, s):
|
|
for rule, replacement in self._rules:
|
|
s = rule.sub(replacement, s)
|
|
return s
|
|
|
|
|
|
class Tokenizer:
|
|
"""
|
|
Class for Unicode character and grapheme tokenization.
|
|
|
|
This class provides extended functionality for
|
|
orthography-specific tokenization with orthography profiles.
|
|
|
|
Parameters
|
|
----------
|
|
|
|
profile : string or pathlib.Path or Profile instance (default = None)
|
|
Specifies an orthography profile to use.
|
|
|
|
rules : string (default = None)
|
|
Filename of a rules file.
|
|
|
|
Notes
|
|
-----
|
|
The tokenizer can be used for pure Unicode character and grapheme
|
|
tokenization, i.e. it uses the Unicode standard grapheme parsing rules, as
|
|
implemented in the Python regex package by Matthew Barnett, to do basic tokenization
|
|
with the "\\X" grapheme regular expression match. This grapheme match
|
|
combines one or more Combining Diacritical Marks to their base character.
|
|
These are called "grapheme clusters" in Unicode parlance. With these functions
|
|
the Tokenizer is meant to do basic rudimentary parsing for things like generating
|
|
unigram models (segments and their counts) from input data.
|
|
|
|
When a profile is passed, the Tokenizer reads the orthography profile and calls a helper
|
|
class to build a tree data structure, which stores the possible Unicode
|
|
character combinations that are specified in the orthography profile
|
|
(i.e. tailored grapheme clusters) that appear in the data source.
|
|
|
|
For example, an orthography profile might specify that in source X
|
|
<uu> is a single grapheme (Unicode parlance: tailored grapheme) and
|
|
therefore it should be chunked as so. Given an orthography profile and
|
|
some data to tokenize, the process would look like this:
|
|
|
|
input string example: uubo uubo
|
|
output string example: uu b o # uu b o
|
|
|
|
>>> prf = Profile({'Grapheme': 'uu'}, {'Grapheme': 'b'}, {'Grapheme': 'o'})
|
|
>>> t = Tokenizer(profile=prf)
|
|
>>> t('uubo uubo')
|
|
'uu b o # uu b o'
|
|
|
|
See also the test orthography profile and rules in the test directory.
|
|
|
|
An additional method "combine_modifiers" handles the case where there are
|
|
Unicode Spacing Modifier Letters, which are not explicitly
|
|
combined to their base character in the Unicode Standard. These graphemes
|
|
are called "Tailored grapheme clusters" in Unicode. For more information
|
|
see the Unicode Standard Annex #29: Unicode Text Segmentation:
|
|
|
|
* http://www.unicode.org/reports/tr29/
|
|
|
|
Additionally, the Tokenizer provides functionality to transform graphemes
|
|
into associated character(s) specified in additional columns in the orthography
|
|
profile. A dictionary is created that keeps a mapping between source-specific
|
|
graphemes and their counterparts (e.g. an IPA column in the orthography profile).
|
|
|
|
Lastly, the Tokenizer can be used to transform text as specified in an
|
|
orthography rules file. These transformations are specified in a separate
|
|
file from the orthography profile (that specifics the document specific graphemes,
|
|
and possibly their IPA counterparts) and the orthography rules should
|
|
be applied to the output of a grapheme tokenization.
|
|
|
|
In an orthography rules file, rules are given in order as regular
|
|
expressions, e.g. this rule replaces a vowel followed by an <n>
|
|
followed by <space> followed by a second vowel with first vowel
|
|
<space> <n> <space> second vowel, e.g.::
|
|
|
|
$ (a|á|e|é|i|í|o|ó|u|ú)(n)(\\s)(a|á|e|é|i|í|o|ó|u|ú), \\1 \\2 \\4
|
|
|
|
"""
|
|
def __init__(self,
|
|
profile=None,
|
|
rules=None,
|
|
errors_strict: typing.Callable[[str], typing.Optional[str]] = errors.strict,
|
|
errors_replace: typing.Callable[[str], typing.Optional[str]] = errors.replace,
|
|
errors_ignore: typing.Callable[[str], typing.Optional[str]] = errors.ignore):
|
|
self.op = None
|
|
if isinstance(profile, Profile):
|
|
self.op = profile
|
|
elif profile is not None:
|
|
self.op = Profile.from_file(profile)
|
|
if not rules and self.op and self.op.fname:
|
|
_rules = self.op.fname.parent / (self.op.fname.stem + '.rules')
|
|
if _rules.exists():
|
|
rules = _rules
|
|
self._rules = Rules.from_file(rules) if rules else None
|
|
self._errors = {
|
|
'strict': errors_strict,
|
|
'replace': errors_replace,
|
|
'ignore': errors_ignore,
|
|
}
|
|
|
|
def __call__(self,
|
|
string: str,
|
|
column: str = Profile.GRAPHEME_COL,
|
|
form: typing.Optional[typing.Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
|
|
ipa: bool = False,
|
|
segment_separator=' ',
|
|
separator=' # ',
|
|
errors: typing.Literal['replace', 'strict', 'ignore'] = 'replace') -> str:
|
|
"""
|
|
The main task of a Tokenizer is tokenizing! This is what happens when called.
|
|
|
|
This function determines what to do given any combination
|
|
of orthography profile and rules or not orthography profile
|
|
or rules.
|
|
|
|
Parameters
|
|
----------
|
|
string : str
|
|
The input string to be tokenized.
|
|
|
|
column : str (default = "graphemes")
|
|
The column label for the transformation, if specified.
|
|
|
|
form : None or unicode normalization form
|
|
Normalize return value if form is not None.
|
|
|
|
ipa : bool
|
|
Tokenize IPA (work in progress)
|
|
|
|
Returns
|
|
-------
|
|
result : str
|
|
Result of the tokenization.
|
|
|
|
"""
|
|
res = []
|
|
for word in string.split():
|
|
if ipa:
|
|
res.append(self.combine_modifiers(self.grapheme_clusters(nfd(word))))
|
|
else:
|
|
if self.op:
|
|
res.append(
|
|
self.transform(word, column=column, error=self._errors[errors]))
|
|
else:
|
|
res.append(self.grapheme_clusters(nfd(word)))
|
|
|
|
def pp(word):
|
|
res = segment_separator.join(word).strip()
|
|
res = self._rules.apply(res) if self._rules else res
|
|
return unicodedata.normalize(form, res) if form else res
|
|
|
|
return separator.join(pp(word) for word in res)
|
|
|
|
def characters(self, string, segment_separator=' ', separator=' # ',) -> str:
|
|
"""
|
|
Given a string as input, return a space-delimited string of Unicode characters
|
|
(code points rendered as glyphs).
|
|
Parameters
|
|
----------
|
|
string : str
|
|
A Unicode string to be tokenized into graphemes.
|
|
Returns
|
|
-------
|
|
result : str
|
|
String returned is space-delimited on Unicode characters and contains "#" to
|
|
mark word boundaries.
|
|
The string is in NFD.
|
|
Notes
|
|
-----
|
|
Input is first normalized according to Normalization Ford D(ecomposition).
|
|
String returned contains "#" to mark word boundaries.
|
|
"""
|
|
return separator.join(segment_separator.join(word) for word in nfd(string).split())
|
|
|
|
def grapheme_clusters(self, word):
|
|
"""
|
|
See: Unicode Standard Annex #29: UNICODE TEXT SEGMENTATION
|
|
http://www.unicode.org/reports/tr29/
|
|
|
|
Given a string as input, return a list of Unicode graphemes using the
|
|
"\\X" regular expression.
|
|
|
|
Parameters
|
|
----------
|
|
word : str
|
|
A Unicode string to be tokenized into graphemes.
|
|
|
|
Returns
|
|
-------
|
|
result : list
|
|
List of Unicode graphemes in NFD.
|
|
|
|
"""
|
|
# init the regex Unicode grapheme cluster match
|
|
return grapheme_pattern.findall(word)
|
|
|
|
def transform(self, word, column=Profile.GRAPHEME_COL, error=errors.replace):
|
|
"""
|
|
Transform a string's graphemes into the mappings given in a different column
|
|
in the orthography profile.
|
|
|
|
Parameters
|
|
----------
|
|
word : str
|
|
The input string to be tokenized.
|
|
|
|
column : str (default = "Grapheme")
|
|
The label of the column to transform to. Default it to tokenize with
|
|
orthography profile.
|
|
|
|
Returns
|
|
-------
|
|
result : list of lists
|
|
Result of the transformation.
|
|
|
|
"""
|
|
assert self.op, 'method can only be called with orthography profile.'
|
|
|
|
if column != Profile.GRAPHEME_COL and column not in self.op.column_labels:
|
|
raise ValueError("Column {0} not found in profile.".format(column))
|
|
|
|
word = self.op.tree.parse(word, error)
|
|
if column == Profile.GRAPHEME_COL:
|
|
return word
|
|
out = []
|
|
for token in word:
|
|
try:
|
|
target = self.op.graphemes[token][column]
|
|
except KeyError:
|
|
target = self._errors['replace'](token)
|
|
if target is not None:
|
|
if isinstance(target, (tuple, list)):
|
|
out.extend(target)
|
|
else:
|
|
out.append(target)
|
|
return out
|
|
|
|
def rules(self, word):
|
|
"""
|
|
Function to tokenize input string and return output of str with ortho rules
|
|
applied.
|
|
|
|
Parameters
|
|
----------
|
|
word : str
|
|
The input string to be tokenized.
|
|
|
|
Returns
|
|
-------
|
|
result : str
|
|
Result of the orthography rules applied to the input str.
|
|
|
|
"""
|
|
return self._rules.apply(word) if self._rules else word
|
|
|
|
def combine_modifiers(self, graphemes):
|
|
"""
|
|
Given a string that is space-delimited on Unicode grapheme clusters,
|
|
group Unicode modifier letters with their preceding base characters,
|
|
deal with tie bars, etc.
|
|
|
|
Parameters
|
|
----------
|
|
string : str
|
|
A Unicode string tokenized into grapheme clusters to be tokenized into simple
|
|
IPA.
|
|
|
|
"""
|
|
result = []
|
|
temp = ""
|
|
count = len(graphemes)
|
|
for grapheme in reversed(graphemes):
|
|
count -= 1
|
|
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Lm" \
|
|
and not ord(grapheme) in [712, 716] and len(graphemes) > 1:
|
|
temp = grapheme + temp
|
|
# hack for the cases where a space modifier is the first character in the
|
|
# string
|
|
if count == 0:
|
|
result[-1] = temp + result[-1]
|
|
continue # pragma: no cover
|
|
|
|
# catch and repair stress marks
|
|
if len(grapheme) == 1 and (ord(grapheme) in [712, 716]) and result:
|
|
# If result == [], there's nothing to combine with ...
|
|
result[-1] = grapheme + result[-1]
|
|
temp = ""
|
|
continue
|
|
|
|
# combine contour tone marks (non-accents)
|
|
if len(grapheme) == 1 and unicodedata.category(grapheme) == "Sk":
|
|
if len(result) == 0:
|
|
result.append(grapheme)
|
|
temp = ""
|
|
continue
|
|
else:
|
|
if unicodedata.category(result[-1][0]) == "Sk":
|
|
result[-1] = grapheme + temp + result[-1]
|
|
temp = ""
|
|
continue
|
|
|
|
result.append(grapheme + temp)
|
|
temp = ""
|
|
|
|
# last check for tie bars
|
|
segments = result[::-1]
|
|
i = 0
|
|
r = []
|
|
while i < len(segments):
|
|
# tie bars
|
|
if ord(segments[i][-1]) in [865, 860]:
|
|
r.append(segments[i] + segments[i + 1])
|
|
i += 2
|
|
else:
|
|
r.append(segments[i])
|
|
i += 1
|
|
return r
|