2025-12-01
This commit is contained in:
@@ -0,0 +1,68 @@
|
||||
from segments.errors import replace
|
||||
|
||||
|
||||
class TreeNode:
|
||||
"""
|
||||
Private class that creates the tree data structure from the orthography profile for
|
||||
parsing.
|
||||
"""
|
||||
|
||||
def __init__(self, char, sentinel=False):
|
||||
self.char = char
|
||||
self.children = {}
|
||||
self.sentinel = sentinel
|
||||
|
||||
|
||||
class Tree:
|
||||
def __init__(self, graphemes):
|
||||
def _multigraph(node, line):
|
||||
# Internal function to add a multigraph starting at node.
|
||||
for char in line:
|
||||
node = node.children.setdefault(char, TreeNode(char))
|
||||
node.sentinel = True
|
||||
|
||||
self.root = TreeNode('', sentinel=True)
|
||||
for grapheme in graphemes:
|
||||
_multigraph(self.root, grapheme)
|
||||
|
||||
def parse(self, line, error=replace):
|
||||
res, idx = self._parse(self.root, line, 0)
|
||||
rem = line[idx:]
|
||||
while rem:
|
||||
# Chop off one character and try parsing the remainder:
|
||||
res.append(error(rem[0]))
|
||||
rem = rem[1:]
|
||||
r, i = self._parse(self.root, rem, 0)
|
||||
res.extend(r)
|
||||
rem = rem[i:]
|
||||
return res
|
||||
|
||||
def _parse(self, root, line, idx):
|
||||
"""
|
||||
:param root: Tree node.
|
||||
:param line: String to parse.
|
||||
:param idx: Global counter of characters parsed.
|
||||
:return: (list of parsed graphemes, incremented character count)
|
||||
"""
|
||||
# Base (or degenerate..) case.
|
||||
if len(line) == 0:
|
||||
return [], idx
|
||||
|
||||
parse = []
|
||||
curr = 0
|
||||
node = root
|
||||
cidx = idx
|
||||
while curr < len(line):
|
||||
node = node.children.get(line[curr])
|
||||
curr += 1
|
||||
if not node:
|
||||
break
|
||||
if node.sentinel:
|
||||
subparse, cidx = self._parse(root, line[curr:], idx + curr)
|
||||
# Always keep the latest valid parse, which will be
|
||||
# the longest-matched (greedy match) graphemes.
|
||||
parse = [line[:curr]]
|
||||
parse.extend(subparse)
|
||||
if parse:
|
||||
idx = cidx
|
||||
return parse, idx
|
||||
Reference in New Issue
Block a user