Files
blender-portable-repo/extensions/.local/lib/python3.11/site-packages/segments/tree.py
T
2026-03-17 14:58:51 -06:00

69 lines
2.0 KiB
Python

from segments.errors import replace
class TreeNode:
"""
Private class that creates the tree data structure from the orthography profile for
parsing.
"""
def __init__(self, char, sentinel=False):
self.char = char
self.children = {}
self.sentinel = sentinel
class Tree:
def __init__(self, graphemes):
def _multigraph(node, line):
# Internal function to add a multigraph starting at node.
for char in line:
node = node.children.setdefault(char, TreeNode(char))
node.sentinel = True
self.root = TreeNode('', sentinel=True)
for grapheme in graphemes:
_multigraph(self.root, grapheme)
def parse(self, line, error=replace):
res, idx = self._parse(self.root, line, 0)
rem = line[idx:]
while rem:
# Chop off one character and try parsing the remainder:
res.append(error(rem[0]))
rem = rem[1:]
r, i = self._parse(self.root, rem, 0)
res.extend(r)
rem = rem[i:]
return res
def _parse(self, root, line, idx):
"""
:param root: Tree node.
:param line: String to parse.
:param idx: Global counter of characters parsed.
:return: (list of parsed graphemes, incremented character count)
"""
# Base (or degenerate..) case.
if len(line) == 0:
return [], idx
parse = []
curr = 0
node = root
cidx = idx
while curr < len(line):
node = node.children.get(line[curr])
curr += 1
if not node:
break
if node.sentinel:
subparse, cidx = self._parse(root, line[curr:], idx + curr)
# Always keep the latest valid parse, which will be
# the longest-matched (greedy match) graphemes.
parse = [line[:curr]]
parse.extend(subparse)
if parse:
idx = cidx
return parse, idx