Files
blender-portable-repo/extensions/.local/lib/python3.11/site-packages/phonemizer/backend/segments.py
T
2026-03-17 14:58:51 -06:00

144 lines
5.3 KiB
Python

# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Segments backend for the phonemizer"""
import pathlib
from logging import Logger
from typing import Optional, Dict, List, Union, Pattern
import segments
from phonemizer.backend.base import BaseBackend
from phonemizer.separator import Separator
from phonemizer.utils import get_package_resource, version_as_tuple
class SegmentsBackend(BaseBackend):
"""Segments backends for the phonemizer
The phonemize method will raise a ValueError when parsing an
unknown morpheme.
"""
def __init__(self, language: str,
punctuation_marks: Optional[Union[str, Pattern]] = None,
preserve_punctuation: bool = False,
logger: Optional[Logger] = None):
# will be initialized in _init_language() from super().__init__()
self._tokenizer: Optional[segments.Tokenizer] = None
super().__init__(
language,
punctuation_marks=punctuation_marks,
preserve_punctuation=preserve_punctuation,
logger=logger)
def _init_language(self, language):
# load the grapheme to phoneme mapping
profile = self._load_g2p_profile(language)
self._tokenizer = segments.Tokenizer(profile=profile)
# this is the language code
return pathlib.Path(language).stem
@staticmethod
def name():
return 'segments'
@classmethod
def version(cls):
return version_as_tuple(segments.__version__)
@classmethod
def is_available(cls):
return True
@staticmethod
def supported_languages():
"""Returns a dict of language: file supported by the segments backend
The supported languages have a grapheme to phoneme conversion file
bundled with phonemizer. Users can also use their own file as
parameter of the phonemize() function.
"""
# directory phonemizer/share/segments
directory = get_package_resource('segments')
# supported languages are files with the 'g2p' extension
return {g2p.stem: g2p
for g2p in directory.iterdir() if g2p.suffix == '.g2p'}
@classmethod
def is_supported_language(cls, language: str) -> bool:
if pathlib.Path(language).is_file():
try:
cls._load_g2p_profile(language)
return True
except RuntimeError:
return False
return language in cls.supported_languages()
@classmethod
def _load_g2p_profile(cls, language: str) -> segments.Profile:
"""Returns a segments profile from a `language`"""
# make sure the g2p file exists
if not pathlib.Path(language).is_file():
try:
language = cls.supported_languages()[language]
except KeyError:
raise RuntimeError(
f'grapheme to phoneme file not found: '
f'{language}') from None
# load the mapping grapheme -> phoneme from the file, make sure all
# lines are well formatted
g2p: Dict[str, str] = {}
with open(language, 'r', encoding='utf8') as flang:
for num, line in enumerate(flang):
elts = line.strip().split()
if not len(elts) == 2:
raise RuntimeError(
'grapheme to phoneme file, line {} must have 2 rows '
'but have {}: {}'.format(num + 1, len(elts), language))
g2p[elts[0]] = elts[1]
# build the segments profile from the g2p mapping
return segments.Profile(
*[{'Grapheme': k, 'mapping': v} for k, v in g2p.items()])
# pylint: disable=unused-argument
def _phonemize_aux(self, text: List[str], offset: int, separator: Separator, strip: bool) -> List[str]:
# tokenize the input text per utterance
phonemized = (
self._tokenizer(line, column='mapping', errors='strict')
for line in text)
# the output of segments is always strip, so we need to add
# token separation at the end when strip is False.
if not strip:
# add word separator at end of utterance
phonemized = (p + ' # ' for p in phonemized)
# add phoneme separator at end of word
phonemized = (p.replace(' # ', ' # ') for p in phonemized)
# replace default separators by our custom ones
phonemized = (p.replace(' # ', '#') for p in phonemized)
phonemized = (p.replace(' ', separator.phone) for p in phonemized)
phonemized = (p.replace('#', separator.word) for p in phonemized)
# return the result as a list of utterances
return list(phonemized)