144 lines
5.3 KiB
Python
144 lines
5.3 KiB
Python
# Copyright 2015-2021 Mathieu Bernard
|
|
#
|
|
# This file is part of phonemizer: you can redistribute it and/or
|
|
# modify it under the terms of the GNU General Public License as
|
|
# published by the Free Software Foundation, either version 3 of the
|
|
# License, or (at your option) any later version.
|
|
#
|
|
# Phonemizer is distributed in the hope that it will be useful, but
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
|
|
"""Segments backend for the phonemizer"""
|
|
|
|
import pathlib
|
|
from logging import Logger
|
|
from typing import Optional, Dict, List, Union, Pattern
|
|
|
|
import segments
|
|
|
|
from phonemizer.backend.base import BaseBackend
|
|
from phonemizer.separator import Separator
|
|
from phonemizer.utils import get_package_resource, version_as_tuple
|
|
|
|
|
|
class SegmentsBackend(BaseBackend):
|
|
"""Segments backends for the phonemizer
|
|
|
|
The phonemize method will raise a ValueError when parsing an
|
|
unknown morpheme.
|
|
|
|
"""
|
|
|
|
def __init__(self, language: str,
|
|
punctuation_marks: Optional[Union[str, Pattern]] = None,
|
|
preserve_punctuation: bool = False,
|
|
logger: Optional[Logger] = None):
|
|
# will be initialized in _init_language() from super().__init__()
|
|
self._tokenizer: Optional[segments.Tokenizer] = None
|
|
super().__init__(
|
|
language,
|
|
punctuation_marks=punctuation_marks,
|
|
preserve_punctuation=preserve_punctuation,
|
|
logger=logger)
|
|
|
|
def _init_language(self, language):
|
|
# load the grapheme to phoneme mapping
|
|
profile = self._load_g2p_profile(language)
|
|
self._tokenizer = segments.Tokenizer(profile=profile)
|
|
|
|
# this is the language code
|
|
return pathlib.Path(language).stem
|
|
|
|
@staticmethod
|
|
def name():
|
|
return 'segments'
|
|
|
|
@classmethod
|
|
def version(cls):
|
|
return version_as_tuple(segments.__version__)
|
|
|
|
@classmethod
|
|
def is_available(cls):
|
|
return True
|
|
|
|
@staticmethod
|
|
def supported_languages():
|
|
"""Returns a dict of language: file supported by the segments backend
|
|
|
|
The supported languages have a grapheme to phoneme conversion file
|
|
bundled with phonemizer. Users can also use their own file as
|
|
parameter of the phonemize() function.
|
|
|
|
"""
|
|
# directory phonemizer/share/segments
|
|
directory = get_package_resource('segments')
|
|
|
|
# supported languages are files with the 'g2p' extension
|
|
return {g2p.stem: g2p
|
|
for g2p in directory.iterdir() if g2p.suffix == '.g2p'}
|
|
|
|
@classmethod
|
|
def is_supported_language(cls, language: str) -> bool:
|
|
if pathlib.Path(language).is_file():
|
|
try:
|
|
cls._load_g2p_profile(language)
|
|
return True
|
|
except RuntimeError:
|
|
return False
|
|
return language in cls.supported_languages()
|
|
|
|
@classmethod
|
|
def _load_g2p_profile(cls, language: str) -> segments.Profile:
|
|
"""Returns a segments profile from a `language`"""
|
|
# make sure the g2p file exists
|
|
if not pathlib.Path(language).is_file():
|
|
try:
|
|
language = cls.supported_languages()[language]
|
|
except KeyError:
|
|
raise RuntimeError(
|
|
f'grapheme to phoneme file not found: '
|
|
f'{language}') from None
|
|
|
|
# load the mapping grapheme -> phoneme from the file, make sure all
|
|
# lines are well formatted
|
|
g2p: Dict[str, str] = {}
|
|
with open(language, 'r', encoding='utf8') as flang:
|
|
for num, line in enumerate(flang):
|
|
elts = line.strip().split()
|
|
if not len(elts) == 2:
|
|
raise RuntimeError(
|
|
'grapheme to phoneme file, line {} must have 2 rows '
|
|
'but have {}: {}'.format(num + 1, len(elts), language))
|
|
g2p[elts[0]] = elts[1]
|
|
|
|
# build the segments profile from the g2p mapping
|
|
return segments.Profile(
|
|
*[{'Grapheme': k, 'mapping': v} for k, v in g2p.items()])
|
|
|
|
# pylint: disable=unused-argument
|
|
def _phonemize_aux(self, text: List[str], offset: int, separator: Separator, strip: bool) -> List[str]:
|
|
# tokenize the input text per utterance
|
|
phonemized = (
|
|
self._tokenizer(line, column='mapping', errors='strict')
|
|
for line in text)
|
|
|
|
# the output of segments is always strip, so we need to add
|
|
# token separation at the end when strip is False.
|
|
if not strip:
|
|
# add word separator at end of utterance
|
|
phonemized = (p + ' # ' for p in phonemized)
|
|
# add phoneme separator at end of word
|
|
phonemized = (p.replace(' # ', ' # ') for p in phonemized)
|
|
|
|
# replace default separators by our custom ones
|
|
phonemized = (p.replace(' # ', '#') for p in phonemized)
|
|
phonemized = (p.replace(' ', separator.phone) for p in phonemized)
|
|
phonemized = (p.replace('#', separator.word) for p in phonemized)
|
|
|
|
# return the result as a list of utterances
|
|
return list(phonemized)
|