blender-portable-repo/extensions/.local/lib/python3.11/site-packages/phonemizer/backend/segments.py

# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Segments backend for the phonemizer"""

import pathlib
from logging import Logger
from typing import Optional, Dict, List, Union, Pattern

import segments

from phonemizer.backend.base import BaseBackend
from phonemizer.separator import Separator
from phonemizer.utils import get_package_resource, version_as_tuple


class SegmentsBackend(BaseBackend):
    """Segments backends for the phonemizer

    The phonemize method will raise a ValueError when parsing an
    unknown morpheme.

    """

    def __init__(self, language: str,
                 punctuation_marks: Optional[Union[str, Pattern]] = None,
                 preserve_punctuation: bool = False,
                 logger: Optional[Logger] = None):
        # will be initialized in _init_language() from super().__init__()
        self._tokenizer: Optional[segments.Tokenizer] = None
        super().__init__(
            language,
            punctuation_marks=punctuation_marks,
            preserve_punctuation=preserve_punctuation,
            logger=logger)

    def _init_language(self, language):
        # load the grapheme to phoneme mapping
        profile = self._load_g2p_profile(language)
        self._tokenizer = segments.Tokenizer(profile=profile)

        # this is the language code
        return pathlib.Path(language).stem

    @staticmethod
    def name():
        return 'segments'

    @classmethod
    def version(cls):
        return version_as_tuple(segments.__version__)

    @classmethod
    def is_available(cls):
        return True

    @staticmethod
    def supported_languages():
        """Returns a dict of language: file supported by the segments backend

        The supported languages have a grapheme to phoneme conversion file
        bundled with phonemizer. Users can also use their own file as
        parameter of the phonemize() function.

        """
        # directory phonemizer/share/segments
        directory = get_package_resource('segments')

        # supported languages are files with the 'g2p' extension
        return {g2p.stem: g2p
                for g2p in directory.iterdir() if g2p.suffix == '.g2p'}

    @classmethod
    def is_supported_language(cls, language: str) -> bool:
        if pathlib.Path(language).is_file():
            try:
                cls._load_g2p_profile(language)
                return True
            except RuntimeError:
                return False
        return language in cls.supported_languages()

    @classmethod
    def _load_g2p_profile(cls, language: str) -> segments.Profile:
        """Returns a segments profile from a `language`"""
        # make sure the g2p file exists
        if not pathlib.Path(language).is_file():
            try:
                language = cls.supported_languages()[language]
            except KeyError:
                raise RuntimeError(
                    f'grapheme to phoneme file not found: '
                    f'{language}') from None

        # load the mapping grapheme -> phoneme from the file, make sure all
        # lines are well formatted
        g2p: Dict[str, str] = {}
        with open(language, 'r', encoding='utf8') as flang:
            for num, line in enumerate(flang):
                elts = line.strip().split()
                if not len(elts) == 2:
                    raise RuntimeError(
                        'grapheme to phoneme file, line {} must have 2 rows '
                        'but have {}: {}'.format(num + 1, len(elts), language))
                g2p[elts[0]] = elts[1]

        # build the segments profile from the g2p mapping
        return segments.Profile(
            *[{'Grapheme': k, 'mapping': v} for k, v in g2p.items()])

    # pylint: disable=unused-argument
    def _phonemize_aux(self, text: List[str], offset: int, separator: Separator, strip: bool) -> List[str]:
        # tokenize the input text per utterance
        phonemized = (
            self._tokenizer(line, column='mapping', errors='strict')
            for line in text)

        # the output of segments is always strip, so we need to add
        # token separation at the end when strip is False.
        if not strip:
            # add word separator at end of utterance
            phonemized = (p + ' # ' for p in phonemized)
            # add phoneme separator at end of word
            phonemized = (p.replace(' # ', '  # ') for p in phonemized)

        # replace default separators by our custom ones
        phonemized = (p.replace(' # ', '#') for p in phonemized)
        phonemized = (p.replace(' ', separator.phone) for p in phonemized)
        phonemized = (p.replace('#', separator.word) for p in phonemized)

        # return the result as a list of utterances
        return list(phonemized)