2025-12-01

2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,21 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonologizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonologizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
+"""Multilingual text to phones converter"""
+
+from .phonemize import phonemize  # pylint: disable=unused-import
+
+
+__version__ = '3.3.0'
+"""Phonemizer version"""
@@ -0,0 +1,27 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonologizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonologizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
+"""Multilingual text to phonemes converter"""
+
+# pylint: disable=unused-import
+
+from .espeak.espeak import EspeakBackend
+from .espeak.mbrola import EspeakMbrolaBackend
+from .festival.festival import FestivalBackend
+from .segments import SegmentsBackend
+
+
+BACKENDS = {b.name(): b for b in (
+    EspeakBackend, FestivalBackend, SegmentsBackend, EspeakMbrolaBackend)}
+"""The different phonemization backends as a mapping (name, class)"""
@@ -0,0 +1,255 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Abstract base class for phonemization backends"""
+
+import abc
+import itertools
+import re
+from logging import Logger
+from typing import Optional, List, Any, Dict, Tuple, Union, Pattern
+
+import joblib
+
+from phonemizer.logger import get_logger
+from phonemizer.punctuation import Punctuation
+from phonemizer.separator import Separator, default_separator
+from phonemizer.utils import chunks
+
+
+class BaseBackend(abc.ABC):
+    """Abstract base class of all the phonemization backends
+
+    Provides a common interface to all backends. The central method is
+    `phonemize()`
+
+    Parameters
+    ----------
+    language: str
+        The language code of the input text, must be supported by
+        the backend. If ``backend`` is 'segments', the language can be a file with
+        a grapheme to phoneme mapping.
+
+    preserve_punctuation: bool
+        When True, will keep the punctuation in the
+        phonemized output. Not supported by the 'espeak-mbrola' backend. Default
+        to False and remove all the punctuation.
+
+    punctuation_marks: str
+        The punctuation marks to consider when dealing with punctuation, either for removal or preservation.
+        Can be defined as a string or regular expression. Default to Punctuation.default_marks().
+
+    logger: logging.Logger
+        the logging instance where to send
+        messages. If not specified, use the default system logger.
+
+    Raises
+    ------
+    RuntimeError
+        if the backend is not available of if the `language` cannot be initialized.
+
+    """
+
+    def __init__(self, language: str,
+                 punctuation_marks: Optional[Union[str, Pattern]] = None,
+                 preserve_punctuation: bool = False,
+                 logger: Optional[Logger] = None):
+
+        if punctuation_marks is None:
+            punctuation_marks = Punctuation.default_marks()
+
+        if logger is None:
+            logger = get_logger()
+
+        # ensure the backend is installed on the system
+        if not self.is_available():
+            raise RuntimeError(  # pragma: nocover
+                '{} not installed on your system'.format(self.name()))
+
+        self._logger = logger
+        self._logger.info(
+            'initializing backend %s-%s',
+            self.name(), '.'.join(str(v) for v in self.version()))
+
+        # ensure the backend support the requested language
+        self._language = self._init_language(language)
+
+        # setup punctuation processing
+        self._preserve_punctuation = preserve_punctuation
+        self._punctuator = Punctuation(punctuation_marks)
+
+    @classmethod
+    def _init_language(cls, language):
+        """Language initialization
+
+        This method may be overloaded in child classes (see Segments backend)
+
+        """
+        if not cls.is_supported_language(language):
+            raise RuntimeError(
+                f'language "{language}" is not supported by the '
+                f'{cls.name()} backend')
+        return language
+
+    @property
+    def logger(self):
+        """A logging.Logger instance where to send messages"""
+        return self._logger
+
+    @property
+    def language(self):
+        """The language code configured to be used for phonemization"""
+        return self._language
+
+    @staticmethod
+    @abc.abstractmethod
+    def name():
+        """The name of the backend"""
+
+    @classmethod
+    @abc.abstractmethod
+    def is_available(cls):
+        """Returns True if the backend is installed, False otherwise"""
+
+    @classmethod
+    @abc.abstractmethod
+    def version(cls):
+        """Return the backend version as a tuple (major, minor, patch)"""
+
+    @staticmethod
+    @abc.abstractmethod
+    def supported_languages() -> Dict[str, str]:
+        """Return a dict of language codes -> name supported by the backend"""
+
+    @classmethod
+    def is_supported_language(cls, language: str):
+        """Returns True if `language` is supported by the backend"""
+        return language in cls.supported_languages()
+
+    def phonemize(self, text: List[str],
+                  separator: Optional[Separator] = None,
+                  strip: bool = False,
+                  njobs: int = 1) -> List[str]:
+        """Returns the `text` phonemized for the given language
+
+        Parameters
+        ----------
+        text: list of str
+            The text to be phonemized. Each string in the list
+            is considered as a separated line. Each line is considered as a text
+            utterance. Any empty utterance will be ignored.
+
+        separator: Separator
+            string separators between phonemes, syllables
+            and words, default to separator.default_separator. Syllable separator
+            is considered only for the festival backend. Word separator is
+            ignored by the 'espeak-mbrola' backend.
+
+        strip: bool
+            If True, don't output the last word and phone separators
+            of a token, default to False.
+
+        njobs : int
+            The number of parallel jobs to launch. The input text is
+            split in ``njobs`` parts, phonemized on parallel instances of the
+            backend and the outputs are finally collapsed.
+
+        Returns
+        -------
+        phonemized text: list of str
+            The input ``text`` phonemized for the given ``language`` and ``backend``.
+
+        Raises
+        ------
+        RuntimeError
+            if something went wrong during the phonemization
+
+        """
+        if isinstance(text, str):
+            # changed in phonemizer-3.0, warn the user
+            raise RuntimeError(
+                'input text to phonemize() is str but it must be list of str')
+
+        if separator is None:
+            separator = default_separator
+
+        text, punctuation_marks = self._phonemize_preprocess(text)
+
+        if njobs == 1:
+            # phonemize the text forced as a string
+            phonemized = self._phonemize_aux(text, 0, separator, strip)
+        else:
+            # If using parallel jobs, disable the log as stderr is not
+            # picklable.
+            self.logger.info('running %s on %s jobs', self.name(), njobs)
+
+            # we have here a list of phonemized chunks
+            phonemized = joblib.Parallel(n_jobs=njobs)(
+                joblib.delayed(self._phonemize_aux)(
+                    # chunk[0] is the text, chunk[1] is the offset
+                    chunk[0], chunk[1], separator, strip)
+                for chunk in zip(*chunks(text, njobs)))
+
+            # flatten them in a single list
+            phonemized = self._flatten(phonemized)
+
+        return self._phonemize_postprocess(phonemized, punctuation_marks, separator, strip)
+
+    @staticmethod
+    def _flatten(phonemized: List[List[Any]]):
+        """Flatten a list of lists into a single one
+
+        From [[1, 2], [3], [4]] returns [1, 2, 3, 4]. This method is used to
+        format the output as obtained using multiple jobs.
+
+        """
+        return list(itertools.chain(*phonemized))
+
+    @abc.abstractmethod
+    def _phonemize_aux(self, text: List[str], offset: int, separator: Separator, strip: bool) -> List[str]:
+        """The "concrete" phonemization method
+
+        Must be implemented in child classes. `separator` and `strip`
+        parameters are as given to the phonemize() method. `text` is as
+        returned by _phonemize_preprocess(). `offset` is line number of the
+        first line in `text` with respect to the original text (this is only
+        usefull with running on chunks in multiple jobs. When using a single
+        jobs the offset is 0).
+
+        """
+
+    def _phonemize_preprocess(self, text: List[str]) -> Tuple[Union[str, List[str]], List]:
+        """Preprocess the text before phonemization
+
+        Removes the punctuation (keep trace of punctuation marks for further
+        restoration if required by the `preserve_punctuation` option).
+
+        """
+        if self._preserve_punctuation:
+            # a tuple (text, punctuation marks)
+            return self._punctuator.preserve(text)
+        return self._punctuator.remove(text), []
+
+    def _phonemize_postprocess(self, phonemized: List[str],
+                               punctuation_marks,
+                               separator: Separator,
+                               strip: bool):
+        """Postprocess the raw phonemized output
+
+        Restores the punctuation as needed.
+
+        """
+        if self._preserve_punctuation:
+            return self._punctuator.restore(phonemized, punctuation_marks, separator, strip)
+        return phonemized
@@ -0,0 +1,15 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonologizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonologizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
+"""Phonemizer module for espeak backend implementation"""
@@ -0,0 +1,275 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Low-level bindings to the espeak API"""
+
+import atexit
+import ctypes
+import pathlib
+import shutil
+import sys
+import tempfile
+import weakref
+from ctypes import CDLL
+from pathlib import Path
+from typing import Union
+
+from phonemizer.backend.espeak.voice import EspeakVoice
+
+if sys.platform != 'win32':
+    # cause a crash on Windows
+    import dlinfo
+
+
+class EspeakAPI:
+    """Exposes the espeak API to the EspeakWrapper
+
+    This class exposes only low-level bindings to the API and should not be
+    used directly.
+
+    """
+
+    def __init__(self, library: Union[str, Path]):
+        # set to None to avoid an AttributeError in _delete if the __init__
+        # method raises, will be properly initialized below
+        self._library = None
+
+        # Because the library is not designed to be wrapped nor to be used in
+        # multithreaded/multiprocess contexts (massive use of global variables)
+        # we need a copy of the original library for each instance of the
+        # wrapper... (see "man dlopen" on Linux/MacOS: we cannot load two times
+        # the same library because a reference is then returned by dlopen). The
+        # tweak is therefore to make a copy of the original library in a
+        # different (temporary) directory.
+        try:
+            # load the original library in order to retrieve its full path?
+            # Forced as str as it is required on Windows.
+            espeak: CDLL = ctypes.cdll.LoadLibrary(str(library))
+            library_path = self._shared_library_path(espeak)
+            del espeak
+        except OSError as error:
+            raise RuntimeError(
+                f'failed to load espeak library: {str(error)}') from None
+
+        # will be automatically destroyed after use
+        self._tempdir = tempfile.mkdtemp()
+
+        # properly exit when the wrapper object is destroyed (see
+        # https://docs.python.org/3/library/weakref.html#comparing-finalizers-with-del-methods).
+        # But... weakref implementation does not work on windows so we register
+        # the cleanup with atexit. This means that, on Windows, all the
+        # temporary directories created by EspeakAPI instances will remain on
+        # disk until the Python process exit.
+        if sys.platform == 'win32':  # pragma: nocover
+            atexit.register(self._delete_win32)
+        else:
+            weakref.finalize(self, self._delete, self._library, self._tempdir)
+
+        espeak_copy = pathlib.Path(self._tempdir) / library_path.name
+        shutil.copy(library_path, espeak_copy, follow_symlinks=False)
+
+        # finally load the library copy and initialize it. 0x02 is
+        # AUDIO_OUTPUT_SYNCHRONOUS in the espeak API
+        self._library = ctypes.cdll.LoadLibrary(str(espeak_copy))
+        try:
+            if self._library.espeak_Initialize(0x02, 0, None, 0) <= 0:
+                raise RuntimeError(  # pragma: nocover
+                    'failed to initialize espeak shared library')
+        except AttributeError:  # pragma: nocover
+            raise RuntimeError(
+                'failed to load espeak library') from None
+
+        # the path to the original one (the copy is considered an
+        # implementation detail and is not exposed)
+        self._library_path = library_path
+
+    def _delete_win32(self):  # pragma: nocover
+        # Windows does not support static methods with ctypes libraries
+        # (library == None) so we use a proxy method...
+        self._delete(self._library, self._tempdir)
+
+    @staticmethod
+    def _delete(library, tempdir):
+        try:
+            # clean up the espeak library allocated memory
+            library.espeak_Terminate()
+        except AttributeError:  # library not loaded
+            pass
+
+        # on Windows it is required to unload the library or the .dll file
+        # cannot be erased from the temporary directory
+        if sys.platform == 'win32':  # pragma: nocover
+            # pylint: disable=import-outside-toplevel
+            # pylint: disable=protected-access
+            # pylint: disable=no-member
+            import _ctypes
+            _ctypes.FreeLibrary(library._handle)
+
+        # clean up the tempdir containing the copy of the library
+        shutil.rmtree(tempdir)
+
+    @property
+    def library_path(self):
+        """Absolute path to the espeak library being in use"""
+        return self._library_path
+
+    @staticmethod
+    def _shared_library_path(library) -> Path:
+        """Returns the absolute path to `library`
+
+        This function is cross-platform and works for Linux, MacOS and Windows.
+        Raises a RuntimeError if the library path cannot be retrieved
+
+        """
+        # pylint: disable=protected-access
+        path = pathlib.Path(library._name).resolve()
+        if path.is_file():
+            return path
+
+        try:
+            # Linux or MacOS only, ImportError on Windows
+            return pathlib.Path(dlinfo.DLInfo(library).path).resolve()
+        except (Exception, ImportError):  # pragma: nocover
+            raise RuntimeError(
+                f'failed to retrieve the path to {library} library') from None
+
+    def info(self):
+        """Bindings to espeak_Info
+
+        Returns
+        -------
+        version, data_path: encoded strings containing the espeak version
+            number and data path respectively
+
+        """
+        f_info = self._library.espeak_Info
+        f_info.restype = ctypes.c_char_p
+        data_path = ctypes.c_char_p()
+        version = f_info(ctypes.byref(data_path))
+        return version, data_path.value
+
+    def list_voices(self, name):
+        """Bindings to espeak_ListVoices
+
+        Parameters
+        ----------
+        name (str or None): if specified, a filter on voices to be listed
+
+        Returns
+        -------
+        voices: a pointer to EspeakVoice.Struct instances
+
+        """
+        f_list_voices = self._library.espeak_ListVoices
+        f_list_voices.argtypes = [ctypes.POINTER(EspeakVoice.VoiceStruct)]
+        f_list_voices.restype = ctypes.POINTER(
+            ctypes.POINTER(EspeakVoice.VoiceStruct))
+        return f_list_voices(name)
+
+    def set_voice_by_name(self, name) -> int:
+        """Bindings to espeak_SetVoiceByName
+
+        Parameters
+        ----------
+        name (str) : the voice name to setup
+
+        Returns
+        -------
+        0 on success, non-zero integer on failure
+
+        """
+        f_set_voice_by_name = self._library.espeak_SetVoiceByName
+        f_set_voice_by_name.argtypes = [ctypes.c_char_p]
+        return f_set_voice_by_name(name)
+
+    def get_current_voice(self):
+        """Bindings to espeak_GetCurrentVoice
+
+        Returns
+        -------
+        a EspeakVoice.Struct instance or None if no voice has been setup
+
+        """
+        f_get_current_voice = self._library.espeak_GetCurrentVoice
+        f_get_current_voice.restype = ctypes.POINTER(EspeakVoice.VoiceStruct)
+        return f_get_current_voice().contents
+
+    def text_to_phonemes(self, text_ptr, text_mode, phonemes_mode):
+        """Bindings to espeak_TextToPhonemes
+
+        Parameters
+        ----------
+        text_ptr (pointer): the text to be phonemized, as a pointer to a
+            pointer of chars
+        text_mode (bits field): see espeak sources for details
+        phonemes_mode (bits field): see espeak sources for details
+
+        Returns
+        -------
+        an encoded string containing the computed phonemes
+
+        """
+        f_text_to_phonemes = self._library.espeak_TextToPhonemes
+        f_text_to_phonemes.restype = ctypes.c_char_p
+        f_text_to_phonemes.argtypes = [
+            ctypes.POINTER(ctypes.c_char_p),
+            ctypes.c_int,
+            ctypes.c_int]
+        return f_text_to_phonemes(text_ptr, text_mode, phonemes_mode)
+
+    def set_phoneme_trace(self, mode, file_pointer):
+        """"Bindings on espeak_SetPhonemeTrace
+
+        This method must be called before any call to synthetize()
+
+        Parameters
+        ----------
+        mode (bits field): see espeak sources for details
+        file_pointer (FILE*): a pointer to an opened file in which to output
+            the phoneme trace
+
+        """
+        f_set_phoneme_trace = self._library.espeak_SetPhonemeTrace
+        f_set_phoneme_trace.argtypes = [
+            ctypes.c_int,
+            ctypes.c_void_p]
+        f_set_phoneme_trace(mode, file_pointer)
+
+    def synthetize(self, text_ptr, size, mode):
+        """Bindings on espeak_Synth
+
+        The output phonemes are sent to the file specified by a call to
+        set_phoneme_trace().
+
+        Parameters
+        ----------
+        text (pointer) : a pointer to chars
+        size (int) : number of chars in `text`
+        mode (bits field) : see espeak sources for details
+
+        Returns
+        -------
+        0 on success, non-zero integer on failure
+
+        """
+        f_synthetize = self._library.espeak_Synth
+        f_synthetize.argtypes = [
+            ctypes.c_void_p,
+            ctypes.c_size_t,
+            ctypes.c_uint,
+            ctypes.c_int,  # position_type
+            ctypes.c_uint,
+            ctypes.POINTER(ctypes.c_uint),
+            ctypes.c_void_p]
+        return f_synthetize(text_ptr, size, 0, 1, 0, mode, None, None)
@@ -0,0 +1,113 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Base class of espeak backends for the phonemizer"""
+
+import abc
+from logging import Logger
+from typing import Optional, Union, Pattern
+
+from phonemizer.backend.base import BaseBackend
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+from phonemizer.logger import get_logger
+from phonemizer.punctuation import Punctuation
+from phonemizer.separator import Separator
+
+
+class BaseEspeakBackend(BaseBackend):
+    """Abstract espeak backend for the phonemizer
+
+    Base class of the concrete backends Espeak and EspeakMbrola. It provides
+    facilities to find espeak library and read espeak version.
+
+    """
+    def __init__(self, language: str,
+                 punctuation_marks: Optional[Union[str, Pattern]] = None,
+                 preserve_punctuation: bool = False,
+                 logger: Optional[Logger] = None):
+        super().__init__(
+            language,
+            punctuation_marks=punctuation_marks,
+            preserve_punctuation=preserve_punctuation,
+            logger=logger)
+
+        self._espeak = EspeakWrapper()
+        self.logger.debug('loaded %s', self._espeak.library_path)
+
+
+    @classmethod
+    def set_library(cls, library):
+        """Sets the espeak backend to use `library`
+
+        If this is not set, the backend uses the default espeak shared library
+        from the system installation.
+
+        Parameters
+        ----------
+        library (str or None) : the path to the espeak shared library to use as
+            backend. Set `library` to None to restore the default.
+
+        """
+        EspeakWrapper.set_library(library)
+
+    @classmethod
+    def library(cls):
+        """Returns the espeak library used as backend
+
+        The following precedence rule applies for library lookup:
+
+        1. As specified by BaseEspeakBackend.set_library()
+        2. Or as specified by the environment variable
+           PHONEMIZER_ESPEAK_LIBRARY
+        3. Or the default espeak library found on the system
+
+        Raises
+        ------
+        RuntimeError if the espeak library cannot be found or if the
+            environment variable PHONEMIZER_ESPEAK_LIBRARY is set to a
+            non-readable file
+
+        """
+        return EspeakWrapper.library()
+
+    @classmethod
+    def is_available(cls) -> bool:
+        try:
+            EspeakWrapper()
+        except RuntimeError:  # pragma: nocover
+            return False
+        return True
+
+    @classmethod
+    def is_espeak_ng(cls) -> bool:
+        """Returns True if using espeak-ng, False otherwise"""
+        # espeak-ng starts with version 1.49
+        return cls.version() >= (1, 49)
+
+    @classmethod
+    def version(cls):
+        """Espeak version as a tuple (major, minor, patch)
+
+        Raises
+        ------
+        RuntimeError if BaseEspeakBackend.is_available() is False or if the
+            version cannot be extracted for some reason.
+
+        """
+        return EspeakWrapper().version
+
+    @abc.abstractmethod
+    def _postprocess_line(self, line: str, num: int,
+                          separator: Separator, strip: bool) -> str:
+        pass
@@ -0,0 +1,172 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Espeak backend for the phonemizer"""
+
+import itertools
+import re
+from logging import Logger
+from typing import Optional, Tuple, List, Union, Pattern
+
+from phonemizer.backend.espeak.base import BaseEspeakBackend
+from phonemizer.backend.espeak.language_switch import (
+    get_language_switch_processor, LanguageSwitch, BaseLanguageSwitch)
+from phonemizer.backend.espeak.words_mismatch import (
+    get_words_mismatch_processor, WordMismatch, BaseWordsMismatch)
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+from phonemizer.separator import Separator
+
+
+class EspeakBackend(BaseEspeakBackend):
+    """Espeak backend for the phonemizer"""
+    # a regular expression to find phonemes stresses in espeak output
+    _ESPEAK_STRESS_RE = re.compile(r"[ˈˌ'-]+")
+
+    # pylint: disable=too-many-arguments
+    def __init__(self, language: str,
+                 punctuation_marks: Optional[Union[str, Pattern]] = None,
+                 preserve_punctuation: bool = False,
+                 with_stress: bool = False,
+                 tie: Union[bool, str] = False,
+                 language_switch: LanguageSwitch = 'keep-flags',
+                 words_mismatch: WordMismatch = 'ignore',
+                 logger: Optional[Logger] = None):
+        super().__init__(
+            language, punctuation_marks=punctuation_marks,
+            preserve_punctuation=preserve_punctuation, logger=logger)
+
+        self._espeak.set_voice(language)
+        self._with_stress = with_stress
+        self._tie = self._init_tie(tie)
+        self._lang_switch: BaseLanguageSwitch = get_language_switch_processor(
+            language_switch, self.logger, self.language)
+        self._words_mismatch: BaseWordsMismatch = get_words_mismatch_processor(
+            words_mismatch, self.logger)
+
+    @staticmethod
+    def _init_tie(tie) -> Optional[str]:
+        if not tie:
+            return None
+
+        if tie is True:  # default U+361 tie character
+            return '͡'
+
+        # non default tie charcacter
+        tie = str(tie)
+        if len(tie) != 1:
+            raise RuntimeError(
+                f'explicit tie must be a single charcacter but is {tie}')
+        return tie
+
+    @staticmethod
+    def name():
+        return 'espeak'
+
+    @classmethod
+    def supported_languages(cls):
+        return {
+            voice.language: voice.name
+            for voice in EspeakWrapper().available_voices()}
+
+    def _phonemize_aux(self, text, offset, separator, strip):
+        if self._tie is not None and separator.phone:
+            self.logger.warning(
+                'cannot use ties AND phone separation, '
+                'ignoring phone separator')
+
+        output = []
+        lang_switches = []
+        for num, line in enumerate(text, start=1):
+            line = self._espeak.text_to_phonemes(line, self._tie)
+            line, has_switch = self._postprocess_line(
+                line, num, separator, strip)
+            output.append(line)
+            if has_switch:
+                lang_switches.append(num + offset)
+
+        return output, lang_switches
+
+    def _process_stress(self, word):
+        if self._with_stress:
+            return word
+        # remove the stresses on phonemes
+        return re.sub(self._ESPEAK_STRESS_RE, '', word)
+
+    def _process_tie(self, word: str, separator: Separator):
+        # NOTE a bug in espeak append ties to (en) flags so as (͡e͡n).
+        # We do not correct it here.
+        if self._tie is not None and self._tie != '͡':
+            # replace default '͡' by the requested one
+            return word.replace('͡', self._tie)
+        return word.replace('_', separator.phone)
+
+    def _postprocess_line(self, line: str, num: int,
+                          separator: Separator, strip: bool) -> Tuple[str, bool]:
+        # espeak can split an utterance into several lines because
+        # of punctuation, here we merge the lines into a single one
+        line = line.strip().replace('\n', ' ').replace('  ', ' ')
+
+        # due to a bug in espeak-ng, some additional separators can be
+        # added at the end of a word. Here a quick fix to solve that
+        # issue. See https://github.com/espeak-ng/espeak-ng/issues/694
+        line = re.sub(r'_+', '_', line)
+        line = re.sub(r'_ ', ' ', line)
+
+        line, has_switch = self._lang_switch.process(line)
+        if not line:
+            return '', has_switch
+
+        out_line = ''
+        for word in line.split(' '):
+            word = self._process_stress(word.strip())
+            if not strip and self._tie is None:
+                word += '_'
+            word = self._process_tie(word, separator)
+            out_line += word + separator.word
+
+        if strip and separator.word:
+            # erase the last word separator from the line
+            out_line = out_line[:-len(separator.word)]
+
+        return out_line, has_switch
+
+    def _phonemize_preprocess(self, text: List[str]) -> Tuple[Union[str, List[str]], List]:
+        text, punctuation_marks = super()._phonemize_preprocess(text)
+        self._words_mismatch.count_text(text)
+        return text, punctuation_marks
+
+    def _phonemize_postprocess(self, phonemized, punctuation_marks, separator: Separator, strip: bool):
+        text = phonemized[0]
+        switches = phonemized[1]
+
+        self._words_mismatch.count_phonemized(text, separator)
+        self._lang_switch.warning(switches)
+
+        phonemized = super()._phonemize_postprocess(text, punctuation_marks, separator, strip)
+        return self._words_mismatch.process(phonemized)
+
+    @staticmethod
+    def _flatten(phonemized) -> List:
+        """Specialization of BaseBackend._flatten for the espeak backend
+
+        From [([1, 2], ['a', 'b']), ([3],), ([4], ['c'])] to [[1, 2, 3, 4],
+        ['a', 'b', 'c']].
+
+        """
+        flattened = []
+        for i in range(len(phonemized[0])):
+            flattened.append(
+                list(itertools.chain(
+                    c for chunk in phonemized for c in chunk[i])))
+        return flattened
@@ -0,0 +1,193 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Manages language switches for the espeak backend
+
+This module is used in phonemizer.backend.EspeakBackend and should be
+considered private.
+
+It manages languages switches that occur during phonemization, where a part of
+a text is phonemized in a language different from the target language. For
+instance the sentence "j'aime le football" in French will be phonemized by
+espeak as "ʒɛm lə (en)fʊtbɔːl(fr)", "football" be pronounced as an English
+word. This may cause two issues to end users. First it introduces undesirable
+(.) language switch flags. It may introduce extra phones that are not present
+in the target language phoneset.
+
+This module implements 3 alternative solutions the user can choose when
+initializing the espeak backend:
+- 'keep-flags' preserves the language switch flags,
+- 'remove-flags' removes the flags (.) but preserves the words with alternative
+  phoneset,
+- 'remove-utterance' removes the utterances where flags are detected.
+
+"""
+
+import abc
+import re
+from logging import Logger
+from typing import List, Tuple
+from typing_extensions import TypeAlias, Literal
+
+LanguageSwitch: TypeAlias = Literal['keep-flags', 'remove-flags', 'remove-utterance']
+
+
+def get_language_switch_processor(mode: LanguageSwitch, logger: Logger, language: str) -> 'BaseLanguageSwitch':
+    """Returns a language switch processor initialized from `mode`
+
+    The `mode` can be one of the following:
+    - 'keep-flags' to preserve the switch flags
+    - 'remove-flags' to suppress the switch flags
+    - 'remove-utterance' to suppress the entire utterance
+
+    Raises a RuntimeError if the `mode` is unknown.
+
+    """
+    processors = {
+        'keep-flags': KeepFlags,
+        'remove-flags': RemoveFlags,
+        'remove-utterance': RemoveUtterances}
+
+    try:
+        return processors[mode](logger, language)
+    except KeyError:
+        raise RuntimeError(
+            f'mode "{mode}" invalid, must be in {", ".join(processors.keys())}'
+        ) from None
+
+
+class BaseLanguageSwitch(abc.ABC):
+    """The base class for language switch processors
+
+    Parameters
+    ----------
+    logger (logging.Logger) : a logger instance to send warnings when language
+        switches are detected.
+    language (str) : the language code currently in use by the phonemizer, to
+        customize warning content
+
+    """
+    # a regular expression to find language switch flags in espeak output,
+    # Switches have the following form (here a switch from English to French):
+    # "something (fr)quelque chose(en) another thing".
+    _ESPEAK_FLAGS_RE = re.compile(r'\(.+?\)')
+
+    def __init__(self, logger: Logger, language: str):
+        self._logger = logger
+        self._language = language
+
+    @classmethod
+    def is_language_switch(cls, utterance: str) -> bool:
+        """Returns True is a language switch is present in the `utterance`"""
+        return bool(cls._ESPEAK_FLAGS_RE.search(utterance))
+
+    @classmethod
+    @abc.abstractmethod
+    def process(cls, utterance: str) -> Tuple[str, bool]:
+        """Detects and process language switches according to the mode
+
+        This method is called on each utterance as a phonemization
+        post-processing step.
+
+        Returns
+        -------
+        processed_utterance (str) : the utterance either preserved, deleted (as
+            '') or with the switch removed
+        has_switch (bool): True if a language switch flag is found in the
+            `utterance` and False otherwise
+
+        """
+
+    @abc.abstractmethod
+    def warning(self, switches: List[int]):
+        """Sends warnings to the logger with recorded language switches
+
+        This method is called a single time at the very end of the
+        phonemization process.
+
+        Parameters
+        ----------
+        switches (list of int) : the line numbers where language switches has
+            been detected during phonemization
+
+        """
+
+
+class KeepFlags(BaseLanguageSwitch):
+    """Preserves utterances even if language switch flags are present"""
+
+    @classmethod
+    def process(cls, utterance: str) -> Tuple[str, bool]:
+        return utterance, cls.is_language_switch(utterance)
+
+    def warning(self, switches: List[int]):
+        if not switches:
+            return
+
+        nswitches = len(switches)
+        self._logger.warning(
+            '%s utterances containing language switches '
+            'on lines %s', nswitches,
+            ', '.join(str(switch) for switch in sorted(switches)))
+        self._logger.warning(
+            'extra phones may appear in the "%s" phoneset', self._language)
+        self._logger.warning(
+            'language switch flags have been kept '
+            '(applying "keep-flags" policy)')
+
+
+class RemoveFlags(BaseLanguageSwitch):
+    """Removes the language switch flags when detected"""
+
+    @classmethod
+    def process(cls, utterance: str) -> Tuple[str, bool]:
+        if cls.is_language_switch(utterance):
+            # remove all the (lang) flags in the current utterance
+            return re.sub(cls._ESPEAK_FLAGS_RE, '', utterance), True
+        return utterance, False
+
+    def warning(self, switches: List[int]):
+        if not switches:
+            return
+
+        nswitches = len(switches)
+        self._logger.warning(
+            '%s utterances containing language switches '
+            'on lines %s', nswitches,
+            ', '.join(str(switch) for switch in sorted(switches)))
+        self._logger.warning(
+            'extra phones may appear in the "%s" phoneset', self._language)
+        self._logger.warning(
+            'language switch flags have been removed '
+            '(applying "remove-flags" policy)')
+
+
+class RemoveUtterances(BaseLanguageSwitch):
+    """Remove the entire utterance when a language switch flag is detected"""
+
+    @classmethod
+    def process(cls, utterance: str) -> Tuple[str, bool]:
+        if cls.is_language_switch(utterance):
+            # drop the entire utterance
+            return '', True
+        return utterance, False
+
+    def warning(self, switches: List[int]):
+        if not switches:
+            return
+
+        nswitches = len(switches)
+        self._logger.warning(
+            'removed %s utterances containing language switches '
+            '(applying "remove-utterance" policy)', nswitches)
@@ -0,0 +1,108 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Mbrola backend for the phonemizer"""
+
+import pathlib
+import shutil
+import sys
+from logging import Logger
+from pathlib import Path
+from typing import Union, Optional, List, Dict
+
+from phonemizer.backend.espeak.base import BaseEspeakBackend
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+from phonemizer.separator import Separator
+
+
+class EspeakMbrolaBackend(BaseEspeakBackend):
+    """Espeak-mbrola backend for the phonemizer"""
+    # this will be initialized once, at the first call to supported_languages()
+    _supported_languages = None
+
+    def __init__(self, language: str, logger: Optional[Logger] = None):
+        super().__init__(language, logger=logger)
+        self._espeak.set_voice(language)
+
+    @staticmethod
+    def name():
+        return 'espeak-mbrola'
+
+    @classmethod
+    def is_available(cls) -> bool:
+        """Mbrola backend is available for espeak>=1.49"""
+        return (
+                BaseEspeakBackend.is_available() and
+                shutil.which('mbrola') and
+                BaseEspeakBackend.is_espeak_ng())
+
+    @classmethod
+    def _all_supported_languages(cls):
+        # retrieve the mbrola voices. This voices must be installed separately.
+        voices = EspeakWrapper().available_voices('mbrola')
+        return {voice.identifier[3:]: voice.name for voice in voices}
+
+    @classmethod
+    def _is_language_installed(cls, language: str, data_path: Union[str, Path]) \
+            -> bool:
+        """Returns True if the required mbrola voice is installed"""
+        # this is a reimplementation of LoadMbrolaTable from espeak
+        # synth_mbrola.h sources
+        voice = language[3:]  # remove mb- prefix
+
+        if pathlib.Path(data_path / 'mbrola' / voice).is_file():
+            return True  # pragma: nocover
+
+        if sys.platform != 'win32':
+            candidates = [
+                f'/usr/share/mbrola/{voice}',
+                f'/usr/share/mbrola/{voice}/{voice}',
+                f'/usr/share/mbrola/voices/{voice}']
+            for candidate in candidates:
+                if pathlib.Path(candidate).is_file():
+                    return True
+
+        return False
+
+    @classmethod
+    def supported_languages(cls) -> Dict[str, str]:  # pragma: nocover
+        """Returns the list of installed mbrola voices"""
+        if cls._supported_languages is None:
+            data_path = EspeakWrapper().data_path
+            cls._supported_languages = {
+                k: v for k, v in cls._all_supported_languages().items()
+                if cls._is_language_installed(k, data_path)}
+        return cls._supported_languages
+
+    def _phonemize_aux(self, text: List[str], offset: int,
+                       separator: Separator, strip: bool) -> List[str]:
+        output = []
+        for num, line in enumerate(text, start=1):
+            line = self._espeak.synthetize(line)
+            line = self._postprocess_line(line, offset + num, separator, strip)
+            output.append(line)
+        return output
+
+    def _postprocess_line(self, line: str, num: int,
+                          separator: Separator, strip: bool) -> str:
+        # retrieve the phonemes with the correct SAMPA alphabet (but
+        # without word separation)
+        phonemes = (
+            phn.split('\t')[0] for phn in line.split('\n') if phn.strip())
+        phonemes = separator.phone.join(pho for pho in phonemes if pho != '_')
+
+        if not strip:
+            phonemes += separator.phone
+
+        return phonemes
@@ -0,0 +1,81 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Voice struct from Espeak API exposed to Python"""
+
+import ctypes
+
+
+# This class can be a dataclass for compatibility with python-3.6 we don't use
+# the dataclasses module.
+class EspeakVoice:
+    """A helper class to expose voice structures within C and Python"""
+
+    def __init__(self, name: str = '', language: str = '', identifier: str = ''):
+        self._name = name
+        self._language = language
+        self._identifier = identifier
+
+    @property
+    def name(self):
+        """Voice name"""
+        return self._name
+
+    @property
+    def language(self):
+        """Language code"""
+        return self._language
+
+    @property
+    def identifier(self):
+        """Path to the voice file wrt espeak data path"""
+        return self._identifier
+
+    def __eq__(self, other: 'EspeakVoice'):
+        return (
+                self.name == other.name and
+                self.language == other.language and
+                self.identifier == other.identifier)
+
+    def __hash__(self):
+        return hash((self.name, self.language, self.identifier))
+
+    class VoiceStruct(ctypes.Structure):  # pylint: disable=too-few-public-methods
+        """A helper class to fetch voices information from the espeak library.
+
+        The espeak_VOICE struct is defined in speak_lib.h from the espeak code.
+        Here we use only name (voice name), languages (language code) and
+        identifier (voice file) information.
+
+        """
+        _fields_ = [
+            ('name', ctypes.c_char_p),
+            ('languages', ctypes.c_char_p),
+            ('identifier', ctypes.c_char_p)]
+
+    def to_ctypes(self):
+        """Converts the Voice instance to  an espeak ctypes structure"""
+        return self.VoiceStruct(
+            self.name.encode('utf8') if self.name else None,
+            self.language.encode('utf8') if self.language else None,
+            self.identifier.encode('utf8') if self.identifier else None)
+
+    @classmethod
+    def from_ctypes(cls, struct: VoiceStruct):
+        """Returns a Voice instance built from an espeak ctypes structure"""
+        return cls(
+            name=(struct.name or b'').decode(),
+            # discard a useless char prepended by espeak
+            language=(struct.languages or b'0').decode()[1:],
+            identifier=(struct.identifier or b'').decode())
@@ -0,0 +1,152 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Manages words count mismatches for the espeak backend"""
+
+import abc
+import re
+from logging import Logger
+from typing import List, Tuple
+
+from typing_extensions import TypeAlias, Literal, Union
+
+from phonemizer.separator import Separator
+
+
+WordMismatch: TypeAlias = Literal["warn", "ignore"]
+
+
+def get_words_mismatch_processor(mode: WordMismatch, logger: Logger) -> 'BaseWordsMismatch':
+    """Returns a word count mismatch processor according to `mode`
+
+    The `mode` can be one of the following:
+    - `ignore` to ignore words mismatches
+    - `warn` to display a warning on each mismatched utterance
+    - `remove` to remove any utterance containing a words mismatch
+
+    Raises a RuntimeError if the `mode` is unknown.
+
+    """
+    processors = {
+        'ignore': Ignore,
+        'warn': Warn,
+        'remove': Remove}
+
+    try:
+        return processors[mode](logger)
+    except KeyError:
+        raise RuntimeError(
+            f'mode {mode} invalid, must be in {", ".join(processors.keys())}'
+        ) from None
+
+
+class BaseWordsMismatch(abc.ABC):
+    """The base class of all word count mismatch processors"""
+    _RE_SPACES = re.compile(r'\s+')
+
+    def __init__(self, logger: Logger):
+        self._logger = logger
+        self._count_txt = []
+        self._count_phn = []
+
+    @classmethod
+    def _count_words(
+            cls,
+            text: List[str],
+            wordsep: Union[str, re.Pattern] = _RE_SPACES) -> List[int]:
+        """Return the number of words contained in each line of `text`"""
+        if not isinstance(wordsep, re.Pattern):
+            wordsep = re.escape(wordsep)
+
+        return [
+            len([w for w in re.split(wordsep, line.strip()) if w])
+            for line in text]
+
+    def _mismatched_lines(self) -> List[Tuple[int, int, int]]:
+        """Returns a list of (num_line, nwords_input, nwords_output)
+
+        Consider only the lines where nwords_input != nwords_output. Raises a
+        RuntimeError if input and output do not have the same number of lines.
+
+        """
+        if len(self._count_txt) != len(self._count_phn):
+            raise RuntimeError(  # pragma: nocover
+                f'number of lines in input and output must be equal, '
+                f'we have: input={len(self._count_txt)}, '
+                f'output={len(self._count_phn)}')
+
+        return [
+            (n, t, p) for n, (t, p) in
+            enumerate(zip(self._count_txt, self._count_phn))
+            if t != p]
+
+    def _resume(self, nmismatch: int, nlines: int):
+        """Logs a high level undetailed warning"""
+        if nmismatch:
+            self._logger.warning(
+                'words count mismatch on %s%% of the lines (%s/%s)',
+                round(nmismatch / nlines, 2) * 100, nmismatch, nlines)
+
+    def count_text(self, text: List[str]):
+        """Stores the number of words in each input line"""
+        self._count_txt = self._count_words(text)
+
+    def count_phonemized(self, text: List[str], separator: Separator):
+        """Stores the number of words in each output line"""
+        self._count_phn = self._count_words(text, separator.word)
+
+    @abc.abstractmethod
+    def process(self, text: List[str]) -> List[str]:
+        """Detects and process word count misatches according to the mode
+
+        This method is called at the very end of phonemization, during
+        post-processing.
+
+        """
+
+
+class Ignore(BaseWordsMismatch):
+    """Ignores word count mismatches"""
+
+    def process(self, text: List[str]) -> List[str]:
+        self._resume(len(self._mismatched_lines()), len(text))
+        return text
+
+
+class Warn(BaseWordsMismatch):
+    """Warns on every mismatch detected"""
+
+    def process(self, text: List[str]) -> List[str]:
+        mismatch = self._mismatched_lines()
+        for num, ntxt, nphn in mismatch:
+            self._logger.warning(
+                'words count mismatch on line %s '
+                '(expected %s words but get %s)',
+                num + 1, ntxt, nphn)
+
+        self._resume(len(mismatch), len(text))
+        return text
+
+
+class Remove(BaseWordsMismatch):
+    """Removes any utterance containing a word count mismatch"""
+
+    def process(self, text: List[str]) -> List[str]:
+        mismatch = [line[0] for line in self._mismatched_lines()]
+        self._resume(len(mismatch), len(text))
+        self._logger.warning('removing the mismatched lines')
+
+        for index in mismatch:
+            text[index] = ''
+        return text
@@ -0,0 +1,370 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Wrapper on espeak-ng library"""
+
+import ctypes
+import ctypes.util
+import functools
+import os
+import pathlib
+import sys
+import tempfile
+import weakref
+from typing import Tuple, Dict
+
+from phonemizer.backend.espeak.api import EspeakAPI
+from phonemizer.backend.espeak.voice import EspeakVoice
+
+
+class EspeakWrapper:
+    """Wrapper on espeak shared library
+
+    The aim of this wrapper is not to be exhaustive but to encapsulate the
+    espeak functions required for phonemization. It relies on a espeak shared
+    library (*.so on Linux, *.dylib on Mac and *.dll on Windows) that must be
+    installed on the system.
+
+    Use the function `EspeakWrapper.set_library()` before instanciation to
+    customize the library to use.
+
+    Raises
+    ------
+    RuntimeError if the espeak shared library cannot be loaded
+
+    """
+    # a static variable used to overload the default espeak library installed
+    # on the system. The user can choose an alternative espeak library with
+    # the method EspeakWrapper.set_library().
+    _ESPEAK_LIBRARY = None
+
+    def __init__(self):
+        # the following attributes are accessed through properties and are
+        # lazily initialized
+        self._version: Tuple[int, ...] = None
+        self._data_path = None
+        self._voice = None
+
+        # load the espeak API
+        self._espeak = EspeakAPI(self.library())
+
+        # lazy loading of attributes only required for the synthetize method
+        self._libc_ = None
+        self._tempfile_ = None
+
+    @property
+    def _libc(self):
+        if self._libc_ is None:
+            self._libc_ = (
+                ctypes.windll.msvcrt if sys.platform == 'win32' else
+                ctypes.cdll.LoadLibrary(ctypes.util.find_library('c')))
+        return self._libc_
+
+    @property
+    def _tempfile(self):
+        if self._tempfile_ is None:
+            # this will automatically removed at exit
+            # pylint: disable=consider-using-with
+            self._tempfile_ = tempfile.NamedTemporaryFile()
+            weakref.finalize(self._tempfile_, self._tempfile_.close)
+        return self._tempfile_
+
+    def __getstate__(self):
+        """For pickling, when phonemizing on multiple jobs"""
+        return {
+            'version': self._version,
+            'data_path': self._data_path,
+            'voice': self._voice}
+
+    def __setstate__(self, state: Dict):
+        """For unpickling, when phonemizing on multiple jobs"""
+        self.__init__()
+        self._version = state['version']
+        self._data_path = state['data_path']
+        self._voice = state['voice']
+        if self._voice:
+            if 'mb' in self._voice.identifier:  # mbrola voice
+                self.set_voice(self._voice.identifier[3:])
+            else:
+                self.set_voice(self._voice.language)
+
+    @classmethod
+    def set_library(cls, library: str):
+        """Sets the espeak backend to use `library`
+
+        If this is not set, the backend uses the default espeak shared library
+        from the system installation.
+
+        Parameters
+        ----------
+        library (str or None) : the path to the espeak shared library to use as
+          backend. Set `library` to None to restore the default.
+
+        """
+        cls._ESPEAK_LIBRARY = library
+
+    @classmethod
+    def library(cls):
+        """Returns the espeak library used as backend
+
+        The following precedence rule applies for library lookup:
+
+        1. As specified by BaseEspeakBackend.set_library()
+        2. Or as specified by the environment variable
+           PHONEMIZER_ESPEAK_LIBRARY
+        3. Or the default espeak library found on the system
+
+        Raises
+        ------
+        RuntimeError if the espeak library cannot be found or if the
+          environment variable PHONEMIZER_ESPEAK_LIBRARY is set to a
+          non-readable file
+
+        """
+        if cls._ESPEAK_LIBRARY:
+            return cls._ESPEAK_LIBRARY
+
+        if 'PHONEMIZER_ESPEAK_LIBRARY' in os.environ:
+            library = pathlib.Path(os.environ['PHONEMIZER_ESPEAK_LIBRARY'])
+            if not (library.is_file() and os.access(library, os.R_OK)):
+                raise RuntimeError(  # pragma: nocover
+                    f'PHONEMIZER_ESPEAK_LIBRARY={library} '
+                    f'is not a readable file')
+            return library.resolve()
+
+        library = (
+                ctypes.util.find_library('espeak-ng') or
+                ctypes.util.find_library('espeak'))
+        if not library:  # pragma: nocover
+            raise RuntimeError(
+                'failed to find espeak library')
+        return library
+
+    def _fetch_version_and_path(self):
+        """Initializes version and dapa path from the espeak library"""
+        version, data_path = self._espeak.info()
+
+        # pylint: disable=no-member
+        self._data_path = pathlib.Path(data_path.decode())
+        if not self._data_path.is_dir():  # pragma: nocover
+            raise RuntimeError('failed to retrieve espeak data directory')
+
+        # espeak-1.48 appends the release date to version number, here we
+        # simply ignore it
+        version = version.decode().strip().split(' ')[0].replace('-dev', '')
+        self._version = tuple(int(v) for v in version.split('.'))
+
+    @property
+    def version(self) -> Tuple[int, int, int]:
+        """The espeak version as a tuple of integers (major, minor, patch)"""
+        if self._version is None:
+            self._fetch_version_and_path()
+        return self._version
+
+    @property
+    def library_path(self):
+        """The espeak library as a pathlib.Path instance"""
+        return self._espeak.library_path
+
+    @property
+    def data_path(self):
+        """The espeak data directory as a pathlib.Path instance"""
+        if self._data_path is None:
+            self._fetch_version_and_path()
+        return self._data_path
+
+    @property
+    def voice(self):
+        """The configured voice as an EspeakVoice instance
+
+        If `set_voice` has not been called, returns None
+
+        """
+        return self._voice
+
+    @functools.lru_cache(maxsize=None)
+    def available_voices(self, name=None):
+        """Voices available for phonemization, as a list of `EspeakVoice`"""
+        if name:
+            name = EspeakVoice(language=name).to_ctypes()
+        voices = self._espeak.list_voices(name or None)
+
+        index = 0
+        available_voices = []
+        # voices is an array to pointers, terminated by None
+        while voices[index]:
+            voice = voices[index].contents
+            available_voices.append(EspeakVoice(
+                name=os.fsdecode(voice.name).replace('_', ' '),
+                language=os.fsdecode(voice.languages)[1:],
+                identifier=os.fsdecode(voice.identifier)))
+            index += 1
+        return available_voices
+
+    def set_voice(self, voice_code):
+        """Setup the voice to use for phonemization
+
+        Parameters
+        ----------
+        voice_code (str) : Must be a valid language code that is actually
+          supported by espeak
+
+        Raises
+        ------
+        RuntimeError if the required voice cannot be initialized
+
+        """
+        if 'mb' in voice_code:
+            # this is an mbrola voice code. Select the voice by using
+            # identifier in the format 'mb/{voice_code}'
+            available = {
+                voice.identifier[3:]: voice.identifier
+                for voice in self.available_voices('mbrola')}
+        else:
+            # this are espeak voices. Select the voice using it's attached
+            # language code. Consider only the first voice of a given code as
+            # they are sorted by relevancy
+            available = {}
+            for voice in self.available_voices():
+                if voice.language not in available:
+                    available[voice.language] = voice.identifier
+
+        try:
+            voice_name = available[voice_code]
+        except KeyError:
+            raise RuntimeError(f'invalid voice code "{voice_code}"') from None
+
+        if self._espeak.set_voice_by_name(voice_name.encode('utf8')) != 0:
+            raise RuntimeError(  # pragma: nocover
+                f'failed to load voice "{voice_code}"')
+
+        voice = self._get_voice()
+        if not voice:  # pragma: nocover
+            raise RuntimeError(f'failed to load voice "{voice_code}"')
+        self._voice = voice
+
+    def _get_voice(self):
+        """Returns the current voice used for phonemization
+
+        If no voice has been set up, returns None.
+
+        """
+        voice = self._espeak.get_current_voice()
+        if voice.name:
+            return EspeakVoice.from_ctypes(voice)
+        return None  # pragma: nocover
+
+    def text_to_phonemes(self, text: str, tie: bool = False) -> str:
+        """Translates a text into phonemes, must call set_voice() first.
+
+        This method is used by the Espeak backend. Wrapper on the
+        espeak_TextToPhonemes function.
+
+        Parameters
+        ----------
+        text (str) : the text to phonemize
+
+        tie (bool, optional) : When True use a '͡' character between
+          consecutive characters of a single phoneme. Else separate phoneme
+          with '_'. This option requires espeak>=1.49. Default to False.
+
+        Returns
+        -------
+        phonemes (str) : the phonemes for the text encoded in IPA, with '_' as
+          phonemes separator (excepted if ``tie`` is True) and ' ' as word
+          separator.
+
+        """
+        if self.voice is None:  # pragma: nocover
+            raise RuntimeError('no voice specified')
+
+        if tie and self.version <= (1, 48, 3):
+            raise RuntimeError(  # pragma: nocover
+                'tie option only compatible with espeak>=1.49')
+
+        # from Python string to C void** (a pointer to a pointer to chars)
+        text_ptr = ctypes.pointer(ctypes.c_char_p(text.encode('utf8')))
+
+        # input text is encoded as UTF8
+        text_mode = 1
+
+        # output phonemes in IPA and separated by _, or with a tie character if
+        # required. See comments for the function espeak_TextToPhonemes in
+        # speak_lib.h of the espeak sources for details.
+        if self.version <= (1, 48, 3):  # pragma: nocover
+            phonemes_mode = 0x03 | 0x01 << 4
+        elif tie:
+            phonemes_mode = 0x02 | 0x01 << 7 | ord('͡') << 8
+        else:
+            phonemes_mode = ord('_') << 8 | 0x02
+
+        result = []
+        while text_ptr.contents.value is not None:
+            phonemes = self._espeak.text_to_phonemes(
+                text_ptr, text_mode, phonemes_mode)
+            if phonemes:
+                result.append(phonemes.decode())
+        return ' '.join(result)
+
+    def synthetize(self, text: str):
+        """Translates a text into phonemes, must call set_voice() first.
+
+        Only compatible with espeak>=1.49. This method is used by the
+        EspeakMbrola backend. Wrapper on the espeak_Synthesize function.
+
+        Parameters
+        ----------
+        text (str) : the text to phonemize
+
+        Returns
+        -------
+        phonemes (str) : the phonemes for the text encoded in SAMPA, with '_'
+          as phonemes separator and no word separation.
+
+        """
+
+        if self.version < (1, 49):  # pragma: nocover
+            raise RuntimeError('not compatible with espeak<=1.48')
+        if self.voice is None:  # pragma: nocover
+            raise RuntimeError('no voice specified')
+
+        # init libc fopen and fclose functions
+        self._libc.fopen.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
+        self._libc.fopen.restype = ctypes.c_void_p
+        self._libc.fclose.argtypes = [ctypes.c_void_p]
+        self._libc.fclose.restype = ctypes.c_int
+
+        # output phonemes in SAMPA and separated by _. Write the result to a
+        # tempfile which is read back after phonemization (seems not possible
+        # to redirect to stdout). See comments for the function
+        # espeak_SetPhonemeTrace in speak_lib.h of the espeak sources for
+        # details.
+        self._tempfile.truncate(0)
+        file_p = self._libc.fopen(
+            self._tempfile.name.encode(),
+            self._tempfile.mode.encode())
+
+        self._espeak.set_phoneme_trace(0x01 << 4 | ord('_') << 8, file_p)
+        status = self._espeak.synthetize(
+            ctypes.c_char_p(text.encode('utf8')),
+            ctypes.c_size_t(len(text) + 1),
+            ctypes.c_uint(0x01))
+        self._libc.fclose(file_p)  # because flush does not work...
+
+        if status != 0:  # pragma: nocover
+            raise RuntimeError('failed to synthetize')
+
+        self._tempfile.seek(0)
+        phonemized = self._tempfile.read().decode().strip()
+        return phonemized
@@ -0,0 +1,15 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonologizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonologizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonologizer. If not, see <http://www.gnu.org/licenses/>.
+"""Phonemizer module for festival backend implementation"""
@@ -0,0 +1,334 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Festival backend for the phonemizer"""
+
+import os
+import pathlib
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+from logging import Logger
+from pathlib import Path
+from typing import Optional, Dict, List, IO, Union, Pattern
+
+from phonemizer.backend.base import BaseBackend
+from phonemizer.backend.festival import lispy
+from phonemizer.separator import Separator
+from phonemizer.utils import get_package_resource, version_as_tuple
+
+
+class FestivalBackend(BaseBackend):
+    """Festival backend for the phonemizer"""
+    # a static variable used to overload the default festival binary installed
+    # on the system. The user can choose an alternative festival binary with
+    # the method FestivalBackend.set_executable().
+    _FESTIVAL_EXECUTABLE = None
+
+    def __init__(self, language: str,
+                 punctuation_marks: Optional[Union[str, Pattern]] = None,
+                 preserve_punctuation: bool = False,
+                 logger: Optional[Logger] = None):
+        super().__init__(
+            language,
+            punctuation_marks=punctuation_marks,
+            preserve_punctuation=preserve_punctuation,
+            logger=logger)
+
+        self.logger.debug('festival executable is %s', self.executable())
+
+        # the Scheme script to be send to festival
+        script_file = get_package_resource('festival/phonemize.scm')
+        with open(script_file, 'r') as fscript:
+            self._script = fscript.read()
+        self.logger.debug('loaded %s', script_file)
+
+    @staticmethod
+    def name():
+        return 'festival'
+
+    @classmethod
+    def set_executable(cls, executable: str):
+        """Sets the festival backend to use `executable`
+
+        If this is not set, the backend uses the default festival executable
+        from the system installation.
+
+        Parameters
+        ----------
+        executable (str) : the path to the festival executable to use as
+            backend. Set `executable` to None to restore the default.
+
+        Raises
+        ------
+        RuntimeError if `executable` is not an executable file.
+
+        """
+        if executable is None:
+            cls._FESTIVAL_EXECUTABLE = None
+            return
+
+        executable = pathlib.Path(executable)
+        if not (executable.is_file() and os.access(executable, os.X_OK)):
+            raise RuntimeError(
+                f'{executable} is not an executable file')
+
+        cls._FESTIVAL_EXECUTABLE = executable.resolve()
+
+    @classmethod
+    def executable(cls) -> Path:
+        """Returns the absolute path to the festival executable used as backend
+
+        The following precedence rule applies for executable lookup:
+
+        1. As specified by FestivalBackend.set_executable()
+        2. Or as specified by the environment variable
+           PHONEMIZER_FESTIVAL_EXECUTABLE
+        3. Or the default 'festival' binary found on the system with ``shutil.which('festival')``
+
+
+        Raises
+        ------
+        RuntimeError
+            if the festival executable cannot be found or if the
+            environment variable PHONEMIZER_FESTIVAL_EXECUTABLE is set to a
+            non-executable file
+
+        """
+        if cls._FESTIVAL_EXECUTABLE:
+            return cls._FESTIVAL_EXECUTABLE
+
+        if 'PHONEMIZER_FESTIVAL_EXECUTABLE' in os.environ:
+            executable = pathlib.Path(os.environ[
+                                          'PHONEMIZER_FESTIVAL_EXECUTABLE'])
+            if not (
+                    executable.is_file()
+                    and os.access(executable, mode=os.X_OK)
+            ):
+                raise RuntimeError(
+                    f'PHONEMIZER_FESTIVAL_EXECUTABLE={executable} '
+                    f'is not an executable file')
+            return executable.resolve()
+
+        executable = shutil.which('festival')
+        if not executable:  # pragma: nocover
+            raise RuntimeError(
+                'failed to find festival executable')
+        return Path(executable).resolve()
+
+    @classmethod
+    def is_available(cls):
+        """True if the festival executable is available, False otherwise"""
+        try:
+            cls.executable()
+        except RuntimeError:  # pragma: nocover
+            return False
+        return True
+
+    @classmethod
+    def version(cls):
+        """Festival version as a tupe of integers (major, minor, patch)
+
+        Raises
+        ------
+        RuntimeError if FestivalBackend.is_available() is False or if the
+            version cannot be extracted for some reason.
+
+        """
+
+        festival = cls.executable()
+
+        # the full version version string includes extra information
+        # we don't need
+        long_version = subprocess.check_output(
+            [festival, '--version']).decode('latin1').strip()
+
+        # extract the version number with a regular expression
+        festival_version_re = r'.* ([0-9\.]+[0-9]):'
+        try:
+            version = re.match(festival_version_re, long_version).group(1)
+        except AttributeError:
+            raise RuntimeError(
+                f'cannot extract festival version from {festival}') from None
+
+        return version_as_tuple(version)
+
+    @staticmethod
+    def supported_languages() -> Dict[str, str]:
+        """A dictionnary of language codes -> name supported by festival
+
+        Actually only en-us (American English) is supported.
+
+        """
+        return {'en-us': 'english-us'}
+
+    # pylint: disable=unused-argument
+    def _phonemize_aux(self, text: List[str], offset: int, separator: Separator, strip: bool) -> List[str]:
+        """Return a phonemized version of `text` with festival
+
+        This function is a wrapper on festival, a text to speech
+        program, allowing simple phonemization of some English
+        text. The US phoneset we use is the default one in festival,
+        as described at http://www.festvox.org/bsv/c4711.html
+
+        Any opening and closing parenthesis in `text` are removed, as
+        they interfer with the Scheme expression syntax. Moreover
+        double quotes are replaced by simple quotes because double
+        quotes denotes utterances boundaries in festival.
+
+        Parsing a ill-formed Scheme expression during post-processing
+        (typically with unbalanced parenthesis) raises an IndexError.
+
+        """
+        text = self._preprocess(text)
+        if len(text) == 0:
+            return []
+        text = self._process(text)
+        text = self._postprocess(text, separator, strip)
+        return text
+
+    @staticmethod
+    def _double_quoted(line: str) -> str:
+        """Return the string `line` surrounded by double quotes"""
+        return '"' + line + '"'
+
+    @staticmethod
+    def _cleaned(line: str):
+        """Remove 'forbidden' characters from the line"""
+        # special case (very unlikely but causes a crash in festival)
+        # where a line is only made of '
+        if set(line) == set("'"):
+            line = ''
+
+        # remove forbidden characters (reserved for scheme, ie festival
+        # scripting language)
+        return line.replace('"', '').replace('(', '').replace(')', '').strip()
+
+    @classmethod
+    def _preprocess(cls, text: List[str]):
+        """Returns the contents of `text` formatted for festival input
+
+        This function adds double quotes to begining and end of each
+        line in text, if not already presents. The returned result is
+        a multiline string. Empty lines in inputs are ignored.
+
+        """
+        cleaned_text = (
+            cls._cleaned(line) for line in text if line != '')
+
+        return '\n'.join(
+            cls._double_quoted(line) for line in cleaned_text if line != '')
+
+    def _process(self, text: str):
+        """Return the raw phonemization of `text`
+
+        This function delegates to festival the text analysis and
+        syllabic structure extraction.
+
+        Return a string containing the "SylStructure" relation tree of
+        the text, as a scheme expression.
+
+        """
+        with tempfile.NamedTemporaryFile('w+', delete=False) as data:
+            try:
+                # save the text as a tempfile
+                data.write(text)
+                data.close()
+
+                # fix the path name for windows
+                name = data.name
+                if sys.platform == 'win32':  # pragma: nocover
+                    name = name.replace('\\', '\\\\')
+
+                with tempfile.NamedTemporaryFile('w+', delete=False) as scm:
+                    try:
+                        scm.write(self._script.format(name))
+                        scm.close()
+
+                        cmd = f'{self.executable()} -b {scm.name}'
+                        if self.logger:
+                            self.logger.debug('running %s', cmd)
+
+                        # redirect stderr to a tempfile and displaying it only
+                        # on errors. Messages are something like: "UniSyn:
+                        # using default diphone ax-ax for y-pau". This is
+                        # related to wave synthesis (done by festival during
+                        # phonemization).
+                        with tempfile.TemporaryFile('w+') as fstderr:
+                            return self._run_festival(cmd, fstderr)
+                    finally:
+                        os.remove(scm.name)
+            finally:
+                os.remove(data.name)
+
+    @staticmethod
+    def _run_festival(cmd: str, fstderr: IO) -> str:
+        """Runs the festival command for phonemization
+
+        Returns the raw phonemized output (need to be postprocesses). Raises a
+        RuntimeError if festival fails.
+
+        """
+        try:
+            output = subprocess.check_output(
+                shlex.split(cmd, posix=False), stderr=fstderr)
+
+            # festival seems to use latin1 and not utf8
+            return re.sub(' +', ' ', output.decode('latin1'))
+
+        except subprocess.CalledProcessError as err:  # pragma: nocover
+            fstderr.seek(0)
+            raise RuntimeError(
+                f'Command "{cmd}" returned exit status {err.returncode}, '
+                f'output is:\n{fstderr.read()}') from None
+
+    @staticmethod
+    def _postprocess_syll(syll: List[str], separator: Separator, strip: bool) -> str:
+        """Parse a syllable from festival to phonemized output"""
+        sep = separator.phone
+        out = (phone[0][0].replace('"', '') for phone in syll[1:])
+        out = sep.join(o for o in out if o != '')
+        return out if strip else out + sep
+
+    @classmethod
+    def _postprocess_word(cls, word: List[List[str]], separator: Separator, strip: bool) -> str:
+        """Parse a word from festival to phonemized output"""
+        sep = separator.syllable
+        out = sep.join(
+            cls._postprocess_syll(syll, separator, strip)
+            for syll in word[1:])
+        return out if strip else out + sep
+
+    @classmethod
+    def _postprocess_line(cls, line: str, separator, strip: bool) -> str:
+        """Parse a line from festival to phonemized output"""
+        sep = separator.word
+        out = []
+        for word in lispy.parse(line):
+            word = cls._postprocess_word(word, separator, strip)
+            if word != '':
+                out.append(word)
+        out = sep.join(out)
+
+        return out if strip else out + sep
+
+    @classmethod
+    def _postprocess(cls, tree: str, separator: Separator, strip: bool) -> List[str]:
+        """Conversion from festival syllable tree to desired format"""
+        return [cls._postprocess_line(line, separator, strip)
+                for line in tree.split('\n')
+                if line not in ['', '(nil nil nil)']]
@@ -0,0 +1,66 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Parse a Scheme expression as a nested list
+
+The main function of this module is lispy.parse, other ones should be
+considered private. This module is a dependency of the festival
+backend.
+
+From http://www.norvig.com/lispy.html
+
+"""
+from typing import List, Union
+
+
+def parse(program: str):
+    """Read a Scheme expression from a string
+
+    Return a nested list
+
+    Raises an IndexError if the expression is not valid scheme
+    (unbalanced parenthesis).
+
+    >>> parse('(+ 2 (* 5 2))')
+    ['+', '2', ['*', '5', '2']]
+
+    """
+    return _read_from_tokens(_tokenize(program))
+
+
+def _tokenize(chars: str) -> List[str]:
+    """Convert a string of characters into a list of tokens."""
+    return chars.replace('(', ' ( ').replace(')', ' ) ').split()
+
+
+Expr = Union[str, List['Expr']]
+
+
+def _read_from_tokens(tokens: List[str]) -> Expr:
+    """Read an expression from a sequence of tokens"""
+    if len(tokens) == 0:  # pragma: nocover
+        raise SyntaxError('unexpected EOF while reading')
+
+    token = tokens.pop(0)
+    if token == '(':
+        expr = []
+        while tokens[0] != ')':
+            expr.append(_read_from_tokens(tokens))
+        tokens.pop(0)  # pop off ')'
+        return expr
+
+    if token == ')':  # pragma: nocover
+        raise SyntaxError('unexpected )')
+
+    return token
@@ -0,0 +1,143 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Segments backend for the phonemizer"""
+
+import pathlib
+from logging import Logger
+from typing import Optional, Dict, List, Union, Pattern
+
+import segments
+
+from phonemizer.backend.base import BaseBackend
+from phonemizer.separator import Separator
+from phonemizer.utils import get_package_resource, version_as_tuple
+
+
+class SegmentsBackend(BaseBackend):
+    """Segments backends for the phonemizer
+
+    The phonemize method will raise a ValueError when parsing an
+    unknown morpheme.
+
+    """
+
+    def __init__(self, language: str,
+                 punctuation_marks: Optional[Union[str, Pattern]] = None,
+                 preserve_punctuation: bool = False,
+                 logger: Optional[Logger] = None):
+        # will be initialized in _init_language() from super().__init__()
+        self._tokenizer: Optional[segments.Tokenizer] = None
+        super().__init__(
+            language,
+            punctuation_marks=punctuation_marks,
+            preserve_punctuation=preserve_punctuation,
+            logger=logger)
+
+    def _init_language(self, language):
+        # load the grapheme to phoneme mapping
+        profile = self._load_g2p_profile(language)
+        self._tokenizer = segments.Tokenizer(profile=profile)
+
+        # this is the language code
+        return pathlib.Path(language).stem
+
+    @staticmethod
+    def name():
+        return 'segments'
+
+    @classmethod
+    def version(cls):
+        return version_as_tuple(segments.__version__)
+
+    @classmethod
+    def is_available(cls):
+        return True
+
+    @staticmethod
+    def supported_languages():
+        """Returns a dict of language: file supported by the segments backend
+
+        The supported languages have a grapheme to phoneme conversion file
+        bundled with phonemizer. Users can also use their own file as
+        parameter of the phonemize() function.
+
+        """
+        # directory phonemizer/share/segments
+        directory = get_package_resource('segments')
+
+        # supported languages are files with the 'g2p' extension
+        return {g2p.stem: g2p
+                for g2p in directory.iterdir() if g2p.suffix == '.g2p'}
+
+    @classmethod
+    def is_supported_language(cls, language: str) -> bool:
+        if pathlib.Path(language).is_file():
+            try:
+                cls._load_g2p_profile(language)
+                return True
+            except RuntimeError:
+                return False
+        return language in cls.supported_languages()
+
+    @classmethod
+    def _load_g2p_profile(cls, language: str) -> segments.Profile:
+        """Returns a segments profile from a `language`"""
+        # make sure the g2p file exists
+        if not pathlib.Path(language).is_file():
+            try:
+                language = cls.supported_languages()[language]
+            except KeyError:
+                raise RuntimeError(
+                    f'grapheme to phoneme file not found: '
+                    f'{language}') from None
+
+        # load the mapping grapheme -> phoneme from the file, make sure all
+        # lines are well formatted
+        g2p: Dict[str, str] = {}
+        with open(language, 'r', encoding='utf8') as flang:
+            for num, line in enumerate(flang):
+                elts = line.strip().split()
+                if not len(elts) == 2:
+                    raise RuntimeError(
+                        'grapheme to phoneme file, line {} must have 2 rows '
+                        'but have {}: {}'.format(num + 1, len(elts), language))
+                g2p[elts[0]] = elts[1]
+
+        # build the segments profile from the g2p mapping
+        return segments.Profile(
+            *[{'Grapheme': k, 'mapping': v} for k, v in g2p.items()])
+
+    # pylint: disable=unused-argument
+    def _phonemize_aux(self, text: List[str], offset: int, separator: Separator, strip: bool) -> List[str]:
+        # tokenize the input text per utterance
+        phonemized = (
+            self._tokenizer(line, column='mapping', errors='strict')
+            for line in text)
+
+        # the output of segments is always strip, so we need to add
+        # token separation at the end when strip is False.
+        if not strip:
+            # add word separator at end of utterance
+            phonemized = (p + ' # ' for p in phonemized)
+            # add phoneme separator at end of word
+            phonemized = (p.replace(' # ', '  # ') for p in phonemized)
+
+        # replace default separators by our custom ones
+        phonemized = (p.replace(' # ', '#') for p in phonemized)
+        phonemized = (p.replace(' ', separator.phone) for p in phonemized)
+        phonemized = (p.replace('#', separator.word) for p in phonemized)
+
+        # return the result as a list of utterances
+        return list(phonemized)
@@ -0,0 +1,63 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Logging facilities for the phonemizer"""
+
+import logging
+import sys
+from logging import Logger
+
+
+def get_logger(verbosity: str = 'quiet', name: str = 'phonemizer') -> Logger:
+    """Returns a configured logging.Logger instance
+
+    The logger is configured to output messages on the standard error stream
+    (stderr).
+
+    Parameters
+    ----------
+    verbosity (str) : The level of verbosity, must be 'verbose' (displays
+      debug/info and warning messages), 'normal' (warnings only) or 'quiet' (do
+      not display anything).
+    name (str) : The logger name, default to 'phonemizer'
+
+    Raises
+    ------
+    RuntimeError if `verbosity` is not 'normal', 'verbose', or 'quiet'.
+
+    """
+    # make sure the verbosity argument is valid
+    valid_verbosity = ['normal', 'verbose', 'quiet']
+    if verbosity not in valid_verbosity:
+        raise RuntimeError(
+            f'verbosity is {verbosity} but must be in '
+            f'{", ".join(valid_verbosity)}')
+
+    logger = logging.getLogger(name)
+
+    # setup output to stderr
+    logger.handlers = []
+    handler = logging.StreamHandler(sys.stderr)
+
+    # setup verbosity level
+    logger.setLevel(logging.WARNING)
+    if verbosity == 'verbose':
+        logger.setLevel(logging.DEBUG)
+    elif verbosity == 'quiet':
+        handler = logging.NullHandler()
+
+    # setup messages format
+    handler.setFormatter(logging.Formatter('[%(levelname)s] %(message)s'))
+    logger.addHandler(handler)
+    return logger
@@ -0,0 +1,428 @@
+#!/usr/bin/env python
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Command-line phonemizer tool, have a 'phonemizer --help' to get in"""
+
+import argparse
+import os
+import sys
+import re
+
+from phonemizer import phonemize, separator, version, logger, punctuation
+from phonemizer.backend import BACKENDS
+
+
+class CatchExceptions:  # pragma: nocover
+    """Decorator wrapping a function in a try/except block
+
+    When an exception occurs, display a user friendly message on
+    standard output before exiting with error code 1.
+
+    The detected exceptions are ValueError, OSError, RuntimeError,
+    AssertionError and KeyboardInterrupt.
+
+    Parameters
+    ----------
+    function :
+        The function to wrap in a try/except block
+
+    """
+    def __init__(self, function):
+        self.function = function
+
+    def __call__(self):
+        """Executes the wrapped function and catch common exceptions"""
+        try:
+            self.function()
+
+        except (IOError, ValueError, OSError,
+                RuntimeError, AssertionError) as err:
+            self.exit('fatal error: {}'.format(err))
+
+        except KeyboardInterrupt:
+            self.exit('keyboard interruption, exiting')
+
+    @staticmethod
+    def exit(msg):
+        """Write `msg` on stderr and exit with error code 1"""
+        sys.stderr.write(msg.strip() + '\n')
+        sys.exit(1)
+
+
+def parse_args():
+    """Argument parser for the phonemization script"""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description='''Multilingual text to phonemes converter
+
+The 'phonemize' program allows simple phonemization of words and texts
+in many language using four backends: espeak, espeak-mbrola, festival
+and segments.
+
+- espeak is a text-to-speech software supporting multiple languages
+  and IPA (International Phonetic Alphabet) output. See
+  http://espeak.sourceforge.net or
+  https://github.com/espeak-ng/espeak-ng
+
+- espeak-mbrola uses the SAMPA phonetic alphabet, it requires mbrola to be
+  installed as well as additional mbrola voices. It does not support word or
+  syllable tokenization. See
+  https://github.com/espeak-ng/espeak-ng/blob/master/docs/mbrola.md
+
+- festival is also a text-to-speech software. Currently only American
+  English is supported and festival uses a custom phoneset
+  (http://www.festvox.org/bsv/c4711.html), but festival is the only
+  backend supporting tokenization at the syllable
+  level. See http://www.cstr.ed.ac.uk/projects/festival
+
+- segments is a Unicode tokenizer that build a phonemization from a
+  grapheme to phoneme mapping provided as a file by the user. See
+  https://github.com/cldf/segments.
+
+See the '--list-languages' option below for details on the languages
+supported by each backend.
+
+''',
+        epilog='''
+Examples:
+
+* Phonemize a US English text with espeak
+
+   $ echo 'hello world' | phonemize -l en-us -b espeak
+   həloʊ wɜːld
+
+* Phonemize a US English text with festival
+
+   $ echo 'hello world' | phonemize -l en-us -b festival
+   hhaxlow werld
+
+* Phonemize a Japanese text with segments
+
+  $ echo 'konnichiwa tsekai' | phonemize -l japanese -b segments
+  konnitʃiwa t͡sekai
+
+* Add a separator between phones
+
+  $ echo 'hello world' | phonemize -l en-us -b festival -p '-' --strip
+  hh-ax-l-ow w-er-l-d
+
+* Phonemize some French text file using espeak
+
+  $ phonemize -l fr-fr -b espeak text.txt -o phones.txt
+        ''')
+
+    # general arguments
+    parser.add_argument(
+        '-V', '--version',
+        action='store_true',
+        help='show version information and exit.')
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        '-v', '--verbose',
+        action='store_true',
+        help='write all log messages to stderr '
+        '(displays only warnings by default).')
+    group.add_argument(
+        '-q', '--quiet',
+        action='store_true',
+        help='do not display any log message, even warnings.')
+
+    parser.add_argument(
+        '-j', '--njobs',
+        type=int, metavar='<int>', default=1,
+        help='number of parallel jobs, default is %(default)s.')
+
+    # input/output arguments
+    group = parser.add_argument_group('input/output')
+    group.add_argument(
+        'input',
+        default=sys.stdin, nargs='?', metavar='<file>',
+        help='input text file to phonemize, if not specified read from stdin.')
+
+    group.add_argument(
+        '-o', '--output',
+        default=sys.stdout, metavar='<file>',
+        help='output text file to write, if not specified write to stdout.')
+
+    group.add_argument(
+        '--prepend-text',
+        default=False, const=True, nargs='?', metavar='<str>',
+        help='''prepend each line of the phonemized output text with its
+        matching input text. If a string is specified as option value, use it
+        as field separator, else use one of "|", "||", "|||", "||||" by
+        selecting the first one that is not configured as a token separator
+        (see -p/-s/-w options).''')
+
+    group.add_argument(
+        '--preserve-empty-lines',
+        action='store_true',
+        help='''preserve the empty lines in the phonemized output, default is
+        to remove them.''')
+
+    group = parser.add_argument_group('backends')
+    group.add_argument(
+        '-b', '--backend',
+        metavar='<str>', default=None,
+        choices=['espeak', 'espeak-mbrola', 'festival', 'segments'],
+        help="""the phonemization backend, must be 'espeak', 'espeak-mbrola',
+        'festival' or 'segments'. Default is 'espeak'.""")
+
+    group.add_argument(
+        '-L', '--list-languages',
+        action='store_true',
+        help="""list available languages (and exit) for the specified backend,
+        or for all backends if none selected.""")
+
+    group = parser.add_argument_group('language')
+    group.add_argument(
+        '-l', '--language',
+        metavar='<str|file>', default='en-us',
+        help='''the language code of the input text, use '--list-languages'
+        for a list of supported languages. Default is %(default)s.''')
+
+    group = parser.add_argument_group('token separators')
+    group.add_argument(
+        '-p', '--phone-separator',
+        metavar='<str>', default=separator.default_separator.phone,
+        help='phone separator, default is "%(default)s".')
+
+    group.add_argument(
+        '-w', '--word-separator',
+        metavar='<str>', default=separator.default_separator.word,
+        help='''word separator, not valid for espeak-mbrola backend,
+        default is "%(default)s".''')
+
+    group.add_argument(
+        '-s', '--syllable-separator',
+        metavar='<str>', default=separator.default_separator.syllable,
+        help='''syllable separator, only valid for festival backend,
+        this option has no effect if another backend is used.
+        Default is "%(default)s".''')
+
+    group.add_argument(
+        '--strip',
+        action='store_true',
+        help='removes the end separators in phonemized tokens.')
+
+    group = parser.add_argument_group('specific to espeak backend')
+    try:
+        espeak_library = BACKENDS['espeak'].library()
+    except RuntimeError:  # pragma: nocover
+        espeak_library = None
+
+    group.add_argument(
+        '--espeak-library',
+        default=None, type=str, metavar='<library>',
+        help=f'''the path to the espeak shared library to use (*.so on Linux,
+        *.dylib on Mac and *.dll on Windows, useful to overload the default
+        espeak version installed on the system). Default to
+        {espeak_library}. This path can also be specified
+        using the PHONEMIZER_ESPEAK_LIBRARY environment variable.''')
+    group.add_argument(
+        '--tie',
+        nargs='?', default=False, const=True, metavar='<chr>',
+        help='''when the option is set, use a tie character within multi-letter
+        phoneme names, default to U+361 (as in d͡ʒ), 'z' means ZWJ character,
+        only compatible with espeak>1.48 and incompatible with the
+        -p/--phone-separator option''')
+    group.add_argument(
+        '--with-stress',
+        action='store_true',
+        help='''when the option is set, the stresses on phonemes are present
+        (stresses characters are ˈ'ˌ). By default stresses are removed.''')
+    group.add_argument(
+        '--language-switch',
+        default='keep-flags',
+        choices=['keep-flags', 'remove-flags', 'remove-utterance'],
+        help="""espeak can pronounce some words in another language (typically
+        English) when phonemizing a text. This option setups the policy to use
+        when such a language switch occurs. Three values are available:
+        'keep-flags' (the default), 'remove-flags' or 'remove-utterance'. The
+        'keep-flags' policy keeps the language switching flags, for example
+        (en) or (jp), in the output. The 'remove-flags' policy removes them and
+        the 'remove-utterance' policy removes the whole line of text including
+        a language switch.""")
+    group.add_argument(
+        '--words-mismatch',
+        default='ignore', choices=['ignore', 'warn', 'remove'],
+        help="""espeak can join two consecutive words or drop some words,
+        yielding a word count mismatch between orthographic and phonemized
+        text. This option setups the policy to use when such a words count
+        mismatch occurs. Three values are available: 'ignore' (the default)
+        which do nothing, 'warn' which issue a warning for each mismatched
+        line, and 'remove' which remove the mismatched lines from the
+        output.""")
+
+    group = parser.add_argument_group('specific to festival backend')
+    try:
+        festival_executable = BACKENDS['festival'].executable()
+    except RuntimeError:  # pragma: nocover
+        festival_executable = None
+
+    group.add_argument(
+        '--festival-executable',
+        default=None, type=str, metavar='<executable>',
+        help=f'''the path to the festival executable to use (useful to
+        overload the default festival installed on the system). Default to
+        {festival_executable}. This path can also be specified using the
+        PHONEMIZER_FESTIVAL_EXECUTABLE environment variable.''')
+
+    group = parser.add_argument_group(
+        'punctuation processing',
+        description='not available for espeak-mbrola backend')
+    group.add_argument(
+        '--preserve-punctuation',
+        action='store_true',
+        help='''preserve the punctuation marks in the phonemized output,
+        default is to remove them.''')
+    group.add_argument(
+        '--punctuation-marks',
+        type=str, metavar='<str>',
+        default=punctuation.Punctuation.default_marks(),
+        help='''the marks to consider during punctuation processing (either
+        for removal or preservation). Default is %(default)s.''')
+    group.add_argument(
+        '--punctuation-marks-is-regex',
+        action='store_true',
+        help="""interpret the '--punctuation-marks' parameter as a regex.
+        Default is to interpret as a string.""")
+
+    return parser.parse_args()
+
+
+def list_languages(args_backend):
+    """Returns the available languages for the given `backend` as a str"""
+    for backend in BACKENDS.keys() if not args_backend else [args_backend]:
+        print(
+            f'supported languages for {backend} are:\n' +
+            '\n'.join(f'\t{k}\t->\t{v}' for k, v in sorted(
+                BACKENDS[backend].supported_languages().items())))
+
+
+def get_logger(verbose, quiet):
+    """Returns a configured logger"""
+    verbosity = 'normal'
+    if verbose:
+        verbosity = 'verbose'
+    elif quiet:
+        verbosity = 'quiet'
+    return logger.get_logger(verbosity=verbosity)
+
+
+def setup_stream(stream, mode):
+    """If `stream` is a filename, open it as a file"""
+    if isinstance(stream, str):
+        # pylint: disable=consider-using-with
+        return open(stream, mode, encoding='utf8')
+    return stream  # pragma: nocover
+
+
+@CatchExceptions
+def main():
+    """Phonemize a text from command-line arguments"""
+    args = parse_args()
+
+    # setup a custom path to espeak and festival if required (this must be done
+    # before generating the version message)
+    if args.espeak_library:
+        BACKENDS['espeak'].set_library(args.espeak_library)
+    if args.festival_executable:
+        BACKENDS['festival'].set_executable(args.festival_executable)
+
+    # display version information and exit
+    if args.version:
+        print(version.version())
+        return
+
+    # list supported languages and exit
+    if args.list_languages:
+        print(list_languages(args.backend))
+        return
+
+    # set default backend as espeak if not specified
+    args.backend = args.backend or 'espeak'
+
+    # configure logging according to --verbose/--quiet options
+    log = get_logger(args.verbose, args.quiet)
+
+    # configure input:output as a readable/writable streams
+    streamin = setup_stream(args.input, 'r')
+    log.debug('reading from %s', streamin.name)
+    streamout = setup_stream(args.output, 'w')
+    log.debug('writing to %s', streamout.name)
+
+    # configure the separator for phonemes, syllables and words.
+    if args.backend == 'espeak-mbrola':
+        log.debug('using espeak-mbrola backend: ignoring word separator')
+        sep = separator.Separator(
+            phone=args.phone_separator,
+            syllable=None,
+            word=None)
+    else:
+        sep = separator.Separator(
+            phone=args.phone_separator,
+            syllable=args.syllable_separator,
+            word=args.word_separator)
+    log.debug('separator is %s', sep)
+
+    if args.prepend_text:
+        input_output_separator = sep.input_output_separator(args.prepend_text)
+        log.debug(
+            'prepend input text to output, separator is "%s"',
+            input_output_separator)
+    else:
+        input_output_separator = False
+
+    if args.punctuation_marks_is_regex:
+        try:
+            log.debug('punctuation marks is regex %s', args.punctuation_marks)
+            args.punctuation_marks = re.compile(args.punctuation_marks)
+        except re.error:
+            # manually close the open streams for windows
+            streamin.close()
+            streamout.close()
+            raise ValueError(f"can't compile regex pattern from {args.punctuation_marks}")
+
+    # phonemize the input text
+    out = phonemize(
+        streamin.readlines(),
+        language=args.language,
+        backend=args.backend,
+        separator=sep,
+        strip=args.strip,
+        prepend_text=args.prepend_text,
+        preserve_empty_lines=args.preserve_empty_lines,
+        preserve_punctuation=args.preserve_punctuation,
+        punctuation_marks=args.punctuation_marks,
+        with_stress=args.with_stress,
+        tie=args.tie,
+        language_switch=args.language_switch,
+        words_mismatch=args.words_mismatch,
+        njobs=args.njobs,
+        logger=log)
+
+    if out and input_output_separator:
+        streamout.write(
+            os.linesep.join(
+                f'{line[0]} {input_output_separator} {line[1]}'
+                for line in out)
+            + os.linesep)
+    elif out:
+        streamout.write(os.linesep.join(out) + os.linesep)
+
+
+if __name__ == '__main__':  # pragma: nocover
+    main()
@@ -0,0 +1,328 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Provides the phonemize function
+
+To use it in your own code, type:
+
+    from phonemizer import phonemize
+
+"""
+
+import os
+import sys
+from logging import Logger
+from typing import Optional, Union, List, Pattern
+
+from typing_extensions import Literal
+
+from phonemizer.backend import BACKENDS
+from phonemizer.backend.base import BaseBackend
+from phonemizer.backend.espeak.language_switch import LanguageSwitch
+from phonemizer.backend.espeak.words_mismatch import WordMismatch
+from phonemizer.logger import get_logger
+from phonemizer.punctuation import Punctuation
+from phonemizer.separator import default_separator, Separator
+from phonemizer.utils import list2str, str2list
+
+Backend = Literal['espeak', 'espeak-mbrola', 'festival', 'segments']
+
+
+def phonemize(  # pylint: disable=too-many-arguments
+        text,
+        language: str = 'en-us',
+        backend: Backend = 'espeak',
+        separator: Optional[Separator] = default_separator,
+        strip: bool = False,
+        prepend_text: bool = False,
+        preserve_empty_lines: bool = False,
+        preserve_punctuation: bool = False,
+        punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
+        with_stress: bool = False,
+        tie: Union[bool, str] = False,
+        language_switch: LanguageSwitch = 'keep-flags',
+        words_mismatch: WordMismatch = 'ignore',
+        njobs: int = 1,
+        logger: Logger = get_logger()):
+    """Multilingual text to phonemes converter
+
+    Return a phonemized version of an input `text`, given its `language` and a
+    phonemization `backend`.
+
+    Note
+    ----
+
+    To improve the processing speed it is better to minimize the calls to this
+    function: provide the input text as a list and call phonemize() a single
+    time is much more efficient than calling it on each element of the list.
+    Indeed the initialization of the phonemization backend can be expensive,
+    especially for espeak. In one example,
+
+    Do this:
+
+    >>> text = [line1, line2, ...]
+    >>> phonemize(text, ...)
+
+    Not this:
+
+    >>> for line in text:
+    >>>     phonemize(line, ...)
+
+    Parameters
+    ----------
+
+    text: str or list of str
+        The text to be phonemized. Any empty line will
+        be ignored. If ``text`` is an str, it can be multiline (lines being
+        separated by ``\\n``). If ``text`` is a list, each element is considered as a
+        separated line. Each line is considered as a text utterance.
+
+    language: str
+        The language code of the input text, must be supported by
+        the backend. If ``backend`` is 'segments', the language can be a file with
+        a grapheme to phoneme mapping.
+
+    backend: str, optional
+        The software backend to use for phonemization,
+        must be 'festival' (US English only is supported, coded 'en-us'),
+        'espeak', 'espeak-mbrola' or 'segments'.
+
+    separator: Separator
+        string separators between phonemes, syllables and
+        words, default to separator.default_separator. Syllable separator is
+        considered only for the festival backend. Word separator is ignored by
+        the 'espeak-mbrola' backend. Initialize it as follows:
+            >>> from phonemizer.separator import Separator
+            >>> separator = Separator(phone='-', word=' ')
+
+    strip: bool, optional
+        If True, don't output the last word and phone
+        separators of a token, default to False.
+
+    prepend_text: bool, optional
+        When True, returns a pair (input utterance,
+        phonemized utterance) for each line of the input text. When False,
+        returns only the phonemized utterances. Default to False
+
+    preserve_empty_lines: bool, optional
+        When True, will keep the empty lines
+        in the phonemized output. Default to False and remove all empty lines.
+
+    preserve_punctuation: bool, optional
+        When True, will keep the punctuation
+        in the phonemized output. Not supported by the 'espeak-mbrola' backend.
+        Default to False and remove all the punctuation.
+
+    punctuation_marks: str or re.Pattern, optional
+        The punctuation marks to consider when dealing with punctuation,
+        either for removal or preservation.  Can be defined as a string or regular expression.
+        Default to Punctuation.default_marks().
+
+    with_stress: bool, optional
+        This option is only valid for the 'espeak'
+        backend. When True the stresses on phonemes are present (stresses
+        characters are ˈ'ˌ). When False stresses are removed. Default to False.
+
+    tie: bool or char, optional
+        This option is only valid for the 'espeak'
+        backend with espeak>=1.49. It is incompatible with phone separator. When
+        not False, use a tie character within multi-letter phoneme names. When
+        True, the char 'U+361' is used (as in d͡ʒ), 'z' means ZWJ character,
+        default to False.
+
+    language_switch: str, optional
+        Espeak can output some words in another
+        language (typically English) when phonemizing a text. This option setups
+        the policy to use when such a language switch occurs. Three values are
+        available : 'keep-flags' (the default), 'remove-flags' or
+        'remove-utterance'. The 'keep-flags' policy keeps the language switching
+        flags, for example "(en) or (jp)", in the output. The 'remove-flags'
+        policy removes them and the 'remove-utterance' policy removes the whole
+        line of text including a language switch. This option is only valid for
+        the 'espeak' backend.
+
+    words_mismatch: str, optional
+        Espeak can join two consecutive words or
+        drop some words, yielding a word count mismatch between orthographic and
+        phonemized text. This option setups the policy to use when such a words
+        count mismatch occurs. Three values are available: 'ignore' (the default)
+        which do nothing, 'warn' which issue a warning for each mismatched line,
+        and 'remove' which remove the mismatched lines from the output.
+
+    njobs: int
+        The number of parallel jobs to launch. The input text is split
+        in ``njobs`` parts, phonemized on parallel instances of the backend and the
+        outputs are finally collapsed.
+
+    logger: logging.Logger
+        the logging instance where to send messages. If
+        not specified, use the default system logger.
+
+    Returns
+    -------
+    phonemized text: str or list of str
+        The input ``text`` phonemized for the
+        given ``language`` and ``backend``. The returned value has the same type of
+        the input text (either a list or a string), excepted if ``prepend_input``
+        is True where the output is forced as a list of pairs (input_text,
+        phonemized text).
+
+    Raises
+    ------
+    RuntimeError
+        if the ``backend`` is not valid or is valid but not installed,
+        if the ``language`` is not supported by the ``backend``, if any incompatible options are used.
+
+    """
+    # ensure we are using a compatible Python version
+    if sys.version_info < (3, 6):  # pragma: nocover
+        logger.error(
+            'Your are using python-%s which is unsupported by the phonemizer, '
+            'please update to python>=3.6', ".".join(sys.version_info))
+
+    # ensure the arguments are valid
+    _check_arguments(
+        backend, with_stress, tie, separator, language_switch, words_mismatch)
+
+    # preserve_punctuation and word separator not valid for espeak-mbrola
+    if backend == 'espeak-mbrola' and preserve_punctuation:
+        logger.warning('espeak-mbrola backend cannot preserve punctuation')
+    if backend == 'espeak-mbrola' and separator.word:
+        logger.warning('espeak-mbrola backend cannot preserve word separation')
+
+    # initialize the phonemization backend
+    if backend == 'espeak':
+        phonemizer = BACKENDS[backend](
+            language,
+            punctuation_marks=punctuation_marks,
+            preserve_punctuation=preserve_punctuation,
+            with_stress=with_stress,
+            tie=tie,
+            language_switch=language_switch,
+            words_mismatch=words_mismatch,
+            logger=logger)
+    elif backend == 'espeak-mbrola':
+        phonemizer = BACKENDS[backend](
+            language,
+            logger=logger)
+    else:  # festival or segments
+        phonemizer = BACKENDS[backend](
+            language,
+            punctuation_marks=punctuation_marks,
+            preserve_punctuation=preserve_punctuation,
+            logger=logger)
+
+    # do the phonemization
+    return _phonemize(phonemizer, text, separator, strip, njobs, prepend_text, preserve_empty_lines)
+
+
+def _check_arguments(  # pylint: disable=too-many-arguments
+        backend: Backend,
+        with_stress: bool,
+        tie: Union[bool, str],
+        separator: Separator,
+        language_switch: LanguageSwitch,
+        words_mismatch: WordMismatch):
+    """Auxiliary function to phonemize()
+
+    Ensures the parameters are compatible with each other, raises a
+    RuntimeError the first encountered error.
+
+    """
+    # ensure the backend is either espeak, festival or segments
+    if backend not in ('espeak', 'espeak-mbrola', 'festival', 'segments'):
+        raise RuntimeError(
+            '{} is not a supported backend, choose in {}.'
+                .format(backend, ', '.join(
+                ('espeak', 'espeak-mbrola', 'festival', 'segments'))))
+
+    # with_stress option only valid for espeak
+    if with_stress and backend != 'espeak':
+        raise RuntimeError(
+            'the "with_stress" option is available for espeak backend only, '
+            'but you are using {} backend'.format(backend))
+
+    # tie option only valid for espeak
+    if tie and backend != 'espeak':
+        raise RuntimeError(
+            'the "tie" option is available for espeak backend only, '
+            'but you are using {} backend'.format(backend))
+
+    # tie option incompatible with phone separator
+    if tie and separator.phone:
+        raise RuntimeError(
+            'the "tie" option is incompatible with phone separator '
+            f'(which is "{separator.phone}")')
+
+    # language_switch option only valid for espeak
+    if language_switch != 'keep-flags' and backend != 'espeak':
+        raise RuntimeError(
+            'the "language_switch" option is available for espeak backend '
+            'only, but you are using {} backend'.format(backend))
+
+    # words_mismatch option only valid for espeak
+    if words_mismatch != 'ignore' and backend != 'espeak':
+        raise RuntimeError(
+            'the "words_mismatch" option is available for espeak backend '
+            'only, but you are using {} backend'.format(backend))
+
+
+def _phonemize(  # pylint: disable=too-many-arguments
+        backend: BaseBackend,
+        text: Union[str, List[str]],
+        separator: Separator,
+        strip: bool,
+        njobs: int,
+        prepend_text: bool,
+        preserve_empty_lines: bool):
+    """Auxiliary function to phonemize()
+
+    Does the phonemization and returns the phonemized text. Raises a
+    RuntimeError on error.
+
+    """
+    # remember the text type for output (either list or string)
+    text_type = type(text)
+
+    # force the text as a list
+    text = [line.strip(os.linesep) for line in str2list(text)]
+
+    # if preserving empty lines, note the index of each empty line
+    if preserve_empty_lines:
+        empty_lines = [n for n, line in enumerate(text) if not line.strip()]
+
+    # ignore empty lines
+    text = [line for line in text if line.strip()]
+
+    if (text):
+        # phonemize the text
+        phonemized = backend.phonemize(
+            text, separator=separator, strip=strip, njobs=njobs)
+    else:
+        phonemized = []
+
+    # if preserving empty lines, reinsert them into text and phonemized lists
+    if preserve_empty_lines:
+        for i in empty_lines: # noqa
+            if prepend_text:
+                text.insert(i, '')
+            phonemized.insert(i, '')
+
+    # at that point, the phonemized text is a list of str. Format it as
+    # expected by the parameters
+    if prepend_text:
+        return list(zip(text, phonemized))
+    if text_type == str:
+        return list2str(phonemized)
+    return phonemized
@@ -0,0 +1,220 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Implementation of punctuation processing"""
+
+import collections
+import re
+from typing import List, Union, Tuple, Pattern
+
+from phonemizer.utils import str2list
+from phonemizer.separator import Separator
+
+# The punctuation marks considered by default.
+_DEFAULT_MARKS = ';:,.!?¡¿—…"«»“”(){}[]'
+
+_MarkIndex = collections.namedtuple(
+    '_mark_index', ['index', 'mark', 'position'])
+
+
+class Punctuation:
+    """Preserve or remove the punctuation during phonemization
+
+    Backends behave differently with punctuation: festival and espeak ignore it
+    and remove it silently whereas segments will raise an error. The
+    Punctuation class solves that issue by "hiding" the punctuation to the
+    phonemization backend and restoring it afterwards.
+
+    Parameters
+    ----------
+    marks (str or re.Pattern) : The punctuation marks to consider for processing
+        (either removal or preservation). If a string, each mark must be made of
+        a single character. Default to Punctuation.default_marks().
+
+    """
+
+    def __init__(self, marks: Union[str, Pattern] = _DEFAULT_MARKS):
+        self._marks: str = None  # noqa
+        self._marks_re: Pattern[str] = None  # noqa
+        self.marks = marks
+
+    @staticmethod
+    def default_marks():
+        """Returns the default punctuation marks as a string"""
+        return _DEFAULT_MARKS
+
+    @property
+    def marks(self):
+        """The punctuation marks as a string"""
+        if self._marks:
+            return self._marks
+        raise ValueError('punctuation initialized from regex, cannot access marks as a string')
+
+    @marks.setter
+    def marks(self, value: Union[str, Pattern]):
+        if isinstance(value, Pattern):
+            # catch the pattern surrounded by zero or more spaces on either side
+            self._marks_re = re.compile(r'((' + value.pattern + r')|\s)+')
+            self._marks = None
+        elif isinstance(value, str):
+            self._marks = ''.join(set(value))
+
+            # catching all the marks in one regular expression: zero or more spaces
+            # + one or more marks + zero or more spaces.
+            self._marks_re = re.compile(fr'(\s*[{re.escape(self._marks)}]+\s*)+')
+        else:
+            raise ValueError('punctuation marks must be defined as a string or re.Pattern')
+
+    def remove(self, text: Union[str, List[str]]) -> Union[str, List[str]]:
+        """Returns the `text` with all punctuation marks replaced by spaces
+
+        The input `text` can be a string or a list and is returned with the
+        same type and punctuation removed.
+
+        """
+
+        def aux(text: str) -> str:
+            return re.sub(self._marks_re, ' ', text).strip()
+
+        if isinstance(text, str):
+            return aux(text)
+        return [aux(line) for line in text]
+
+    def preserve(self, text: Union[List[str], str]) -> Tuple[List[List[str]], List[_MarkIndex]]:
+        """Removes punctuation from `text`, allowing for furter restoration
+
+        This method returns the text as a list of punctuated chunks, along with
+        a list of punctuation marks for furter restoration:
+
+            'hello, my world!' -> ['hello', 'my world'], [',', '!']
+
+        """
+        text: List[str] = str2list(text)
+        preserved_text = []
+        preserved_marks = []
+
+        for num, line in enumerate(text):
+            line, marks = self._preserve_line(line, num)
+            preserved_text += line
+            preserved_marks += marks
+        return [line for line in preserved_text if line], preserved_marks
+
+    def _preserve_line(self, line: str, num: int) -> Tuple[List[str], List[_MarkIndex]]:
+        """Auxiliary method for Punctuation.preserve()"""
+        matches = list(re.finditer(self._marks_re, line))
+        if not matches:
+            return [line], []
+
+        # the line is made only of punctuation marks
+        if len(matches) == 1 and matches[0].group() == line:
+            return [], [_MarkIndex(num, line, 'A')]
+
+        # build the list of mark indexes required to restore the punctuation
+        marks = []
+        for match in matches:
+            # find the position of the punctuation mark in the utterance:
+            # begin (B), end (E), in the middle (I) or alone (A)
+            position = 'I'
+            if match == matches[0] and line.startswith(match.group()):
+                position = 'B'
+            elif match == matches[-1] and line.endswith(match.group()):
+                position = 'E'
+            marks.append(_MarkIndex(num, match.group(), position))
+
+        # split the line into sublines, each separated by a punctuation mark
+        preserved_line = []
+        for mark in marks:
+            split = line.split(mark.mark)
+            prefix, suffix = split[0], mark.mark.join(split[1:])
+            preserved_line.append(prefix)
+            line = suffix
+
+        # append any trailing text to the preserved line
+        return preserved_line + [line], marks
+
+    @classmethod
+    def restore(cls, text: Union[str, List[str]],
+                marks: List[_MarkIndex],
+                sep: Separator,
+                strip: bool) -> List[str]:
+        """Restore punctuation in a text.
+
+        This is the reverse operation of Punctuation.preserve(). It takes a
+        list of punctuated chunks and a list of punctuation marks, as well as
+        the separator and strip parameters used by phonemize. It returns the
+        punctuated text as a list:
+
+            ['hello', 'my world'], [',', '!'] -> ['hello, my world!']
+
+        """
+        text = str2list(text)
+        punctuated_text = []
+        pos = 0
+
+        while text or marks:
+
+            if not marks:
+                for line in text:
+                    # if strip is False, ensure the final word ends with a word separator
+                    if not strip and sep.word and not line.endswith(sep.word):
+                        line = line + sep.word
+                    punctuated_text.append(line)
+                text = []
+            elif not text:
+                # nothing has been phonemized, returns the marks alone, with internal
+                # spaces replaced by the word separator
+                punctuated_text.append(re.sub(' ', sep.word, ''.join(m.mark for m in marks)))
+                marks = []
+
+            else:
+                current_mark = marks[0]
+                if current_mark.index == pos:
+
+                    # place the current mark here
+                    mark = marks[0]
+                    marks = marks[1:]
+                    # replace internal spaces in the current mark with the word separator
+                    mark = re.sub(' ', sep.word, mark.mark)
+
+                    # remove the word last separator from the current word
+                    if sep.word and text[0].endswith(sep.word):
+                        text[0] = text[0][:-len(sep.word)]
+
+                    if current_mark.position == 'B':
+                        text[0] = mark + text[0]
+                    elif current_mark.position == 'E':
+                        punctuated_text.append(text[0] + mark + ('' if strip or mark.endswith(sep.word) else sep.word))
+                        text = text[1:]
+                        pos = pos + 1
+                    elif current_mark.position == 'A':
+                        punctuated_text.append(mark + ('' if strip or mark.endswith(sep.word) else sep.word))
+                        pos = pos + 1
+                    else:
+                        # position == 'I'
+                        if len(text) == 1:  # pragma: nocover
+                            # a corner case where the final part of an intermediate
+                            # mark (I) has not been phonemized
+                            text[0] = text[0] + mark
+                        else:
+                            first_word = text[0]
+                            text = text[1:]
+                            text[0] = first_word + mark + text[0]
+
+                else:
+                    punctuated_text.append(text[0])
+                    text = text[1:]
+                    pos = pos + 1
+
+
+        return punctuated_text
@@ -0,0 +1,118 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Provides the Separator tuple and its default value"""
+from typing import Optional, Union
+
+
+class Separator:
+    """Defines phone, syllable and word boundary tokens"""
+
+    def __init__(self, word: str = ' ',
+                 syllable: Optional[str] = None,
+                 phone: Optional[str] = None):
+        # check we have different separators, None excluded
+        sep1 = list(sep for sep in (phone, syllable, word) if sep)
+        sep2 = set(sep for sep in (phone, syllable, word) if sep)
+        if len(sep1) != len(sep2):
+            raise ValueError(
+                'illegal separator with word="{}", syllable="{}" and '
+                'phone="{}", must be all differents if not empty'
+                    .format(phone, syllable, word))
+
+        self._phone = str(phone) if phone else ''
+        self._syllable = str(syllable) if syllable else ''
+        self._word = str(word) if word else ''
+
+    def __eq__(self, other: 'Separator'):
+        return (
+                self.phone == other.phone
+                and self.syllable == other.syllable
+                and self.word == other.word)
+
+    def __str__(self):
+        return (
+            f'(phone: "{self.phone}", '
+            f'syllable: "{self.syllable}", '
+            f'word: "{self.word}")')
+
+    @property
+    def phone(self):
+        """Phones separator"""
+        return self._phone
+
+    @property
+    def syllable(self):
+        """Syllables separator"""
+        return self._syllable
+
+    @property
+    def word(self):
+        """Words separator"""
+        return self._word
+
+    def __contains__(self, value: str):
+        """Returns True if the separator has `value` as token separation"""
+        return value in {self.phone, self.syllable, self.word}
+
+    def input_output_separator(self, field_separator: Union[str, bool]) \
+            -> Union[str, bool]:
+        """Returns a suitable input/output separator based on token separator
+
+        The input/output separator split orthographic and phonetic texts when
+        using the --prepend-text option from command-line.
+
+        Parameters
+        ----------
+
+        field_separator: bool or str
+            If str, ensures it's value is not
+            already defined as a token separator. If True choose one of "|",
+            "||", "|||", "||||" (the first one that is not defined as a token
+            separator)
+
+        Returns
+        -------
+        The input/output separator, or False if ``field_separator`` is False
+
+        Raises
+        ------
+        RuntimeError
+            if ``field_separator`` is a str but is already registered as token separator
+
+        """
+        if not field_separator:
+            return False
+
+        if isinstance(field_separator, str):
+            if field_separator in self:
+                raise RuntimeError(
+                    f'cannot prepend input with "{field_separator}" because '
+                    f'it is already a token separator: {self}')
+            return field_separator
+
+        if field_separator is True:
+            field_separator = '|'
+            while field_separator in self:
+                field_separator += '|'
+            return field_separator
+
+        # not a bool nor a str
+        raise RuntimeError(
+            'invalid input/output separator, must be bool or str but is'
+            f'{field_separator}')
+
+
+default_separator = Separator(phone='', syllable='', word=' ')
+"""The default separation characters for phonemes, syllables and words"""
@@ -0,0 +1,30 @@
+;; Copyright 2015-2021 Mathieu Bernard
+;;
+;; This file is part of phonemizer: you can redistribute it and/or
+;; modify it under the terms of the GNU General Public License as
+;; published by the Free Software Foundation, either version 3 of the
+;; License, or (at your option) any later version.
+;;
+;; Phonemizer is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+
+;; This script is executed by festival for English text phonemization.
+(define (phonemize line)
+  "(phonemize LINE)
+Extract the phonemes of the string LINE as a tree and write it to stdout."
+  (set! utterance (eval (list 'Utterance 'Text line)))
+  (utt.synth utterance)
+  ;; Use of print instead of pprintf to have each utterance on one line
+  (print (utt.relation_tree utterance "SylStructure")))
+
+;; This double braket have to be replaced by the name of the text file
+;; you want to read data from. To be parsed by festival as a unique
+;; utterance, each line of that file must begin and end with
+;; double-quotes.
+(set! lines (load "{}" t))
+(mapcar (lambda (line) (phonemize line)) lines)
@@ -0,0 +1,27 @@
+a ʌ
+â aː
+b b
+ch tʃ
+d d
+e eː
+f f
+g g
+h h
+i ɪ
+î iː
+j dʒ
+k k
+kw kʷ
+l l
+m m
+n n
+o ʊ
+p p
+s s
+sh ʃ
+t t
+th θ
+u ʊ
+û o
+w w
+y j
@@ -0,0 +1,27 @@
+a	ʌ
+â	aː
+b	b
+ch	tʃ
+d	d
+e	eː
+f	f
+g	g
+h	h
+i	ɪ
+î	iː
+j	dʒ
+k	k
+kw	kʷ
+l	l
+m	m
+n	n
+o	ʊ
+p	p
+s	s
+sh	ʃ
+t	t
+th	θ
+u	ʊ
+û	o
+w	w
+y	j
@@ -0,0 +1,20 @@
+a	a
+g	g
+h	h
+i	i
+j	j
+k	k
+l	l
+ll	ɬ
+m	m
+n	n
+ng	ŋ
+nng	ŋŋ
+p	p
+q	q
+r	ʁ
+rng	ɴ
+s	s
+t	t
+u	u
+v	v
@@ -0,0 +1,35 @@
+a	a
+aa	aː
+b	b
+by	bʲ
+ch	tʃ
+d	d
+e	e
+ee	eː
+f	ɸ
+g	g
+gy	gʲ
+h	h
+hy	ç
+i	i
+j	dʒ
+k	k
+ky	kʲ
+m	m
+my	mʲ
+n	n
+ny	ɲ
+o	o
+oo	oː
+p	p
+py	pʲ
+r	r
+ry	rʲ
+sh	ʃ
+t	t
+ts	t͡s
+u	ɯ
+uu	ɯː
+w	w
+y	j
+z	z
@@ -0,0 +1,38 @@
+a	a
+b	b
+ch	tʃʰ
+d	d
+e	e
+f	f
+g	χ
+h	h
+hl	ɬ
+i	i
+j	dʒ
+k	k
+kg	kx
+kh	kʰ
+l	l
+m	m
+n	n
+ng	ŋ
+nq	ǃ̃
+ny	ɲ
+o	o
+p	t
+ph	pʰ
+q	ǃ
+qh	ǃʰ
+r	r
+s	s
+sh	ʃ
+t	t
+th	tʰ
+tj	tʃ
+tl	tɬ
+tlh	tɬʰ
+ts	t͡s
+tsh	t͡sʰ
+u	u
+w	w
+y	j
@@ -0,0 +1,45 @@
+a	a
+aa	aː
+aʼ	a̰
+aʼa	a̰ː
+b	b
+ch	t̠͡ʃ
+chʼ	t̠͡ʃʼ
+e	e
+ee	eː
+eʼ	ḛ
+eʼe	ḛː
+f	f
+h	h
+i	i
+ii	iː
+iʼ	ḭ
+iʼi	ḭː
+j	x
+k	k
+kʼ	kʼ
+l	l
+m	m
+n	n
+ñ	n
+o	o
+oo	oː
+oʼ	o̰
+oʼo	o̰ː
+p	pʼ
+pʼ	pʼ
+qu	k
+r	r
+s	s
+x	ʃ
+t	t
+ts	t͡s
+tsʼ	t͡sʼ
+tʼ	tʼ
+u	u
+uu	uː
+uʼ	ṵ
+uʼu	ṵː
+w	w
+y	j
+z	s
@@ -0,0 +1,131 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Provides utility functions for the phonemizer"""
+
+import os
+from numbers import Number
+from pathlib import Path
+from typing import Union, List, Tuple, Iterable
+
+import importlib
+
+
+def cumsum(iterable: Iterable[Number]) -> List[Number]:
+    """Returns the cumulative sum of the `iterable` as a list"""
+    res = []
+    cumulative = 0
+    for value in iterable:
+        cumulative += value
+        res.append(cumulative)
+    return res
+
+
+def str2list(text: Union[str, List[str]]) -> List[str]:
+    """Returns the string `text` as a list of lines, split by \n"""
+    if isinstance(text, str):
+        return text.strip(os.linesep).split(os.linesep)
+    return text
+
+
+def list2str(text: Union[str, List[str]]) -> str:
+    """Returns the list of lines `text` as a single string separated by \n"""
+    if isinstance(text, str):
+        return text
+    return os.linesep.join(text)
+
+
+def chunks(text: Union[str, List[str]], num: int) \
+        -> Tuple[List[List[str]], List[int]]:
+    """Return a maximum of `num` equally sized chunks of a `text`
+
+    This method is usefull when phonemizing a single text on multiple jobs.
+
+    The exact number of chunks returned is `m = min(num, len(str2list(text)))`.
+    Only the m-1 first chunks have equal size. The last chunk can be longer.
+    The input `text` can be a list or a string. Return a list of `m` strings.
+
+    Parameters
+    ----------
+    text (str or list) : The text to divide in chunks
+
+    num (int) : The number of chunks to build, must be a strictly positive
+    integer.
+
+    Returns
+    -------
+    chunks (list of list of str) : The chunked text with utterances separated
+        by '\n'.
+
+    offsets (list of int) : offset used below to recover the line numbers in
+        the input text wrt the chunks
+
+    """
+    text: List[str] = str2list(text)
+    size = int(max(1, len(text) / num))  # noqa
+    nchunks = min(num, len(text))
+
+    text_chunks = [
+        text[i * size:(i + 1) * size] for i in range(nchunks - 1)]
+
+    last = text[(nchunks - 1) * size:]
+    if last:
+        text_chunks.append(last)
+
+    offsets = [0] + cumsum((len(c) for c in text_chunks[:-1]))
+    return text_chunks, offsets
+
+
+def get_package_resource(path: str) -> Path:
+    """Returns the absolute path to a phonemizer resource file or directory
+
+    The packages resource are stored within the source tree in the
+    'phonemizer/share' directory and, once the package is installed, are moved
+    to another system directory (e.g. /share/phonemizer).
+
+    Parameters
+    ----------
+    path (str) : the file or directory to get, must be relative to
+        'phonemizer/share'.
+
+    Raises
+    ------
+    ValueError if the required `path` is not found
+
+    Returns
+    -------
+    The absolute path to the required resource as a `pathlib.Path`
+
+    """
+    try:
+        # new in python-3.9
+        path = importlib.resources.files('phonemizer') / 'share' / path
+    except AttributeError:  # pragma: nocover
+        with importlib.resources.path('phonemizer', 'share') as share:
+            path = share / path
+
+    if not path.exists():  # pragma: nocover
+        raise ValueError(f'the requested resource does not exist: {path}')
+
+    return path.resolve()
+
+
+def version_as_tuple(version: str) -> Tuple[int, ...]:
+    """Returns a tuple of integers from a version string
+
+    Any '-dev' in version string is ignored. For instance, returns (1, 2, 3)
+    from '1.2.3' or (0, 2) from '0.2-dev'
+
+    """
+    return tuple(int(v) for v in version.replace('-dev', '').split('.'))
@@ -0,0 +1,67 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Phonemizer version description"""
+
+import importlib
+
+from phonemizer.backend import (
+    EspeakBackend, EspeakMbrolaBackend, FestivalBackend, SegmentsBackend)
+
+
+def _version_as_str(vers):
+    """From (1, 49, 3) to '1.49.3'"""
+    return '.'.join(str(v) for v in vers)
+
+
+def version():
+    """Return version information for front and backends"""
+    # version of the phonemizer
+    _version = 'phonemizer-' + importlib.metadata.version('phonemizer')
+
+    # for each backend, check if it is available or not. If so get its version
+    available = []
+    unavailable = []
+
+    if EspeakBackend.is_available():
+        available.append(
+            'espeak-' + ('ng-' if EspeakBackend.is_espeak_ng() else '')
+            + _version_as_str(EspeakBackend.version()))
+    else:  # pragma: nocover
+        unavailable.append('espeak')
+
+    if EspeakMbrolaBackend.is_available():
+        available.append('espeak-mbrola')
+    else:  # pragma: nocover
+        unavailable.append('espeak-mbrola')
+
+    if FestivalBackend.is_available():
+        available.append(
+            'festival-' + _version_as_str(FestivalBackend.version()))
+    else:  # pragma: nocover
+        unavailable.append('festival')
+
+    if SegmentsBackend.is_available():
+        available.append(
+            'segments-' + _version_as_str(SegmentsBackend.version()))
+    else:  # pragma: nocover
+        unavailable.append('segments')
+
+    # resumes the backends status in the final version string
+    if available:
+        _version += '\navailable backends: ' + ', '.join(available)
+    if unavailable:  # pragma: nocover
+        _version += '\nuninstalled backends: ' + ', '.join(unavailable)
+
+    return _version