2025-12-01

2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,225 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the espeak backend"""
+
+# pylint: disable=missing-docstring
+# pylint: disable=redefined-outer-name
+
+import os
+import shutil
+import pytest
+
+from phonemizer.backend import EspeakBackend
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+from phonemizer.separator import Separator, default_separator
+
+
+def test_bad_text():
+    backend = EspeakBackend('en-us')
+    text = 'hello world'
+    with pytest.raises(RuntimeError) as err:
+        backend.phonemize(text, default_separator, True)
+    assert 'input text to phonemize() is str' in str(err)
+
+    assert backend.phonemize(
+        [text], default_separator, True) == ['həloʊ wɜːld']
+
+
+def test_english():
+    backend = EspeakBackend('en-us')
+    text = ['hello world', 'goodbye', 'third line', 'yet another']
+    out = backend.phonemize(text, default_separator, True)
+    assert out == ['həloʊ wɜːld', 'ɡʊdbaɪ', 'θɜːd laɪn', 'jɛt ɐnʌðɚ']
+
+
+def test_stress():
+    backend = EspeakBackend('en-us', with_stress=False)
+    assert backend.phonemize(
+        ['hello world'], default_separator, True) == ['həloʊ wɜːld']
+
+    backend = EspeakBackend('en-us', with_stress=True)
+    assert backend.phonemize(
+        ['hello world'], default_separator, True) == ['həlˈoʊ wˈɜːld']
+
+
+def test_french():
+    backend = EspeakBackend('fr-fr')
+    text = ['bonjour le monde']
+    sep = Separator(word=';eword ', syllable=None, phone=' ')
+    expected = ['b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword ']
+    out = backend.phonemize(text, sep, False)
+    assert out == expected
+
+
+@pytest.mark.skipif(
+    (
+        not EspeakBackend.is_espeak_ng() or
+        # Arabic is not supported by the Windows msi installer from espeak-ng
+        # github release
+        not EspeakBackend.is_supported_language('ar')),
+    reason='Arabic is not supported')
+def test_arabic():
+    backend = EspeakBackend('ar')
+    text = ['السلام عليكم']
+    sep = Separator()
+
+    # Arabic seems to have changed starting at espeak-ng-1.49.3
+    if EspeakBackend.version() >= (1, 49, 3):
+        expected = ['ʔassalaːm ʕliːkm ']
+    else:
+        expected = ['ʔassalaam ʕaliijkum ']
+    out = backend.phonemize(text, sep, False)
+    assert out == expected
+
+
+# see https://github.com/bootphon/phonemizer/issues/31
+def test_phone_separator_simple():
+    text = ['The lion and the tiger ran']
+    sep = Separator(phone='_')
+    backend = EspeakBackend('en-us')
+
+    output = backend.phonemize(text, separator=sep, strip=True)
+    expected = ['ð_ə l_aɪə_n æ_n_d ð_ə t_aɪ_ɡ_ɚ ɹ_æ_n']
+    assert expected == output
+
+    output = backend.phonemize(text, separator=sep, strip=False)
+    expected = ['ð_ə_ l_aɪə_n_ æ_n_d_ ð_ə_ t_aɪ_ɡ_ɚ_ ɹ_æ_n_ ']
+    assert expected == output
+
+
+@pytest.mark.parametrize(
+    'text, expected',
+    (('the hello but the', 'ð_ə h_ə_l_oʊ b_ʌ_t ð_ə'),
+     # ('Here there and everywhere', 'h_ɪɹ ð_ɛɹ æ_n_d ɛ_v_ɹ_ɪ_w_ɛɹ'),
+     # ('He was hungry and tired.', 'h_iː w_ʌ_z h_ʌ_ŋ_ɡ_ɹ_i æ_n_d t_aɪɚ_d'),
+     ('He was hungry but tired.', 'h_iː w_ʌ_z h_ʌ_ŋ_ɡ_ɹ_i b_ʌ_t t_aɪɚ_d')))
+def test_phone_separator(text, expected):
+    sep = Separator(phone='_')
+    backend = EspeakBackend('en-us')
+    output = backend.phonemize([text], separator=sep, strip=True)[0]
+    assert output == expected
+
+
+@pytest.mark.skipif(
+    'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
+    reason='cannot modify environment')
+def test_path_good():
+    espeak = EspeakBackend.library()
+    try:
+        EspeakBackend.set_library(None)
+        assert espeak == EspeakBackend.library()
+
+        library = EspeakWrapper().library_path
+        EspeakBackend.set_library(library)
+
+        test_english()
+
+    # restore the espeak path to default
+    finally:
+        EspeakBackend.set_library(None)
+
+
+@pytest.mark.skipif(
+    'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
+    reason='cannot modify environment')
+def test_path_bad():
+    try:
+        # corrupt the default espeak path, try to use python executable instead
+        binary = shutil.which('python')
+        EspeakBackend.set_library(binary)
+
+        with pytest.raises(RuntimeError):
+            EspeakBackend('en-us')
+        with pytest.raises(RuntimeError):
+            EspeakBackend.version()
+
+        EspeakBackend.set_library(__file__)
+        with pytest.raises(RuntimeError):
+            EspeakBackend('en-us')
+
+    # restore the espeak path to default
+    finally:
+        EspeakBackend.set_library(None)
+
+
+@pytest.mark.skipif(
+    'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
+    reason='cannot modify environment')
+def test_path_venv():
+    try:
+        os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = (
+            shutil.which('python'))
+        with pytest.raises(RuntimeError):
+            EspeakBackend('en-us').phonemize(['hello'])
+        with pytest.raises(RuntimeError):
+            EspeakBackend.version()
+
+        os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = __file__
+        with pytest.raises(RuntimeError):
+            EspeakBackend.version()
+
+    finally:
+        try:
+            del os.environ['PHONEMIZER_ESPEAK_LIBRARY']
+        except KeyError:
+            pass
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='tie only compatible with espeak-ng')
+@pytest.mark.parametrize(
+    'tie, expected', [
+        (False, 'dʒ_æ_k_i_ tʃ_æ_n_ '),
+        (True, 'd͡ʒæki t͡ʃæn '),
+        ('8', 'd8ʒæki t8ʃæn ')])
+def test_tie_simple(caplog, tie, expected):
+    backend = EspeakBackend('en-us', tie=tie)
+    assert backend.phonemize(
+        ['Jackie Chan'],
+        separator=Separator(word=' ', phone='_'))[0] == expected
+
+    if tie:
+        messages = [msg[2] for msg in caplog.record_tuples]
+        assert (
+            'cannot use ties AND phone separation, ignoring phone separator'
+            in messages)
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='tie only compatible with espeak-ng')
+def test_tie_utf8():
+    # NOTE this is a bug in espeak to append ties on (en) language switch
+    # flags. For now phonemizer does not fix it.
+    backend = EspeakBackend('fr-fr', tie=True)
+
+    # used to be 'bɔ̃͡ʒuʁ '
+    assert backend.phonemize(['bonjour']) == ['bɔ̃ʒuʁ ']
+
+    # used to be 'ty ɛm lə (͡e͡n͡)fʊtbɔ͡ːl(͡f͡r͡)'
+    assert backend.phonemize(
+        ['tu aimes le football']) == ['ty ɛm lə (͡e͡n)fʊtbɔːl(͡f͡r) ']
+
+    assert backend.phonemize(
+        ['bonjour apple']) == ['bɔ̃ʒuʁ (͡e͡n)apə͡l(͡f͡r) ']
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='tie only compatible with espeak-ng')
+def test_tie_bad():
+    with pytest.raises(RuntimeError):
+        EspeakBackend('en-us', tie='abc')
@@ -0,0 +1,142 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the espeak backend language switch processing"""
+
+# pylint: disable=missing-docstring
+# pylint: disable=redefined-outer-name
+
+import pytest
+
+from phonemizer.backend import EspeakBackend
+from phonemizer.separator import Separator
+
+
+@pytest.fixture
+def langswitch_text():
+    return [
+        "j'aime l'anglais",
+        "j'aime le football",
+        "football",
+        "surtout le real madrid",
+        "n'utilise pas google"]
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='language switch only exists for espeak-ng')
+@pytest.mark.parametrize('njobs', [1, 3])
+def test_language_switch_keep_flags(caplog, langswitch_text, njobs):
+    backend = EspeakBackend('fr-fr', language_switch='keep-flags')
+    out = backend.phonemize(
+        langswitch_text, separator=Separator(), strip=True, njobs=njobs)
+    assert out == [
+        'ʒɛm lɑ̃ɡlɛ',
+        'ʒɛm lə (en)fʊtbɔːl(fr)',
+        '(en)fʊtbɔːl(fr)',
+        'syʁtu lə (en)ɹiəl(fr) madʁid',
+        'nytiliz pa (en)ɡuːɡəl(fr)']
+
+    messages = [msg[2] for msg in caplog.record_tuples]
+    assert (
+        '4 utterances containing language switches on lines 2, 3, 4, 5'
+        in messages)
+    assert (
+        'language switch flags have been kept (applying "keep-flags" policy)'
+        in messages)
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='language switch only exists for espeak-ng')
+@pytest.mark.parametrize('njobs', [1, 3])
+def test_language_switch_default(caplog, langswitch_text, njobs):
+    # default behavior is to keep the flags
+    backend = EspeakBackend('fr-fr')
+    out = backend.phonemize(
+        langswitch_text, separator=Separator(), strip=True, njobs=njobs)
+    assert out == [
+        'ʒɛm lɑ̃ɡlɛ',
+        'ʒɛm lə (en)fʊtbɔːl(fr)',
+        '(en)fʊtbɔːl(fr)',
+        'syʁtu lə (en)ɹiəl(fr) madʁid',
+        'nytiliz pa (en)ɡuːɡəl(fr)']
+
+    messages = [msg[2] for msg in caplog.record_tuples]
+    assert (
+        '4 utterances containing language switches on lines 2, 3, 4, 5'
+        in messages)
+    assert (
+        'language switch flags have been kept (applying "keep-flags" policy)'
+        in messages)
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='language switch only exists for espeak-ng')
+@pytest.mark.parametrize('njobs', [1, 3])
+def test_language_switch_remove_flags(caplog, langswitch_text, njobs):
+    backend = EspeakBackend('fr-fr', language_switch='remove-flags')
+    out = backend.phonemize(
+        langswitch_text, separator=Separator(), strip=True, njobs=njobs)
+    assert out == [
+        'ʒɛm lɑ̃ɡlɛ',
+        'ʒɛm lə fʊtbɔːl',
+        'fʊtbɔːl',
+        'syʁtu lə ɹiəl madʁid',
+        'nytiliz pa ɡuːɡəl']
+
+    messages = [msg[2] for msg in caplog.record_tuples]
+    assert (
+        '4 utterances containing language switches on lines 2, 3, 4, 5'
+        in messages)
+    assert (
+        'language switch flags have been removed '
+        '(applying "remove-flags" policy)'
+        in messages)
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='language switch only exists for espeak-ng')
+@pytest.mark.parametrize('njobs', [1, 3])
+def test_language_switch_remove_utterance(caplog, langswitch_text, njobs):
+    backend = EspeakBackend('fr-fr', language_switch='remove-utterance')
+    out = backend.phonemize(
+        langswitch_text, separator=Separator(), strip=True, njobs=njobs)
+    assert out == ['ʒɛm lɑ̃ɡlɛ', '', '', '', '']
+
+    messages = [msg[2] for msg in caplog.record_tuples]
+    assert (
+        'removed 4 utterances containing language switches '
+        '(applying "remove-utterance" policy)'
+        in messages)
+
+    with pytest.raises(RuntimeError):
+        backend = EspeakBackend('fr-fr', language_switch='foo')
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='language switch only exists for espeak-ng')
+@pytest.mark.parametrize(
+    'policy', ('keep-flags', 'remove-flags', 'remove-utterance'))
+def test_no_switch(policy, caplog):
+    text = ["j'aime l'anglais", "tu parles le français"]
+    backend = EspeakBackend('fr-fr', language_switch=policy)
+    out = backend.phonemize(text, separator=Separator(), strip=True)
+    assert out == ['ʒɛm lɑ̃ɡlɛ', 'ty paʁl lə fʁɑ̃sɛ']
+
+    messages = [msg[2] for msg in caplog.record_tuples]
+    assert not messages
@@ -0,0 +1,79 @@
+"""Tests of the phonemizer.backend.espeak.words_mismatch module"""
+
+# pylint: disable=missing-docstring
+# pylint: disable=redefined-outer-name
+
+import pytest
+
+import re
+
+from phonemizer import phonemize
+from phonemizer.backend.espeak.words_mismatch import Ignore
+from phonemizer.separator import Separator, default_separator
+
+
+@pytest.fixture
+def text():
+    return ["How are you?", "I have been busy", "I won't have time"]
+
+
+def test_count_words():
+    # pylint: disable=protected-access
+    count_words = lambda phn: Ignore._count_words(
+        phn, wordsep=default_separator.word)
+    assert count_words(['']) == [0]
+    assert count_words(['a']) == [1]
+    assert count_words(['aaa']) == [1]
+    assert count_words([' aaa  ']) == [1]
+    assert count_words([' a  a \taa  ']) == [3]
+
+
+def test_bad():
+    with pytest.raises(RuntimeError):
+        phonemize('', words_mismatch='foo')
+
+    with pytest.raises(RuntimeError):
+        phonemize('', backend='festival', words_mismatch='remove')
+
+
+@pytest.mark.parametrize('mode', ['ignore', 'warn', 'remove'])
+def test_mismatch(caplog, text, mode):
+    phn = phonemize(
+        text, backend='espeak', language='en-us', words_mismatch=mode)
+
+    if mode == 'ignore':
+        assert phn == ['haʊ ɑːɹ juː ', 'aɪ hɐvbɪn bɪzi ', 'aɪ woʊntɐv taɪm ']
+        messages = [msg[2] for msg in caplog.record_tuples]
+        assert len(messages) == 1
+        assert 'words count mismatch on 67.0% of the lines (2/3)' in messages
+    elif mode == 'remove':
+        assert phn == ['haʊ ɑːɹ juː ', '', '']
+        messages = [msg[2] for msg in caplog.record_tuples]
+        assert len(messages) == 2
+        assert 'words count mismatch on 67.0% of the lines (2/3)' in messages
+        assert 'removing the mismatched lines' in messages
+    elif mode == 'warn':
+        assert phn == ['haʊ ɑːɹ juː ', 'aɪ hɐvbɪn bɪzi ', 'aɪ woʊntɐv taɪm ']
+        messages = [msg[2] for msg in caplog.record_tuples]
+        assert len(messages) == 3
+        assert (
+            'words count mismatch on line 2 (expected 4 words but get 3)'
+            in messages)
+        assert (
+            'words count mismatch on line 3 (expected 4 words but get 3)'
+            in messages)
+        assert 'words count mismatch on 67.0% of the lines (2/3)' in messages
+
+
+# from https://github.com/bootphon/phonemizer/issues/169
+def test_custom_separator(caplog):
+    phn = phonemize(
+        'try',
+        backend='espeak',
+        language='en-us',
+        separator=Separator(word='|', phone=' '),
+        words_mismatch='warn')
+
+    assert phn == 't ɹ aɪ |'
+    messages = [msg[2] for msg in caplog.record_tuples]
+    assert len(messages) == 0
@@ -0,0 +1,134 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the EspeakWrapper class"""
+
+# pylint: disable=missing-docstring
+# pylint: disable=redefined-outer-name
+
+import os
+import pathlib
+import pickle
+import sys
+
+import pytest
+
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+from phonemizer.backend import EspeakMbrolaBackend
+
+
+@pytest.fixture
+def wrapper():
+    return EspeakWrapper()
+
+
+def test_basic(wrapper):
+    assert wrapper.version >= (1, 48)
+    assert 'espeak' in str(wrapper.library_path)
+    assert os.path.isabs(wrapper.library_path)
+    assert os.path.isabs(wrapper.data_path)  # not None, no raise
+
+
+def test_available_voices(wrapper):
+    espeak = set(wrapper.available_voices())
+    assert espeak
+
+    mbrola = set(wrapper.available_voices('mbrola'))
+    # can be empty if no mbrola voice installed (occurs only on Windows, at
+    # least within the github CI pipeline)
+    if mbrola:
+        assert not espeak.intersection(mbrola)
+
+
+def test_set_get_voice(wrapper):
+    assert wrapper.voice is None
+    with pytest.raises(RuntimeError) as err:
+        wrapper.set_voice('')
+    assert 'invalid voice code ""' in str(err)
+
+    wrapper.set_voice('fr-fr')
+    assert wrapper.voice.language == 'fr-fr'
+    assert wrapper.voice.name in (
+        'French (France)',  # >1.48.3
+        'french')           # older espeak
+
+    wrapper.set_voice('en-us')
+    assert wrapper.voice.language == 'en-us'
+    assert wrapper.voice.name in (
+        'English (America)',  # >1.48.3
+        'english-us')         # older espeak
+
+    # no mbrola voices available on Windows by default (at least on the github
+    # CI pipeline)
+    if sys.platform != 'win32':
+        wrapper.set_voice('mb-af1')
+        assert wrapper.voice.language == 'af'
+        assert wrapper.voice.name == 'afrikaans-mbrola-1'
+
+    with pytest.raises(RuntimeError) as err:
+        wrapper.set_voice('some non existant voice code')
+    assert 'invalid voice code' in str(err)
+
+
+def _test_pickle(voice):
+    # the wrapper is pickled when using espeak backend on multiple jobs
+    wrapper = EspeakWrapper()
+    wrapper.set_voice(voice)
+
+    dump = pickle.dumps(wrapper)
+    wrapper2 = pickle.loads(dump)
+
+    assert wrapper.version == wrapper2.version
+    assert wrapper.library_path == wrapper2.library_path
+    assert wrapper.data_path == wrapper2.data_path
+    assert wrapper.voice == wrapper2.voice
+
+
+def test_pickle_en_us():
+    _test_pickle('en-us')
+
+
+@pytest.mark.skipif(
+    not EspeakMbrolaBackend.is_available() or
+    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
+    reason='mbrola or mb-fr1 voice not installed')
+def test_pickle_mb_fr1():
+    _test_pickle('mb-fr1')
+
+
+def test_twice():
+    wrapper1 = EspeakWrapper()
+    wrapper2 = EspeakWrapper()
+
+    assert wrapper1.data_path == wrapper2.data_path
+    assert wrapper1.version == wrapper2.version
+    assert wrapper1.library_path == wrapper2.library_path
+
+    wrapper1.set_voice('fr-fr')
+    assert wrapper1.voice.language == 'fr-fr'
+    wrapper2.set_voice('en-us')
+    assert wrapper2.voice.language == 'en-us'
+    assert wrapper1.voice.language == 'fr-fr'
+
+    # pylint: disable=protected-access
+    assert wrapper1._espeak._tempdir != wrapper2._espeak._tempdir
+
+
+@pytest.mark.skipif(sys.platform == 'win32', reason='not supported on Windows')
+def test_deletion():
+    # pylint: disable=protected-access
+    wrapper = EspeakWrapper()
+    path = pathlib.Path(wrapper._espeak._tempdir)
+    del wrapper
+    assert not path.exists()
@@ -0,0 +1,119 @@
+# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the festival backend"""
+
+# pylint: disable=missing-docstring
+
+
+import os
+import pathlib
+import shutil
+
+import pytest
+
+from phonemizer.separator import Separator
+from phonemizer.backend import FestivalBackend
+
+
+def _test(text, separator=Separator(
+        word=' ', syllable='|', phone='-')):
+    backend = FestivalBackend('en-us')
+    # pylint: disable=protected-access
+    return backend._phonemize_aux(text, 0, separator, True)
+
+
+@pytest.mark.skipif(
+    FestivalBackend.version() <= (2, 1),
+    reason='festival-2.1 gives different results than further versions '
+    'for syllable boundaries')
+def test_hello():
+    assert _test(['hello world']) == ['hh-ax|l-ow w-er-l-d']
+    assert _test(['hello', 'world']) == ['hh-ax|l-ow', 'w-er-l-d']
+
+
+@pytest.mark.parametrize('text', ['', ' ', '  ', '(', '()', '"', "'"])
+def test_bad_input(text):
+    assert _test(text) == []
+
+
+def test_quote():
+    assert _test(["it's"]) == ['ih-t-s']
+    assert _test(["its"]) == ['ih-t-s']
+    assert _test(["it s"]) == ['ih-t eh-s']
+    assert _test(['it "s']) == ['ih-t eh-s']
+
+
+def test_im():
+    sep = Separator(word=' ', syllable='', phone='')
+    assert _test(["I'm looking for an image"], sep) \
+        == ['aym luhkaxng faor axn ihmaxjh']
+    assert _test(["Im looking for an image"], sep) \
+        == ['ihm luhkaxng faor axn ihmaxjh']
+
+
+@pytest.mark.skipif(
+    not shutil.which('festival'), reason='festival not in PATH')
+def test_path_good():
+    try:
+        binary = shutil.which('festival')
+        FestivalBackend.set_executable(binary)
+        assert FestivalBackend('en-us').executable() == pathlib.Path(binary)
+    # restore the festival path to default
+    finally:
+        FestivalBackend.set_executable(None)
+
+
+@pytest.mark.skipif(
+    'PHONEMIZER_FESTIVAL_EXECUTABLE' in os.environ,
+    reason='environment variable precedence')
+def test_path_bad():
+    try:
+        # corrupt the default espeak path, try to use python executable instead
+        binary = shutil.which('python')
+        FestivalBackend.set_executable(binary)
+
+        with pytest.raises(RuntimeError):
+            FestivalBackend('en-us').phonemize(['hello'])
+        with pytest.raises(RuntimeError):
+            FestivalBackend.version()
+
+        with pytest.raises(RuntimeError):
+            FestivalBackend.set_executable(__file__)
+
+    # restore the festival path to default
+    finally:
+        FestivalBackend.set_executable(None)
+
+
+@pytest.mark.skipif(
+    'PHONEMIZER_FESTIVAL_EXECUTABLE' in os.environ,
+    reason='cannot modify environment')
+def test_path_venv():
+    try:
+        os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE'] = shutil.which('python')
+        with pytest.raises(RuntimeError):
+            FestivalBackend('en-us').phonemize(['hello'])
+        with pytest.raises(RuntimeError):
+            FestivalBackend.version()
+
+        os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE'] = __file__
+        with pytest.raises(RuntimeError):
+            FestivalBackend.version()
+
+    finally:
+        try:
+            del os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE']
+        except KeyError:
+            pass
@@ -0,0 +1,14 @@
+"""Tests to import the phonemize function"""
+
+# pylint: disable=missing-docstring
+# pylint: disable=import-outside-toplevel
+
+
+def test_relative():
+    from phonemizer import phonemize
+    assert phonemize('a') == 'eɪ '
+
+
+def test_absolute():
+    from phonemizer.phonemize import phonemize
+    assert phonemize('a') == 'eɪ '
@@ -0,0 +1,158 @@
+# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the command line interface"""
+
+# pylint: disable=missing-docstring
+
+import os
+import pathlib
+import tempfile
+import shlex
+import sys
+
+import pytest
+
+from phonemizer.backend import EspeakMbrolaBackend, EspeakBackend
+from phonemizer import main, backend, logger
+
+
+def _test(text, expected_output, args=''):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        input_file = pathlib.Path(tmpdir) / 'input.txt'
+        output_file = pathlib.Path(tmpdir) / 'output.txt'
+        with open(input_file, 'wb') as finput:
+            finput.write(text.encode('utf8'))
+
+        sys.argv = ['unused', f'{input_file}', '-o', f'{output_file}']
+        if args:
+            sys.argv += shlex.split(args)
+        main.main()
+
+        with open(output_file, 'rb') as foutput:
+            output = foutput.read().decode()
+
+        # silly fix for windows
+        assert output.replace('\r', '').strip(os.linesep) \
+            == expected_output.replace('\r', '')
+
+
+def test_help():
+    sys.argv = ['foo', '-h']
+    with pytest.raises(SystemExit):
+        main.main()
+
+
+def test_version():
+    sys.argv = ['foo', '--version']
+    main.main()
+
+
+def test_list_languages():
+    sys.argv = ['foo', '--list-languages']
+    main.main()
+
+
+def test_readme():
+    _test('hello world', 'həloʊ wɜːld ', '--verbose')
+    _test('hello world', 'həloʊ wɜːld ', '--quiet')
+    _test('hello world', 'hello world | həloʊ wɜːld ', '--prepend-text')
+    _test('hello world', 'hhaxlow werld', '-b festival --strip')
+    _test('bonjour le monde', 'bɔ̃ʒuʁ lə mɔ̃d ', '-l fr-fr')
+    _test('bonjour le monde', 'b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword ',
+          '-l fr-fr -p " " -w ";eword "')
+
+
+@pytest.mark.skipif(
+    '2.1' in backend.FestivalBackend.version(),
+    reason='festival-2.1 gives different results than further versions '
+    'for syllable boundaries')
+def test_readme_festival_syll():
+    _test('hello world',
+          'hh ax ;esyll l ow ;esyll ;eword w er l d ;esyll ;eword ',
+          "-p ' ' -s ';esyll ' -w ';eword ' -b festival -l en-us")
+
+
+@pytest.mark.parametrize('njobs', [1, 6])
+def test_njobs(njobs):
+    _test(
+        os.linesep.join((
+            'hello world',
+            'goodbye',
+            'third line',
+            'yet another')),
+        os.linesep.join((
+            'h-ə-l-oʊ w-ɜː-l-d',
+            'ɡ-ʊ-d-b-aɪ',
+            'θ-ɜː-d l-aɪ-n',
+            'j-ɛ-t ɐ-n-ʌ-ð-ɚ')),
+        f'--strip -j {njobs} -l en-us -b espeak -p "-" -s "|" -w " "')
+
+
+def test_unicode():
+    _test('untuʼule', 'untṵːle ', '-l yucatec -b segments')
+
+
+def test_logger():
+    with pytest.raises(RuntimeError):
+        logger.get_logger(verbosity=1)
+
+
+@pytest.mark.skipif(
+    not EspeakMbrolaBackend.is_available() or
+    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
+    reason='mbrola or mb-fr1 voice not installed')
+def test_espeak_mbrola():
+    _test('coucou toi!', 'k u k u t w a ',
+          '-b espeak-mbrola -l mb-fr1 -p" " --preserve-punctuation')
+
+
+def test_espeak_path():
+    espeak = pathlib.Path(backend.EspeakBackend.library())
+    if sys.platform == 'win32':
+        espeak = str(espeak).replace('\\', '\\\\').replace(' ', '\\ ')
+    _test('hello world', 'həloʊ wɜːld ', f'--espeak-library={espeak}')
+
+
+def test_festival_path():
+    festival = pathlib.Path(backend.FestivalBackend.executable())
+    if sys.platform == 'win32':
+        festival = str(festival).replace('\\', '\\\\').replace(' ', '\\ ')
+
+    _test('hello world', 'hhaxlow werld ',
+          f'--festival-executable={festival} -b festival')
+
+
+@pytest.mark.parametrize(
+    'args, expected', [
+        ('',
+         'həloʊ wɜːld θɹiː ziəɹoʊziəɹoʊ ziəɹoʊ ɔːɹ tuː fɪfti həloʊ '),
+        ('--preserve-punctuation',
+         'həloʊ, ,wɜːld? θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
+        ('--preserve-punctuation '
+         '--punctuation-marks-is-regex '
+         '--punctuation-marks "[^a-zA-ZÀ-ÖØ-öø-ÿ0-9\'\\-]"',
+         'həloʊ, ,wɜːld? ‡ θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
+        ('--preserve-punctuation '
+         '--punctuation-marks-is-regex '
+         '--punctuation-marks "[;:\\!?¡¿—…\\\"«»“”]|[,.](?!\\d)"',
+         'həloʊ, ,wɜːld? θɹiː θaʊzənd, ɔːɹ tuː pɔɪnt faɪv ziəɹoʊ. ¿həloʊ? ')])
+def test_punctuation_is_regex(args, expected):
+    print(args)
+    _test("hello, ,world? ‡ 3,000, or 2.50. ¿hello?", expected, args)
+
+
+def test_invalid_punctuation_regex():
+    with pytest.raises(SystemExit):
+        _test('hello world', None, '--punctuation-marks-is-regex --punctuation-marks "[*,"')
@@ -0,0 +1,109 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the espeak-mbrola backend"""
+
+# pylint: disable=missing-docstring
+# pylint: disable=redefined-outer-name
+
+import pytest
+
+from phonemizer.backend import EspeakMbrolaBackend
+from phonemizer.separator import Separator
+
+
+@pytest.fixture(scope='session')
+def backend():
+    return EspeakMbrolaBackend('mb-fr1')
+
+
+@pytest.mark.skipif(
+    not EspeakMbrolaBackend.is_available() or
+    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
+    reason='mbrola or mb-fr1 voice not installed')
+@pytest.mark.parametrize(
+    'text, expected',
+    [
+        # plosives
+        ('pont', 'po~'),
+        ('bon', 'bo~'),
+        ('temps', 'ta~'),
+        ('dans', 'da~'),
+        ('quand', 'ka~'),
+        ('gant', 'ga~'),
+        # fricatives
+        ('femme', 'fam'),
+        ('vent', 'va~'),
+        ('sans', 'sa~'),
+        ('champ', 'Sa~'),
+        ('gens', 'Za~'),
+        ('ion', 'jo~'),
+        # nasals
+        ('mont', 'mo~'),
+        ('nom', 'no~'),
+        ('oignon', 'onjo~'),
+        ('ping', 'piN'),
+        # liquid glides
+        ('long', 'lo~'),
+        ('rond', 'Ro~'),
+        ('coin', 'kwe~'),
+        ('juin', 'Zye~'),
+        ('pierre', 'pjER'),
+        # vowels
+        ('si', 'si'),
+        ('ses', 'se'),
+        ('seize', 'sEz'),
+        ('patte', 'pat'),
+        ('pâte', 'pat'),
+        ('comme', 'kOm'),
+        ('gros', 'gRo'),
+        ('doux', 'du'),
+        ('du', 'dy'),
+        ('deux', 'd2'),
+        ('neuf', 'n9f'),
+        ('justement', 'Zystma~'),
+        ('vin', 've~'),
+        ('vent', 'va~'),
+        ('bon', 'bo~'),
+        ('brun', 'bR9~')])
+def test_sampa_fr(backend, text, expected):
+    assert expected == backend.phonemize(
+        [text], strip=True, separator=Separator(phone=''))[0]
+
+
+@pytest.mark.skipif(
+    not EspeakMbrolaBackend.is_available() or
+    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
+    reason='mbrola or mb-fr1 voice not installed')
+def test_french_sampa(backend):
+    text = ['bonjour le monde']
+    sep = Separator(word=None, phone=' ')
+
+    expected = ['b o~ Z u R l @ m o~ d ']
+    out = backend.phonemize(text, separator=sep, strip=False)
+    assert out == expected
+
+    expected = ['b o~ Z u R l @ m o~ d']
+    out = backend.phonemize(text, separator=sep, strip=True)
+    assert out == expected
+
+    assert backend.phonemize([''], separator=sep, strip=True) == ['']
+    assert backend.phonemize(['"'], separator=sep, strip=True) == ['']
+
+
+@pytest.mark.skipif(
+    not EspeakMbrolaBackend.is_available(),
+    reason='mbrola not installed')
+def test_mbrola_bad_language():
+    assert not EspeakMbrolaBackend.is_supported_language('foo-bar')
@@ -0,0 +1,291 @@
+# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the phonemizer.phonemize function"""
+
+# pylint: disable=missing-docstring
+
+import os
+import pytest
+
+from phonemizer.phonemize import phonemize
+from phonemizer.separator import Separator
+from phonemizer.backend import EspeakBackend, EspeakMbrolaBackend
+
+
+def test_bad_backend():
+    with pytest.raises(RuntimeError):
+        phonemize('', backend='fetiv')
+
+    with pytest.raises(RuntimeError):
+        phonemize('', backend='foo')
+
+    with pytest.raises(RuntimeError):
+        phonemize('', tie=True, backend='festival')
+    with pytest.raises(RuntimeError):
+        phonemize('', tie=True, backend='mbrola')
+    with pytest.raises(RuntimeError):
+        phonemize('', tie=True, backend='segments')
+    with pytest.raises(RuntimeError):
+        phonemize(
+            '', tie=True, backend='espeak',
+            separator=Separator(' ', None, '-'))
+
+
+def test_bad_language():
+    with pytest.raises(RuntimeError):
+        phonemize('', language='fr-fr', backend='festival')
+
+    with pytest.raises(RuntimeError):
+        phonemize('', language='ffr', backend='espeak')
+
+    with pytest.raises(RuntimeError):
+        phonemize('', language='/path/to/nonexisting/file', backend='segments')
+
+    with pytest.raises(RuntimeError):
+        phonemize('', language='creep', backend='segments')
+
+
+def test_text_type():
+    text1 = ['one two', 'three', 'four five']
+    text2 = os.linesep.join(text1)
+
+    phn1 = phonemize(text1, language='en-us', backend='espeak', strip=True)
+    phn2 = phonemize(text2, language='en-us', backend='espeak', strip=True)
+    out3 = phonemize(text2, language='en-us', backend='espeak', strip=True,
+                     prepend_text=True)
+    text3 = [o[0] for o in out3]
+    phn3 = [o[1] for o in out3]
+
+    assert isinstance(phn1, list)
+    assert isinstance(phn2, str)
+    assert os.linesep.join(phn1) == phn2
+    assert os.linesep.join(phn3) == phn2
+    assert text3 == text1
+
+
+@pytest.mark.skipif(
+    not EspeakBackend.is_espeak_ng(),
+    reason='language switch only exists for espeak-ng')
+def test_lang_switch():
+    text = ['bonjour apple', 'bonjour toi']
+    out = phonemize(
+        text,
+        language='fr-fr',
+        backend='espeak',
+        prepend_text=True,
+        language_switch='remove-utterance')
+    assert out == [('bonjour apple', ''), ('bonjour toi', 'bɔ̃ʒuʁ twa ')]
+
+
+@pytest.mark.parametrize('njobs', [2, 4])
+def test_espeak(njobs):
+    text = ['one two', 'three', 'four five']
+
+    out = phonemize(
+        text, language='en-us', backend='espeak',
+        strip=True, njobs=njobs)
+    assert out == ['wʌn tuː', 'θɹiː', 'foːɹ faɪv']
+
+    out = phonemize(
+        ' '.join(text), language='en-us', backend='espeak',
+        strip=False, njobs=njobs)
+    assert out == ' '.join(['wʌn tuː', 'θɹiː', 'foːɹ faɪv '])
+
+    out = phonemize(
+        os.linesep.join(text), language='en-us', backend='espeak',
+        strip=False, njobs=njobs)
+    assert out == os.linesep.join(['wʌn tuː ', 'θɹiː ', 'foːɹ faɪv '])
+
+
+@pytest.mark.skipif(
+    not EspeakMbrolaBackend.is_available() or
+    not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
+    reason='mbrola or mb-fr1 voice not installed')
+@pytest.mark.parametrize('njobs', [2, 4])
+def test_espeak_mbrola(caplog, njobs):
+    text = ['un deux', 'trois', 'quatre cinq']
+
+    out = phonemize(
+        text,
+        language='mb-fr1',
+        backend='espeak-mbrola',
+        njobs=njobs,
+        preserve_punctuation=True)
+    assert out == ['9~d2', 'tRwa', 'katRse~k']
+
+    messages = [msg[2] for msg in caplog.record_tuples]
+    assert 'espeak-mbrola backend cannot preserve punctuation' in messages
+    assert 'espeak-mbrola backend cannot preserve word separation' in messages
+
+
+@pytest.mark.parametrize('njobs', [2, 4])
+def test_festival(njobs):
+    text = ['one two', 'three', 'four five']
+
+    out = phonemize(
+        text, language='en-us', backend='festival',
+        strip=False, njobs=njobs)
+    assert out == ['wahn tuw ', 'thriy ', 'faor fayv ']
+
+    out = phonemize(
+        ' '.join(text), language='en-us', backend='festival',
+        strip=True, njobs=njobs)
+    assert out == ' '.join(['wahn tuw', 'thriy', 'faor fayv'])
+
+    out = phonemize(
+        os.linesep.join(text), language='en-us', backend='festival',
+        strip=True, njobs=njobs)
+    assert out == os.linesep.join(['wahn tuw', 'thriy', 'faor fayv'])
+
+
+def test_festival_bad():
+    # cannot use options valid for espeak only
+    text = ['one two', 'three', 'four five']
+
+    with pytest.raises(RuntimeError):
+        phonemize(
+            text, language='en-us', backend='festival', with_stress=True)
+
+    with pytest.raises(RuntimeError):
+        phonemize(
+            text, language='en-us', backend='festival',
+            language_switch='remove-flags')
+
+
+@pytest.mark.parametrize('njobs', [2, 4])
+def test_segments(njobs):
+    # one two three four five in Maya Yucatec
+    text = ['untuʼuleʼ kaʼapʼeʼel', 'oʼoxpʼeʼel', 'kantuʼuloʼon chincho']
+
+    out = phonemize(
+        text, language='yucatec', backend='segments',
+        strip=False, njobs=njobs)
+    assert out == [
+        'untṵːlḛ ka̰ːpʼḛːl ', 'o̰ːʃpʼḛːl ', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo ']
+    out = phonemize(
+        ' '.join(text), language='yucatec', backend='segments',
+        strip=False, njobs=njobs)
+    assert out == ' '.join(
+        ['untṵːlḛ ka̰ːpʼḛːl', 'o̰ːʃpʼḛːl', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo '])
+
+    out = phonemize(
+        os.linesep.join(text), language='yucatec', backend='segments',
+        strip=True, njobs=njobs)
+    assert out == os.linesep.join(
+        ['untṵːlḛ ka̰ːpʼḛːl', 'o̰ːʃpʼḛːl', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo'])
+
+
+@pytest.mark.parametrize(
+    'backend, empty_lines, punctuation, prepend_text, text, expected', [
+        ('espeak', False, False, False,
+            ['hello world!', '', 'goodbye'],
+            ['həloʊ wɜːld ', 'ɡʊdbaɪ ']),
+        ('espeak', False, True, False,
+            ['hello world!', '', 'goodbye'],
+            ['həloʊ wɜːld! ', 'ɡʊdbaɪ ']),
+        ('espeak', True, False, False,
+            ['hello world!', '', 'goodbye'],
+            ['həloʊ wɜːld ', '', 'ɡʊdbaɪ ']),
+        ('espeak', True, True, False,
+            ['hello world!', '', 'goodbye'],
+            ['həloʊ wɜːld! ', '', 'ɡʊdbaɪ ']),
+        ('segments', False, False, False,
+            ['achi acho?', '', 'achi acho'],
+            [u'ʌtʃɪ ʌtʃʊ ', u'ʌtʃɪ ʌtʃʊ ']),
+        ('segments', False, True, False,
+            ['achi acho?', '', 'achi acho'],
+            [u'ʌtʃɪ ʌtʃʊ? ', u'ʌtʃɪ ʌtʃʊ ']),
+        ('segments', True, False, False,
+            ['achi acho?', '', 'achi acho'],
+            [u'ʌtʃɪ ʌtʃʊ ', '', u'ʌtʃɪ ʌtʃʊ ']),
+        ('segments', True, True, False,
+            ['achi acho?', '', 'achi acho'],
+            [u'ʌtʃɪ ʌtʃʊ? ', '', u'ʌtʃɪ ʌtʃʊ ']),
+        ('festival', False, False, False,
+            ['hello world!', '', 'goodbye'],
+            ['hhaxlow werld ', 'guhdbay ']),
+        ('festival', False, True, False,
+            ['hello world!', '', 'goodbye'],
+            ['hhaxlow werld! ', 'guhdbay ']),
+        ('festival', True, False, False,
+            ['hello world!', '', 'goodbye'],
+            ['hhaxlow werld ', '', 'guhdbay ']),
+        ('festival', True, True, False,
+            ['hello world!', '', 'goodbye'],
+            ['hhaxlow werld! ', '', 'guhdbay ']),
+        ('espeak', False, False, True,
+            ['hello world!', '', 'goodbye'],
+            [('hello world!', 'həloʊ wɜːld '), ('goodbye', 'ɡʊdbaɪ ')]),
+        ('espeak', False, True, True,
+            ['hello world!', '', 'goodbye'],
+            [('hello world!', 'həloʊ wɜːld! '), ('goodbye', 'ɡʊdbaɪ ')]),
+        ('espeak', True, False, True,
+            ['hello world!', '', 'goodbye'],
+            [('hello world!', 'həloʊ wɜːld '), ('', ''), ('goodbye', 'ɡʊdbaɪ ')]),
+        ('espeak', True, True, True,
+            ['hello world!', '', 'goodbye'],
+            [('hello world!', 'həloʊ wɜːld! '), ('', ''), ('goodbye', 'ɡʊdbaɪ ')]),
+        ('segments', False, False, True,
+            ['achi acho?', '', 'achi acho'],
+            [('achi acho?', 'ʌtʃɪ ʌtʃʊ '), ('achi acho', u'ʌtʃɪ ʌtʃʊ ')]),
+        ('segments', False, True, True,
+            ['achi acho?', '', 'achi acho'],
+            [('achi acho?', 'ʌtʃɪ ʌtʃʊ? '), ('achi acho', u'ʌtʃɪ ʌtʃʊ ')]),
+        ('segments', True, False, True,
+            ['achi acho?', '', 'achi acho'],
+            [('achi acho?', u'ʌtʃɪ ʌtʃʊ '), ('', ''), ('achi acho', u'ʌtʃɪ ʌtʃʊ ')]),
+        ('segments', True, True, True,
+            ['achi acho?', '', 'achi acho'],
+            [('achi acho?', u'ʌtʃɪ ʌtʃʊ? '), ('', ''), ('achi acho', u'ʌtʃɪ ʌtʃʊ ')]),
+        ('festival', False, False, True,
+            ['hello world!', '', 'goodbye'],
+            [('hello world!', 'hhaxlow werld '), ('goodbye', 'guhdbay ')]),
+        ('festival', False, True, True,
+            ['hello world!', '', 'goodbye'],
+            [('hello world!', 'hhaxlow werld! '), ('goodbye', 'guhdbay ')]),
+        ('festival', True, False, True,
+            ['hello world!', '', 'goodbye'],
+            [('hello world!', 'hhaxlow werld '), ('', ''), ('goodbye', 'guhdbay ')]),
+        ('festival', True, True, True,
+            ['hello world!', '', 'goodbye'],
+            [('hello world!', 'hhaxlow werld! '), ('', ''), ('goodbye', 'guhdbay ')])])
+def test_preserve_empty_lines(backend, empty_lines, punctuation, prepend_text, text, expected):
+    language = 'cree' if backend == 'segments' else 'en-us'
+
+    assert expected == phonemize(
+        text, language=language, backend=backend, prepend_text=prepend_text,
+        preserve_punctuation=punctuation, preserve_empty_lines=empty_lines)
+
+
+@pytest.mark.parametrize(
+    'backend, empty_lines, punctuation, text, expected', [
+        ('espeak', False, False, [''], []),
+        ('espeak', False, True, [''], []),
+        ('espeak', True, False, [''], ['']),
+        ('espeak', True, True, [''], ['']),
+        ('segments', False, False, [''], []),
+        ('segments', False, True, [''], []),
+        ('segments', True, False, [''], ['']),
+        ('segments', True, True, [''], ['']),
+        ('festival', False, False, [''], []),
+        ('festival', False, True, [''], []),
+        ('festival', True, False, [''], ['']),
+        ('festival', True, True, [''], [''])])
+def test_empty_input(backend, empty_lines, punctuation, text, expected):
+    language = 'cree' if backend == 'segments' else 'en-us'
+
+    assert expected == phonemize(
+        text, language=language, backend=backend,
+        preserve_punctuation=punctuation, preserve_empty_lines=empty_lines)
@@ -0,0 +1,274 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the punctuation processing"""
+
+# pylint: disable=missing-docstring
+from pathlib import Path
+
+import pytest
+import re
+
+from phonemizer.backend import EspeakBackend, FestivalBackend, SegmentsBackend
+from phonemizer.punctuation import Punctuation
+from phonemizer.phonemize import phonemize
+from phonemizer.separator import Separator, default_separator
+
+# True if we are using espeak>=1.50
+ESPEAK_150 = (EspeakBackend.version() >= (1, 50))
+
+# True if we are using espeak>=1.49.3
+ESPEAK_149 = (EspeakBackend.version() >= (1, 49, 3))
+
+# True if we are using festival>=2.5
+FESTIVAL_25 = (FestivalBackend.version() >= (2, 5))
+
+
+@pytest.mark.parametrize(
+    'inp, out', [
+        ('a, b,c.', 'a b c'),
+        ('abc de', 'abc de'),
+        ('!d.d. dd??  d!', 'd d dd d')])
+def test_remove(inp, out):
+    assert Punctuation().remove(inp) == out
+
+
+@pytest.mark.parametrize(
+    'inp', [
+        ['.a.b.c.'],
+        ['a, a?', 'b, b'],
+        ['a, a?', 'b, b', '!'],
+        ['a, a?', '!?', 'b, b'],
+        ['!?', 'a, a?', 'b, b'],
+        ['a, a, a'],
+        ['a, a?', 'aaa bb', '.bb, b', 'c', '!d.d. dd??  d!'],
+        ['Truly replied, "Yes".'],
+        ['hi; ho,"'],
+        ["!?"],
+        ["!'"],
+        ["It is ! (I think so)"],
+        ["This {is} right"],
+        ["[He] is right"],
+    ])
+def test_preserve(inp):
+    punct = Punctuation()
+    text, marks = punct.preserve(inp)
+    assert inp == punct.restore(text, marks, sep=default_separator, strip=True)
+
+
+@pytest.mark.parametrize(
+    'text, expected_restore, expected_output', [
+        (['hi; hi,"'], ['hi; hi," '], ['haɪ; haɪ, ']),
+        (['hi; "hi,'], ['hi; "hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else ['haɪ;  haɪ, ']),
+        (['"hi; hi,'], ['"hi; hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else [' haɪ; haɪ, '])])
+def test_preserve_2(text, expected_restore, expected_output):
+    marks = ".!;:,?"
+    punct = Punctuation(marks=marks)
+    assert expected_restore == punct.restore(
+        *punct.preserve(text), sep=default_separator, strip=False)
+
+    output = phonemize(
+        text, backend="espeak",
+        preserve_punctuation=True, punctuation_marks=marks)
+    assert output == expected_output
+
+
+def test_custom():
+    punct = Punctuation()
+    assert set(punct.marks) == set(punct.default_marks())
+    assert punct.remove('a,b.c') == 'a b c'
+
+    with pytest.raises(ValueError):
+        punct.marks = ['?', '.']
+    punct.marks = '?.'
+    assert len(punct.marks) == 2
+    assert punct.remove('a,b.c') == 'a,b c'
+
+
+def test_espeak():
+    text = 'hello, world!'
+    expected1 = 'həloʊ wɜːld'
+    expected2 = 'həloʊ, wɜːld!'
+    expected3 = 'həloʊ wɜːld '
+    expected4 = 'həloʊ, wɜːld! '
+
+    out1 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
+        [text], strip=True)[0]
+    assert out1 == expected1
+
+    out2 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
+        [text], strip=True)[0]
+    assert out2 == expected2
+
+    out3 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
+        [text], strip=False)[0]
+    assert out3 == expected3
+
+    out4 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
+        [text], strip=False)[0]
+    assert out4 == expected4
+
+
+def test_festival():
+    text = 'hello, world!'
+    expected1 = 'hhaxlow werld'
+    expected2 = 'hhaxlow, werld!'
+    expected3 = 'hhaxlow werld '
+    expected4 = 'hhaxlow, werld! '
+
+    out1 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
+        [text], strip=True)[0]
+    assert out1 == expected1
+
+    out2 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
+        [text], strip=True)[0]
+    assert out2 == expected2
+
+    out3 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
+        [text], strip=False)[0]
+    assert out3 == expected3
+
+    out4 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
+        [text], strip=False)[0]
+    assert out4 == expected4
+
+
+def test_segments():
+    text = 'achi, acho!'
+    expected1 = 'ʌtʃɪ ʌtʃʊ'
+    expected2 = 'ʌtʃɪ, ʌtʃʊ!'
+    expected3 = 'ʌtʃɪ ʌtʃʊ '
+    expected4 = 'ʌtʃɪ, ʌtʃʊ! '
+
+    out1 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
+        [text], strip=True)[0]
+    assert out1 == expected1
+
+    out2 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
+        [text], strip=True)[0]
+    assert out2 == expected2
+
+    out3 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
+        [text], strip=False)[0]
+    assert out3 == expected3
+
+    out4 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
+        [text], strip=False)[0]
+    assert out4 == expected4
+
+
+# see https://github.com/bootphon/phonemizer/issues/54
+@pytest.mark.parametrize(
+    'text, expected', [("!'", "! "), ("'!", "! "), ("!'!", "!! "), ("'!'", "! ")])
+def test_issue_54(text, expected):
+    output = phonemize(
+        [text], language='en-us', backend='espeak',
+        preserve_punctuation=True)[0]
+    assert expected == output
+
+
+# see https://github.com/bootphon/phonemizer/issues/55
+@pytest.mark.parametrize(
+    'backend, marks, text, expected', [
+        ('espeak', 'default', ['"Hey! "', '"hey,"'], ['"heɪ! " ', '"heɪ," ']),
+        ('espeak', '.!;:,?', ['"Hey! " ', '"hey," '],
+         ['heɪ! ', 'heɪ, '] if ESPEAK_150 else [' heɪ! ', ' heɪ, ']),
+        ('espeak', 'default', ['! ?', 'hey!'], ['! ? ', 'heɪ! ']),
+        ('espeak', '!', ['! ?', 'hey!'], ['! ', 'heɪ! ']),
+        ('segments', 'default', ['! ?', 'hey!'], ['! ? ', 'heːj! ']),
+        ('segments', '!', ['! ?', 'hey!'], ValueError),
+        ('festival', 'default', ['! ?', 'hey!'], ['! ? ', 'hhey! ']),
+        ('festival', '!', ['! ?', 'hey!'], ['! ', 'hhey! '])])
+def test_issue55(backend, marks, text, expected):
+    if marks == 'default':
+        marks = Punctuation.default_marks()
+    language = 'cree' if backend == 'segments' else 'en-us'
+
+    try:
+        with pytest.raises(expected):
+            phonemize(
+                text, language=language, backend=backend,
+                preserve_punctuation=True, punctuation_marks=marks)
+    except TypeError:
+        try:
+            assert expected == phonemize(
+                text, language=language, backend=backend,
+                preserve_punctuation=True, punctuation_marks=marks)
+        except RuntimeError:
+            if backend == 'festival':
+                # TODO on some installations festival fails to phonemize "?".
+                # It ends with a segmentation fault. This seems to only appear
+                # with festival-2.5 (but is working on travis and docker image)
+                pass
+
+
+@pytest.mark.parametrize(
+    'punctuation_marks, text, expected', [
+        (';:,.!?¡—…"«»“”',
+         'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
+         'həloʊ, ,wɜːld? θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. həloʊ? '),
+        (re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9'$@&+%\-=/\\]"),
+         'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
+         'həloʊ, ,wɜːld? ‡ θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
+        (re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9',.$@&+%\-=/\\]|[,.](?!\d)"),
+         'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
+         'həloʊ, ,wɜːld? ‡ θɹiː θaʊzənd, ɔːɹ tuː pɔɪnt faɪv ziəɹoʊ. ¿həloʊ? ')
+    ])
+def test_punctuation_marks_regex(punctuation_marks, text, expected):
+    assert expected == phonemize(
+        text, preserve_punctuation=True, punctuation_marks=punctuation_marks)
+
+
+def test_marks_getter_with_regex():
+    marks_re = re.compile(r"[^a-zA-Z0-9]")
+    punct = Punctuation(marks_re)
+    with pytest.raises(ValueError):
+        punct.marks == marks_re
+
+
+def test_long_document():
+    # testing issue raised by #108
+    DATA_FOLDER = Path(__file__).parent / "data"
+    with open(DATA_FOLDER / "pg67147.txt") as txt_file:
+        phonemize(txt_file.read().split("\n"), backend="espeak", preserve_punctuation=True)
+
+
+@pytest.mark.parametrize(
+    'text', [
+        ([
+            'worked david ford i started in deloitte and i was immediately',
+         ]
+        ),
+        ([
+            'worked david ford i started in deloitte, and i was immediately',
+         ]
+        ),
+        ([
+            'worked david ford i started in deloitte and i was immediately',
+            'an offer of price waterhouse cooper and here i take may',
+            'we are now as maximum plan for a customer time and',
+            "they're going to meet all the xvin so great it"
+         ]
+        ),
+        ([
+            'worked david ford i started in deloitte, and i was immediately',
+            'an offer of price waterhouse cooper and here i take may',
+            'we are now as maximum plan for a customer time and',
+            "they're going to meet all the xvin so great it."
+         ]
+        ),
+    ])
+def test_multiline_punctuation(text):
+    phonemized = phonemize(text, preserve_punctuation=True)
+    assert len(text) == len(phonemized)
@@ -0,0 +1,113 @@
+# Copyright 2015-2021 Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the segments backend"""
+
+# pylint: disable=missing-docstring
+
+import os
+import pytest
+
+from phonemizer.separator import Separator, default_separator
+from phonemizer.backend import SegmentsBackend
+from phonemizer.utils import get_package_resource
+
+
+def test_multiline():
+    backend = SegmentsBackend('cree')
+    assert backend.language == 'cree'
+
+    assert backend.phonemize(['a']) == [u'ʌ ']
+    assert backend.phonemize(['aa']) == [u'ʌʌ ']
+    assert backend.phonemize(['a\n']) == [u'ʌ ']
+    assert backend.phonemize(['a\na']) == [u'ʌ ʌ ']
+    assert backend.phonemize(['a\na\n']) == [u'ʌ ʌ ']
+    assert backend.phonemize(['a', 'a']) == [u'ʌ ', 'ʌ ']
+    assert backend.phonemize(['a\n', 'a\n']) == [u'ʌ ', 'ʌ ']
+
+
+def test_bad_morpheme():
+    backend = SegmentsBackend('cree')
+    with pytest.raises(ValueError):
+        backend.phonemize(['A'])
+
+
+def test_separator():
+    backend = SegmentsBackend('cree')
+    text = ['achi acho']
+
+    sep = default_separator
+    assert backend.phonemize(text, separator=sep) == [u'ʌtʃɪ ʌtʃʊ ']
+    assert backend.phonemize(text, separator=sep, strip=True) == [u'ʌtʃɪ ʌtʃʊ']
+
+
+def test_separator_2():
+    backend = SegmentsBackend('cree')
+    text = ['achi acho']
+
+    sep = Separator(word='_', phone=' ')
+    assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ _ʌ tʃ ʊ _']
+    assert backend.phonemize(text, separator=sep, strip=True) \
+        == [u'ʌ tʃ ɪ_ʌ tʃ ʊ']
+
+
+def test_separator_3():
+    backend = SegmentsBackend('cree')
+    text = ['achi acho']
+
+    sep = Separator(word=' ', syllable=None, phone='_')
+    assert backend.phonemize(text, separator=sep) == [u'ʌ_tʃ_ɪ_ ʌ_tʃ_ʊ_ ']
+    assert backend.phonemize(text, separator=sep, strip=True) \
+        == [u'ʌ_tʃ_ɪ ʌ_tʃ_ʊ']
+
+
+def test_separator_4():
+    backend = SegmentsBackend('cree')
+    text = ['achi acho']
+
+    # TODO bug when sep.phone == ' ' with no sep.word
+    sep = Separator(phone=' ', word='')
+    assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ ʌ tʃ ʊ ']
+    assert backend.phonemize(text, separator=sep, strip=True) \
+        == [u'ʌ tʃ ɪʌ tʃ ʊ']
+
+
+def test_separator_5():
+    backend = SegmentsBackend('cree')
+    text = ['achi acho']
+
+    sep = Separator(phone=' ', word='_')
+    assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ _ʌ tʃ ʊ _']
+    assert backend.phonemize(text, separator=sep, strip=True) \
+        == [u'ʌ tʃ ɪ_ʌ tʃ ʊ']
+
+
+def test_language(tmpdir):
+    # check languages by name
+    assert SegmentsBackend.is_supported_language('cree')
+    assert not SegmentsBackend.is_supported_language('unexisting')
+
+    # check languages by g2p file
+    directory = get_package_resource('segments')
+    assert SegmentsBackend.is_supported_language(
+        os.path.join(directory, 'cree.g2p'))
+    assert not SegmentsBackend.is_supported_language(
+        os.path.join(directory, 'cree'))
+    assert not SegmentsBackend.is_supported_language(
+        os.path.join(directory, 'unexisting.g2p'))
+
+    # bad syntax in g2p file
+    g2p = tmpdir.join('foo.g2p')
+    g2p.write('\n'.join(['a a', 'b b b', 'c']))
+    assert not SegmentsBackend.is_supported_language(g2p)
@@ -0,0 +1,82 @@
+# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
+#
+# This file is part of phonemizer: you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# Phonemizer is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
+"""Test of the Separator class"""
+
+# pylint: disable=missing-docstring
+
+import pytest
+
+from phonemizer.separator import Separator, default_separator
+
+
+def test_prop():
+    # read only attributes
+    with pytest.raises(AttributeError):
+        default_separator.phone = 'a'
+
+    with pytest.raises(AttributeError):
+        default_separator.syllable = 'a'
+
+    with pytest.raises(AttributeError):
+        default_separator.word = 'a'
+
+
+@pytest.mark.parametrize('val', [None, '', False])
+def test_empty(val):
+    s = Separator(val, val, val)
+    assert s.phone == ''
+    assert s.syllable == ''
+    assert s.word == ''
+
+
+def test_same():
+    with pytest.raises(ValueError):
+        Separator(word=' ', phone=' ')
+
+
+def test_str():
+    separator = Separator(word='w', syllable='s', phone='p')
+    assert str(separator) == '(phone: "p", syllable: "s", word: "w")'
+    assert str(default_separator) == '(phone: "", syllable: "", word: " ")'
+
+
+def test_equal():
+    assert Separator() == Separator()
+    assert default_separator == Separator(phone='', syllable='', word=' ')
+    assert Separator(word='  ') != default_separator
+
+
+def test_field_separator():
+    sep = Separator(word='w', syllable='s', phone='p')
+    assert 'w' in sep
+    assert 'p' in sep
+    assert 'wp' not in sep
+    assert ' ' not in sep
+
+    assert sep.input_output_separator(False) is False
+    assert sep.input_output_separator(None) is False
+    assert sep.input_output_separator('') is False
+    assert sep.input_output_separator(True) == '|'
+    assert sep.input_output_separator('io') == 'io'
+
+    with pytest.raises(RuntimeError) as err:
+        sep.input_output_separator([1, 2])
+    assert 'invalid input/output separator' in str(err)
+    with pytest.raises(RuntimeError) as err:
+        sep.input_output_separator('w')
+    assert 'cannot prepend input with "w"' in str(err)
+
+    sep = Separator(phone='|', syllable='||', word='|||')
+    assert sep.input_output_separator(True) == '||||'
@@ -0,0 +1,52 @@
+"""Test of the phonemizer.utils module"""
+
+# pylint: disable=missing-docstring
+import os
+
+from phonemizer.utils import chunks, cumsum, str2list, list2str
+
+
+def test_cumsum():
+    assert cumsum([]) == []
+    assert cumsum([0]) == [0]
+    assert cumsum([1, 2, 3]) == [1, 3, 6]
+
+
+def test_list2str():
+    assert list2str('') == ''
+    assert list2str([]) == ''
+    assert list2str(['']) == ''
+    assert list2str(['abc']) == 'abc'
+    assert list2str(['a', 'b', 'c']) == os.linesep.join('abc')
+
+
+def test_str2list():
+    assert str2list('') == ['']
+    assert str2list('a') == ['a']
+    assert str2list('ab') == ['ab']
+    assert str2list('a b') == ['a b']
+    assert str2list(f'a{os.linesep}b') == ['a', 'b']
+    assert str2list(
+        f'a{os.linesep}{os.linesep}b{os.linesep}') == ['a', '', 'b']
+
+
+def test_chunks():
+    for i in range(1, 5):
+        assert chunks(['a'], i) == ([['a']], [0])
+
+    assert chunks(['a', 'a'], 1) == ([['a', 'a']], [0])
+    assert chunks(['a', 'a'], 2) == ([['a'], ['a']], [0, 1])
+    assert chunks(['a', 'a'], 10) == ([['a'], ['a']], [0, 1])
+
+    assert chunks(['a', 'a', 'a'], 1) == ([['a', 'a', 'a']], [0])
+    assert chunks(['a', 'a', 'a'], 2) == ([['a'], ['a', 'a']], [0, 1])
+    assert chunks(['a', 'a', 'a'], 3) == ([['a'], ['a'], ['a']], [0, 1, 2])
+    assert chunks(['a', 'a', 'a'], 10) == ([['a'], ['a'], ['a']], [0, 1, 2])
+
+    assert chunks(['a', 'a', 'a', 'a'], 1) == ([['a', 'a', 'a', 'a']], [0])
+    assert chunks(['a', 'a', 'a', 'a'], 2) == (
+        [['a', 'a'], ['a', 'a']], [0, 2])
+    assert chunks(['a', 'a', 'a', 'a'], 3) == (
+        [['a'], ['a'], ['a', 'a']], [0, 1, 2])
+    assert chunks(['a', 'a', 'a', 'a'], 10) == (
+        [['a'], ['a'], ['a'], ['a']], [0, 1, 2, 3])