2025-12-01

This commit is contained in:
2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,225 @@
# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the espeak backend"""
# pylint: disable=missing-docstring
# pylint: disable=redefined-outer-name
import os
import shutil
import pytest
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.wrapper import EspeakWrapper
from phonemizer.separator import Separator, default_separator
def test_bad_text():
backend = EspeakBackend('en-us')
text = 'hello world'
with pytest.raises(RuntimeError) as err:
backend.phonemize(text, default_separator, True)
assert 'input text to phonemize() is str' in str(err)
assert backend.phonemize(
[text], default_separator, True) == ['həloʊ wɜːld']
def test_english():
backend = EspeakBackend('en-us')
text = ['hello world', 'goodbye', 'third line', 'yet another']
out = backend.phonemize(text, default_separator, True)
assert out == ['həloʊ wɜːld', 'ɡʊdbaɪ', 'θɜːd laɪn', 'jɛt ɐnʌðɚ']
def test_stress():
backend = EspeakBackend('en-us', with_stress=False)
assert backend.phonemize(
['hello world'], default_separator, True) == ['həloʊ wɜːld']
backend = EspeakBackend('en-us', with_stress=True)
assert backend.phonemize(
['hello world'], default_separator, True) == ['həlˈoʊ wˈɜːld']
def test_french():
backend = EspeakBackend('fr-fr')
text = ['bonjour le monde']
sep = Separator(word=';eword ', syllable=None, phone=' ')
expected = ['b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword ']
out = backend.phonemize(text, sep, False)
assert out == expected
@pytest.mark.skipif(
(
not EspeakBackend.is_espeak_ng() or
# Arabic is not supported by the Windows msi installer from espeak-ng
# github release
not EspeakBackend.is_supported_language('ar')),
reason='Arabic is not supported')
def test_arabic():
backend = EspeakBackend('ar')
text = ['السلام عليكم']
sep = Separator()
# Arabic seems to have changed starting at espeak-ng-1.49.3
if EspeakBackend.version() >= (1, 49, 3):
expected = ['ʔassalaːm ʕliːkm ']
else:
expected = ['ʔassalaam ʕaliijkum ']
out = backend.phonemize(text, sep, False)
assert out == expected
# see https://github.com/bootphon/phonemizer/issues/31
def test_phone_separator_simple():
text = ['The lion and the tiger ran']
sep = Separator(phone='_')
backend = EspeakBackend('en-us')
output = backend.phonemize(text, separator=sep, strip=True)
expected = ['ð_ə l_aɪə_n æ_n_d ð_ə t_aɪ_ɡ_ɚ ɹ_æ_n']
assert expected == output
output = backend.phonemize(text, separator=sep, strip=False)
expected = ['ð_ə_ l_aɪə_n_ æ_n_d_ ð_ə_ t_aɪ_ɡ_ɚ_ ɹ_æ_n_ ']
assert expected == output
@pytest.mark.parametrize(
'text, expected',
(('the hello but the', 'ð_ə h_ə_l_oʊ b_ʌ_t ð_ə'),
# ('Here there and everywhere', 'h_ɪɹ ð_ɛɹ æ_n_d ɛ_v_ɹ_ɪ_w_ɛɹ'),
# ('He was hungry and tired.', 'h_iː w_ʌ_z h_ʌ_ŋ_ɡ_ɹ_i æ_n_d t_aɪɚ_d'),
('He was hungry but tired.', 'h_iː w_ʌ_z h_ʌ_ŋ_ɡ_ɹ_i b_ʌ_t t_aɪɚ_d')))
def test_phone_separator(text, expected):
sep = Separator(phone='_')
backend = EspeakBackend('en-us')
output = backend.phonemize([text], separator=sep, strip=True)[0]
assert output == expected
@pytest.mark.skipif(
'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
reason='cannot modify environment')
def test_path_good():
espeak = EspeakBackend.library()
try:
EspeakBackend.set_library(None)
assert espeak == EspeakBackend.library()
library = EspeakWrapper().library_path
EspeakBackend.set_library(library)
test_english()
# restore the espeak path to default
finally:
EspeakBackend.set_library(None)
@pytest.mark.skipif(
'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
reason='cannot modify environment')
def test_path_bad():
try:
# corrupt the default espeak path, try to use python executable instead
binary = shutil.which('python')
EspeakBackend.set_library(binary)
with pytest.raises(RuntimeError):
EspeakBackend('en-us')
with pytest.raises(RuntimeError):
EspeakBackend.version()
EspeakBackend.set_library(__file__)
with pytest.raises(RuntimeError):
EspeakBackend('en-us')
# restore the espeak path to default
finally:
EspeakBackend.set_library(None)
@pytest.mark.skipif(
'PHONEMIZER_ESPEAK_LIBRARY' in os.environ,
reason='cannot modify environment')
def test_path_venv():
try:
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = (
shutil.which('python'))
with pytest.raises(RuntimeError):
EspeakBackend('en-us').phonemize(['hello'])
with pytest.raises(RuntimeError):
EspeakBackend.version()
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = __file__
with pytest.raises(RuntimeError):
EspeakBackend.version()
finally:
try:
del os.environ['PHONEMIZER_ESPEAK_LIBRARY']
except KeyError:
pass
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='tie only compatible with espeak-ng')
@pytest.mark.parametrize(
'tie, expected', [
(False, 'dʒ_æ_k_i_ tʃ_æ_n_ '),
(True, 'd͡ʒæki t͡ʃæn '),
('8', 'd8ʒæki t8ʃæn ')])
def test_tie_simple(caplog, tie, expected):
backend = EspeakBackend('en-us', tie=tie)
assert backend.phonemize(
['Jackie Chan'],
separator=Separator(word=' ', phone='_'))[0] == expected
if tie:
messages = [msg[2] for msg in caplog.record_tuples]
assert (
'cannot use ties AND phone separation, ignoring phone separator'
in messages)
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='tie only compatible with espeak-ng')
def test_tie_utf8():
# NOTE this is a bug in espeak to append ties on (en) language switch
# flags. For now phonemizer does not fix it.
backend = EspeakBackend('fr-fr', tie=True)
# used to be 'bɔ̃͡ʒuʁ '
assert backend.phonemize(['bonjour']) == ['bɔ̃ʒuʁ ']
# used to be 'ty ɛm lə (͡e͡n͡)fʊtbɔ͡ːl(͡f͡r͡)'
assert backend.phonemize(
['tu aimes le football']) == ['ty ɛm lə (͡e͡n)fʊtbɔːl(͡f͡r) ']
assert backend.phonemize(
['bonjour apple']) == ['bɔ̃ʒuʁ (͡e͡n)apə͡l(͡f͡r) ']
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='tie only compatible with espeak-ng')
def test_tie_bad():
with pytest.raises(RuntimeError):
EspeakBackend('en-us', tie='abc')
@@ -0,0 +1,142 @@
# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the espeak backend language switch processing"""
# pylint: disable=missing-docstring
# pylint: disable=redefined-outer-name
import pytest
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator
@pytest.fixture
def langswitch_text():
return [
"j'aime l'anglais",
"j'aime le football",
"football",
"surtout le real madrid",
"n'utilise pas google"]
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='language switch only exists for espeak-ng')
@pytest.mark.parametrize('njobs', [1, 3])
def test_language_switch_keep_flags(caplog, langswitch_text, njobs):
backend = EspeakBackend('fr-fr', language_switch='keep-flags')
out = backend.phonemize(
langswitch_text, separator=Separator(), strip=True, njobs=njobs)
assert out == [
'ʒɛm lɑ̃ɡlɛ',
'ʒɛm lə (en)fʊtbɔːl(fr)',
'(en)fʊtbɔːl(fr)',
'syʁtu lə (en)ɹiəl(fr) madʁid',
'nytiliz pa (en)ɡuːɡəl(fr)']
messages = [msg[2] for msg in caplog.record_tuples]
assert (
'4 utterances containing language switches on lines 2, 3, 4, 5'
in messages)
assert (
'language switch flags have been kept (applying "keep-flags" policy)'
in messages)
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='language switch only exists for espeak-ng')
@pytest.mark.parametrize('njobs', [1, 3])
def test_language_switch_default(caplog, langswitch_text, njobs):
# default behavior is to keep the flags
backend = EspeakBackend('fr-fr')
out = backend.phonemize(
langswitch_text, separator=Separator(), strip=True, njobs=njobs)
assert out == [
'ʒɛm lɑ̃ɡlɛ',
'ʒɛm lə (en)fʊtbɔːl(fr)',
'(en)fʊtbɔːl(fr)',
'syʁtu lə (en)ɹiəl(fr) madʁid',
'nytiliz pa (en)ɡuːɡəl(fr)']
messages = [msg[2] for msg in caplog.record_tuples]
assert (
'4 utterances containing language switches on lines 2, 3, 4, 5'
in messages)
assert (
'language switch flags have been kept (applying "keep-flags" policy)'
in messages)
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='language switch only exists for espeak-ng')
@pytest.mark.parametrize('njobs', [1, 3])
def test_language_switch_remove_flags(caplog, langswitch_text, njobs):
backend = EspeakBackend('fr-fr', language_switch='remove-flags')
out = backend.phonemize(
langswitch_text, separator=Separator(), strip=True, njobs=njobs)
assert out == [
'ʒɛm lɑ̃ɡlɛ',
'ʒɛm lə fʊtbɔːl',
'fʊtbɔːl',
'syʁtu lə ɹiəl madʁid',
'nytiliz pa ɡuːɡəl']
messages = [msg[2] for msg in caplog.record_tuples]
assert (
'4 utterances containing language switches on lines 2, 3, 4, 5'
in messages)
assert (
'language switch flags have been removed '
'(applying "remove-flags" policy)'
in messages)
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='language switch only exists for espeak-ng')
@pytest.mark.parametrize('njobs', [1, 3])
def test_language_switch_remove_utterance(caplog, langswitch_text, njobs):
backend = EspeakBackend('fr-fr', language_switch='remove-utterance')
out = backend.phonemize(
langswitch_text, separator=Separator(), strip=True, njobs=njobs)
assert out == ['ʒɛm lɑ̃ɡlɛ', '', '', '', '']
messages = [msg[2] for msg in caplog.record_tuples]
assert (
'removed 4 utterances containing language switches '
'(applying "remove-utterance" policy)'
in messages)
with pytest.raises(RuntimeError):
backend = EspeakBackend('fr-fr', language_switch='foo')
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='language switch only exists for espeak-ng')
@pytest.mark.parametrize(
'policy', ('keep-flags', 'remove-flags', 'remove-utterance'))
def test_no_switch(policy, caplog):
text = ["j'aime l'anglais", "tu parles le français"]
backend = EspeakBackend('fr-fr', language_switch=policy)
out = backend.phonemize(text, separator=Separator(), strip=True)
assert out == ['ʒɛm lɑ̃ɡlɛ', 'ty paʁl lə fʁɑ̃sɛ']
messages = [msg[2] for msg in caplog.record_tuples]
assert not messages
@@ -0,0 +1,79 @@
"""Tests of the phonemizer.backend.espeak.words_mismatch module"""
# pylint: disable=missing-docstring
# pylint: disable=redefined-outer-name
import pytest
import re
from phonemizer import phonemize
from phonemizer.backend.espeak.words_mismatch import Ignore
from phonemizer.separator import Separator, default_separator
@pytest.fixture
def text():
return ["How are you?", "I have been busy", "I won't have time"]
def test_count_words():
# pylint: disable=protected-access
count_words = lambda phn: Ignore._count_words(
phn, wordsep=default_separator.word)
assert count_words(['']) == [0]
assert count_words(['a']) == [1]
assert count_words(['aaa']) == [1]
assert count_words([' aaa ']) == [1]
assert count_words([' a a \taa ']) == [3]
def test_bad():
with pytest.raises(RuntimeError):
phonemize('', words_mismatch='foo')
with pytest.raises(RuntimeError):
phonemize('', backend='festival', words_mismatch='remove')
@pytest.mark.parametrize('mode', ['ignore', 'warn', 'remove'])
def test_mismatch(caplog, text, mode):
phn = phonemize(
text, backend='espeak', language='en-us', words_mismatch=mode)
if mode == 'ignore':
assert phn == ['haʊ ɑːɹ juː ', 'aɪ hɐvbɪn bɪzi ', 'aɪ woʊntɐv taɪm ']
messages = [msg[2] for msg in caplog.record_tuples]
assert len(messages) == 1
assert 'words count mismatch on 67.0% of the lines (2/3)' in messages
elif mode == 'remove':
assert phn == ['haʊ ɑːɹ juː ', '', '']
messages = [msg[2] for msg in caplog.record_tuples]
assert len(messages) == 2
assert 'words count mismatch on 67.0% of the lines (2/3)' in messages
assert 'removing the mismatched lines' in messages
elif mode == 'warn':
assert phn == ['haʊ ɑːɹ juː ', 'aɪ hɐvbɪn bɪzi ', 'aɪ woʊntɐv taɪm ']
messages = [msg[2] for msg in caplog.record_tuples]
assert len(messages) == 3
assert (
'words count mismatch on line 2 (expected 4 words but get 3)'
in messages)
assert (
'words count mismatch on line 3 (expected 4 words but get 3)'
in messages)
assert 'words count mismatch on 67.0% of the lines (2/3)' in messages
# from https://github.com/bootphon/phonemizer/issues/169
def test_custom_separator(caplog):
phn = phonemize(
'try',
backend='espeak',
language='en-us',
separator=Separator(word='|', phone=' '),
words_mismatch='warn')
assert phn == 't ɹ aɪ |'
messages = [msg[2] for msg in caplog.record_tuples]
assert len(messages) == 0
@@ -0,0 +1,134 @@
# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the EspeakWrapper class"""
# pylint: disable=missing-docstring
# pylint: disable=redefined-outer-name
import os
import pathlib
import pickle
import sys
import pytest
from phonemizer.backend.espeak.wrapper import EspeakWrapper
from phonemizer.backend import EspeakMbrolaBackend
@pytest.fixture
def wrapper():
return EspeakWrapper()
def test_basic(wrapper):
assert wrapper.version >= (1, 48)
assert 'espeak' in str(wrapper.library_path)
assert os.path.isabs(wrapper.library_path)
assert os.path.isabs(wrapper.data_path) # not None, no raise
def test_available_voices(wrapper):
espeak = set(wrapper.available_voices())
assert espeak
mbrola = set(wrapper.available_voices('mbrola'))
# can be empty if no mbrola voice installed (occurs only on Windows, at
# least within the github CI pipeline)
if mbrola:
assert not espeak.intersection(mbrola)
def test_set_get_voice(wrapper):
assert wrapper.voice is None
with pytest.raises(RuntimeError) as err:
wrapper.set_voice('')
assert 'invalid voice code ""' in str(err)
wrapper.set_voice('fr-fr')
assert wrapper.voice.language == 'fr-fr'
assert wrapper.voice.name in (
'French (France)', # >1.48.3
'french') # older espeak
wrapper.set_voice('en-us')
assert wrapper.voice.language == 'en-us'
assert wrapper.voice.name in (
'English (America)', # >1.48.3
'english-us') # older espeak
# no mbrola voices available on Windows by default (at least on the github
# CI pipeline)
if sys.platform != 'win32':
wrapper.set_voice('mb-af1')
assert wrapper.voice.language == 'af'
assert wrapper.voice.name == 'afrikaans-mbrola-1'
with pytest.raises(RuntimeError) as err:
wrapper.set_voice('some non existant voice code')
assert 'invalid voice code' in str(err)
def _test_pickle(voice):
# the wrapper is pickled when using espeak backend on multiple jobs
wrapper = EspeakWrapper()
wrapper.set_voice(voice)
dump = pickle.dumps(wrapper)
wrapper2 = pickle.loads(dump)
assert wrapper.version == wrapper2.version
assert wrapper.library_path == wrapper2.library_path
assert wrapper.data_path == wrapper2.data_path
assert wrapper.voice == wrapper2.voice
def test_pickle_en_us():
_test_pickle('en-us')
@pytest.mark.skipif(
not EspeakMbrolaBackend.is_available() or
not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
reason='mbrola or mb-fr1 voice not installed')
def test_pickle_mb_fr1():
_test_pickle('mb-fr1')
def test_twice():
wrapper1 = EspeakWrapper()
wrapper2 = EspeakWrapper()
assert wrapper1.data_path == wrapper2.data_path
assert wrapper1.version == wrapper2.version
assert wrapper1.library_path == wrapper2.library_path
wrapper1.set_voice('fr-fr')
assert wrapper1.voice.language == 'fr-fr'
wrapper2.set_voice('en-us')
assert wrapper2.voice.language == 'en-us'
assert wrapper1.voice.language == 'fr-fr'
# pylint: disable=protected-access
assert wrapper1._espeak._tempdir != wrapper2._espeak._tempdir
@pytest.mark.skipif(sys.platform == 'win32', reason='not supported on Windows')
def test_deletion():
# pylint: disable=protected-access
wrapper = EspeakWrapper()
path = pathlib.Path(wrapper._espeak._tempdir)
del wrapper
assert not path.exists()
@@ -0,0 +1,119 @@
# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the festival backend"""
# pylint: disable=missing-docstring
import os
import pathlib
import shutil
import pytest
from phonemizer.separator import Separator
from phonemizer.backend import FestivalBackend
def _test(text, separator=Separator(
word=' ', syllable='|', phone='-')):
backend = FestivalBackend('en-us')
# pylint: disable=protected-access
return backend._phonemize_aux(text, 0, separator, True)
@pytest.mark.skipif(
FestivalBackend.version() <= (2, 1),
reason='festival-2.1 gives different results than further versions '
'for syllable boundaries')
def test_hello():
assert _test(['hello world']) == ['hh-ax|l-ow w-er-l-d']
assert _test(['hello', 'world']) == ['hh-ax|l-ow', 'w-er-l-d']
@pytest.mark.parametrize('text', ['', ' ', ' ', '(', '()', '"', "'"])
def test_bad_input(text):
assert _test(text) == []
def test_quote():
assert _test(["it's"]) == ['ih-t-s']
assert _test(["its"]) == ['ih-t-s']
assert _test(["it s"]) == ['ih-t eh-s']
assert _test(['it "s']) == ['ih-t eh-s']
def test_im():
sep = Separator(word=' ', syllable='', phone='')
assert _test(["I'm looking for an image"], sep) \
== ['aym luhkaxng faor axn ihmaxjh']
assert _test(["Im looking for an image"], sep) \
== ['ihm luhkaxng faor axn ihmaxjh']
@pytest.mark.skipif(
not shutil.which('festival'), reason='festival not in PATH')
def test_path_good():
try:
binary = shutil.which('festival')
FestivalBackend.set_executable(binary)
assert FestivalBackend('en-us').executable() == pathlib.Path(binary)
# restore the festival path to default
finally:
FestivalBackend.set_executable(None)
@pytest.mark.skipif(
'PHONEMIZER_FESTIVAL_EXECUTABLE' in os.environ,
reason='environment variable precedence')
def test_path_bad():
try:
# corrupt the default espeak path, try to use python executable instead
binary = shutil.which('python')
FestivalBackend.set_executable(binary)
with pytest.raises(RuntimeError):
FestivalBackend('en-us').phonemize(['hello'])
with pytest.raises(RuntimeError):
FestivalBackend.version()
with pytest.raises(RuntimeError):
FestivalBackend.set_executable(__file__)
# restore the festival path to default
finally:
FestivalBackend.set_executable(None)
@pytest.mark.skipif(
'PHONEMIZER_FESTIVAL_EXECUTABLE' in os.environ,
reason='cannot modify environment')
def test_path_venv():
try:
os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE'] = shutil.which('python')
with pytest.raises(RuntimeError):
FestivalBackend('en-us').phonemize(['hello'])
with pytest.raises(RuntimeError):
FestivalBackend.version()
os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE'] = __file__
with pytest.raises(RuntimeError):
FestivalBackend.version()
finally:
try:
del os.environ['PHONEMIZER_FESTIVAL_EXECUTABLE']
except KeyError:
pass
@@ -0,0 +1,14 @@
"""Tests to import the phonemize function"""
# pylint: disable=missing-docstring
# pylint: disable=import-outside-toplevel
def test_relative():
from phonemizer import phonemize
assert phonemize('a') == 'eɪ '
def test_absolute():
from phonemizer.phonemize import phonemize
assert phonemize('a') == 'eɪ '
@@ -0,0 +1,158 @@
# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the command line interface"""
# pylint: disable=missing-docstring
import os
import pathlib
import tempfile
import shlex
import sys
import pytest
from phonemizer.backend import EspeakMbrolaBackend, EspeakBackend
from phonemizer import main, backend, logger
def _test(text, expected_output, args=''):
with tempfile.TemporaryDirectory() as tmpdir:
input_file = pathlib.Path(tmpdir) / 'input.txt'
output_file = pathlib.Path(tmpdir) / 'output.txt'
with open(input_file, 'wb') as finput:
finput.write(text.encode('utf8'))
sys.argv = ['unused', f'{input_file}', '-o', f'{output_file}']
if args:
sys.argv += shlex.split(args)
main.main()
with open(output_file, 'rb') as foutput:
output = foutput.read().decode()
# silly fix for windows
assert output.replace('\r', '').strip(os.linesep) \
== expected_output.replace('\r', '')
def test_help():
sys.argv = ['foo', '-h']
with pytest.raises(SystemExit):
main.main()
def test_version():
sys.argv = ['foo', '--version']
main.main()
def test_list_languages():
sys.argv = ['foo', '--list-languages']
main.main()
def test_readme():
_test('hello world', 'həloʊ wɜːld ', '--verbose')
_test('hello world', 'həloʊ wɜːld ', '--quiet')
_test('hello world', 'hello world | həloʊ wɜːld ', '--prepend-text')
_test('hello world', 'hhaxlow werld', '-b festival --strip')
_test('bonjour le monde', 'bɔ̃ʒuʁ lə mɔ̃d ', '-l fr-fr')
_test('bonjour le monde', 'b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword ',
'-l fr-fr -p " " -w ";eword "')
@pytest.mark.skipif(
'2.1' in backend.FestivalBackend.version(),
reason='festival-2.1 gives different results than further versions '
'for syllable boundaries')
def test_readme_festival_syll():
_test('hello world',
'hh ax ;esyll l ow ;esyll ;eword w er l d ;esyll ;eword ',
"-p ' ' -s ';esyll ' -w ';eword ' -b festival -l en-us")
@pytest.mark.parametrize('njobs', [1, 6])
def test_njobs(njobs):
_test(
os.linesep.join((
'hello world',
'goodbye',
'third line',
'yet another')),
os.linesep.join((
'h-ə-l-oʊ w-ɜː-l-d',
'ɡ-ʊ-d-b-aɪ',
'θ-ɜː-d l-aɪ-n',
'j-ɛ-t ɐ-n-ʌ-ð-ɚ')),
f'--strip -j {njobs} -l en-us -b espeak -p "-" -s "|" -w " "')
def test_unicode():
_test('untuʼule', 'untṵːle ', '-l yucatec -b segments')
def test_logger():
with pytest.raises(RuntimeError):
logger.get_logger(verbosity=1)
@pytest.mark.skipif(
not EspeakMbrolaBackend.is_available() or
not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
reason='mbrola or mb-fr1 voice not installed')
def test_espeak_mbrola():
_test('coucou toi!', 'k u k u t w a ',
'-b espeak-mbrola -l mb-fr1 -p" " --preserve-punctuation')
def test_espeak_path():
espeak = pathlib.Path(backend.EspeakBackend.library())
if sys.platform == 'win32':
espeak = str(espeak).replace('\\', '\\\\').replace(' ', '\\ ')
_test('hello world', 'həloʊ wɜːld ', f'--espeak-library={espeak}')
def test_festival_path():
festival = pathlib.Path(backend.FestivalBackend.executable())
if sys.platform == 'win32':
festival = str(festival).replace('\\', '\\\\').replace(' ', '\\ ')
_test('hello world', 'hhaxlow werld ',
f'--festival-executable={festival} -b festival')
@pytest.mark.parametrize(
'args, expected', [
('',
'həloʊ wɜːld θɹiː ziəɹoʊziəɹoʊ ziəɹoʊ ɔːɹ tuː fɪfti həloʊ '),
('--preserve-punctuation',
'həloʊ, ,wɜːld? θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
('--preserve-punctuation '
'--punctuation-marks-is-regex '
'--punctuation-marks "[^a-zA-ZÀ-ÖØ-öø-ÿ0-9\'\\-]"',
'həloʊ, ,wɜːld? ‡ θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
('--preserve-punctuation '
'--punctuation-marks-is-regex '
'--punctuation-marks "[;:\\!?¡¿—…\\\"«»“”]|[,.](?!\\d)"',
'həloʊ, ,wɜːld? θɹiː θaʊzənd, ɔːɹ tuː pɔɪnt faɪv ziəɹoʊ. ¿həloʊ? ')])
def test_punctuation_is_regex(args, expected):
print(args)
_test("hello, ,world? ‡ 3,000, or 2.50. ¿hello?", expected, args)
def test_invalid_punctuation_regex():
with pytest.raises(SystemExit):
_test('hello world', None, '--punctuation-marks-is-regex --punctuation-marks "[*,"')
@@ -0,0 +1,109 @@
# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the espeak-mbrola backend"""
# pylint: disable=missing-docstring
# pylint: disable=redefined-outer-name
import pytest
from phonemizer.backend import EspeakMbrolaBackend
from phonemizer.separator import Separator
@pytest.fixture(scope='session')
def backend():
return EspeakMbrolaBackend('mb-fr1')
@pytest.mark.skipif(
not EspeakMbrolaBackend.is_available() or
not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
reason='mbrola or mb-fr1 voice not installed')
@pytest.mark.parametrize(
'text, expected',
[
# plosives
('pont', 'po~'),
('bon', 'bo~'),
('temps', 'ta~'),
('dans', 'da~'),
('quand', 'ka~'),
('gant', 'ga~'),
# fricatives
('femme', 'fam'),
('vent', 'va~'),
('sans', 'sa~'),
('champ', 'Sa~'),
('gens', 'Za~'),
('ion', 'jo~'),
# nasals
('mont', 'mo~'),
('nom', 'no~'),
('oignon', 'onjo~'),
('ping', 'piN'),
# liquid glides
('long', 'lo~'),
('rond', 'Ro~'),
('coin', 'kwe~'),
('juin', 'Zye~'),
('pierre', 'pjER'),
# vowels
('si', 'si'),
('ses', 'se'),
('seize', 'sEz'),
('patte', 'pat'),
('pâte', 'pat'),
('comme', 'kOm'),
('gros', 'gRo'),
('doux', 'du'),
('du', 'dy'),
('deux', 'd2'),
('neuf', 'n9f'),
('justement', 'Zystma~'),
('vin', 've~'),
('vent', 'va~'),
('bon', 'bo~'),
('brun', 'bR9~')])
def test_sampa_fr(backend, text, expected):
assert expected == backend.phonemize(
[text], strip=True, separator=Separator(phone=''))[0]
@pytest.mark.skipif(
not EspeakMbrolaBackend.is_available() or
not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
reason='mbrola or mb-fr1 voice not installed')
def test_french_sampa(backend):
text = ['bonjour le monde']
sep = Separator(word=None, phone=' ')
expected = ['b o~ Z u R l @ m o~ d ']
out = backend.phonemize(text, separator=sep, strip=False)
assert out == expected
expected = ['b o~ Z u R l @ m o~ d']
out = backend.phonemize(text, separator=sep, strip=True)
assert out == expected
assert backend.phonemize([''], separator=sep, strip=True) == ['']
assert backend.phonemize(['"'], separator=sep, strip=True) == ['']
@pytest.mark.skipif(
not EspeakMbrolaBackend.is_available(),
reason='mbrola not installed')
def test_mbrola_bad_language():
assert not EspeakMbrolaBackend.is_supported_language('foo-bar')
@@ -0,0 +1,291 @@
# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the phonemizer.phonemize function"""
# pylint: disable=missing-docstring
import os
import pytest
from phonemizer.phonemize import phonemize
from phonemizer.separator import Separator
from phonemizer.backend import EspeakBackend, EspeakMbrolaBackend
def test_bad_backend():
with pytest.raises(RuntimeError):
phonemize('', backend='fetiv')
with pytest.raises(RuntimeError):
phonemize('', backend='foo')
with pytest.raises(RuntimeError):
phonemize('', tie=True, backend='festival')
with pytest.raises(RuntimeError):
phonemize('', tie=True, backend='mbrola')
with pytest.raises(RuntimeError):
phonemize('', tie=True, backend='segments')
with pytest.raises(RuntimeError):
phonemize(
'', tie=True, backend='espeak',
separator=Separator(' ', None, '-'))
def test_bad_language():
with pytest.raises(RuntimeError):
phonemize('', language='fr-fr', backend='festival')
with pytest.raises(RuntimeError):
phonemize('', language='ffr', backend='espeak')
with pytest.raises(RuntimeError):
phonemize('', language='/path/to/nonexisting/file', backend='segments')
with pytest.raises(RuntimeError):
phonemize('', language='creep', backend='segments')
def test_text_type():
text1 = ['one two', 'three', 'four five']
text2 = os.linesep.join(text1)
phn1 = phonemize(text1, language='en-us', backend='espeak', strip=True)
phn2 = phonemize(text2, language='en-us', backend='espeak', strip=True)
out3 = phonemize(text2, language='en-us', backend='espeak', strip=True,
prepend_text=True)
text3 = [o[0] for o in out3]
phn3 = [o[1] for o in out3]
assert isinstance(phn1, list)
assert isinstance(phn2, str)
assert os.linesep.join(phn1) == phn2
assert os.linesep.join(phn3) == phn2
assert text3 == text1
@pytest.mark.skipif(
not EspeakBackend.is_espeak_ng(),
reason='language switch only exists for espeak-ng')
def test_lang_switch():
text = ['bonjour apple', 'bonjour toi']
out = phonemize(
text,
language='fr-fr',
backend='espeak',
prepend_text=True,
language_switch='remove-utterance')
assert out == [('bonjour apple', ''), ('bonjour toi', 'bɔ̃ʒuʁ twa ')]
@pytest.mark.parametrize('njobs', [2, 4])
def test_espeak(njobs):
text = ['one two', 'three', 'four five']
out = phonemize(
text, language='en-us', backend='espeak',
strip=True, njobs=njobs)
assert out == ['wʌn tuː', 'θɹiː', 'foːɹ faɪv']
out = phonemize(
' '.join(text), language='en-us', backend='espeak',
strip=False, njobs=njobs)
assert out == ' '.join(['wʌn tuː', 'θɹiː', 'foːɹ faɪv '])
out = phonemize(
os.linesep.join(text), language='en-us', backend='espeak',
strip=False, njobs=njobs)
assert out == os.linesep.join(['wʌn tuː ', 'θɹiː ', 'foːɹ faɪv '])
@pytest.mark.skipif(
not EspeakMbrolaBackend.is_available() or
not EspeakMbrolaBackend.is_supported_language('mb-fr1'),
reason='mbrola or mb-fr1 voice not installed')
@pytest.mark.parametrize('njobs', [2, 4])
def test_espeak_mbrola(caplog, njobs):
text = ['un deux', 'trois', 'quatre cinq']
out = phonemize(
text,
language='mb-fr1',
backend='espeak-mbrola',
njobs=njobs,
preserve_punctuation=True)
assert out == ['9~d2', 'tRwa', 'katRse~k']
messages = [msg[2] for msg in caplog.record_tuples]
assert 'espeak-mbrola backend cannot preserve punctuation' in messages
assert 'espeak-mbrola backend cannot preserve word separation' in messages
@pytest.mark.parametrize('njobs', [2, 4])
def test_festival(njobs):
text = ['one two', 'three', 'four five']
out = phonemize(
text, language='en-us', backend='festival',
strip=False, njobs=njobs)
assert out == ['wahn tuw ', 'thriy ', 'faor fayv ']
out = phonemize(
' '.join(text), language='en-us', backend='festival',
strip=True, njobs=njobs)
assert out == ' '.join(['wahn tuw', 'thriy', 'faor fayv'])
out = phonemize(
os.linesep.join(text), language='en-us', backend='festival',
strip=True, njobs=njobs)
assert out == os.linesep.join(['wahn tuw', 'thriy', 'faor fayv'])
def test_festival_bad():
# cannot use options valid for espeak only
text = ['one two', 'three', 'four five']
with pytest.raises(RuntimeError):
phonemize(
text, language='en-us', backend='festival', with_stress=True)
with pytest.raises(RuntimeError):
phonemize(
text, language='en-us', backend='festival',
language_switch='remove-flags')
@pytest.mark.parametrize('njobs', [2, 4])
def test_segments(njobs):
# one two three four five in Maya Yucatec
text = ['untuʼuleʼ kaʼapʼeʼel', 'oʼoxpʼeʼel', 'kantuʼuloʼon chincho']
out = phonemize(
text, language='yucatec', backend='segments',
strip=False, njobs=njobs)
assert out == [
'untṵːlḛ ka̰ːpʼḛːl ', 'o̰ːʃpʼḛːl ', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo ']
out = phonemize(
' '.join(text), language='yucatec', backend='segments',
strip=False, njobs=njobs)
assert out == ' '.join(
['untṵːlḛ ka̰ːpʼḛːl', 'o̰ːʃpʼḛːl', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo '])
out = phonemize(
os.linesep.join(text), language='yucatec', backend='segments',
strip=True, njobs=njobs)
assert out == os.linesep.join(
['untṵːlḛ ka̰ːpʼḛːl', 'o̰ːʃpʼḛːl', 'kantṵːlo̰ːn t̠͡ʃint̠͡ʃo'])
@pytest.mark.parametrize(
'backend, empty_lines, punctuation, prepend_text, text, expected', [
('espeak', False, False, False,
['hello world!', '', 'goodbye'],
['həloʊ wɜːld ', 'ɡʊdbaɪ ']),
('espeak', False, True, False,
['hello world!', '', 'goodbye'],
['həloʊ wɜːld! ', 'ɡʊdbaɪ ']),
('espeak', True, False, False,
['hello world!', '', 'goodbye'],
['həloʊ wɜːld ', '', 'ɡʊdbaɪ ']),
('espeak', True, True, False,
['hello world!', '', 'goodbye'],
['həloʊ wɜːld! ', '', 'ɡʊdbaɪ ']),
('segments', False, False, False,
['achi acho?', '', 'achi acho'],
[u'ʌtʃɪ ʌtʃʊ ', u'ʌtʃɪ ʌtʃʊ ']),
('segments', False, True, False,
['achi acho?', '', 'achi acho'],
[u'ʌtʃɪ ʌtʃʊ? ', u'ʌtʃɪ ʌtʃʊ ']),
('segments', True, False, False,
['achi acho?', '', 'achi acho'],
[u'ʌtʃɪ ʌtʃʊ ', '', u'ʌtʃɪ ʌtʃʊ ']),
('segments', True, True, False,
['achi acho?', '', 'achi acho'],
[u'ʌtʃɪ ʌtʃʊ? ', '', u'ʌtʃɪ ʌtʃʊ ']),
('festival', False, False, False,
['hello world!', '', 'goodbye'],
['hhaxlow werld ', 'guhdbay ']),
('festival', False, True, False,
['hello world!', '', 'goodbye'],
['hhaxlow werld! ', 'guhdbay ']),
('festival', True, False, False,
['hello world!', '', 'goodbye'],
['hhaxlow werld ', '', 'guhdbay ']),
('festival', True, True, False,
['hello world!', '', 'goodbye'],
['hhaxlow werld! ', '', 'guhdbay ']),
('espeak', False, False, True,
['hello world!', '', 'goodbye'],
[('hello world!', 'həloʊ wɜːld '), ('goodbye', 'ɡʊdbaɪ ')]),
('espeak', False, True, True,
['hello world!', '', 'goodbye'],
[('hello world!', 'həloʊ wɜːld! '), ('goodbye', 'ɡʊdbaɪ ')]),
('espeak', True, False, True,
['hello world!', '', 'goodbye'],
[('hello world!', 'həloʊ wɜːld '), ('', ''), ('goodbye', 'ɡʊdbaɪ ')]),
('espeak', True, True, True,
['hello world!', '', 'goodbye'],
[('hello world!', 'həloʊ wɜːld! '), ('', ''), ('goodbye', 'ɡʊdbaɪ ')]),
('segments', False, False, True,
['achi acho?', '', 'achi acho'],
[('achi acho?', 'ʌtʃɪ ʌtʃʊ '), ('achi acho', u'ʌtʃɪ ʌtʃʊ ')]),
('segments', False, True, True,
['achi acho?', '', 'achi acho'],
[('achi acho?', 'ʌtʃɪ ʌtʃʊ? '), ('achi acho', u'ʌtʃɪ ʌtʃʊ ')]),
('segments', True, False, True,
['achi acho?', '', 'achi acho'],
[('achi acho?', u'ʌtʃɪ ʌtʃʊ '), ('', ''), ('achi acho', u'ʌtʃɪ ʌtʃʊ ')]),
('segments', True, True, True,
['achi acho?', '', 'achi acho'],
[('achi acho?', u'ʌtʃɪ ʌtʃʊ? '), ('', ''), ('achi acho', u'ʌtʃɪ ʌtʃʊ ')]),
('festival', False, False, True,
['hello world!', '', 'goodbye'],
[('hello world!', 'hhaxlow werld '), ('goodbye', 'guhdbay ')]),
('festival', False, True, True,
['hello world!', '', 'goodbye'],
[('hello world!', 'hhaxlow werld! '), ('goodbye', 'guhdbay ')]),
('festival', True, False, True,
['hello world!', '', 'goodbye'],
[('hello world!', 'hhaxlow werld '), ('', ''), ('goodbye', 'guhdbay ')]),
('festival', True, True, True,
['hello world!', '', 'goodbye'],
[('hello world!', 'hhaxlow werld! '), ('', ''), ('goodbye', 'guhdbay ')])])
def test_preserve_empty_lines(backend, empty_lines, punctuation, prepend_text, text, expected):
language = 'cree' if backend == 'segments' else 'en-us'
assert expected == phonemize(
text, language=language, backend=backend, prepend_text=prepend_text,
preserve_punctuation=punctuation, preserve_empty_lines=empty_lines)
@pytest.mark.parametrize(
'backend, empty_lines, punctuation, text, expected', [
('espeak', False, False, [''], []),
('espeak', False, True, [''], []),
('espeak', True, False, [''], ['']),
('espeak', True, True, [''], ['']),
('segments', False, False, [''], []),
('segments', False, True, [''], []),
('segments', True, False, [''], ['']),
('segments', True, True, [''], ['']),
('festival', False, False, [''], []),
('festival', False, True, [''], []),
('festival', True, False, [''], ['']),
('festival', True, True, [''], [''])])
def test_empty_input(backend, empty_lines, punctuation, text, expected):
language = 'cree' if backend == 'segments' else 'en-us'
assert expected == phonemize(
text, language=language, backend=backend,
preserve_punctuation=punctuation, preserve_empty_lines=empty_lines)
@@ -0,0 +1,274 @@
# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the punctuation processing"""
# pylint: disable=missing-docstring
from pathlib import Path
import pytest
import re
from phonemizer.backend import EspeakBackend, FestivalBackend, SegmentsBackend
from phonemizer.punctuation import Punctuation
from phonemizer.phonemize import phonemize
from phonemizer.separator import Separator, default_separator
# True if we are using espeak>=1.50
ESPEAK_150 = (EspeakBackend.version() >= (1, 50))
# True if we are using espeak>=1.49.3
ESPEAK_149 = (EspeakBackend.version() >= (1, 49, 3))
# True if we are using festival>=2.5
FESTIVAL_25 = (FestivalBackend.version() >= (2, 5))
@pytest.mark.parametrize(
'inp, out', [
('a, b,c.', 'a b c'),
('abc de', 'abc de'),
('!d.d. dd?? d!', 'd d dd d')])
def test_remove(inp, out):
assert Punctuation().remove(inp) == out
@pytest.mark.parametrize(
'inp', [
['.a.b.c.'],
['a, a?', 'b, b'],
['a, a?', 'b, b', '!'],
['a, a?', '!?', 'b, b'],
['!?', 'a, a?', 'b, b'],
['a, a, a'],
['a, a?', 'aaa bb', '.bb, b', 'c', '!d.d. dd?? d!'],
['Truly replied, "Yes".'],
['hi; ho,"'],
["!?"],
["!'"],
["It is ! (I think so)"],
["This {is} right"],
["[He] is right"],
])
def test_preserve(inp):
punct = Punctuation()
text, marks = punct.preserve(inp)
assert inp == punct.restore(text, marks, sep=default_separator, strip=True)
@pytest.mark.parametrize(
'text, expected_restore, expected_output', [
(['hi; hi,"'], ['hi; hi," '], ['haɪ; haɪ, ']),
(['hi; "hi,'], ['hi; "hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else ['haɪ; haɪ, ']),
(['"hi; hi,'], ['"hi; hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else [' haɪ; haɪ, '])])
def test_preserve_2(text, expected_restore, expected_output):
marks = ".!;:,?"
punct = Punctuation(marks=marks)
assert expected_restore == punct.restore(
*punct.preserve(text), sep=default_separator, strip=False)
output = phonemize(
text, backend="espeak",
preserve_punctuation=True, punctuation_marks=marks)
assert output == expected_output
def test_custom():
punct = Punctuation()
assert set(punct.marks) == set(punct.default_marks())
assert punct.remove('a,b.c') == 'a b c'
with pytest.raises(ValueError):
punct.marks = ['?', '.']
punct.marks = '?.'
assert len(punct.marks) == 2
assert punct.remove('a,b.c') == 'a,b c'
def test_espeak():
text = 'hello, world!'
expected1 = 'həloʊ wɜːld'
expected2 = 'həloʊ, wɜːld!'
expected3 = 'həloʊ wɜːld '
expected4 = 'həloʊ, wɜːld! '
out1 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
[text], strip=True)[0]
assert out1 == expected1
out2 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
[text], strip=True)[0]
assert out2 == expected2
out3 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
[text], strip=False)[0]
assert out3 == expected3
out4 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
[text], strip=False)[0]
assert out4 == expected4
def test_festival():
text = 'hello, world!'
expected1 = 'hhaxlow werld'
expected2 = 'hhaxlow, werld!'
expected3 = 'hhaxlow werld '
expected4 = 'hhaxlow, werld! '
out1 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
[text], strip=True)[0]
assert out1 == expected1
out2 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
[text], strip=True)[0]
assert out2 == expected2
out3 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
[text], strip=False)[0]
assert out3 == expected3
out4 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
[text], strip=False)[0]
assert out4 == expected4
def test_segments():
text = 'achi, acho!'
expected1 = 'ʌtʃɪ ʌtʃʊ'
expected2 = 'ʌtʃɪ, ʌtʃʊ!'
expected3 = 'ʌtʃɪ ʌtʃʊ '
expected4 = 'ʌtʃɪ, ʌtʃʊ! '
out1 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
[text], strip=True)[0]
assert out1 == expected1
out2 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
[text], strip=True)[0]
assert out2 == expected2
out3 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
[text], strip=False)[0]
assert out3 == expected3
out4 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
[text], strip=False)[0]
assert out4 == expected4
# see https://github.com/bootphon/phonemizer/issues/54
@pytest.mark.parametrize(
'text, expected', [("!'", "! "), ("'!", "! "), ("!'!", "!! "), ("'!'", "! ")])
def test_issue_54(text, expected):
output = phonemize(
[text], language='en-us', backend='espeak',
preserve_punctuation=True)[0]
assert expected == output
# see https://github.com/bootphon/phonemizer/issues/55
@pytest.mark.parametrize(
'backend, marks, text, expected', [
('espeak', 'default', ['"Hey! "', '"hey,"'], ['"heɪ! " ', '"heɪ," ']),
('espeak', '.!;:,?', ['"Hey! " ', '"hey," '],
['heɪ! ', 'heɪ, '] if ESPEAK_150 else [' heɪ! ', ' heɪ, ']),
('espeak', 'default', ['! ?', 'hey!'], ['! ? ', 'heɪ! ']),
('espeak', '!', ['! ?', 'hey!'], ['! ', 'heɪ! ']),
('segments', 'default', ['! ?', 'hey!'], ['! ? ', 'heːj! ']),
('segments', '!', ['! ?', 'hey!'], ValueError),
('festival', 'default', ['! ?', 'hey!'], ['! ? ', 'hhey! ']),
('festival', '!', ['! ?', 'hey!'], ['! ', 'hhey! '])])
def test_issue55(backend, marks, text, expected):
if marks == 'default':
marks = Punctuation.default_marks()
language = 'cree' if backend == 'segments' else 'en-us'
try:
with pytest.raises(expected):
phonemize(
text, language=language, backend=backend,
preserve_punctuation=True, punctuation_marks=marks)
except TypeError:
try:
assert expected == phonemize(
text, language=language, backend=backend,
preserve_punctuation=True, punctuation_marks=marks)
except RuntimeError:
if backend == 'festival':
# TODO on some installations festival fails to phonemize "?".
# It ends with a segmentation fault. This seems to only appear
# with festival-2.5 (but is working on travis and docker image)
pass
@pytest.mark.parametrize(
'punctuation_marks, text, expected', [
(';:,.!?¡—…"«»“”',
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
'həloʊ, ,wɜːld? θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. həloʊ? '),
(re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9'$@&+%\-=/\\]"),
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
'həloʊ, ,wɜːld? ‡ θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
(re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9',.$@&+%\-=/\\]|[,.](?!\d)"),
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
'həloʊ, ,wɜːld? ‡ θɹiː θaʊzənd, ɔːɹ tuː pɔɪnt faɪv ziəɹoʊ. ¿həloʊ? ')
])
def test_punctuation_marks_regex(punctuation_marks, text, expected):
assert expected == phonemize(
text, preserve_punctuation=True, punctuation_marks=punctuation_marks)
def test_marks_getter_with_regex():
marks_re = re.compile(r"[^a-zA-Z0-9]")
punct = Punctuation(marks_re)
with pytest.raises(ValueError):
punct.marks == marks_re
def test_long_document():
# testing issue raised by #108
DATA_FOLDER = Path(__file__).parent / "data"
with open(DATA_FOLDER / "pg67147.txt") as txt_file:
phonemize(txt_file.read().split("\n"), backend="espeak", preserve_punctuation=True)
@pytest.mark.parametrize(
'text', [
([
'worked david ford i started in deloitte and i was immediately',
]
),
([
'worked david ford i started in deloitte, and i was immediately',
]
),
([
'worked david ford i started in deloitte and i was immediately',
'an offer of price waterhouse cooper and here i take may',
'we are now as maximum plan for a customer time and',
"they're going to meet all the xvin so great it"
]
),
([
'worked david ford i started in deloitte, and i was immediately',
'an offer of price waterhouse cooper and here i take may',
'we are now as maximum plan for a customer time and',
"they're going to meet all the xvin so great it."
]
),
])
def test_multiline_punctuation(text):
phonemized = phonemize(text, preserve_punctuation=True)
assert len(text) == len(phonemized)
@@ -0,0 +1,113 @@
# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the segments backend"""
# pylint: disable=missing-docstring
import os
import pytest
from phonemizer.separator import Separator, default_separator
from phonemizer.backend import SegmentsBackend
from phonemizer.utils import get_package_resource
def test_multiline():
backend = SegmentsBackend('cree')
assert backend.language == 'cree'
assert backend.phonemize(['a']) == [u'ʌ ']
assert backend.phonemize(['aa']) == [u'ʌʌ ']
assert backend.phonemize(['a\n']) == [u'ʌ ']
assert backend.phonemize(['a\na']) == [u'ʌ ʌ ']
assert backend.phonemize(['a\na\n']) == [u'ʌ ʌ ']
assert backend.phonemize(['a', 'a']) == [u'ʌ ', 'ʌ ']
assert backend.phonemize(['a\n', 'a\n']) == [u'ʌ ', 'ʌ ']
def test_bad_morpheme():
backend = SegmentsBackend('cree')
with pytest.raises(ValueError):
backend.phonemize(['A'])
def test_separator():
backend = SegmentsBackend('cree')
text = ['achi acho']
sep = default_separator
assert backend.phonemize(text, separator=sep) == [u'ʌtʃɪ ʌtʃʊ ']
assert backend.phonemize(text, separator=sep, strip=True) == [u'ʌtʃɪ ʌtʃʊ']
def test_separator_2():
backend = SegmentsBackend('cree')
text = ['achi acho']
sep = Separator(word='_', phone=' ')
assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ _ʌ tʃ ʊ _']
assert backend.phonemize(text, separator=sep, strip=True) \
== [u'ʌ tʃ ɪ_ʌ tʃ ʊ']
def test_separator_3():
backend = SegmentsBackend('cree')
text = ['achi acho']
sep = Separator(word=' ', syllable=None, phone='_')
assert backend.phonemize(text, separator=sep) == [u'ʌ_tʃ_ɪ_ ʌ_tʃ_ʊ_ ']
assert backend.phonemize(text, separator=sep, strip=True) \
== [u'ʌ_tʃ_ɪ ʌ_tʃ_ʊ']
def test_separator_4():
backend = SegmentsBackend('cree')
text = ['achi acho']
# TODO bug when sep.phone == ' ' with no sep.word
sep = Separator(phone=' ', word='')
assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ ʌ tʃ ʊ ']
assert backend.phonemize(text, separator=sep, strip=True) \
== [u'ʌ tʃ ɪʌ tʃ ʊ']
def test_separator_5():
backend = SegmentsBackend('cree')
text = ['achi acho']
sep = Separator(phone=' ', word='_')
assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ _ʌ tʃ ʊ _']
assert backend.phonemize(text, separator=sep, strip=True) \
== [u'ʌ tʃ ɪ_ʌ tʃ ʊ']
def test_language(tmpdir):
# check languages by name
assert SegmentsBackend.is_supported_language('cree')
assert not SegmentsBackend.is_supported_language('unexisting')
# check languages by g2p file
directory = get_package_resource('segments')
assert SegmentsBackend.is_supported_language(
os.path.join(directory, 'cree.g2p'))
assert not SegmentsBackend.is_supported_language(
os.path.join(directory, 'cree'))
assert not SegmentsBackend.is_supported_language(
os.path.join(directory, 'unexisting.g2p'))
# bad syntax in g2p file
g2p = tmpdir.join('foo.g2p')
g2p.write('\n'.join(['a a', 'b b b', 'c']))
assert not SegmentsBackend.is_supported_language(g2p)
@@ -0,0 +1,82 @@
# Copyright 2015-2021 Thomas Schatz, Xuan Nga Cao, Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the Separator class"""
# pylint: disable=missing-docstring
import pytest
from phonemizer.separator import Separator, default_separator
def test_prop():
# read only attributes
with pytest.raises(AttributeError):
default_separator.phone = 'a'
with pytest.raises(AttributeError):
default_separator.syllable = 'a'
with pytest.raises(AttributeError):
default_separator.word = 'a'
@pytest.mark.parametrize('val', [None, '', False])
def test_empty(val):
s = Separator(val, val, val)
assert s.phone == ''
assert s.syllable == ''
assert s.word == ''
def test_same():
with pytest.raises(ValueError):
Separator(word=' ', phone=' ')
def test_str():
separator = Separator(word='w', syllable='s', phone='p')
assert str(separator) == '(phone: "p", syllable: "s", word: "w")'
assert str(default_separator) == '(phone: "", syllable: "", word: " ")'
def test_equal():
assert Separator() == Separator()
assert default_separator == Separator(phone='', syllable='', word=' ')
assert Separator(word=' ') != default_separator
def test_field_separator():
sep = Separator(word='w', syllable='s', phone='p')
assert 'w' in sep
assert 'p' in sep
assert 'wp' not in sep
assert ' ' not in sep
assert sep.input_output_separator(False) is False
assert sep.input_output_separator(None) is False
assert sep.input_output_separator('') is False
assert sep.input_output_separator(True) == '|'
assert sep.input_output_separator('io') == 'io'
with pytest.raises(RuntimeError) as err:
sep.input_output_separator([1, 2])
assert 'invalid input/output separator' in str(err)
with pytest.raises(RuntimeError) as err:
sep.input_output_separator('w')
assert 'cannot prepend input with "w"' in str(err)
sep = Separator(phone='|', syllable='||', word='|||')
assert sep.input_output_separator(True) == '||||'
@@ -0,0 +1,52 @@
"""Test of the phonemizer.utils module"""
# pylint: disable=missing-docstring
import os
from phonemizer.utils import chunks, cumsum, str2list, list2str
def test_cumsum():
assert cumsum([]) == []
assert cumsum([0]) == [0]
assert cumsum([1, 2, 3]) == [1, 3, 6]
def test_list2str():
assert list2str('') == ''
assert list2str([]) == ''
assert list2str(['']) == ''
assert list2str(['abc']) == 'abc'
assert list2str(['a', 'b', 'c']) == os.linesep.join('abc')
def test_str2list():
assert str2list('') == ['']
assert str2list('a') == ['a']
assert str2list('ab') == ['ab']
assert str2list('a b') == ['a b']
assert str2list(f'a{os.linesep}b') == ['a', 'b']
assert str2list(
f'a{os.linesep}{os.linesep}b{os.linesep}') == ['a', '', 'b']
def test_chunks():
for i in range(1, 5):
assert chunks(['a'], i) == ([['a']], [0])
assert chunks(['a', 'a'], 1) == ([['a', 'a']], [0])
assert chunks(['a', 'a'], 2) == ([['a'], ['a']], [0, 1])
assert chunks(['a', 'a'], 10) == ([['a'], ['a']], [0, 1])
assert chunks(['a', 'a', 'a'], 1) == ([['a', 'a', 'a']], [0])
assert chunks(['a', 'a', 'a'], 2) == ([['a'], ['a', 'a']], [0, 1])
assert chunks(['a', 'a', 'a'], 3) == ([['a'], ['a'], ['a']], [0, 1, 2])
assert chunks(['a', 'a', 'a'], 10) == ([['a'], ['a'], ['a']], [0, 1, 2])
assert chunks(['a', 'a', 'a', 'a'], 1) == ([['a', 'a', 'a', 'a']], [0])
assert chunks(['a', 'a', 'a', 'a'], 2) == (
[['a', 'a'], ['a', 'a']], [0, 2])
assert chunks(['a', 'a', 'a', 'a'], 3) == (
[['a'], ['a'], ['a', 'a']], [0, 1, 2])
assert chunks(['a', 'a', 'a', 'a'], 10) == (
[['a'], ['a'], ['a'], ['a']], [0, 1, 2, 3])