2025-12-01

This commit is contained in:
2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,274 @@
# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the punctuation processing"""
# pylint: disable=missing-docstring
from pathlib import Path
import pytest
import re
from phonemizer.backend import EspeakBackend, FestivalBackend, SegmentsBackend
from phonemizer.punctuation import Punctuation
from phonemizer.phonemize import phonemize
from phonemizer.separator import Separator, default_separator
# True if we are using espeak>=1.50
ESPEAK_150 = (EspeakBackend.version() >= (1, 50))
# True if we are using espeak>=1.49.3
ESPEAK_149 = (EspeakBackend.version() >= (1, 49, 3))
# True if we are using festival>=2.5
FESTIVAL_25 = (FestivalBackend.version() >= (2, 5))
@pytest.mark.parametrize(
'inp, out', [
('a, b,c.', 'a b c'),
('abc de', 'abc de'),
('!d.d. dd?? d!', 'd d dd d')])
def test_remove(inp, out):
assert Punctuation().remove(inp) == out
@pytest.mark.parametrize(
'inp', [
['.a.b.c.'],
['a, a?', 'b, b'],
['a, a?', 'b, b', '!'],
['a, a?', '!?', 'b, b'],
['!?', 'a, a?', 'b, b'],
['a, a, a'],
['a, a?', 'aaa bb', '.bb, b', 'c', '!d.d. dd?? d!'],
['Truly replied, "Yes".'],
['hi; ho,"'],
["!?"],
["!'"],
["It is ! (I think so)"],
["This {is} right"],
["[He] is right"],
])
def test_preserve(inp):
punct = Punctuation()
text, marks = punct.preserve(inp)
assert inp == punct.restore(text, marks, sep=default_separator, strip=True)
@pytest.mark.parametrize(
'text, expected_restore, expected_output', [
(['hi; hi,"'], ['hi; hi," '], ['haɪ; haɪ, ']),
(['hi; "hi,'], ['hi; "hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else ['haɪ; haɪ, ']),
(['"hi; hi,'], ['"hi; hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else [' haɪ; haɪ, '])])
def test_preserve_2(text, expected_restore, expected_output):
marks = ".!;:,?"
punct = Punctuation(marks=marks)
assert expected_restore == punct.restore(
*punct.preserve(text), sep=default_separator, strip=False)
output = phonemize(
text, backend="espeak",
preserve_punctuation=True, punctuation_marks=marks)
assert output == expected_output
def test_custom():
punct = Punctuation()
assert set(punct.marks) == set(punct.default_marks())
assert punct.remove('a,b.c') == 'a b c'
with pytest.raises(ValueError):
punct.marks = ['?', '.']
punct.marks = '?.'
assert len(punct.marks) == 2
assert punct.remove('a,b.c') == 'a,b c'
def test_espeak():
text = 'hello, world!'
expected1 = 'həloʊ wɜːld'
expected2 = 'həloʊ, wɜːld!'
expected3 = 'həloʊ wɜːld '
expected4 = 'həloʊ, wɜːld! '
out1 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
[text], strip=True)[0]
assert out1 == expected1
out2 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
[text], strip=True)[0]
assert out2 == expected2
out3 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
[text], strip=False)[0]
assert out3 == expected3
out4 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
[text], strip=False)[0]
assert out4 == expected4
def test_festival():
text = 'hello, world!'
expected1 = 'hhaxlow werld'
expected2 = 'hhaxlow, werld!'
expected3 = 'hhaxlow werld '
expected4 = 'hhaxlow, werld! '
out1 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
[text], strip=True)[0]
assert out1 == expected1
out2 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
[text], strip=True)[0]
assert out2 == expected2
out3 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
[text], strip=False)[0]
assert out3 == expected3
out4 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
[text], strip=False)[0]
assert out4 == expected4
def test_segments():
text = 'achi, acho!'
expected1 = 'ʌtʃɪ ʌtʃʊ'
expected2 = 'ʌtʃɪ, ʌtʃʊ!'
expected3 = 'ʌtʃɪ ʌtʃʊ '
expected4 = 'ʌtʃɪ, ʌtʃʊ! '
out1 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
[text], strip=True)[0]
assert out1 == expected1
out2 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
[text], strip=True)[0]
assert out2 == expected2
out3 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
[text], strip=False)[0]
assert out3 == expected3
out4 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
[text], strip=False)[0]
assert out4 == expected4
# see https://github.com/bootphon/phonemizer/issues/54
@pytest.mark.parametrize(
'text, expected', [("!'", "! "), ("'!", "! "), ("!'!", "!! "), ("'!'", "! ")])
def test_issue_54(text, expected):
output = phonemize(
[text], language='en-us', backend='espeak',
preserve_punctuation=True)[0]
assert expected == output
# see https://github.com/bootphon/phonemizer/issues/55
@pytest.mark.parametrize(
'backend, marks, text, expected', [
('espeak', 'default', ['"Hey! "', '"hey,"'], ['"heɪ! " ', '"heɪ," ']),
('espeak', '.!;:,?', ['"Hey! " ', '"hey," '],
['heɪ! ', 'heɪ, '] if ESPEAK_150 else [' heɪ! ', ' heɪ, ']),
('espeak', 'default', ['! ?', 'hey!'], ['! ? ', 'heɪ! ']),
('espeak', '!', ['! ?', 'hey!'], ['! ', 'heɪ! ']),
('segments', 'default', ['! ?', 'hey!'], ['! ? ', 'heːj! ']),
('segments', '!', ['! ?', 'hey!'], ValueError),
('festival', 'default', ['! ?', 'hey!'], ['! ? ', 'hhey! ']),
('festival', '!', ['! ?', 'hey!'], ['! ', 'hhey! '])])
def test_issue55(backend, marks, text, expected):
if marks == 'default':
marks = Punctuation.default_marks()
language = 'cree' if backend == 'segments' else 'en-us'
try:
with pytest.raises(expected):
phonemize(
text, language=language, backend=backend,
preserve_punctuation=True, punctuation_marks=marks)
except TypeError:
try:
assert expected == phonemize(
text, language=language, backend=backend,
preserve_punctuation=True, punctuation_marks=marks)
except RuntimeError:
if backend == 'festival':
# TODO on some installations festival fails to phonemize "?".
# It ends with a segmentation fault. This seems to only appear
# with festival-2.5 (but is working on travis and docker image)
pass
@pytest.mark.parametrize(
'punctuation_marks, text, expected', [
(';:,.!?¡—…"«»“”',
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
'həloʊ, ,wɜːld? θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. həloʊ? '),
(re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9'$@&+%\-=/\\]"),
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
'həloʊ, ,wɜːld? ‡ θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
(re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9',.$@&+%\-=/\\]|[,.](?!\d)"),
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
'həloʊ, ,wɜːld? ‡ θɹiː θaʊzənd, ɔːɹ tuː pɔɪnt faɪv ziəɹoʊ. ¿həloʊ? ')
])
def test_punctuation_marks_regex(punctuation_marks, text, expected):
assert expected == phonemize(
text, preserve_punctuation=True, punctuation_marks=punctuation_marks)
def test_marks_getter_with_regex():
marks_re = re.compile(r"[^a-zA-Z0-9]")
punct = Punctuation(marks_re)
with pytest.raises(ValueError):
punct.marks == marks_re
def test_long_document():
# testing issue raised by #108
DATA_FOLDER = Path(__file__).parent / "data"
with open(DATA_FOLDER / "pg67147.txt") as txt_file:
phonemize(txt_file.read().split("\n"), backend="espeak", preserve_punctuation=True)
@pytest.mark.parametrize(
'text', [
([
'worked david ford i started in deloitte and i was immediately',
]
),
([
'worked david ford i started in deloitte, and i was immediately',
]
),
([
'worked david ford i started in deloitte and i was immediately',
'an offer of price waterhouse cooper and here i take may',
'we are now as maximum plan for a customer time and',
"they're going to meet all the xvin so great it"
]
),
([
'worked david ford i started in deloitte, and i was immediately',
'an offer of price waterhouse cooper and here i take may',
'we are now as maximum plan for a customer time and',
"they're going to meet all the xvin so great it."
]
),
])
def test_multiline_punctuation(text):
phonemized = phonemize(text, preserve_punctuation=True)
assert len(text) == len(phonemized)