275 lines
9.5 KiB
Python
275 lines
9.5 KiB
Python
# Copyright 2015-2021 Mathieu Bernard
|
||
#
|
||
# This file is part of phonemizer: you can redistribute it and/or
|
||
# modify it under the terms of the GNU General Public License as
|
||
# published by the Free Software Foundation, either version 3 of the
|
||
# License, or (at your option) any later version.
|
||
#
|
||
# Phonemizer is distributed in the hope that it will be useful, but
|
||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
# General Public License for more details.
|
||
#
|
||
# You should have received a copy of the GNU General Public License
|
||
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
|
||
"""Test of the punctuation processing"""
|
||
|
||
# pylint: disable=missing-docstring
|
||
from pathlib import Path
|
||
|
||
import pytest
|
||
import re
|
||
|
||
from phonemizer.backend import EspeakBackend, FestivalBackend, SegmentsBackend
|
||
from phonemizer.punctuation import Punctuation
|
||
from phonemizer.phonemize import phonemize
|
||
from phonemizer.separator import Separator, default_separator
|
||
|
||
# True if we are using espeak>=1.50
|
||
ESPEAK_150 = (EspeakBackend.version() >= (1, 50))
|
||
|
||
# True if we are using espeak>=1.49.3
|
||
ESPEAK_149 = (EspeakBackend.version() >= (1, 49, 3))
|
||
|
||
# True if we are using festival>=2.5
|
||
FESTIVAL_25 = (FestivalBackend.version() >= (2, 5))
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
'inp, out', [
|
||
('a, b,c.', 'a b c'),
|
||
('abc de', 'abc de'),
|
||
('!d.d. dd?? d!', 'd d dd d')])
|
||
def test_remove(inp, out):
|
||
assert Punctuation().remove(inp) == out
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
'inp', [
|
||
['.a.b.c.'],
|
||
['a, a?', 'b, b'],
|
||
['a, a?', 'b, b', '!'],
|
||
['a, a?', '!?', 'b, b'],
|
||
['!?', 'a, a?', 'b, b'],
|
||
['a, a, a'],
|
||
['a, a?', 'aaa bb', '.bb, b', 'c', '!d.d. dd?? d!'],
|
||
['Truly replied, "Yes".'],
|
||
['hi; ho,"'],
|
||
["!?"],
|
||
["!'"],
|
||
["It is ! (I think so)"],
|
||
["This {is} right"],
|
||
["[He] is right"],
|
||
])
|
||
def test_preserve(inp):
|
||
punct = Punctuation()
|
||
text, marks = punct.preserve(inp)
|
||
assert inp == punct.restore(text, marks, sep=default_separator, strip=True)
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
'text, expected_restore, expected_output', [
|
||
(['hi; hi,"'], ['hi; hi," '], ['haɪ; haɪ, ']),
|
||
(['hi; "hi,'], ['hi; "hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else ['haɪ; haɪ, ']),
|
||
(['"hi; hi,'], ['"hi; hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else [' haɪ; haɪ, '])])
|
||
def test_preserve_2(text, expected_restore, expected_output):
|
||
marks = ".!;:,?"
|
||
punct = Punctuation(marks=marks)
|
||
assert expected_restore == punct.restore(
|
||
*punct.preserve(text), sep=default_separator, strip=False)
|
||
|
||
output = phonemize(
|
||
text, backend="espeak",
|
||
preserve_punctuation=True, punctuation_marks=marks)
|
||
assert output == expected_output
|
||
|
||
|
||
def test_custom():
|
||
punct = Punctuation()
|
||
assert set(punct.marks) == set(punct.default_marks())
|
||
assert punct.remove('a,b.c') == 'a b c'
|
||
|
||
with pytest.raises(ValueError):
|
||
punct.marks = ['?', '.']
|
||
punct.marks = '?.'
|
||
assert len(punct.marks) == 2
|
||
assert punct.remove('a,b.c') == 'a,b c'
|
||
|
||
|
||
def test_espeak():
|
||
text = 'hello, world!'
|
||
expected1 = 'həloʊ wɜːld'
|
||
expected2 = 'həloʊ, wɜːld!'
|
||
expected3 = 'həloʊ wɜːld '
|
||
expected4 = 'həloʊ, wɜːld! '
|
||
|
||
out1 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
|
||
[text], strip=True)[0]
|
||
assert out1 == expected1
|
||
|
||
out2 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
|
||
[text], strip=True)[0]
|
||
assert out2 == expected2
|
||
|
||
out3 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
|
||
[text], strip=False)[0]
|
||
assert out3 == expected3
|
||
|
||
out4 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
|
||
[text], strip=False)[0]
|
||
assert out4 == expected4
|
||
|
||
|
||
def test_festival():
|
||
text = 'hello, world!'
|
||
expected1 = 'hhaxlow werld'
|
||
expected2 = 'hhaxlow, werld!'
|
||
expected3 = 'hhaxlow werld '
|
||
expected4 = 'hhaxlow, werld! '
|
||
|
||
out1 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
|
||
[text], strip=True)[0]
|
||
assert out1 == expected1
|
||
|
||
out2 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
|
||
[text], strip=True)[0]
|
||
assert out2 == expected2
|
||
|
||
out3 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
|
||
[text], strip=False)[0]
|
||
assert out3 == expected3
|
||
|
||
out4 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
|
||
[text], strip=False)[0]
|
||
assert out4 == expected4
|
||
|
||
|
||
def test_segments():
|
||
text = 'achi, acho!'
|
||
expected1 = 'ʌtʃɪ ʌtʃʊ'
|
||
expected2 = 'ʌtʃɪ, ʌtʃʊ!'
|
||
expected3 = 'ʌtʃɪ ʌtʃʊ '
|
||
expected4 = 'ʌtʃɪ, ʌtʃʊ! '
|
||
|
||
out1 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
|
||
[text], strip=True)[0]
|
||
assert out1 == expected1
|
||
|
||
out2 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
|
||
[text], strip=True)[0]
|
||
assert out2 == expected2
|
||
|
||
out3 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
|
||
[text], strip=False)[0]
|
||
assert out3 == expected3
|
||
|
||
out4 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
|
||
[text], strip=False)[0]
|
||
assert out4 == expected4
|
||
|
||
|
||
# see https://github.com/bootphon/phonemizer/issues/54
|
||
@pytest.mark.parametrize(
|
||
'text, expected', [("!'", "! "), ("'!", "! "), ("!'!", "!! "), ("'!'", "! ")])
|
||
def test_issue_54(text, expected):
|
||
output = phonemize(
|
||
[text], language='en-us', backend='espeak',
|
||
preserve_punctuation=True)[0]
|
||
assert expected == output
|
||
|
||
|
||
# see https://github.com/bootphon/phonemizer/issues/55
|
||
@pytest.mark.parametrize(
|
||
'backend, marks, text, expected', [
|
||
('espeak', 'default', ['"Hey! "', '"hey,"'], ['"heɪ! " ', '"heɪ," ']),
|
||
('espeak', '.!;:,?', ['"Hey! " ', '"hey," '],
|
||
['heɪ! ', 'heɪ, '] if ESPEAK_150 else [' heɪ! ', ' heɪ, ']),
|
||
('espeak', 'default', ['! ?', 'hey!'], ['! ? ', 'heɪ! ']),
|
||
('espeak', '!', ['! ?', 'hey!'], ['! ', 'heɪ! ']),
|
||
('segments', 'default', ['! ?', 'hey!'], ['! ? ', 'heːj! ']),
|
||
('segments', '!', ['! ?', 'hey!'], ValueError),
|
||
('festival', 'default', ['! ?', 'hey!'], ['! ? ', 'hhey! ']),
|
||
('festival', '!', ['! ?', 'hey!'], ['! ', 'hhey! '])])
|
||
def test_issue55(backend, marks, text, expected):
|
||
if marks == 'default':
|
||
marks = Punctuation.default_marks()
|
||
language = 'cree' if backend == 'segments' else 'en-us'
|
||
|
||
try:
|
||
with pytest.raises(expected):
|
||
phonemize(
|
||
text, language=language, backend=backend,
|
||
preserve_punctuation=True, punctuation_marks=marks)
|
||
except TypeError:
|
||
try:
|
||
assert expected == phonemize(
|
||
text, language=language, backend=backend,
|
||
preserve_punctuation=True, punctuation_marks=marks)
|
||
except RuntimeError:
|
||
if backend == 'festival':
|
||
# TODO on some installations festival fails to phonemize "?".
|
||
# It ends with a segmentation fault. This seems to only appear
|
||
# with festival-2.5 (but is working on travis and docker image)
|
||
pass
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
'punctuation_marks, text, expected', [
|
||
(';:,.!?¡—…"«»“”',
|
||
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
|
||
'həloʊ, ,wɜːld? θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. həloʊ? '),
|
||
(re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9'$@&+%\-=/\\]"),
|
||
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
|
||
'həloʊ, ,wɜːld? ‡ θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
|
||
(re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9',.$@&+%\-=/\\]|[,.](?!\d)"),
|
||
'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
|
||
'həloʊ, ,wɜːld? ‡ θɹiː θaʊzənd, ɔːɹ tuː pɔɪnt faɪv ziəɹoʊ. ¿həloʊ? ')
|
||
])
|
||
def test_punctuation_marks_regex(punctuation_marks, text, expected):
|
||
assert expected == phonemize(
|
||
text, preserve_punctuation=True, punctuation_marks=punctuation_marks)
|
||
|
||
|
||
def test_marks_getter_with_regex():
|
||
marks_re = re.compile(r"[^a-zA-Z0-9]")
|
||
punct = Punctuation(marks_re)
|
||
with pytest.raises(ValueError):
|
||
punct.marks == marks_re
|
||
|
||
|
||
def test_long_document():
|
||
# testing issue raised by #108
|
||
DATA_FOLDER = Path(__file__).parent / "data"
|
||
with open(DATA_FOLDER / "pg67147.txt") as txt_file:
|
||
phonemize(txt_file.read().split("\n"), backend="espeak", preserve_punctuation=True)
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
'text', [
|
||
([
|
||
'worked david ford i started in deloitte and i was immediately',
|
||
]
|
||
),
|
||
([
|
||
'worked david ford i started in deloitte, and i was immediately',
|
||
]
|
||
),
|
||
([
|
||
'worked david ford i started in deloitte and i was immediately',
|
||
'an offer of price waterhouse cooper and here i take may',
|
||
'we are now as maximum plan for a customer time and',
|
||
"they're going to meet all the xvin so great it"
|
||
]
|
||
),
|
||
([
|
||
'worked david ford i started in deloitte, and i was immediately',
|
||
'an offer of price waterhouse cooper and here i take may',
|
||
'we are now as maximum plan for a customer time and',
|
||
"they're going to meet all the xvin so great it."
|
||
]
|
||
),
|
||
])
|
||
def test_multiline_punctuation(text):
|
||
phonemized = phonemize(text, preserve_punctuation=True)
|
||
assert len(text) == len(phonemized)
|