blender-portable-repo/extensions/.local/lib/python3.11/site-packages/test/test_punctuation.py

# Copyright 2015-2021 Mathieu Bernard
#
# This file is part of phonemizer: you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Phonemizer is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with phonemizer. If not, see <http://www.gnu.org/licenses/>.
"""Test of the punctuation processing"""

# pylint: disable=missing-docstring
from pathlib import Path

import pytest
import re

from phonemizer.backend import EspeakBackend, FestivalBackend, SegmentsBackend
from phonemizer.punctuation import Punctuation
from phonemizer.phonemize import phonemize
from phonemizer.separator import Separator, default_separator

# True if we are using espeak>=1.50
ESPEAK_150 = (EspeakBackend.version() >= (1, 50))

# True if we are using espeak>=1.49.3
ESPEAK_149 = (EspeakBackend.version() >= (1, 49, 3))

# True if we are using festival>=2.5
FESTIVAL_25 = (FestivalBackend.version() >= (2, 5))


@pytest.mark.parametrize(
    'inp, out', [
        ('a, b,c.', 'a b c'),
        ('abc de', 'abc de'),
        ('!d.d. dd??  d!', 'd d dd d')])
def test_remove(inp, out):
    assert Punctuation().remove(inp) == out


@pytest.mark.parametrize(
    'inp', [
        ['.a.b.c.'],
        ['a, a?', 'b, b'],
        ['a, a?', 'b, b', '!'],
        ['a, a?', '!?', 'b, b'],
        ['!?', 'a, a?', 'b, b'],
        ['a, a, a'],
        ['a, a?', 'aaa bb', '.bb, b', 'c', '!d.d. dd??  d!'],
        ['Truly replied, "Yes".'],
        ['hi; ho,"'],
        ["!?"],
        ["!'"],
        ["It is ! (I think so)"],
        ["This {is} right"],
        ["[He] is right"],
    ])
def test_preserve(inp):
    punct = Punctuation()
    text, marks = punct.preserve(inp)
    assert inp == punct.restore(text, marks, sep=default_separator, strip=True)


@pytest.mark.parametrize(
    'text, expected_restore, expected_output', [
        (['hi; hi,"'], ['hi; hi," '], ['haɪ; haɪ, ']),
        (['hi; "hi,'], ['hi; "hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else ['haɪ;  haɪ, ']),
        (['"hi; hi,'], ['"hi; hi, '], ['haɪ; haɪ, '] if ESPEAK_149 else [' haɪ; haɪ, '])])
def test_preserve_2(text, expected_restore, expected_output):
    marks = ".!;:,?"
    punct = Punctuation(marks=marks)
    assert expected_restore == punct.restore(
        *punct.preserve(text), sep=default_separator, strip=False)

    output = phonemize(
        text, backend="espeak",
        preserve_punctuation=True, punctuation_marks=marks)
    assert output == expected_output


def test_custom():
    punct = Punctuation()
    assert set(punct.marks) == set(punct.default_marks())
    assert punct.remove('a,b.c') == 'a b c'

    with pytest.raises(ValueError):
        punct.marks = ['?', '.']
    punct.marks = '?.'
    assert len(punct.marks) == 2
    assert punct.remove('a,b.c') == 'a,b c'


def test_espeak():
    text = 'hello, world!'
    expected1 = 'həloʊ wɜːld'
    expected2 = 'həloʊ, wɜːld!'
    expected3 = 'həloʊ wɜːld '
    expected4 = 'həloʊ, wɜːld! '

    out1 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
        [text], strip=True)[0]
    assert out1 == expected1

    out2 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
        [text], strip=True)[0]
    assert out2 == expected2

    out3 = EspeakBackend('en-us', preserve_punctuation=False).phonemize(
        [text], strip=False)[0]
    assert out3 == expected3

    out4 = EspeakBackend('en-us', preserve_punctuation=True).phonemize(
        [text], strip=False)[0]
    assert out4 == expected4


def test_festival():
    text = 'hello, world!'
    expected1 = 'hhaxlow werld'
    expected2 = 'hhaxlow, werld!'
    expected3 = 'hhaxlow werld '
    expected4 = 'hhaxlow, werld! '

    out1 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
        [text], strip=True)[0]
    assert out1 == expected1

    out2 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
        [text], strip=True)[0]
    assert out2 == expected2

    out3 = FestivalBackend('en-us', preserve_punctuation=False).phonemize(
        [text], strip=False)[0]
    assert out3 == expected3

    out4 = FestivalBackend('en-us', preserve_punctuation=True).phonemize(
        [text], strip=False)[0]
    assert out4 == expected4


def test_segments():
    text = 'achi, acho!'
    expected1 = 'ʌtʃɪ ʌtʃʊ'
    expected2 = 'ʌtʃɪ, ʌtʃʊ!'
    expected3 = 'ʌtʃɪ ʌtʃʊ '
    expected4 = 'ʌtʃɪ, ʌtʃʊ! '

    out1 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
        [text], strip=True)[0]
    assert out1 == expected1

    out2 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
        [text], strip=True)[0]
    assert out2 == expected2

    out3 = SegmentsBackend('cree', preserve_punctuation=False).phonemize(
        [text], strip=False)[0]
    assert out3 == expected3

    out4 = SegmentsBackend('cree', preserve_punctuation=True).phonemize(
        [text], strip=False)[0]
    assert out4 == expected4


# see https://github.com/bootphon/phonemizer/issues/54
@pytest.mark.parametrize(
    'text, expected', [("!'", "! "), ("'!", "! "), ("!'!", "!! "), ("'!'", "! ")])
def test_issue_54(text, expected):
    output = phonemize(
        [text], language='en-us', backend='espeak',
        preserve_punctuation=True)[0]
    assert expected == output


# see https://github.com/bootphon/phonemizer/issues/55
@pytest.mark.parametrize(
    'backend, marks, text, expected', [
        ('espeak', 'default', ['"Hey! "', '"hey,"'], ['"heɪ! " ', '"heɪ," ']),
        ('espeak', '.!;:,?', ['"Hey! " ', '"hey," '],
         ['heɪ! ', 'heɪ, '] if ESPEAK_150 else [' heɪ! ', ' heɪ, ']),
        ('espeak', 'default', ['! ?', 'hey!'], ['! ? ', 'heɪ! ']),
        ('espeak', '!', ['! ?', 'hey!'], ['! ', 'heɪ! ']),
        ('segments', 'default', ['! ?', 'hey!'], ['! ? ', 'heːj! ']),
        ('segments', '!', ['! ?', 'hey!'], ValueError),
        ('festival', 'default', ['! ?', 'hey!'], ['! ? ', 'hhey! ']),
        ('festival', '!', ['! ?', 'hey!'], ['! ', 'hhey! '])])
def test_issue55(backend, marks, text, expected):
    if marks == 'default':
        marks = Punctuation.default_marks()
    language = 'cree' if backend == 'segments' else 'en-us'

    try:
        with pytest.raises(expected):
            phonemize(
                text, language=language, backend=backend,
                preserve_punctuation=True, punctuation_marks=marks)
    except TypeError:
        try:
            assert expected == phonemize(
                text, language=language, backend=backend,
                preserve_punctuation=True, punctuation_marks=marks)
        except RuntimeError:
            if backend == 'festival':
                # TODO on some installations festival fails to phonemize "?".
                # It ends with a segmentation fault. This seems to only appear
                # with festival-2.5 (but is working on travis and docker image)
                pass


@pytest.mark.parametrize(
    'punctuation_marks, text, expected', [
        (';:,.!?¡—…"«»“”',
         'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
         'həloʊ, ,wɜːld? θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. həloʊ? '),
        (re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9'$@&+%\-=/\\]"),
         'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
         'həloʊ, ,wɜːld? ‡ θɹiː,ziəɹoʊziəɹoʊ ziəɹoʊ, ɔːɹ tuː.fɪfti. ¿həloʊ? '),
        (re.compile(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9',.$@&+%\-=/\\]|[,.](?!\d)"),
         'hello, ,world? ‡ 3,000, or 2.50. ¿hello?',
         'həloʊ, ,wɜːld? ‡ θɹiː θaʊzənd, ɔːɹ tuː pɔɪnt faɪv ziəɹoʊ. ¿həloʊ? ')
    ])
def test_punctuation_marks_regex(punctuation_marks, text, expected):
    assert expected == phonemize(
        text, preserve_punctuation=True, punctuation_marks=punctuation_marks)


def test_marks_getter_with_regex():
    marks_re = re.compile(r"[^a-zA-Z0-9]")
    punct = Punctuation(marks_re)
    with pytest.raises(ValueError):
        punct.marks == marks_re


def test_long_document():
    # testing issue raised by #108
    DATA_FOLDER = Path(__file__).parent / "data"
    with open(DATA_FOLDER / "pg67147.txt") as txt_file:
        phonemize(txt_file.read().split("\n"), backend="espeak", preserve_punctuation=True)


@pytest.mark.parametrize(
    'text', [
        ([
            'worked david ford i started in deloitte and i was immediately',
         ]
        ),
        ([
            'worked david ford i started in deloitte, and i was immediately',
         ]
        ),
        ([
            'worked david ford i started in deloitte and i was immediately',
            'an offer of price waterhouse cooper and here i take may',
            'we are now as maximum plan for a customer time and',
            "they're going to meet all the xvin so great it"
         ]
        ),
        ([
            'worked david ford i started in deloitte, and i was immediately',
            'an offer of price waterhouse cooper and here i take may',
            'we are now as maximum plan for a customer time and',
            "they're going to meet all the xvin so great it."
         ]
        ),
    ])
def test_multiline_punctuation(text):
    phonemized = phonemize(text, preserve_punctuation=True)
    assert len(text) == len(phonemized)