# Copyright 2015-2021 Mathieu Bernard # # This file is part of phonemizer: you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # Phonemizer is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with phonemizer. If not, see . """Implementation of punctuation processing""" import collections import re from typing import List, Union, Tuple, Pattern from phonemizer.utils import str2list from phonemizer.separator import Separator # The punctuation marks considered by default. _DEFAULT_MARKS = ';:,.!?¡¿—…"«»“”(){}[]' _MarkIndex = collections.namedtuple( '_mark_index', ['index', 'mark', 'position']) class Punctuation: """Preserve or remove the punctuation during phonemization Backends behave differently with punctuation: festival and espeak ignore it and remove it silently whereas segments will raise an error. The Punctuation class solves that issue by "hiding" the punctuation to the phonemization backend and restoring it afterwards. Parameters ---------- marks (str or re.Pattern) : The punctuation marks to consider for processing (either removal or preservation). If a string, each mark must be made of a single character. Default to Punctuation.default_marks(). """ def __init__(self, marks: Union[str, Pattern] = _DEFAULT_MARKS): self._marks: str = None # noqa self._marks_re: Pattern[str] = None # noqa self.marks = marks @staticmethod def default_marks(): """Returns the default punctuation marks as a string""" return _DEFAULT_MARKS @property def marks(self): """The punctuation marks as a string""" if self._marks: return self._marks raise ValueError('punctuation initialized from regex, cannot access marks as a string') @marks.setter def marks(self, value: Union[str, Pattern]): if isinstance(value, Pattern): # catch the pattern surrounded by zero or more spaces on either side self._marks_re = re.compile(r'((' + value.pattern + r')|\s)+') self._marks = None elif isinstance(value, str): self._marks = ''.join(set(value)) # catching all the marks in one regular expression: zero or more spaces # + one or more marks + zero or more spaces. self._marks_re = re.compile(fr'(\s*[{re.escape(self._marks)}]+\s*)+') else: raise ValueError('punctuation marks must be defined as a string or re.Pattern') def remove(self, text: Union[str, List[str]]) -> Union[str, List[str]]: """Returns the `text` with all punctuation marks replaced by spaces The input `text` can be a string or a list and is returned with the same type and punctuation removed. """ def aux(text: str) -> str: return re.sub(self._marks_re, ' ', text).strip() if isinstance(text, str): return aux(text) return [aux(line) for line in text] def preserve(self, text: Union[List[str], str]) -> Tuple[List[List[str]], List[_MarkIndex]]: """Removes punctuation from `text`, allowing for furter restoration This method returns the text as a list of punctuated chunks, along with a list of punctuation marks for furter restoration: 'hello, my world!' -> ['hello', 'my world'], [',', '!'] """ text: List[str] = str2list(text) preserved_text = [] preserved_marks = [] for num, line in enumerate(text): line, marks = self._preserve_line(line, num) preserved_text += line preserved_marks += marks return [line for line in preserved_text if line], preserved_marks def _preserve_line(self, line: str, num: int) -> Tuple[List[str], List[_MarkIndex]]: """Auxiliary method for Punctuation.preserve()""" matches = list(re.finditer(self._marks_re, line)) if not matches: return [line], [] # the line is made only of punctuation marks if len(matches) == 1 and matches[0].group() == line: return [], [_MarkIndex(num, line, 'A')] # build the list of mark indexes required to restore the punctuation marks = [] for match in matches: # find the position of the punctuation mark in the utterance: # begin (B), end (E), in the middle (I) or alone (A) position = 'I' if match == matches[0] and line.startswith(match.group()): position = 'B' elif match == matches[-1] and line.endswith(match.group()): position = 'E' marks.append(_MarkIndex(num, match.group(), position)) # split the line into sublines, each separated by a punctuation mark preserved_line = [] for mark in marks: split = line.split(mark.mark) prefix, suffix = split[0], mark.mark.join(split[1:]) preserved_line.append(prefix) line = suffix # append any trailing text to the preserved line return preserved_line + [line], marks @classmethod def restore(cls, text: Union[str, List[str]], marks: List[_MarkIndex], sep: Separator, strip: bool) -> List[str]: """Restore punctuation in a text. This is the reverse operation of Punctuation.preserve(). It takes a list of punctuated chunks and a list of punctuation marks, as well as the separator and strip parameters used by phonemize. It returns the punctuated text as a list: ['hello', 'my world'], [',', '!'] -> ['hello, my world!'] """ text = str2list(text) punctuated_text = [] pos = 0 while text or marks: if not marks: for line in text: # if strip is False, ensure the final word ends with a word separator if not strip and sep.word and not line.endswith(sep.word): line = line + sep.word punctuated_text.append(line) text = [] elif not text: # nothing has been phonemized, returns the marks alone, with internal # spaces replaced by the word separator punctuated_text.append(re.sub(' ', sep.word, ''.join(m.mark for m in marks))) marks = [] else: current_mark = marks[0] if current_mark.index == pos: # place the current mark here mark = marks[0] marks = marks[1:] # replace internal spaces in the current mark with the word separator mark = re.sub(' ', sep.word, mark.mark) # remove the word last separator from the current word if sep.word and text[0].endswith(sep.word): text[0] = text[0][:-len(sep.word)] if current_mark.position == 'B': text[0] = mark + text[0] elif current_mark.position == 'E': punctuated_text.append(text[0] + mark + ('' if strip or mark.endswith(sep.word) else sep.word)) text = text[1:] pos = pos + 1 elif current_mark.position == 'A': punctuated_text.append(mark + ('' if strip or mark.endswith(sep.word) else sep.word)) pos = pos + 1 else: # position == 'I' if len(text) == 1: # pragma: nocover # a corner case where the final part of an intermediate # mark (I) has not been phonemized text[0] = text[0] + mark else: first_word = text[0] text = text[1:] text[0] = first_word + mark + text[0] else: punctuated_text.append(text[0]) text = text[1:] pos = pos + 1 return punctuated_text