# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import re from g2p_en import G2p from string import punctuation from typing import Any, Dict, List, Optional, Pattern, Union from phonemizer.backend import EspeakBackend from phonemizer.backend.espeak.language_switch import LanguageSwitch from phonemizer.backend.espeak.words_mismatch import WordMismatch from phonemizer.punctuation import Punctuation from phonemizer.separator import Separator try: from pypinyin import Style, pinyin from pypinyin.style._utils import get_finals, get_initials except Exception: pass # This code is modified from # https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/tokenizer.py class PypinyinBackend: """PypinyinBackend for Chinese. Most codes is referenced from espnet. There are two types pinyin or initials_finals, one is just like "ni1 hao3", the other is like "n i1 h ao3". """ def __init__( self, backend="initials_finals", punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(), ) -> None: self.backend = backend self.punctuation_marks = punctuation_marks def phonemize( self, text: List[str], separator: Separator, strip=True, njobs=1 ) -> List[str]: assert isinstance(text, List) phonemized = [] for _text in text: _text = re.sub(" +", " ", _text.strip()) _text = _text.replace(" ", separator.word) phones = [] if self.backend == "pypinyin": for n, py in enumerate( pinyin(_text, style=Style.TONE3, neutral_tone_with_five=True) ): if all([c in self.punctuation_marks for c in py[0]]): if len(phones): assert phones[-1] == separator.syllable phones.pop(-1) phones.extend(list(py[0])) else: phones.extend([py[0], separator.syllable]) elif self.backend == "pypinyin_initials_finals": for n, py in enumerate( pinyin(_text, style=Style.TONE3, neutral_tone_with_five=True) ): if all([c in self.punctuation_marks for c in py[0]]): if len(phones): assert phones[-1] == separator.syllable phones.pop(-1) phones.extend(list(py[0])) else: if py[0][-1].isalnum(): initial = get_initials(py[0], strict=False) if py[0][-1].isdigit(): final = get_finals(py[0][:-1], strict=False) + py[0][-1] else: final = get_finals(py[0], strict=False) phones.extend( [ initial, separator.phone, final, separator.syllable, ] ) else: assert ValueError else: raise NotImplementedError phonemized.append( "".join(phones).rstrip(f"{separator.word}{separator.syllable}") ) return phonemized class G2PModule: """Phonemize Text.""" # We support espeak to extract IPA (International Phonetic Alphabet), which supports 100 languages, # https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md def __init__( self, language="en-us", backend="espeak", separator=Separator(word="_", syllable="-", phone="|"), preserve_punctuation=True, punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(), with_stress: bool = False, tie: Union[bool, str] = False, language_switch: LanguageSwitch = "keep-flags", words_mismatch: WordMismatch = "ignore", ) -> None: self.separator = separator self.backend = self._initialize_backend( backend, language, punctuation_marks, preserve_punctuation, with_stress, tie, language_switch, words_mismatch, ) def _initialize_backend( self, backend, language, punctuation_marks, preserve_punctuation, with_stress, tie, language_switch, words_mismatch, ): if backend == "espeak": return EspeakBackend( language, punctuation_marks=punctuation_marks, preserve_punctuation=preserve_punctuation, with_stress=with_stress, tie=tie, language_switch=language_switch, words_mismatch=words_mismatch, ) elif backend in ["pypinyin", "pypinyin_initials_finals"]: if language != "cmn": raise ValueError( f"{language} is not supported for pypinyin and pypinyin_initials_finals." ) return PypinyinBackend( backend=backend, punctuation_marks=punctuation_marks + self.separator.word, ) else: raise NotImplementedError(f"{backend}") def to_list(self, phonemized: str) -> List[str]: fields = [] for word in phonemized.split(self.separator.word): pp = re.findall(r"\w+|[^\w\s]", word, re.UNICODE) fields.extend( [p for p in pp if p != self.separator.phone] + [self.separator.word] ) assert len("".join(fields[:-1])) == len(phonemized) - phonemized.count( self.separator.phone ) return fields[:-1] def phonemization(self, text, strip=True) -> List[List[str]]: if isinstance(text, str): text = [text] phonemized = self.backend.phonemize( text, separator=self.separator, strip=strip, njobs=1 ) phonemes = [self.to_list(p) for p in phonemized] return phonemes def g2p_conversion(self, text: str) -> List[str]: phonemes = self.phonemization([text.strip()]) return phonemes[0] class LexiconModule: def __init__(self, lex_path, language="en-us") -> None: # todo: check lexicon derivation, merge with G2PModule? lexicon = {} with open(lex_path) as f: for line in f: temp = re.split(r"\s+", line.strip("\n")) word = temp[0] phones = temp[1:] if word.lower() not in lexicon: lexicon[word.lower()] = phones self.lexicon = lexicon self.language = language self.lang2g2p = {"en-us": G2p()} def g2p_conversion(self, text): phone = None # todo: preprocess with other languages if self.language == "en-us": phone = self.preprocess_english(text) else: print("No support to", self.language) raise return phone def preprocess_english(self, text): text = text.rstrip(punctuation) g2p = self.lang2g2p["en-us"] phones = [] words = re.split(r"([,;.\-\?\!\s+])", text) for w in words: if w.lower() in self.lexicon: phones += self.lexicon[w.lower()] else: phones += list(filter(lambda p: p != " ", g2p(w))) phones = "}{".join(phones) phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones) phones = phones.replace("}{", " ") return phones