Skip to content

Instantly share code, notes, and snippets.

@esnya
Created April 22, 2023 05:50
Show Gist options
  • Save esnya/76661d9dfb1a6dea41c099d435865844 to your computer and use it in GitHub Desktop.
Save esnya/76661d9dfb1a6dea41c099d435865844 to your computer and use it in GitHub Desktop.
Monkey-Patch for espnet2.bin.tts_inference.Text2Speech to support pyopenjtalk 0.3 and python 3.10
# Original code: Apache License 2.0
# https://github.com/espnet/espnet/blob/master/LICENSE
# Modified by: esnya
# https://github.com/esnya
from espnet2.bin.tts_inference import Text2Speech
from espnet2.text.phoneme_tokenizer import (
pyopenjtalk_g2p_accent_with_pause,
pyopenjtalk_g2p_prosody,
)
def pyopenjtalk_g2p_accent_with_pause_patch(text) -> list[str]:
import re
import pyopenjtalk
phones = []
for labels in pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)):
if labels.split("-")[1].split("+")[0] == "pau":
phones += ["pau"]
continue
p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels)
if len(p) == 1:
phones += [p[0][0], p[0][2], p[0][1]]
return phones
def pyopenjtalk_g2p_prosody_patch(
text: str, drop_unvoiced_vowels: bool = True
) -> list[str]:
"""Extract phoneme + prosoody symbol sequence from input full-context labels.
The algorithm is based on `Prosodic features control by symbols as input of
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
Args:
text (str): Input text.
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
Returns:
List[str]: List of phoneme + prosody symbols.
Examples:
>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
>>> pyopenjtalk_g2p_prosody("こんにちは。")
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
"""
import re
import pyopenjtalk
from espnet2.text.phoneme_tokenizer import _numeric_feature_by_regex
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
N = len(labels)
phones = []
for n in range(N):
lab_curr = labels[n]
# current phoneme
p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) # type: ignore
# deal unvoiced vowels as normal vowels
if drop_unvoiced_vowels and p3 in "AEIOU":
p3 = p3.lower()
# deal with sil at the beginning and the end of text
if p3 == "sil":
assert n == 0 or n == N - 1
if n == 0:
phones.append("^")
elif n == N - 1:
# check question form or not
e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
if e3 == 0:
phones.append("$")
elif e3 == 1:
phones.append("?")
continue
elif p3 == "pau":
phones.append("_")
continue
else:
phones.append(p3)
# accent type and position info (forward or backward)
a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
# number of mora in accent phrase
f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
# accent phrase border
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
phones.append("#")
# pitch falling
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
phones.append("]")
# pitch rising
elif a2 == 1 and a2_next == 2:
phones.append("[")
return phones
def patch_tts(tts: Text2Speech) -> Text2Speech:
if tts.preprocess_fn.tokenizer.g2p == pyopenjtalk_g2p_prosody: # type: ignore[attr-defined]
tts.preprocess_fn.tokenizer.g2p = pyopenjtalk_g2p_prosody_patch # type: ignore[attr-defined]
elif tts.preprocess_fn.tokenizer.g2p == pyopenjtalk_g2p_accent_with_pause: # type: ignore[attr-defined]
tts.preprocess_fn.tokenizer.g2p = pyopenjtalk_g2p_accent_with_pause_patch # type: ignore[attr-defined]
return tts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment