Extract the Preprocessing for Silero into a file and Improve it (#757)
This commit is contained in:
parent
744bf7cbf2
commit
3f922d4bfb
4 changed files with 311 additions and 29 deletions
195
extensions/silero_tts/tts_preprocessor.py
Normal file
195
extensions/silero_tts/tts_preprocessor.py
Normal file
|
@ -0,0 +1,195 @@
|
|||
import re
|
||||
|
||||
from num2words import num2words
|
||||
|
||||
|
||||
punctuation = r'[\s,.?!/)\'\]>]'
|
||||
alphabet_map = {
|
||||
"A": " Ei ",
|
||||
"B": " Bee ",
|
||||
"C": " See ",
|
||||
"D": " Dee ",
|
||||
"E": " Eee ",
|
||||
"F": " Eff ",
|
||||
"G": " Jee ",
|
||||
"H": " Eich ",
|
||||
"I": " Eye ",
|
||||
"J": " Jay ",
|
||||
"K": " Kay ",
|
||||
"L": " El ",
|
||||
"M": " Emm ",
|
||||
"N": " Enn ",
|
||||
"O": " Ohh ",
|
||||
"P": " Pee ",
|
||||
"Q": " Queue ",
|
||||
"R": " Are ",
|
||||
"S": " Ess ",
|
||||
"T": " Tee ",
|
||||
"U": " You ",
|
||||
"V": " Vee ",
|
||||
"W": " Double You ",
|
||||
"X": " Ex ",
|
||||
"Y": " Why ",
|
||||
"Z": " Zed " # Zed is weird, as I (da3dsoul) am American, but most of the voice models sound British, so it matches
|
||||
}
|
||||
|
||||
|
||||
def preprocess(string):
|
||||
# the order for some of these matter
|
||||
# For example, you need to remove the commas in numbers before expanding them
|
||||
string = remove_surrounded_chars(string)
|
||||
string = string.replace('"', '')
|
||||
string = string.replace('\u201D', '').replace('\u201C', '') # right and left quote
|
||||
string = string.replace('\u201F', '') # italic looking quote
|
||||
string = string.replace('\n', ' ')
|
||||
string = convert_num_locale(string)
|
||||
string = replace_negative(string)
|
||||
string = replace_roman(string)
|
||||
string = hyphen_range_to(string)
|
||||
string = num_to_words(string)
|
||||
|
||||
# TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually
|
||||
# try to say the abbreviation or spell it out as I've done below is not agreed upon
|
||||
|
||||
# For now, expand abbreviations to pronunciations
|
||||
# replace_abbreviations adds a lot of unnecessary whitespace to ensure separation
|
||||
string = replace_abbreviations(string)
|
||||
string = replace_lowercase_abbreviations(string)
|
||||
|
||||
# cleanup whitespaces
|
||||
# remove whitespace before punctuation
|
||||
string = re.sub(rf'\s+({punctuation})', r'\1', string)
|
||||
string = string.strip()
|
||||
# compact whitespace
|
||||
string = ' '.join(string.split())
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def remove_surrounded_chars(string):
|
||||
# this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR
|
||||
# 'as few symbols as possible (0 upwards) between an asterisk and the end of the string'
|
||||
return re.sub(r'\*[^*]*?(\*|$)', '', string)
|
||||
|
||||
|
||||
def convert_num_locale(text):
|
||||
# This detects locale and converts it to American without comma separators
|
||||
pattern = re.compile(r'(?:\s|^)\d{1,3}(?:\.\d{3})+(,\d+)(?:\s|$)')
|
||||
result = text
|
||||
while True:
|
||||
match = pattern.search(result)
|
||||
if match is None:
|
||||
break
|
||||
|
||||
start = match.start()
|
||||
end = match.end()
|
||||
result = result[0:start] + result[start:end].replace('.', '').replace(',', '.') + result[end:len(result)]
|
||||
|
||||
# removes comma separators from existing American numbers
|
||||
pattern = re.compile(r'(\d),(\d)')
|
||||
result = pattern.sub(r'\1\2', result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def replace_negative(string):
|
||||
# handles situations like -5. -5 would become negative 5, which would then be expanded to negative five
|
||||
return re.sub(rf'(\s)(-)(\d+)({punctuation})', r'\1negative \3\4', string)
|
||||
|
||||
|
||||
def replace_roman(string):
|
||||
# find a string of roman numerals.
|
||||
# Only 2 or more, to avoid capturing I and single character abbreviations, like names
|
||||
pattern = re.compile(rf'\s[IVXLCDM]{{2,}}{punctuation}')
|
||||
result = string
|
||||
while True:
|
||||
match = pattern.search(result)
|
||||
if match is None:
|
||||
break
|
||||
|
||||
start = match.start()
|
||||
end = match.end()
|
||||
result = result[0:start + 1] + str(roman_to_int(result[start + 1:end - 1])) + result[end - 1:len(result)]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def roman_to_int(s):
|
||||
rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
||||
int_val = 0
|
||||
for i in range(len(s)):
|
||||
if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]:
|
||||
int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]]
|
||||
else:
|
||||
int_val += rom_val[s[i]]
|
||||
return int_val
|
||||
|
||||
|
||||
def hyphen_range_to(text):
|
||||
pattern = re.compile(r'(\d+)[-–](\d+)')
|
||||
result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text)
|
||||
return result
|
||||
|
||||
|
||||
def num_to_words(text):
|
||||
# 1000 or 10.23
|
||||
pattern = re.compile(r'\d+\.\d+|\d+')
|
||||
result = pattern.sub(lambda x: num2words(float(x.group())), text)
|
||||
return result
|
||||
|
||||
|
||||
def replace_abbreviations(string):
|
||||
# abbreviations 1 to 4 characters long. It will get things like A and I, but those are pronounced with their letter
|
||||
pattern = re.compile(rf'(^|[\s(.\'\[<])([A-Z]{{1,4}})({punctuation}|$)')
|
||||
result = string
|
||||
while True:
|
||||
match = pattern.search(result)
|
||||
if match is None:
|
||||
break
|
||||
|
||||
start = match.start()
|
||||
end = match.end()
|
||||
result = result[0:start] + replace_abbreviation(result[start:end]) + result[end:len(result)]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def replace_lowercase_abbreviations(string):
|
||||
# abbreviations 1 to 4 characters long, separated by dots i.e. e.g.
|
||||
pattern = re.compile(rf'(^|[\s(.\'\[<])(([a-z]\.){{1,4}})({punctuation}|$)')
|
||||
result = string
|
||||
while True:
|
||||
match = pattern.search(result)
|
||||
if match is None:
|
||||
break
|
||||
|
||||
start = match.start()
|
||||
end = match.end()
|
||||
result = result[0:start] + replace_abbreviation(result[start:end].upper()) + result[end:len(result)]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def replace_abbreviation(string):
|
||||
result = ""
|
||||
for char in string:
|
||||
result += match_mapping(char)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def match_mapping(char):
|
||||
for mapping in alphabet_map.keys():
|
||||
if char == mapping:
|
||||
return alphabet_map[char]
|
||||
|
||||
return char
|
||||
|
||||
|
||||
def __main__(args):
|
||||
print(preprocess(args[1]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
__main__(sys.argv)
|
Loading…
Add table
Add a link
Reference in a new issue