""" Normalize input text to a format that Soprano recognizes. Adapted from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/utils/tokenizer.py """ import os import re import inflect from unidecode import unidecode _inflect = inflect.engine() #################################################################################################### # Abbreviations _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ ('mrs', 'misuss'), ('ms', 'miss'), ('mr', 'mister'), ('dr', 'doctor'), ('st', 'saint'), ('co', 'company'), ('jr', 'junior'), ('maj', 'major'), ('gen', 'general'), ('drs', 'doctors'), ('rev', 'reverend'), ('lt', 'lieutenant'), ('hon', 'honorable'), ('sgt', 'sergeant'), ('capt', 'captain'), ('esq', 'esquire'), ('ltd', 'limited'), ('col', 'colonel'), ('ft', 'fort'), ]] _cased_abbreviations = [(re.compile('\\b%s\\b' % x[0]), x[1]) for x in [ ('TTS', 'text to speech'), ('Hz', 'hertz'), ('kHz', 'kilohertz'), ('KBs', 'kilobytes'), ('KB', 'kilobyte'), ('MBs', 'megabytes'), ('MB', 'megabyte'), ('GBs', 'gigabytes'), ('GB', 'gigabyte'), ('TBs', 'terabytes'), ('TB', 'terabyte'), ('APIs', 'a p i\'s'), ('API', 'a p i'), ('CLIs', 'c l i\'s'), ('CLI', 'c l i'), ('CPUs', 'c p u\'s'), ('CPU', 'c p u'), ('GPUs', 'g p u\'s'), ('GPU', 'g p u'), ('Ave', 'avenue'), ('etc', 'etcetera'), ]] def expand_abbreviations(text): for regex, replacement in _abbreviations + _cased_abbreviations: text = re.sub(regex, replacement, text) return text #################################################################################################### # Numbers _num_prefix_re = re.compile(r'#\d') _num_suffix_re = re.compile(r'\d(K|M|B|T)', re.IGNORECASE) _num_letter_split_re = re.compile(r'(\d[a-z]|[a-z]\d)', re.IGNORECASE) _comma_number_re = re.compile(r'(\d[\d\,]+\d)') _date_re = re.compile(r'(^|[^/])(\d\d?[/-]\d\d?[/-]\d\d(?:\d\d)?)($|[^/])') _phone_number_re = re.compile(r'(\(?\d{3}\)?[-.\s]\d{3}[-.\s]?\d{4})') _time_re = re.compile(r'(\d\d?:\d\d(?::\d\d)?)') _pounds_re = re.compile(r'£([\d\,]*\d+)') _dollars_re = re.compile(r'\$([\d\.\,]*\d+)') _decimal_number_re = re.compile(r'(\d+(?:\.\d+)+)') _multiply_re = re.compile(r'(\d\s?\*\s?\d)') _divide_re = re.compile(r'(\d\s?/\s?\d)') _add_re = re.compile(r'(\d\s?\+\s?\d)') _subtract_re = re.compile(r'(\d?\s?-\s?\d)') # also does negative numbers _fraction_re = re.compile(r'(\d+(?:/\d+)+)') _ordinal_re = re.compile(r'\d+(st|nd|rd|th)') _number_re = re.compile(r'\d+') def _expand_num_prefix(m): match = m.group(0) return f"number {match[1]}" def _expand_num_suffix(m): match = m.group(0) if match[1].upper() == 'K': return f"{match[0]} thousand" elif match[1].upper() == 'M': return f"{match[0]} million" elif match[1].upper() == 'B': return f"{match[0]} billion" elif match[1].upper() == 'T': return f"{match[0]} trillion" return match # unexpected format def _split_alphanumeric(m): match = m.group(1) return f"{match[0]} {match[1]}" def _remove_commas(m): return m.group(1).replace(',', '') def _expand_date(m): match = m.group(2) match = re.split('[./-]', match) return m.group(1) + ' dash '.join(match) + m.group(3) def _expand_phone_number(m): match = m.group(1) match = re.sub(r'\D', '', match) assert len(match) == 10 match = f"{' '.join(list(match[:3]))}, {' '.join(list(match[3:6]))}, {' '.join(list(match[6:]))}" return match def _expand_time(m): match = m.group(1) match = match.split(':') if len(match) == 2: hours, minutes = match if minutes == '00': if int(hours) == 0: return '0' elif int(hours) > 12: return f"{hours} minutes" return f"{hours} o'clock" elif minutes.startswith('0'): minutes = f'oh {minutes[1:]}' return f"{hours} {minutes}" else: hours, minutes, seconds = match if int(hours) != 0: return f"{hours} {'oh oh' if minutes == '00' else f'oh {minutes}' if minutes.startswith('0') else {minutes}} {'' if seconds == '00' else f'oh {seconds}' if seconds.startswith('0') else seconds}" elif minutes != '00': return f"{minutes} {'oh oh' if seconds == '00' else f'oh {seconds}' if seconds.startswith('0') else seconds}" else: return seconds def _expand_dollars(m): match = m.group(1) parts = match.split('.') if len(parts) > 2: return match + ' dollars' # Unexpected format dollars = int(parts[0]) if parts[0] else 0 cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 if dollars and cents: dollar_unit = 'dollar' if dollars == 1 else 'dollars' cent_unit = 'cent' if cents == 1 else 'cents' return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) elif dollars: dollar_unit = 'dollar' if dollars == 1 else 'dollars' return '%s %s' % (dollars, dollar_unit) elif cents: cent_unit = 'cent' if cents == 1 else 'cents' return '%s %s' % (cents, cent_unit) else: return 'zero dollars' def _expand_decimal_point(m): match = m.group(1) match = match.split('.') return match[0] + ' point ' + ' point '.join(' '.join(list(match[i])) for i in range(1, len(match))) def _expand_fraction(m): match = m.group(1) match = match.split('/') return ' over '.join(match) if len(match)==2 else ' slash '.join(match) def _expand_multiply(m): return ' times '.join(m.group(1).split('*')) def _expand_divide(m): return ' over '.join(m.group(1).split('/')) def _expand_add(m): return ' plus '.join(m.group(1).split('+')) def _expand_subtract(m): return ' minus '.join(m.group(1).split('-')) def _expand_ordinal(m): return _inflect.number_to_words(m.group(0), andword='') def _expand_number(m): num = int(m.group(0)) if num > 1000 and num < 3000: if num == 2000: return 'two thousand' elif num > 2000 and num < 2010: return 'two thousand ' + _inflect.number_to_words(num % 100) elif num % 100 == 0: return _inflect.number_to_words(num // 100) + ' hundred' else: return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') else: return _inflect.number_to_words(num, andword='') def normalize_numbers(text): text = re.sub(_num_prefix_re, _expand_num_prefix, text) text = re.sub(_num_suffix_re, _expand_num_suffix, text) for _ in range(2): # need to do this twice to find all matches text = re.sub(_num_letter_split_re, _split_alphanumeric, text) text = re.sub(_comma_number_re, _remove_commas, text) text = re.sub(_date_re, _expand_date, text) text = re.sub(_phone_number_re, _expand_phone_number, text) text = re.sub(_time_re, _expand_time, text) text = re.sub(_pounds_re, r'\1 pounds', text) text = re.sub(_dollars_re, _expand_dollars, text) text = re.sub(_decimal_number_re, _expand_decimal_point, text) text = re.sub(_multiply_re, _expand_multiply, text) text = re.sub(_divide_re, _expand_divide, text) text = re.sub(_add_re, _expand_add, text) text = re.sub(_subtract_re, _expand_subtract, text) text = re.sub(_fraction_re, _expand_fraction, text) text = re.sub(_ordinal_re, _expand_ordinal, text) text = re.sub(_number_re, _expand_number, text) return text #################################################################################################### # Special characters & other patterns _special_characters = [(re.compile(x[0]), x[1]) for x in [ ('@', ' at '), ('&', ' and '), ('%', ' percent '), (':', '.'), (';', ','), (r'\+', ' plus '), (r'\\', ' backslash '), ('~', ' about '), ('(^| )<3', ' heart '), ('<=', ' less than or equal to '), ('>=', ' greater than or equal to '), ('<', ' less than '), ('>', ' greater than '), ('=', ' equals '), ('/', ' slash '), ('_', ' '), ]] _link_header_re = re.compile(r'(https?://)') _dash_re = re.compile(r'(. - .)') _dot_re = re.compile(r'([A-Z]\.[A-Z])', re.IGNORECASE) _parentheses_re = re.compile(r'[\(\[\{].*[\)\]\}](.|$)') def expand_special_characters(text): for regex, replacement in _special_characters: text = re.sub(regex, replacement, text) return text def _expand_link_header(m): return 'h t t p s colon slash slash ' def _expand_dash(m): match = m.group(0) return f"{match[0]}, {match[4]}" def _expand_dot(m): match = m.group(0) return f"{match[0]} dot {match[2]}" def _expand_parantheses(m): match = m.group(0) match = re.sub(r'[\(\[\{]', ', ', match) match = re.sub(r'[\)\]\}][^$.!?,]', ', ', match) match = re.sub(r'[\)\]\}]', '', match) return match def normalize_special(text): text = re.sub(_link_header_re, _expand_link_header, text) text = re.sub(_dash_re, _expand_dash, text) text = re.sub(_dot_re, _expand_dot, text) text = re.sub(_parentheses_re, _expand_parantheses, text) return text #################################################################################################### # Misc def lowercase(text): return text.lower() def convert_to_ascii(text): return unidecode(text) def normalize_newlines(text): text = text.split('\n') for i in range(len(text)): if not text[i]: continue text[i] = text[i].strip() if text[i][-1] not in '.!?': text[i] = f"{text[i]}." return ' '.join(text) def remove_unknown_characters(text): text = re.sub(r"[^A-Za-z !\$%&'\*\+,-./0123456789<>\?_]", "", text) text = re.sub(r"[<>/_+]", "", text) return text def collapse_whitespace(text): text = re.sub(r'\s+', ' ', text) text = re.sub(r' [.\?!,]', lambda m: m.group(0)[1], text) return text def dedup_punctuation(text): text = re.sub(r"\.\.\.+", "[ELLIPSIS]", text) text = re.sub(r",+", ",", text) text = re.sub(r"[\.,]*\.[\.,]*", ".", text) text = re.sub(r"[\.,!]*![\.,!]*", "!", text) text = re.sub(r"[\.,!\?]*\?[\.,!\?]*", "?", text) text = re.sub("[ELLIPSIS]", "...", text) return text def clean_text(text): text = convert_to_ascii(text) text = normalize_newlines(text) text = normalize_numbers(text) text = normalize_special(text) text = expand_abbreviations(text) text = expand_special_characters(text) text = lowercase(text) text = remove_unknown_characters(text) text = collapse_whitespace(text) text = dedup_punctuation(text) return text if __name__ == '__main__': print(normalize_numbers('1,2,3,456,176')) print(normalize_numbers('123,456,789')) print(normalize_numbers('123,456,789th')) print(normalize_numbers('123-456-7890')) print(normalize_numbers('111-111-1111')) print(normalize_numbers('(111) 111-1111')) print(normalize_numbers('A(111) 111-1111')) print(normalize_numbers('A (111) 111-1111')) print(normalize_numbers('$2.47')) print(normalize_numbers('$247')) print(normalize_numbers('$0.27')) print(normalize_numbers('$1.00')) print(normalize_numbers('£20')) for i in range(1990, 2030): print(normalize_numbers(str(i))) print(normalize_numbers('2656')) print(normalize_numbers('1024')) print(normalize_numbers('2.47023')) print(normalize_numbers('20.47023')) print(normalize_numbers('1.17.1.1')) print(normalize_numbers('111.111.1111')) print(normalize_numbers('1/1/2025')) print(normalize_numbers('1-1-2025')) print(normalize_numbers('1-1-25')) print(normalize_numbers('A 1/1/11 A')) print(normalize_numbers('A 1/1 A')) print(normalize_numbers('1/1')) print(normalize_numbers('1/10')) print(normalize_numbers('1/1/10')) print(normalize_numbers('11/1/1/10')) print(normalize_numbers('0:00')) print(normalize_numbers('12:00')) print(normalize_numbers('13:00')) print(normalize_numbers('8:00')) print(normalize_numbers('8:05')) print(normalize_numbers('8:15')) print(normalize_numbers('0:00:00')) print(normalize_numbers('00:01:10')) print(normalize_numbers('00:10:01')) print(normalize_numbers('01:01:01')) print(normalize_numbers('00:01:00')) print(normalize_numbers('01:00:00')) print(normalize_numbers('-1 + 2 * 3 - 4 / 5')) print(normalize_numbers('-1+2*3-5/4/25')) print(normalize_numbers('100x1')) print(normalize_numbers('100k')) print(normalize_numbers('100m')) print(normalize_numbers('100b')) print(normalize_numbers('100t')) print(normalize_numbers('#1')) print(normalize_numbers('12:00')) print(normalize_numbers('11:59')) print(normalize_numbers('01:00')) print(normalize_numbers('0100'))