Spaces:
Running
on
A10G
Running
on
A10G
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# This module is modified from [Whisper](https://github.com/openai/whisper.git). | |
# ## Citations | |
# ```bibtex | |
# @inproceedings{openai-whisper, | |
# author = {Alec Radford and | |
# Jong Wook Kim and | |
# Tao Xu and | |
# Greg Brockman and | |
# Christine McLeavey and | |
# Ilya Sutskever}, | |
# title = {Robust Speech Recognition via Large-Scale Weak Supervision}, | |
# booktitle = {{ICML}}, | |
# series = {Proceedings of Machine Learning Research}, | |
# volume = {202}, | |
# pages = {28492--28518}, | |
# publisher = {{PMLR}}, | |
# year = {2023} | |
# } | |
# ``` | |
# | |
import re | |
import unicodedata | |
import regex | |
# non-ASCII letters that are not separated by "NFKD" normalization | |
ADDITIONAL_DIACRITICS = { | |
"œ": "oe", | |
"Œ": "OE", | |
"ø": "o", | |
"Ø": "O", | |
"æ": "ae", | |
"Æ": "AE", | |
"ß": "ss", | |
"ẞ": "SS", | |
"đ": "d", | |
"Đ": "D", | |
"ð": "d", | |
"Ð": "D", | |
"þ": "th", | |
"Þ": "th", | |
"ł": "l", | |
"Ł": "L", | |
} | |
def remove_symbols_and_diacritics(s: str, keep=""): | |
""" | |
Replace any other markers, symbols, and punctuations with a space, | |
and drop any diacritics (category 'Mn' and some manual mappings) | |
""" | |
return "".join( | |
c | |
if c in keep | |
else ADDITIONAL_DIACRITICS[c] | |
if c in ADDITIONAL_DIACRITICS | |
else "" | |
if unicodedata.category(c) == "Mn" | |
else " " | |
if unicodedata.category(c)[0] in "MSP" | |
else c | |
for c in unicodedata.normalize("NFKD", s) | |
) | |
def remove_symbols(s: str): | |
""" | |
Replace any other markers, symbols, punctuations with a space, keeping diacritics | |
""" | |
return "".join( | |
" " if unicodedata.category(c)[0] in "MSP" else c | |
for c in unicodedata.normalize("NFKC", s) | |
) | |
class BasicTextNormalizer: | |
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): | |
self.clean = ( | |
remove_symbols_and_diacritics if remove_diacritics else remove_symbols | |
) | |
self.split_letters = split_letters | |
def __call__(self, s: str): | |
s = s.lower() | |
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets | |
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis | |
s = self.clean(s).lower() | |
if self.split_letters: | |
s = " ".join(regex.findall(r"\X", s, regex.U)) | |
s = re.sub( | |
r"\s+", " ", s | |
) # replace any successive whitespace characters with a space | |
return s | |