Spaces:
Sleeping
Sleeping
import torch | |
import re | |
import gradio as gr | |
import soundfile as sf | |
import numpy as np | |
from transformers import SpeechT5HifiGan | |
from IPython.display import Audio | |
from transformers import SpeechT5ForTextToSpeech | |
from transformers import SpeechT5Processor | |
# helper function | |
number_words = { | |
0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", | |
10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen", | |
18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy", | |
80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand" | |
} | |
replacements = [ | |
("β", '"'), | |
("β", '"'), | |
("β", ","), | |
("_", " "), | |
("\xa0", " "), | |
("\n", " "), | |
("$","dollar"), | |
("%","percent"), | |
("&","and"), | |
("*","star"), | |
("+","plus"), | |
("β","-") | |
] | |
def number_to_words(number): | |
if number < 20: | |
return number_words[number] | |
elif number < 100: | |
tens, unit = divmod(number, 10) | |
return number_words[tens * 10] + (" " + number_words[unit] if unit else "") | |
elif number < 1000: | |
hundreds, remainder = divmod(number, 100) | |
return (number_words[hundreds] + " hundred" if hundreds > 1 else "hundred") + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000: | |
thousands, remainder = divmod(number, 1000) | |
return (number_to_words(thousands) + " thousand" if thousands > 1 else "thousand") + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000000: | |
millions, remainder = divmod(number, 1000000) | |
return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000000000: | |
billions, remainder = divmod(number, 1000000000) | |
return number_to_words(billions) + " billion" + (" " + number_to_words(remainder) if remainder else "") | |
else: | |
return str(number) | |
def replace_numbers_with_words(text): | |
def replace(match): | |
number = int(match.group()) | |
return number_to_words(number) | |
# Find the numbers and change with words. | |
result = re.sub(r'\b\d+\b', replace, text) | |
return result | |
def cleanup_text(text): | |
for src, dst in replacements: | |
text = text.replace(src, dst) | |
return text | |
def normalize_text(text): | |
# Convert to lowercase | |
text = text.lower() | |
# Remove punctuation (except apostrophes) | |
text = re.sub(r'[^\w\s\']', '', text) | |
# Remove extra whitespace | |
text = ' '.join(text.split()) | |
return text | |
model = SpeechT5ForTextToSpeech.from_pretrained( | |
"Yassmen/speecht5_finetuned_english_tehnical" | |
) | |
checkpoint = "microsoft/speecht5_tts" | |
processor = SpeechT5Processor.from_pretrained(checkpoint) | |
def generate_wav_file(text): | |
try: | |
converted_text = replace_numbers_with_words(text) | |
cleaned_text = cleanup_text(converted_text) | |
final_text = normalize_text(cleaned_text) | |
inputs = processor(text=final_text, return_tensors="pt") | |
speaker_embeddings = torch.tensor(np.load('speaker_embedding.npy')) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
# Convert the speech to a WAV file | |
output_file = "output.wav" | |
sf.write(output_file, speech.detach().cpu().numpy(), 16000) | |
return output_file # Return the file path for download | |
except Exception as e: | |
print(f"Error: {e}") | |
return None | |
iface = gr.Interface( | |
fn=generate_wav_file, | |
inputs=gr.Textbox(lines=3, label="Enter text to convert to speech"), | |
outputs= gr.Audio(type="filepath", label="Generated Audio"), | |
title="Text-to-Speech Technical EN" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |