|
import streamlit as st |
|
import datetime |
|
from transformers import pipeline |
|
import gradio as gr |
|
|
|
import tempfile |
|
from typing import Optional |
|
import numpy as np |
|
from TTS.utils.manage import ModelManager |
|
from TTS.utils.synthesizer import Synthesizer |
|
|
|
|
|
import os |
|
import csv |
|
import gradio as gr |
|
from gradio import inputs, outputs |
|
import huggingface_hub |
|
from huggingface_hub import Repository, hf_hub_download, upload_file |
|
from datetime import datetime |
|
|
|
|
|
|
|
|
|
|
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") |
|
|
|
|
|
MODEL_NAMES = [ |
|
"en/ljspeech/tacotron2-DDC", |
|
"en/ljspeech/glow-tts", |
|
"en/ljspeech/speedy-speech-wn", |
|
"en/ljspeech/vits", |
|
"en/sam/tacotron-DDC", |
|
"fr/mai/tacotron2-DDC", |
|
"de/thorsten/tacotron2-DCA", |
|
] |
|
|
|
|
|
MODELS = {} |
|
manager = ModelManager() |
|
for MODEL_NAME in MODEL_NAMES: |
|
print(f"downloading {MODEL_NAME}") |
|
model_path, config_path, model_item = manager.download_model(f"tts_models/{MODEL_NAME}") |
|
vocoder_name: Optional[str] = model_item["default_vocoder"] |
|
vocoder_path = None |
|
vocoder_config_path = None |
|
if vocoder_name is not None: |
|
vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) |
|
|
|
synthesizer = Synthesizer( |
|
model_path, config_path, None, vocoder_path, vocoder_config_path, |
|
) |
|
MODELS[MODEL_NAME] = synthesizer |
|
|
|
|
|
def transcribe(audio): |
|
text = asr(audio)["text"] |
|
return text |
|
|
|
|
|
classifier = pipeline("text-classification") |
|
|
|
|
|
def speech_to_text(speech): |
|
text = asr(speech)["text"] |
|
|
|
return text |
|
|
|
def text_to_sentiment(text): |
|
sentiment = classifier(text)[0]["label"] |
|
|
|
return sentiment |
|
|
|
def upsert(text): |
|
date_time =str(datetime.datetime.today()) |
|
doc_ref = db.collection('Text2SpeechSentimentSave').document(date_time) |
|
doc_ref.set({u'firefield': 'Recognize Speech', u'first': 'https://huggingface.co/spaces/awacke1/TTS-STT-Blocks/', u'last': text, u'born': date_time,}) |
|
saved = select('TTS-STT', date_time) |
|
return saved |
|
|
|
def select(collection, document): |
|
doc_ref = db.collection(collection).document(document) |
|
doc = doc_ref.get() |
|
docid = ("The id is: ", doc.id) |
|
contents = ("The contents are: ", doc.to_dict()) |
|
return contents |
|
|
|
def selectall(text): |
|
docs = db.collection('Text2SpeechSentimentSave').stream() |
|
doclist='' |
|
for doc in docs: |
|
r=(f'{doc.id} => {doc.to_dict()}') |
|
doclist += r |
|
return doclist |
|
|
|
def tts(text: str, model_name: str): |
|
print(text, model_name) |
|
synthesizer = MODELS.get(model_name, None) |
|
if synthesizer is None: |
|
raise NameError("model not found") |
|
wavs = synthesizer.tts(text) |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
synthesizer.save_wav(wavs, fp) |
|
|
|
|
|
|
|
return fp.name |
|
|
|
demo = gr.Blocks() |
|
with demo: |
|
audio_file = gr.inputs.Audio(source="microphone", type="filepath") |
|
text = gr.Textbox(label="Speech to Text") |
|
|
|
|
|
|
|
TTSchoice = gr.inputs.Radio( label="Pick a Text to Speech Model", choices=MODEL_NAMES, ) |
|
audio = gr.Audio(label="Output", interactive=False) |
|
|
|
b1 = gr.Button("Recognize Speech") |
|
|
|
|
|
|
|
b5 = gr.Button("Read It Back Aloud") |
|
|
|
b1.click(speech_to_text, inputs=audio_file, outputs=text) |
|
|
|
|
|
|
|
b5.click(tts, inputs=[text,TTSchoice], outputs=audio) |
|
|
|
demo.launch(share=True) |