Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
import gradio as gr
|
2 |
import langcodes
|
3 |
-
from transformers import pipeline
|
4 |
from huggingface_hub import InferenceClient
|
5 |
from langdetect import detect, DetectorFactory
|
6 |
-
|
|
|
7 |
|
8 |
|
9 |
playground = gr.Blocks()
|
@@ -12,18 +13,19 @@ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
|
|
12 |
image_pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
13 |
summary_pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
14 |
ner_pipe = pipeline("ner", model="dslim/bert-base-NER")
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
#
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
# # Use IPythonAudio to play the audio
|
24 |
-
# audio = IPythonAudio(audio_data, rate=sampling_rate)
|
25 |
-
# return audio_data, sampling_rate
|
26 |
-
|
27 |
def detect_language(text):
|
28 |
DetectorFactory.seed = 0 # Ensure consistent results
|
29 |
return detect(text)
|
@@ -64,7 +66,8 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
|
|
64 |
def launch_image_pipe(input):
|
65 |
out = image_pipe(input)
|
66 |
text = out[0]['generated_text']
|
67 |
-
|
|
|
68 |
|
69 |
def translate(input_text, source, target):
|
70 |
try:
|
@@ -139,10 +142,10 @@ with playground:
|
|
139 |
with gr.Column():
|
140 |
generated_textbox = gr.Textbox(lines=2, placeholder="", label="Generated Text")
|
141 |
# generate_audio_button = gr.Button(value="Generate Audio", variant="primary")
|
142 |
-
|
143 |
ITT_Clear_button = gr.ClearButton(components=[img, generated_textbox], value="Clear")
|
144 |
|
145 |
-
ITT_button.click(launch_image_pipe, inputs=[img], outputs=[generated_textbox])
|
146 |
# generate_audio_button.click(generate_audio, inputs=[generated_textbox], outputs=[audio_output])
|
147 |
|
148 |
## ================================================================================================================================
|
|
|
1 |
import gradio as gr
|
2 |
import langcodes
|
3 |
+
from transformers import pipeline, VitsModel, AutoTokenizer, set_seed
|
4 |
from huggingface_hub import InferenceClient
|
5 |
from langdetect import detect, DetectorFactory
|
6 |
+
import uuid
|
7 |
+
import scipy.io.wavfile as wav
|
8 |
|
9 |
|
10 |
playground = gr.Blocks()
|
|
|
13 |
image_pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
|
14 |
summary_pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
15 |
ner_pipe = pipeline("ner", model="dslim/bert-base-NER")
|
16 |
+
tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
17 |
+
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
|
18 |
+
|
19 |
+
def gen_speech(text):
|
20 |
+
set_seed(555) # Make it deterministic
|
21 |
+
input_text = tts_tokenizer(text, return_tensors="pt")
|
22 |
+
with torch.no_grad():
|
23 |
+
outputs = tts_model(**input_text)
|
24 |
+
waveform_np = outputs.waveform[0].cpu().numpy()
|
25 |
+
output_file = f"{str(uuid.uuid4())}.wav"
|
26 |
+
wav.write(output_file, rate=tts_model.config.sampling_rate, data=waveform_np)
|
27 |
+
return output_file
|
28 |
|
|
|
|
|
|
|
|
|
29 |
def detect_language(text):
|
30 |
DetectorFactory.seed = 0 # Ensure consistent results
|
31 |
return detect(text)
|
|
|
66 |
def launch_image_pipe(input):
|
67 |
out = image_pipe(input)
|
68 |
text = out[0]['generated_text']
|
69 |
+
audio_output_filepath = gen_speech(text)
|
70 |
+
return text, audio_output_filepath
|
71 |
|
72 |
def translate(input_text, source, target):
|
73 |
try:
|
|
|
142 |
with gr.Column():
|
143 |
generated_textbox = gr.Textbox(lines=2, placeholder="", label="Generated Text")
|
144 |
# generate_audio_button = gr.Button(value="Generate Audio", variant="primary")
|
145 |
+
audio_output = gr.Audio(type="filepath", label="Generated Speech")
|
146 |
ITT_Clear_button = gr.ClearButton(components=[img, generated_textbox], value="Clear")
|
147 |
|
148 |
+
ITT_button.click(launch_image_pipe, inputs=[img], outputs=[generated_textbox, audio_output])
|
149 |
# generate_audio_button.click(generate_audio, inputs=[generated_textbox], outputs=[audio_output])
|
150 |
|
151 |
## ================================================================================================================================
|