rayl-aoit commited on
Commit
dac2ec6
·
verified ·
1 Parent(s): 475c063

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -16
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import gradio as gr
2
  import langcodes
3
- from transformers import pipeline
4
  from huggingface_hub import InferenceClient
5
  from langdetect import detect, DetectorFactory
6
- # from IPython.display import Audio as IPythonAudio
 
7
 
8
 
9
  playground = gr.Blocks()
@@ -12,18 +13,19 @@ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
12
  image_pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
13
  summary_pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
14
  ner_pipe = pipeline("ner", model="dslim/bert-base-NER")
15
- # narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
16
-
17
- # def generate_audio(text):
18
- # # Generate speech from text
19
- # narrated_text = narrator(text)
20
- # audio_data = narrated_text["audio"][0]
21
- # sampling_rate = narrated_text["sampling_rate"]
 
 
 
 
 
22
 
23
- # # Use IPythonAudio to play the audio
24
- # audio = IPythonAudio(audio_data, rate=sampling_rate)
25
- # return audio_data, sampling_rate
26
-
27
  def detect_language(text):
28
  DetectorFactory.seed = 0 # Ensure consistent results
29
  return detect(text)
@@ -64,7 +66,8 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
64
  def launch_image_pipe(input):
65
  out = image_pipe(input)
66
  text = out[0]['generated_text']
67
- return text
 
68
 
69
  def translate(input_text, source, target):
70
  try:
@@ -139,10 +142,10 @@ with playground:
139
  with gr.Column():
140
  generated_textbox = gr.Textbox(lines=2, placeholder="", label="Generated Text")
141
  # generate_audio_button = gr.Button(value="Generate Audio", variant="primary")
142
- # audio_output = gr.Audio(label="Generated Audio")
143
  ITT_Clear_button = gr.ClearButton(components=[img, generated_textbox], value="Clear")
144
 
145
- ITT_button.click(launch_image_pipe, inputs=[img], outputs=[generated_textbox])
146
  # generate_audio_button.click(generate_audio, inputs=[generated_textbox], outputs=[audio_output])
147
 
148
  ## ================================================================================================================================
 
1
  import gradio as gr
2
  import langcodes
3
+ from transformers import pipeline, VitsModel, AutoTokenizer, set_seed
4
  from huggingface_hub import InferenceClient
5
  from langdetect import detect, DetectorFactory
6
+ import uuid
7
+ import scipy.io.wavfile as wav
8
 
9
 
10
  playground = gr.Blocks()
 
13
  image_pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
14
  summary_pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
15
  ner_pipe = pipeline("ner", model="dslim/bert-base-NER")
16
+ tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
17
+ tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
18
+
19
+ def gen_speech(text):
20
+ set_seed(555) # Make it deterministic
21
+ input_text = tts_tokenizer(text, return_tensors="pt")
22
+ with torch.no_grad():
23
+ outputs = tts_model(**input_text)
24
+ waveform_np = outputs.waveform[0].cpu().numpy()
25
+ output_file = f"{str(uuid.uuid4())}.wav"
26
+ wav.write(output_file, rate=tts_model.config.sampling_rate, data=waveform_np)
27
+ return output_file
28
 
 
 
 
 
29
  def detect_language(text):
30
  DetectorFactory.seed = 0 # Ensure consistent results
31
  return detect(text)
 
66
  def launch_image_pipe(input):
67
  out = image_pipe(input)
68
  text = out[0]['generated_text']
69
+ audio_output_filepath = gen_speech(text)
70
+ return text, audio_output_filepath
71
 
72
  def translate(input_text, source, target):
73
  try:
 
142
  with gr.Column():
143
  generated_textbox = gr.Textbox(lines=2, placeholder="", label="Generated Text")
144
  # generate_audio_button = gr.Button(value="Generate Audio", variant="primary")
145
+ audio_output = gr.Audio(type="filepath", label="Generated Speech")
146
  ITT_Clear_button = gr.ClearButton(components=[img, generated_textbox], value="Clear")
147
 
148
+ ITT_button.click(launch_image_pipe, inputs=[img], outputs=[generated_textbox, audio_output])
149
  # generate_audio_button.click(generate_audio, inputs=[generated_textbox], outputs=[audio_output])
150
 
151
  ## ================================================================================================================================