Ubuntu commited on
Commit
f3fbcc1
·
1 Parent(s): 2ac88d4

support adding youtube

Browse files
Files changed (1) hide show
  1. app.py +85 -8
app.py CHANGED
@@ -2,6 +2,11 @@ import gradio as gr
2
  import requests
3
  import uuid
4
  import os
 
 
 
 
 
5
  ASR_API = "http://astarwiz.com:9998/asr"
6
  TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
7
  TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
@@ -16,6 +21,70 @@ LANGUAGE_MAP = {
16
  # Add a password for developer mode
17
  DEVELOPER_PASSWORD = os.getenv("DEV_PWD")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
20
  print(input_text)
21
  one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
@@ -37,9 +106,14 @@ def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
37
  else:
38
  return "The system got some error during vLLM generation. Please try it again."
39
 
40
- def transcribe_and_speak(audio, source_lang, target_lang):
 
 
 
 
 
41
  if not audio:
42
- return "Please provide an audio input.", None, None
43
 
44
  # ASR
45
  file_id = str(uuid.uuid4())
@@ -79,18 +153,19 @@ def transcribe_and_speak(audio, source_lang, target_lang):
79
  def check_password(password):
80
  return password == DEVELOPER_PASSWORD
81
 
82
- def user_interface(audio, source_lang, target_lang):
83
- _, _, audio_url = transcribe_and_speak(audio, source_lang, target_lang)
84
- return audio_url
85
 
86
  with gr.Blocks() as demo:
87
  gr.Markdown("# Speech Translation")
88
 
89
  with gr.Tab("User Mode"):
90
- gr.Markdown("Speak into the microphone or upload an audio file. The app will translate and speak it back to you.")
91
 
92
  with gr.Row():
93
  user_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
 
94
  user_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en")
95
  user_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh")
96
 
@@ -98,12 +173,14 @@ with gr.Blocks() as demo:
98
  user_button = gr.Button("Translate and Speak")
99
 
100
  with gr.Row():
 
 
101
  user_audio_output = gr.Audio(label="Translated Speech")
102
 
103
  user_button.click(
104
  fn=user_interface,
105
- inputs=[user_audio_input, user_source_lang, user_target_lang],
106
- outputs=[user_audio_output]
107
  )
108
 
109
  with gr.Tab("Developer Mode"):
 
2
  import requests
3
  import uuid
4
  import os
5
+ from typing import Optional
6
+ import tempfile
7
+ from pydub import AudioSegment
8
+ import re
9
+
10
  ASR_API = "http://astarwiz.com:9998/asr"
11
  TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak'
12
  TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave'
 
21
  # Add a password for developer mode
22
  DEVELOPER_PASSWORD = os.getenv("DEV_PWD")
23
 
24
+ # Add this constant for the RapidAPI key
25
+ RAPID_API_KEY = os.getenv("RAPID_API_KEY")
26
+
27
+ def fetch_youtube_id(youtube_url: str) -> str:
28
+ if 'v=' in youtube_url:
29
+ return youtube_url.split("v=")[1]
30
+ elif 'shorts' in youtube_url:
31
+ return youtube_url.split("/")[-1]
32
+ else:
33
+ raise Exception("Unsupported URL format")
34
+
35
+ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[str]:
36
+ video_id = fetch_youtube_id(youtube_url)
37
+
38
+ if not video_id:
39
+ return None
40
+
41
+ if output_dir is None:
42
+ output_dir = tempfile.gettempdir()
43
+
44
+ output_filename = os.path.join(output_dir, f"{video_id}.mp3")
45
+
46
+ if os.path.exists(output_filename):
47
+ return output_filename # Return if the file already exists
48
+
49
+ url = "https://youtube86.p.rapidapi.com/api/youtube/links"
50
+ headers = {
51
+ 'Content-Type': 'application/json',
52
+ 'x-rapidapi-host': 'youtube86.p.rapidapi.com',
53
+ 'x-rapidapi-key': RAPID_API_KEY
54
+ }
55
+ data = {
56
+ "url": youtube_url
57
+ }
58
+
59
+ response = requests.post(url, headers=headers, json=data)
60
+ print('Fetched audio links')
61
+
62
+ if response.status_code == 200:
63
+ result = response.json()
64
+ for url in result[0]['urls']:
65
+ if url.get('isBundle'):
66
+ audio_url = url['url']
67
+ extension = url['extension']
68
+ audio_response = requests.get(audio_url)
69
+
70
+ if audio_response.status_code == 200:
71
+ temp_filename = os.path.join(output_dir, f"{video_id}.{extension}")
72
+ with open(temp_filename, 'wb') as audio_file:
73
+ audio_file.write(audio_response.content)
74
+
75
+ # Convert to MP3 and downsample to 16000 Hz
76
+ audio = AudioSegment.from_file(temp_filename, format=extension)
77
+ audio = audio.set_frame_rate(16000)
78
+ audio.export(output_filename, format="mp3", parameters=["-ar", "16000"])
79
+
80
+ os.remove(temp_filename) # Remove the temporary file
81
+ return output_filename # Return the final MP3 filename
82
+
83
+ return None # Return None if no successful download occurs
84
+ else:
85
+ print("Error:", response.status_code, response.text)
86
+ return None # Return None on failure
87
+
88
  def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
89
  print(input_text)
90
  one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
 
106
  else:
107
  return "The system got some error during vLLM generation. Please try it again."
108
 
109
+ def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None):
110
+ if youtube_url:
111
+ audio = download_youtube_audio(youtube_url)
112
+ if not audio:
113
+ return "Failed to download YouTube audio.", None, None
114
+
115
  if not audio:
116
+ return "Please provide an audio input or a valid YouTube URL.", None, None
117
 
118
  # ASR
119
  file_id = str(uuid.uuid4())
 
153
  def check_password(password):
154
  return password == DEVELOPER_PASSWORD
155
 
156
+ def user_interface(audio, source_lang, target_lang, youtube_url):
157
+ transcription, translated_text, audio_url = transcribe_and_speak(audio, source_lang, target_lang, youtube_url)
158
+ return transcription, translated_text, audio_url
159
 
160
  with gr.Blocks() as demo:
161
  gr.Markdown("# Speech Translation")
162
 
163
  with gr.Tab("User Mode"):
164
+ gr.Markdown("Speak into the microphone, upload an audio file, or provide a YouTube URL. The app will translate and speak it back to you.")
165
 
166
  with gr.Row():
167
  user_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
168
+ user_youtube_url = gr.Textbox(label="YouTube URL (optional)")
169
  user_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en")
170
  user_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh")
171
 
 
173
  user_button = gr.Button("Translate and Speak")
174
 
175
  with gr.Row():
176
+ user_transcription_output = gr.Textbox(label="Transcription")
177
+ user_translation_output = gr.Textbox(label="Translation")
178
  user_audio_output = gr.Audio(label="Translated Speech")
179
 
180
  user_button.click(
181
  fn=user_interface,
182
+ inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url],
183
+ outputs=[user_transcription_output, user_translation_output, user_audio_output]
184
  )
185
 
186
  with gr.Tab("Developer Mode"):