Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 26

Commit

5fb2e7c

verified ·

1 Parent(s): 25763d0

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -25

app.py CHANGED Viewed

@@ -6,11 +6,11 @@ from transformers import pipeline, VitsModel, AutoTokenizer
 import scipy  # if needed for processing
 # ------------------------------------------------------
-# 1. ASR Pipeline (English) using Whisper-small
 # ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
-    model="openai/whisper-small"
 )
 # ------------------------------------------------------
@@ -30,12 +30,13 @@ translation_tasks = {
 # ------------------------------------------------------
 # 3. TTS Model Configurations
-# For Spanish, we keep the MMS TTS.
-# For Chinese & Japanese, use myshell-ai/MeloTTS-Chinese.
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {
-        "model_id": "facebook/mms-tts-spa",  # MMS Spanish
         "architecture": "vits"
     },
     "Chinese": {
@@ -84,7 +85,7 @@ def get_tts_model(lang):
     arch = config["architecture"]
     try:
-        # Assuming the model follows VITS-based inference
         model = VitsModel.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
     except Exception as e:
@@ -107,14 +108,15 @@ def run_tts_inference(lang, text):
     with torch.no_grad():
         output = model(**inputs)
-    # VitsModel output is typically provided via .waveform attribute
-    if hasattr(output, "waveform"):
-        waveform_tensor = output.waveform
-    else:
-        raise RuntimeError("TTS model output does not contain 'waveform'.")
     waveform = waveform_tensor.squeeze().cpu().numpy()
-    sample_rate = 16000  # Typically used sample rate for these models
     return (sample_rate, waveform)
 # ------------------------------------------------------
@@ -122,25 +124,25 @@ def run_tts_inference(lang, text):
 # ------------------------------------------------------
 def predict(audio, text, target_language):
     """
-    1. Obtain English text (via ASR using Whisper-small or text input).
-    2. Translate English text to the target language.
-    3. Synthesize speech with the target language TTS model.
     """
-    # Step 1: Get English text
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
-        # Ensure float32 data type
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
-        # Convert stereo to mono if necessary
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
-        # Resample to 16kHz if necessary
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
@@ -150,7 +152,7 @@ def predict(audio, text, target_language):
     else:
         return "No input provided.", "", None
-    # Step 2: Translation
     translator = get_translator(target_language)
     try:
         translation_result = translator(english_text)
@@ -162,6 +164,7 @@ def predict(audio, text, target_language):
     try:
         sample_rate, waveform = run_tts_inference(target_language, translated_text)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, (sample_rate, waveform)
@@ -181,12 +184,11 @@ iface = gr.Interface(
         gr.Textbox(label="Translation (Target Language)"),
         gr.Audio(label="Synthesized Speech")
     ],
-    title="Multimodal Language Learning Aid (ASR / TTS)",
     description=(
-        "This app:\n"
-        "1. Transcribes English speech or English text.\n"
-        "2. Translates to Spanish, Chinese, or Japanese (using Helsinki-NLP models).\n"
-        "3. Provides synthetic speech with TTS models:\n"
     ),
     allow_flagging="never"
 )

 import scipy  # if needed for processing
 # ------------------------------------------------------
+# 1. ASR Pipeline (English) using Wav2Vec2
 # ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
+    model="facebook/wav2vec2-base-960h"
 )
 # ------------------------------------------------------
 # ------------------------------------------------------
 # 3. TTS Model Configurations
+#    - Spanish: facebook/mms-tts-spa
+#    - Chinese: myshell-ai/MeloTTS-Chinese
+#    - Japanese: myshell-ai/MeloTTS-Japanese
 # ------------------------------------------------------
 tts_config = {
     "Spanish": {
+        "model_id": "facebook/mms-tts-spa",
         "architecture": "vits"
     },
     "Chinese": {
     arch = config["architecture"]
     try:
+        # Attempt VITS-based loading
         model = VitsModel.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
     except Exception as e:
     with torch.no_grad():
         output = model(**inputs)
+    # VitsModel output is typically `.waveform`
+    if not hasattr(output, "waveform"):
+        raise RuntimeError("TTS model output does not contain 'waveform' attribute.")
+    waveform_tensor = output.waveform
     waveform = waveform_tensor.squeeze().cpu().numpy()
+    # Typically 16 kHz for these VITS models
+    sample_rate = 16000
     return (sample_rate, waveform)
 # ------------------------------------------------------
 # ------------------------------------------------------
 def predict(audio, text, target_language):
     """
+    1. Obtain English text (ASR with Wav2Vec2 or text input).
+    2. Translate English -> target_language.
+    3. TTS for that language (using configured models).
     """
+    # Step 1: English text
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
+        # Convert to float32 if needed
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
+        # Stereo -> mono if needed
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
+        # Resample to 16k if needed
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
     else:
         return "No input provided.", "", None
+    # Step 2: Translate
     translator = get_translator(target_language)
     try:
         translation_result = translator(english_text)
     try:
         sample_rate, waveform = run_tts_inference(target_language, translated_text)
     except Exception as e:
+        # Return error info in place of audio
         return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, (sample_rate, waveform)
         gr.Textbox(label="Translation (Target Language)"),
         gr.Audio(label="Synthesized Speech")
     ],
+    title="Multimodal Language Learning Aid",
     description=(
+        "1. Transcribes English speech using Wav2Vec2 (or takes English text).\n"
+        "2. Translates to Spanish, Chinese, or Japanese (Helsinki-NLP models).\n"
+        "3. Provides synthetic speech with TTS models.\n"
     ),
     allow_flagging="never"
 )