Spaces:

Sunbird
/

sb-mms-inference

Sleeping

App Files Files Community

akera commited on Feb 20

Commit

a39ac0d

•

1 Parent(s): 6d352f5

added formatting

Browse files

Files changed (1) hide show

app.py +32 -28

app.py CHANGED Viewed

@@ -1,56 +1,60 @@
 import gradio as gr
-from transformers import Wav2Vec2ForCTC, AutoProcessor, Wav2Vec2Processor
 import torch
 import librosa
 import json
 import os
-import huggingface_hub
-from transformers import pipeline
-auth_token = os.environ.get("HF_TOKEN")
-target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
-languages = list(target_lang_options.keys())
-# Transcribe audio using custom model
-def transcribe_audio(input_file, language,chunk_length_s=10,
-                      stride_length_s=(4, 2), return_timestamps="word"):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     target_lang_code = target_lang_options[language]
     # Determine the model_id based on the language
     if target_lang_code == "eng":
         model_id = "facebook/mms-1b-all"
     else:
-        model_id = "Sunbird/sunbird-mms"
     pipe = pipeline(model=model_id, device=device, token=auth_token)
-    pipe.tokenizer.set_target_lang(target_lang_code)
-    pipe.model.load_adapter(target_lang_code)
-    # Read audio file
-    # audio_data = input_file
     output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
-    return output
 description = '''ASR with salt-mms'''
 iface = gr.Interface(fn=transcribe_audio,
                      inputs=[
                          gr.Audio(source="upload", type="filepath", label="upload file to transcribe"),
-                         gr.Dropdown(choices=languages, label="Language", value="English")
-                         ],
                      outputs=gr.Textbox(label="Transcription"),
                      description=description
                      )
-iface.launch()

 import gradio as gr
+from transformers import pipeline
 import torch
 import librosa
 import json
 import os
+# Assuming other necessary imports and setup are already done
+# Helper function to format and group word timestamps
+def format_and_group_timestamps(chunks, interval=5.0):
+    grouped = {}
+    transcript = ""
+    for chunk in chunks:
+        start, end = chunk['timestamp']
+        word = chunk['text']
+        transcript += f"{word} "
+        interval_start = int(start // interval) * interval
+        if interval_start not in grouped:
+            grouped[interval_start] = []
+        grouped[interval_start].append((start, end, word))
+    formatted_output = f"Transcript: {transcript.strip()}'\n\n-------\n\nword-stamped transcripts (every 5 seconds):\n\n"
+    for interval_start, words in grouped.items():
+        formatted_output += f"({interval_start}, {interval_start + interval}) -- {' '.join([w[2] for w in words])}\n"
+    return formatted_output
+# Modified transcribe_audio function
+def transcribe_audio(input_file, language, chunk_length_s=10, stride_length_s=(4, 2), return_timestamps="word"):
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
     target_lang_code = target_lang_options[language]
     # Determine the model_id based on the language
     if target_lang_code == "eng":
         model_id = "facebook/mms-1b-all"
     else:
+        model_id = "custom_model_id_for_other_languages"  # Placeholder for actual model IDs
+    auth_token = os.environ.get("HF_TOKEN")
     pipe = pipeline(model=model_id, device=device, token=auth_token)
+    # Assuming necessary setup for tokenizer and loading adapter
     output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
+    formatted_output = format_and_group_timestamps(output['chunks'])
+    return formatted_output
+# Interface setup remains the same
 description = '''ASR with salt-mms'''
 iface = gr.Interface(fn=transcribe_audio,
                      inputs=[
                          gr.Audio(source="upload", type="filepath", label="upload file to transcribe"),
+                         gr.Dropdown(choices=list(target_lang_options.keys()), label="Language", value="English")
+                     ],
                      outputs=gr.Textbox(label="Transcription"),
                      description=description
                      )
+# Launch the interface
+iface.launch()