Spaces:

Sunbird
/

sb-mms-inference

Sleeping

akera commited on Feb 20

Commit

6ce3643

•

1 Parent(s): a39ac0d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,6 +7,11 @@ import os
 # Assuming other necessary imports and setup are already done
 # Helper function to format and group word timestamps
 def format_and_group_timestamps(chunks, interval=5.0):
     grouped = {}
@@ -35,11 +40,11 @@ def transcribe_audio(input_file, language, chunk_length_s=10, stride_length_s=(4
     if target_lang_code == "eng":
         model_id = "facebook/mms-1b-all"
     else:
-        model_id = "custom_model_id_for_other_languages"  # Placeholder for actual model IDs
-    auth_token = os.environ.get("HF_TOKEN")
     pipe = pipeline(model=model_id, device=device, token=auth_token)
-    # Assuming necessary setup for tokenizer and loading adapter
     output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
     formatted_output = format_and_group_timestamps(output['chunks'])

 # Assuming other necessary imports and setup are already done
+auth_token = os.environ.get("HF_TOKEN")
+target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
+languages = list(target_lang_options.keys())
 # Helper function to format and group word timestamps
 def format_and_group_timestamps(chunks, interval=5.0):
     grouped = {}
     if target_lang_code == "eng":
         model_id = "facebook/mms-1b-all"
     else:
+        model_id = "Sunbird/sunbird-mms"
     pipe = pipeline(model=model_id, device=device, token=auth_token)
+    pipe.tokenizer.set_target_lang(target_lang_code)
+    pipe.model.load_adapter(target_lang_code)
     output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
     formatted_output = format_and_group_timestamps(output['chunks'])