akera commited on
Commit
6ce3643
1 Parent(s): a39ac0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -3
app.py CHANGED
@@ -7,6 +7,11 @@ import os
7
 
8
  # Assuming other necessary imports and setup are already done
9
 
 
 
 
 
 
10
  # Helper function to format and group word timestamps
11
  def format_and_group_timestamps(chunks, interval=5.0):
12
  grouped = {}
@@ -35,11 +40,11 @@ def transcribe_audio(input_file, language, chunk_length_s=10, stride_length_s=(4
35
  if target_lang_code == "eng":
36
  model_id = "facebook/mms-1b-all"
37
  else:
38
- model_id = "custom_model_id_for_other_languages" # Placeholder for actual model IDs
39
 
40
- auth_token = os.environ.get("HF_TOKEN")
41
  pipe = pipeline(model=model_id, device=device, token=auth_token)
42
- # Assuming necessary setup for tokenizer and loading adapter
 
43
 
44
  output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
45
  formatted_output = format_and_group_timestamps(output['chunks'])
 
7
 
8
  # Assuming other necessary imports and setup are already done
9
 
10
+ auth_token = os.environ.get("HF_TOKEN")
11
+ target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
12
+ languages = list(target_lang_options.keys())
13
+
14
+
15
  # Helper function to format and group word timestamps
16
  def format_and_group_timestamps(chunks, interval=5.0):
17
  grouped = {}
 
40
  if target_lang_code == "eng":
41
  model_id = "facebook/mms-1b-all"
42
  else:
43
+ model_id = "Sunbird/sunbird-mms"
44
 
 
45
  pipe = pipeline(model=model_id, device=device, token=auth_token)
46
+ pipe.tokenizer.set_target_lang(target_lang_code)
47
+ pipe.model.load_adapter(target_lang_code)
48
 
49
  output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
50
  formatted_output = format_and_group_timestamps(output['chunks'])