palli23 commited on
Commit
8e021af
·
verified ·
1 Parent(s): 52d795c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -2
app.py CHANGED
@@ -27,6 +27,7 @@ def transcribe_files(audio_files):
27
  outdir = os.path.join(workdir, "transcripts")
28
  os.makedirs(outdir, exist_ok=True)
29
 
 
30
  pipe = pipeline(
31
  "automatic-speech-recognition",
32
  model="palli23/whisper-small-sam_spjall",
@@ -34,6 +35,9 @@ def transcribe_files(audio_files):
34
  device=0,
35
  )
36
 
 
 
 
37
  for file in audio_files:
38
  audio_path = file.name
39
  base = os.path.splitext(os.path.basename(audio_path))[0]
@@ -45,8 +49,7 @@ def transcribe_files(audio_files):
45
  batch_size=8,
46
  return_timestamps=False,
47
  generate_kwargs={
48
- "language": "is",
49
- "task": "transcribe",
50
  "num_beams": 5,
51
  "repetition_penalty": 1.2,
52
  "no_repeat_ngram_size": 3,
 
27
  outdir = os.path.join(workdir, "transcripts")
28
  os.makedirs(outdir, exist_ok=True)
29
 
30
+ # Create pipeline
31
  pipe = pipeline(
32
  "automatic-speech-recognition",
33
  model="palli23/whisper-small-sam_spjall",
 
35
  device=0,
36
  )
37
 
38
+ # Force Icelandic language using tokenizer
39
+ forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="is", task="transcribe")
40
+
41
  for file in audio_files:
42
  audio_path = file.name
43
  base = os.path.splitext(os.path.basename(audio_path))[0]
 
49
  batch_size=8,
50
  return_timestamps=False,
51
  generate_kwargs={
52
+ "forced_decoder_ids": forced_decoder_ids,
 
53
  "num_beams": 5,
54
  "repetition_penalty": 1.2,
55
  "no_repeat_ngram_size": 3,