Merlintxu commited on
Commit
c55c408
1 Parent(s): 75c2204

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -15
app.py CHANGED
@@ -7,10 +7,11 @@ from langdetect import detect_langs
7
  import os
8
  import warnings
9
  from transformers import logging
 
10
 
11
  # Suppress warnings
12
- # warnings.filterwarnings("ignore")
13
- # logging.set_verbosity_error()
14
 
15
  # Updated models by language
16
  MODELS = {
@@ -29,7 +30,6 @@ MODELS = {
29
  "openai/whisper-medium",
30
  "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
31
  ]
32
- # Add more languages and models as needed
33
  }
34
 
35
  def convert_audio_to_wav(audio_path):
@@ -60,14 +60,14 @@ def detect_language(audio_path):
60
 
61
  def transcribe_audio_stream(audio, model_name):
62
  wav_audio = convert_audio_to_wav(audio)
 
 
63
 
64
  if "whisper" in model_name:
65
  processor = WhisperProcessor.from_pretrained(model_name)
66
  model = WhisperForConditionalGeneration.from_pretrained(model_name)
67
 
68
  chunk_duration = 30 # seconds
69
- speech, rate = librosa.load(wav_audio, sr=16000)
70
- duration = len(speech) / rate
71
 
72
  for i in range(0, int(duration), chunk_duration):
73
  end = min(i + chunk_duration, duration)
@@ -77,19 +77,20 @@ def transcribe_audio_stream(audio, model_name):
77
  predicted_ids = model.generate(input_features)
78
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
79
 
80
- yield transcription
 
81
  else:
82
  transcriber = pipeline("automatic-speech-recognition", model=model_name)
83
 
84
  chunk_duration = 10 # seconds
85
- speech, rate = librosa.load(wav_audio, sr=16000)
86
- duration = len(speech) / rate
87
 
88
  for i in range(0, int(duration), chunk_duration):
89
  end = min(i + chunk_duration, duration)
90
  chunk = speech[int(i * rate):int(end * rate)]
91
  result = transcriber(chunk)
92
- yield result["text"]
 
 
93
 
94
  def detect_and_select_model(audio):
95
  wav_audio = convert_audio_to_wav(audio)
@@ -102,18 +103,22 @@ def combined_interface(audio):
102
  language, model_options = detect_and_select_model(audio)
103
  selected_model = model_options[0]
104
 
105
- yield language, model_options, selected_model, ""
106
 
107
  full_transcription = ""
108
- for partial_transcription in transcribe_audio_stream(audio, selected_model):
109
  full_transcription += partial_transcription + " "
110
- yield language, model_options, selected_model, full_transcription.strip()
 
 
111
 
112
  # Clean up temporary files
113
  os.remove("converted_audio.wav")
114
 
 
 
115
  except Exception as e:
116
- yield str(e), [], "", "An error occurred during processing."
117
 
118
  iface = gr.Interface(
119
  fn=combined_interface,
@@ -122,9 +127,11 @@ iface = gr.Interface(
122
  gr.Textbox(label="Detected Language"),
123
  gr.Dropdown(label="Available Models", choices=[]),
124
  gr.Textbox(label="Selected Model"),
125
- gr.Textbox(label="Transcription", lines=10)
 
 
126
  ],
127
- title="Multilingual Audio Transcriber with Real-time Display",
128
  description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
129
  live=True
130
  )
 
7
  import os
8
  import warnings
9
  from transformers import logging
10
+ import math
11
 
12
  # Suppress warnings
13
+ warnings.filterwarnings("ignore")
14
+ logging.set_verbosity_error()
15
 
16
  # Updated models by language
17
  MODELS = {
 
30
  "openai/whisper-medium",
31
  "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
32
  ]
 
33
  }
34
 
35
  def convert_audio_to_wav(audio_path):
 
60
 
61
  def transcribe_audio_stream(audio, model_name):
62
  wav_audio = convert_audio_to_wav(audio)
63
+ speech, rate = librosa.load(wav_audio, sr=16000)
64
+ duration = len(speech) / rate
65
 
66
  if "whisper" in model_name:
67
  processor = WhisperProcessor.from_pretrained(model_name)
68
  model = WhisperForConditionalGeneration.from_pretrained(model_name)
69
 
70
  chunk_duration = 30 # seconds
 
 
71
 
72
  for i in range(0, int(duration), chunk_duration):
73
  end = min(i + chunk_duration, duration)
 
77
  predicted_ids = model.generate(input_features)
78
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
79
 
80
+ progress = min(100, (end / duration) * 100)
81
+ yield transcription, progress
82
  else:
83
  transcriber = pipeline("automatic-speech-recognition", model=model_name)
84
 
85
  chunk_duration = 10 # seconds
 
 
86
 
87
  for i in range(0, int(duration), chunk_duration):
88
  end = min(i + chunk_duration, duration)
89
  chunk = speech[int(i * rate):int(end * rate)]
90
  result = transcriber(chunk)
91
+
92
+ progress = min(100, (end / duration) * 100)
93
+ yield result["text"], progress
94
 
95
  def detect_and_select_model(audio):
96
  wav_audio = convert_audio_to_wav(audio)
 
103
  language, model_options = detect_and_select_model(audio)
104
  selected_model = model_options[0]
105
 
106
+ yield language, model_options, selected_model, "", 0, "Initializing..."
107
 
108
  full_transcription = ""
109
+ for partial_transcription, progress in transcribe_audio_stream(audio, selected_model):
110
  full_transcription += partial_transcription + " "
111
+ progress_int = math.floor(progress)
112
+ status = f"Transcribing... {progress_int}% complete"
113
+ yield language, model_options, selected_model, full_transcription.strip(), progress_int, status
114
 
115
  # Clean up temporary files
116
  os.remove("converted_audio.wav")
117
 
118
+ yield language, model_options, selected_model, full_transcription.strip(), 100, "Transcription complete!"
119
+
120
  except Exception as e:
121
+ yield str(e), [], "", "An error occurred during processing.", 0, "Error"
122
 
123
  iface = gr.Interface(
124
  fn=combined_interface,
 
127
  gr.Textbox(label="Detected Language"),
128
  gr.Dropdown(label="Available Models", choices=[]),
129
  gr.Textbox(label="Selected Model"),
130
+ gr.Textbox(label="Transcription", lines=10),
131
+ gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
132
+ gr.Textbox(label="Status")
133
  ],
134
+ title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
135
  description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
136
  live=True
137
  )