Spaces:

Merlintxu
/

Wav2Txt

Sleeping

App Files Files Community

Merlintxu commited on Jul 23

Commit

b7fce90

•

1 Parent(s): 5653d92

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -2

app.py CHANGED Viewed

@@ -10,11 +10,15 @@ from transformers import logging
 import math
 import json
 from pyannote.audio import Pipeline
 # Suppress warnings
 warnings.filterwarnings("ignore")
 logging.set_verbosity_error()
 # Read the Hugging Face token from the environment variable
 HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
@@ -39,15 +43,19 @@ MODELS = {
 def convert_audio_to_wav(audio_path):
     try:
         wav_path = "converted_audio.wav"
         command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
         subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         return wav_path
     except Exception as e:
         raise RuntimeError(f"Error converting audio to WAV: {e}")
 def detect_language(audio_path):
     try:
         speech, _ = librosa.load(audio_path, sr=16000, duration=30)
         processor = WhisperProcessor.from_pretrained("openai/whisper-base")
@@ -63,18 +71,25 @@ def detect_language(audio_path):
         pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
         if abs(es_confidence - pt_confidence) < 0.2:
             return 'es'
-        return max(langs, key=lambda x: x.prob).lang
     except Exception as e:
         raise RuntimeError(f"Error detecting language: {e}")
 def diarize_audio(wav_audio):
     try:
         pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HUGGINGFACE_TOKEN)
         diarization = pipeline(wav_audio)
         return diarization
     except Exception as e:
         raise RuntimeError(f"Error in diarization: {e}")
 def transcribe_audio_stream(audio, model_name):
@@ -118,10 +133,12 @@ def transcribe_audio_stream(audio, model_name):
                 transcriptions.append((timestamp, result["text"], progress))
                 yield transcriptions, progress
     except Exception as e:
         raise RuntimeError(f"Error in transcription: {e}")
 def merge_diarization_with_transcription(transcriptions, diarization, rate):
     try:
         speaker_transcriptions = []
         for segment in diarization.itertracks(yield_label=True):
             start, end, speaker = segment
@@ -132,37 +149,47 @@ def merge_diarization_with_transcription(transcriptions, diarization, rate):
                 if start_time <= ts <= end_time:
                     text_segment += text + " "
             speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
         return speaker_transcriptions
     except Exception as e:
         raise RuntimeError(f"Error merging diarization with transcription: {e}")
 def detect_and_select_model(audio):
     try:
         wav_audio = convert_audio_to_wav(audio)
         language = detect_language(wav_audio)
         model_options = MODELS.get(language, MODELS["en"])
         return language, model_options
     except Exception as e:
         raise RuntimeError(f"Error detecting and selecting model: {e}")
 def save_transcription(transcriptions, file_format):
     try:
         if file_format == "txt":
             file_path = "/tmp/transcription.txt"
             with open(file_path, "w") as f:
                 for start, end, speaker, text in transcriptions:
                     f.write(f"[{start:.2f}-{end:.2f}] {speaker}: {text}\n")
             return file_path
         elif file_format == "json":
             file_path = "/tmp/transcription.json"
             with open(file_path, "w") as f:
                 json.dump(transcriptions, f)
             return file_path
     except Exception as e:
         raise RuntimeError(f"Error saving transcription: {e}")
 def combined_interface(audio):
     try:
         language, model_options = detect_and_select_model(audio)
         selected_model = model_options[0]
@@ -189,8 +216,8 @@ def combined_interface(audio):
         os.remove(wav_audio)
         yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!", txt_file_path, json_file_path
     except Exception as e:
         yield str(e), [], "", "An error occurred during processing.", 0, "Error", None, None
 iface = gr.Interface(

 import math
 import json
 from pyannote.audio import Pipeline
+import numpy as np  # Asegúrate de importar numpy
 # Suppress warnings
 warnings.filterwarnings("ignore")
 logging.set_verbosity_error()
+# Inicializar numpy correctamente
+np._import_array()
 # Read the Hugging Face token from the environment variable
 HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
 def convert_audio_to_wav(audio_path):
     try:
+        print("Converting audio to WAV format...")
         wav_path = "converted_audio.wav"
         command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
         subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        print(f"Audio converted to {wav_path}")
         return wav_path
     except Exception as e:
+        print(f"Error converting audio to WAV: {e}")
         raise RuntimeError(f"Error converting audio to WAV: {e}")
 def detect_language(audio_path):
     try:
+        print("Detecting language...")
         speech, _ = librosa.load(audio_path, sr=16000, duration=30)
         processor = WhisperProcessor.from_pretrained("openai/whisper-base")
         pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
         if abs(es_confidence - pt_confidence) < 0.2:
+            print("Detected language: Spanish")
             return 'es'
+        detected_language = max(langs, key=lambda x: x.prob).lang
+        print(f"Detected language: {detected_language}")
+        return detected_language
     except Exception as e:
+        print(f"Error detecting language: {e}")
         raise RuntimeError(f"Error detecting language: {e}")
 def diarize_audio(wav_audio):
     try:
+        print("Performing diarization...")
         pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HUGGINGFACE_TOKEN)
         diarization = pipeline(wav_audio)
+        print("Diarization complete.")
         return diarization
     except Exception as e:
+        print(f"Error in diarization: {e}")
         raise RuntimeError(f"Error in diarization: {e}")
 def transcribe_audio_stream(audio, model_name):
                 transcriptions.append((timestamp, result["text"], progress))
                 yield transcriptions, progress
     except Exception as e:
+        print(f"Error in transcription: {e}")
         raise RuntimeError(f"Error in transcription: {e}")
 def merge_diarization_with_transcription(transcriptions, diarization, rate):
     try:
+        print("Merging diarization with transcription...")
         speaker_transcriptions = []
         for segment in diarization.itertracks(yield_label=True):
             start, end, speaker = segment
                 if start_time <= ts <= end_time:
                     text_segment += text + " "
             speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
+        print("Merge complete.")
         return speaker_transcriptions
     except Exception as e:
+        print(f"Error merging diarization with transcription: {e}")
         raise RuntimeError(f"Error merging diarization with transcription: {e}")
 def detect_and_select_model(audio):
     try:
+        print("Detecting and selecting model...")
         wav_audio = convert_audio_to_wav(audio)
         language = detect_language(wav_audio)
         model_options = MODELS.get(language, MODELS["en"])
+        print(f"Selected model: {model_options[0]}")
         return language, model_options
     except Exception as e:
+        print(f"Error detecting and selecting model: {e}")
         raise RuntimeError(f"Error detecting and selecting model: {e}")
 def save_transcription(transcriptions, file_format):
     try:
+        print(f"Saving transcription to {file_format} format...")
         if file_format == "txt":
             file_path = "/tmp/transcription.txt"
             with open(file_path, "w") as f:
                 for start, end, speaker, text in transcriptions:
                     f.write(f"[{start:.2f}-{end:.2f}] {speaker}: {text}\n")
+            print(f"Transcription saved to {file_path}")
             return file_path
         elif file_format == "json":
             file_path = "/tmp/transcription.json"
             with open(file_path, "w") as f:
                 json.dump(transcriptions, f)
+            print(f"Transcription saved to {file_path}")
             return file_path
     except Exception as e:
+        print(f"Error saving transcription: {e}")
         raise RuntimeError(f"Error saving transcription: {e}")
 def combined_interface(audio):
     try:
+        print("Starting combined interface...")
         language, model_options = detect_and_select_model(audio)
         selected_model = model_options[0]
         os.remove(wav_audio)
         yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!", txt_file_path, json_file_path
     except Exception as e:
+        print(f"Error in combined interface: {e}")
         yield str(e), [], "", "An error occurred during processing.", 0, "Error", None, None
 iface = gr.Interface(