Spaces:

Merlintxu
/

Wav2Txt

Sleeping

File size: 6,758 Bytes

a3199db
694f93a
a3199db
 
694f93a
c5093bb
694f93a
 
c5093bb
694f93a
 
df609a3
 
c55c408
 
df609a3
2b71965
e35f365
7fdbed5
2b71965
 
 
7fdbed5
 
2b71965
7fdbed5
1851c8f
7fdbed5
2b71965
 
 
 
 
e35f365
 
 
86a050b
 
694f93a
 
 
 
e35f365
 
86a050b
 
 
 
694f93a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2e9f55
fe4ae7f
694f93a
 
 
 
 
 
 
 
 
df609a3
694f93a
df609a3
694f93a
 
 
df609a3
694f93a
 
 
5653d92
694f93a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55c408
694f93a
 
 
 
 
 
 
7fdbed5
 
694f93a
 
 
 
7fdbed5
d2e9f55
694f93a
 
 
 
 
 
 
 
 
 
d2e9f55
694f93a
2b71965
 
 
 
694f93a
df609a3
d2e9f55
fe4ae7f
d2e9f55
694f93a
c55c408
 
694f93a
fe4ae7f
694f93a
 
2b71965
694f93a
 
 
 
2b71965
 
694f93a
a3199db
 
7fdbed5
694f93a
 
 
 
7fdbed5
 
 
 
c55c408
 
d2e9f55
694f93a
7fdbed5
694f93a
 
df609a3
a3199db
 
 
d2e9f55

import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import subprocess
from langdetect import detect_langs
import os
import warnings
from transformers import logging
import math
import json

# Suppress warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

# Updated models by language
MODELS = {
    "es": [
        "openai/whisper-large-v3",
        "facebook/wav2vec2-large-xlsr-53-spanish",
        "jonatasgrosman/wav2vec2-xls-r-1b-spanish"
    ],
    "en": [
        "openai/whisper-large-v3",
        "facebook/wav2vec2-large-960h",
        "microsoft/wav2vec2-base-960h"
    ],
    "pt": [
        "facebook/wav2vec2-large-xlsr-53-portuguese",
        "openai/whisper-medium",
        "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
    ]
}

def convert_audio_to_wav(audio_path):
    if os.path.isdir(audio_path):
        raise ValueError(f"The path provided is a directory: {audio_path}")
    wav_path = "converted_audio.wav"
    command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return wav_path

def detect_language(audio_path):
    try:
        speech, _ = librosa.load(audio_path, sr=16000, duration=30)
    except Exception as e:
        raise ValueError(f"Error loading audio file with librosa: {e}")
    
    processor = WhisperProcessor.from_pretrained("openai/whisper-base")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
    
    input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    langs = detect_langs(transcription)
    
    es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
    pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
    
    if abs(es_confidence - pt_confidence) < 0.2:
        return 'es'
    
    return max(langs, key=lambda x: x.prob).lang

def transcribe_audio_stream(audio, model_name):
    wav_audio = convert_audio_to_wav(audio)
    speech, rate = librosa.load(wav_audio, sr=16000)
    duration = len(speech) / rate
    
    transcriptions = []

    if "whisper" in model_name:
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        
        chunk_duration = 30  # seconds
        
        for i in range(0, int(duration), chunk_duration):
            end = min(i + chunk_duration, duration)
            chunk = speech[int(i * rate):int(end * rate)]
            
            input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
            predicted_ids = model.generate(input_features)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            
            progress = min(100, (end / duration) * 100)
            transcriptions.append({
                "start_time": i,
                "end_time": end,
                "text": transcription
            })
            yield transcriptions, progress
    else:
        transcriber = pipeline("automatic-speech-recognition", model=model_name)
        
        chunk_duration = 10  # seconds
        
        for i in range(0, int(duration), chunk_duration):
            end = min(i + chunk_duration, duration)
            chunk = speech[int(i * rate):int(end * rate)]
            result = transcriber(chunk)
            
            progress = min(100, (end / duration) * 100)
            transcriptions.append({
                "start_time": i,
                "end_time": end,
                "text": result["text"]
            })
            yield transcriptions, progress

def detect_and_select_model(audio):
    wav_audio = convert_audio_to_wav(audio)
    language = detect_language(wav_audio)
    model_options = MODELS.get(language, MODELS["en"])
    return language, model_options

def save_transcription(transcriptions, file_format):
    if file_format == "JSON":
        file_path = "transcription.json"
        with open(file_path, 'w') as f:
            json.dump(transcriptions, f, ensure_ascii=False, indent=4)
    elif file_format == "TXT":
        file_path = "transcription.txt"
        with open(file_path, 'w') as f:
            for entry in transcriptions:
                f.write(f"{entry['start_time']},{entry['end_time']},{entry['text']}\n")
    return file_path

def combined_interface(audio, file_format):
    try:
        language, model_options = detect_and_select_model(audio)
        selected_model = model_options[0]
        
        yield language, model_options, selected_model, "", 0, "Initializing..."
        
        transcriptions = []
        for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
            transcriptions = partial_transcriptions
            full_transcription = " ".join([t["text"] for t in transcriptions])
            progress_int = math.floor(progress)
            status = f"Transcribing... {progress_int}% complete"
            yield language, model_options, selected_model, full_transcription.strip(), progress_int, status
        
        # Save transcription file
        file_path = save_transcription(transcriptions, file_format)
        
        # Clean up temporary files
        os.remove("converted_audio.wav")
        
        yield language, model_options, selected_model, full_transcription.strip(), 100, f"Transcription complete! Download {file_path}", file_path
        
    except Exception as e:
        yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""

iface = gr.Interface(
    fn=combined_interface,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Radio(choices=["JSON", "TXT"], label="Choose output format")
    ],
    outputs=[
        gr.Textbox(label="Detected Language"),
        gr.Dropdown(label="Available Models", choices=[]),
        gr.Textbox(label="Selected Model"),
        gr.Textbox(label="Transcription", lines=10),
        gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
        gr.Textbox(label="Status"),
        gr.File(label="Download Transcription")
    ],
    title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
    description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
    live=True
)

if __name__ == "__main__":
    iface.queue().launch()