|
import gradio as gr |
|
import torch |
|
from transformers import AutoProcessor, SeamlessM4TModel |
|
|
|
class SeamlessM4TApp: |
|
def __init__(self): |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f"Using device: {self.device}") |
|
|
|
|
|
self.processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") |
|
self.model = SeamlessM4TModel.from_pretrained("facebook/seamless-m4t-v2-large") |
|
self.model.to(self.device) |
|
|
|
def transcribe_audio(self, audio_path): |
|
try: |
|
|
|
audio_inputs = self.processor( |
|
audios=audio_path, |
|
return_tensors="pt", |
|
sampling_rate=16000 |
|
).to(self.device) |
|
|
|
|
|
with torch.no_grad(): |
|
generated_tokens = self.model.generate( |
|
**audio_inputs, |
|
tgt_lang="eng", |
|
task="transcribe" |
|
) |
|
|
|
|
|
transcription = self.processor.decode( |
|
generated_tokens[0].tolist(), |
|
skip_special_tokens=True |
|
) |
|
|
|
return transcription |
|
|
|
except Exception as e: |
|
return f"Error during transcription: {str(e)}" |
|
|
|
|
|
def create_interface(): |
|
app = SeamlessM4TApp() |
|
|
|
interface = gr.Interface( |
|
fn=app.transcribe_audio, |
|
inputs=gr.Audio( |
|
type="filepath", |
|
label="Upload Audio", |
|
source="microphone" |
|
), |
|
outputs=gr.Textbox(label="Transcription"), |
|
title="SeamlessM4T Speech-to-Text", |
|
description="Upload audio or use microphone to transcribe speech to text using SeamlessM4T model.", |
|
examples=[], |
|
cache_examples=False |
|
) |
|
|
|
return interface |
|
|
|
if __name__ == "__main__": |
|
interface = create_interface() |
|
interface.launch() |