kzs2t / app.py
bektim's picture
Create app.py
48cf8b9 verified
raw
history blame
2.07 kB
import gradio as gr
import torch
from transformers import AutoProcessor, SeamlessM4TModel
class SeamlessM4TApp:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
# Load model and processor
self.processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
self.model = SeamlessM4TModel.from_pretrained("facebook/seamless-m4t-v2-large")
self.model.to(self.device)
def transcribe_audio(self, audio_path):
try:
# Load and process the audio
audio_inputs = self.processor(
audios=audio_path,
return_tensors="pt",
sampling_rate=16000
).to(self.device)
# Generate transcription
with torch.no_grad():
generated_tokens = self.model.generate(
**audio_inputs,
tgt_lang="eng",
task="transcribe"
)
# Decode the generated tokens
transcription = self.processor.decode(
generated_tokens[0].tolist(),
skip_special_tokens=True
)
return transcription
except Exception as e:
return f"Error during transcription: {str(e)}"
# Initialize the Gradio interface
def create_interface():
app = SeamlessM4TApp()
interface = gr.Interface(
fn=app.transcribe_audio,
inputs=gr.Audio(
type="filepath",
label="Upload Audio",
source="microphone"
),
outputs=gr.Textbox(label="Transcription"),
title="SeamlessM4T Speech-to-Text",
description="Upload audio or use microphone to transcribe speech to text using SeamlessM4T model.",
examples=[],
cache_examples=False
)
return interface
if __name__ == "__main__":
interface = create_interface()
interface.launch()