umarasif commited on
Commit
aa6660e
1 Parent(s): 31f65f2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -0
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
+ import gradio as gr
4
+ from moviepy.editor import VideoFileClip
5
+
6
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
8
+
9
+ model_id = "distil-whisper/distil-large-v3"
10
+
11
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
12
+ model_id, torch_dtype=torch_dtype, use_safetensors=True
13
+ )
14
+ model.to(device)
15
+
16
+ processor = AutoProcessor.from_pretrained(model_id)
17
+
18
+ pipe = pipeline(
19
+ "automatic-speech-recognition",
20
+ model=model,
21
+ tokenizer=processor.tokenizer,
22
+ feature_extractor=processor.feature_extractor,
23
+ max_new_tokens=128,
24
+ chunk_length_s=25,
25
+ batch_size=16,
26
+ torch_dtype=torch_dtype,
27
+ device=device,
28
+ )
29
+
30
+ def extract_audio_from_video(video_path, audio_output_path):
31
+ """Extracts audio from a video and saves it to an MP3 file."""
32
+ try:
33
+ video_clip = VideoFileClip(video_path)
34
+ audio_clip = video_clip.audio
35
+ audio_clip.write_audiofile(audio_output_path)
36
+ print(f"Audio extracted successfully and saved to: {audio_output_path}")
37
+ return audio_output_path
38
+ except Exception as e:
39
+ print(f"Error extracting audio: {e}")
40
+ return None
41
+
42
+ def speech_to_text(input_file):
43
+ try:
44
+ if input_file.name.endswith((".mp4", ".avi", ".mov")):
45
+ audio_file_path = extract_audio_from_video(input_file.name, "temp_audio.mp3")
46
+ if audio_file_path:
47
+ result = pipe(audio_file_path)
48
+ return result[0]["transcription"]
49
+ else:
50
+ result = pipe(input_file.read())
51
+ return result[0]["transcription"]
52
+ except Exception as e:
53
+ return f"Error: {str(e)}"
54
+
55
+ iface = gr.Interface(fn=speech_to_text, inputs="file", outputs="text", title="Audio/Video-to-Text")
56
+
57
+ if __name__ == "__main__":
58
+ iface.launch(debug=True)