Tohidichi commited on
Commit
2bcca4d
1 Parent(s): 863bc0c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +109 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pytube import YouTube
3
+ from moviepy.editor import VideoFileClip
4
+ import torch
5
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
+
7
+
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
10
+
11
+ model_id = "openai/whisper-tiny"
12
+
13
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
14
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
15
+ )
16
+ model.to(device)
17
+
18
+ processor = AutoProcessor.from_pretrained(model_id)
19
+
20
+ pipe = pipeline(
21
+ "automatic-speech-recognition",
22
+ model=model,
23
+ tokenizer=processor.tokenizer,
24
+ feature_extractor=processor.feature_extractor,
25
+ max_new_tokens=128,
26
+ chunk_length_s=15,
27
+ batch_size=16,
28
+ return_timestamps=True,
29
+ torch_dtype=torch_dtype,
30
+ device=device,
31
+ )
32
+
33
+
34
+ def transcribe(url):
35
+ # get video and extract video
36
+ def get_video(yt_url):
37
+ try:
38
+ video = YouTube(yt_url)
39
+ video.streams.get_by_itag(22).download(filename='video.mp4')
40
+ print('Video succesfully downloaded from Youtube')
41
+ except Exception as e:
42
+ print(f'Failed to download Youtube video \nerror : {e}')
43
+
44
+ def audio_from_video(video_path):
45
+ try:
46
+ video = VideoFileClip(video_path)
47
+ audio = video.audio
48
+ audio.write_audiofile('audio.wav')
49
+ video.close()
50
+ audio.close()
51
+ except Exception as e:
52
+ print(f'Failed to extract audio from video \nerror : {e}')
53
+
54
+ url = url
55
+ video_path = './video.mp4'
56
+
57
+ get_video(url)
58
+ audio_from_video(video_path)
59
+
60
+
61
+ # transcribe audio
62
+
63
+ audio = 'audio.wav'
64
+
65
+ text_audio = pipe(audio)
66
+
67
+ chunks = text_audio['chunks']
68
+
69
+ chunks_count = len(chunks)
70
+
71
+ chunk_id = []
72
+ timestamps = []
73
+ texts = []
74
+ start_time = []
75
+ end_time = []
76
+
77
+
78
+ for i in range(0, chunks_count):
79
+ chunk_id.append(i)
80
+ texts.append(chunks[i]['text'])
81
+ start_time.append(chunks[i]['timestamp'][0])
82
+ end_time.append(chunks[i]['timestamp'][1])
83
+
84
+ chunk_length = []
85
+ for i in range(0, chunks_count-1):
86
+ chunk_length.append(round(end_time[i] - start_time[i], 3))
87
+
88
+ output = list(zip(chunk_id, chunk_length, texts, start_time, end_time))
89
+
90
+ sample_output_list = []
91
+ for sublist in output:
92
+ chunk_dict = {
93
+ "chunk_id": sublist[0],
94
+ "chunk_length": sublist[1],
95
+ "text": sublist[2],
96
+ "start_time": sublist[3],
97
+ "end_time": sublist[4]
98
+ }
99
+ sample_output_list.append(chunk_dict)
100
+
101
+ return sample_output_list
102
+
103
+ intf = gr.Interface(
104
+ fn=transcribe,
105
+ inputs = ["text"],
106
+ outputs = ["text"]
107
+ )
108
+
109
+ intf.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pytube
2
+ moviepy
3
+ transformers
4
+ accelerate
5
+ git+https://github.com/huggingface/transformers.git
6
+ dataset[audio]
7
+ torch