Tohidichi commited on
Commit
0d9a9f0
1 Parent(s): 51190de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -109
app.py CHANGED
@@ -1,109 +1,108 @@
1
- import gradio as gr
2
- from pytube import YouTube
3
- from moviepy.editor import VideoFileClip
4
- import torch
5
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
-
7
-
8
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
10
-
11
- model_id = "openai/whisper-tiny"
12
-
13
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
14
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
15
- )
16
- model.to(device)
17
-
18
- processor = AutoProcessor.from_pretrained(model_id)
19
-
20
- pipe = pipeline(
21
- "automatic-speech-recognition",
22
- model=model,
23
- tokenizer=processor.tokenizer,
24
- feature_extractor=processor.feature_extractor,
25
- max_new_tokens=128,
26
- chunk_length_s=15,
27
- batch_size=16,
28
- return_timestamps=True,
29
- torch_dtype=torch_dtype,
30
- device=device,
31
- )
32
-
33
-
34
- def transcribe(url):
35
- # get video and extract video
36
- def get_video(yt_url):
37
- try:
38
- video = YouTube(yt_url)
39
- video.streams.get_by_itag(22).download(filename='video.mp4')
40
- print('Video succesfully downloaded from Youtube')
41
- except Exception as e:
42
- print(f'Failed to download Youtube video \nerror : {e}')
43
-
44
- def audio_from_video(video_path):
45
- try:
46
- video = VideoFileClip(video_path)
47
- audio = video.audio
48
- audio.write_audiofile('audio.wav')
49
- video.close()
50
- audio.close()
51
- except Exception as e:
52
- print(f'Failed to extract audio from video \nerror : {e}')
53
-
54
- url = url
55
- video_path = './video.mp4'
56
-
57
- get_video(url)
58
- audio_from_video(video_path)
59
-
60
-
61
- # transcribe audio
62
-
63
- audio = 'audio.wav'
64
-
65
- text_audio = pipe(audio)
66
-
67
- chunks = text_audio['chunks']
68
-
69
- chunks_count = len(chunks)
70
-
71
- chunk_id = []
72
- timestamps = []
73
- texts = []
74
- start_time = []
75
- end_time = []
76
-
77
-
78
- for i in range(0, chunks_count):
79
- chunk_id.append(i)
80
- texts.append(chunks[i]['text'])
81
- start_time.append(chunks[i]['timestamp'][0])
82
- end_time.append(chunks[i]['timestamp'][1])
83
-
84
- chunk_length = []
85
- for i in range(0, chunks_count-1):
86
- chunk_length.append(round(end_time[i] - start_time[i], 3))
87
-
88
- output = list(zip(chunk_id, chunk_length, texts, start_time, end_time))
89
-
90
- sample_output_list = []
91
- for sublist in output:
92
- chunk_dict = {
93
- "chunk_id": sublist[0],
94
- "chunk_length": sublist[1],
95
- "text": sublist[2],
96
- "start_time": sublist[3],
97
- "end_time": sublist[4]
98
- }
99
- sample_output_list.append(chunk_dict)
100
-
101
- return sample_output_list
102
-
103
- intf = gr.Interface(
104
- fn=transcribe,
105
- inputs = ["text"],
106
- outputs = ["text"]
107
- )
108
-
109
- intf.launch()
 
1
+ import gradio as gr
2
+ from pytube import YouTube
3
+ from moviepy.editor import VideoFileClip
4
+ import torch
5
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
+
7
+
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
10
+
11
+ model_id = "openai/whisper-tiny"
12
+
13
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
14
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
15
+ )
16
+ model.to(device)
17
+
18
+ processor = AutoProcessor.from_pretrained(model_id)
19
+
20
+ pipe = pipeline(
21
+ "automatic-speech-recognition",
22
+ model=model,
23
+ tokenizer=processor.tokenizer,
24
+ feature_extractor=processor.feature_extractor,
25
+ max_new_tokens=128,
26
+ chunk_length_s=15,
27
+ batch_size=16,
28
+ return_timestamps=True,
29
+ torch_dtype=torch_dtype,
30
+ device=device,
31
+ )
32
+
33
+ # get video and extract video
34
+ def get_video(yt_url):
35
+ try:
36
+ video = YouTube(yt_url)
37
+ video.streams.get_by_itag(22).download(filename='video.mp4')
38
+ print('Video succesfully downloaded from Youtube')
39
+ except Exception as e:
40
+ print(f'Failed to download Youtube video \nerror : {e}')
41
+
42
+ def audio_from_video(video_path):
43
+ try:
44
+ video = VideoFileClip(video_path)
45
+ audio = video.audio
46
+ audio.write_audiofile('audio.wav')
47
+ video.close()
48
+ audio.close()
49
+ except Exception as e:
50
+ print(f'Failed to extract audio from video \nerror : {e}')
51
+
52
+ def transcribe(url):
53
+
54
+ video_path = './video.mp4'
55
+
56
+ get_video(url)
57
+ audio_from_video(video_path)
58
+
59
+
60
+ # transcribe audio
61
+
62
+ audio = 'audio.wav'
63
+
64
+ text_audio = pipe(audio)
65
+
66
+ chunks = text_audio['chunks']
67
+
68
+ chunks_count = len(chunks)
69
+
70
+ chunk_id = []
71
+ timestamps = []
72
+ texts = []
73
+ start_time = []
74
+ end_time = []
75
+
76
+
77
+ for i in range(0, chunks_count):
78
+ chunk_id.append(i)
79
+ texts.append(chunks[i]['text'])
80
+ start_time.append(chunks[i]['timestamp'][0])
81
+ end_time.append(chunks[i]['timestamp'][1])
82
+
83
+ chunk_length = []
84
+ for i in range(0, chunks_count-1):
85
+ chunk_length.append(round(end_time[i] - start_time[i], 3))
86
+
87
+ output = list(zip(chunk_id, chunk_length, texts, start_time, end_time))
88
+
89
+ sample_output_list = []
90
+ for sublist in output:
91
+ chunk_dict = {
92
+ "chunk_id": sublist[0],
93
+ "chunk_length": sublist[1],
94
+ "text": sublist[2],
95
+ "start_time": sublist[3],
96
+ "end_time": sublist[4]
97
+ }
98
+ sample_output_list.append(chunk_dict)
99
+
100
+ return sample_output_list
101
+
102
+ intf = gr.Interface(
103
+ fn=transcribe,
104
+ inputs = ["text"],
105
+ outputs = ["text"]
106
+ )
107
+
108
+ intf.launch(inline=False)