abnerh commited on
Commit
262d511
1 Parent(s): a8f89c5

Add necessary file

Browse files
Files changed (4) hide show
  1. app.py +97 -0
  2. process_audio.py +16 -0
  3. requirements.txt +5 -0
  4. write_srt.py +31 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys, re
2
+ import shutil
3
+ import argparse
4
+ import subprocess
5
+ import soundfile
6
+ from process_audio import segment_audio
7
+ from write_srt import write_to_file
8
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Tokenizer
9
+ import torch
10
+ import gradio as gr
11
+
12
+
13
+ model = "facebook/wav2vec2-large-960h-lv60-self"
14
+ tokenizer = Wav2Vec2Tokenizer.from_pretrained(model)
15
+ asr_model = Wav2Vec2ForCTC.from_pretrained(model)#.to('cuda')
16
+ vocab_dict = tokenizer.get_vocab()
17
+ sort_vocab = sorted((value, key) for (key,value) in vocab_dict.items())
18
+ vocab = ([x[1].replace("|", " ") if x[1] not in tokenizer.all_special_tokens else "_" for x in sort_vocab])
19
+
20
+
21
+ # Line count for SRT file
22
+ line_count = 0
23
+
24
+ def sort_alphanumeric(data):
25
+ convert = lambda text: int(text) if text.isdigit() else text.lower()
26
+ alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
27
+
28
+ return sorted(data, key = alphanum_key)
29
+
30
+
31
+ def transcribe_audio(tokenizer, asr_model, audio_file, file_handle):
32
+ # Run Wav2Vec2.0 inference on each audio file generated after VAD segmentation.
33
+ global line_count
34
+
35
+ speech, rate = soundfile.read(audio_file)
36
+ input_values = tokenizer(speech, sampling_rate=16000, return_tensors = "pt", padding='longest').input_values
37
+ logits = asr_model(input_values).logits
38
+ prediction = torch.argmax(logits, dim = -1)
39
+
40
+
41
+ infered_text = tokenizer.batch_decode(prediction)[0].lower()
42
+ infered_text = re.sub(r' ', ' ', infered_text)
43
+ infered_text = re.sub(r'\bi\s', 'I ', infered_text)
44
+ infered_text = re.sub(r'\si$', ' I', infered_text)
45
+ infered_text = re.sub(r'i\'', 'I\'', infered_text)
46
+
47
+ limits = audio_file.split(os.sep)[-1][:-4].split("_")[-1].split("-")
48
+
49
+ if len(infered_text) > 1:
50
+ line_count += 1
51
+ write_to_file(file_handle, infered_text, line_count, limits)
52
+
53
+
54
+ def get_subs(input_file):
55
+ # Get directory for audio
56
+ base_directory = os.getcwd()
57
+ audio_directory = os.path.join(base_directory, "audio")
58
+ if os.path.isdir(audio_directory):
59
+ shutil.rmtree(audio_directory)
60
+ os.mkdir(audio_directory)
61
+
62
+ # Extract audio from video file
63
+ video_file = input_file
64
+ audio_file = audio_directory+'/temp.wav'
65
+ command = ["ffmpeg", "-i", video_file, "-ac", "1", "-ar", "16000","-vn", "-f", "wav", audio_file]
66
+ subprocess.run(command)
67
+
68
+ video_file = input_file.split('/')[-1][:-4]
69
+ srt_directory = os.path.join(base_directory, "srt")
70
+ srt_file_name = os.path.join(srt_directory, video_file + ".srt")
71
+
72
+ # Split audio file based on VAD silent segments
73
+ segment_audio(audio_file)
74
+ os.remove(audio_file)
75
+
76
+ # Output SRT file
77
+ file_handle = open(srt_file_name, "a+")
78
+ file_handle.seek(0)
79
+ for file in sort_alphanumeric(os.listdir(audio_directory)):
80
+ audio_segment_path = os.path.join(audio_directory, file)
81
+ if audio_segment_path.split(os.sep)[-1] != audio_file.split(os.sep)[-1]:
82
+ transcribe_audio(tokenizer, asr_model, audio_segment_path, file_handle)
83
+
84
+ file_handle.close()
85
+ shutil.rmtree(audio_directory)
86
+
87
+ return srt_file_name
88
+
89
+
90
+ gradio_ui = gr.Interface(
91
+ fn=get_subs,
92
+ title="Autoblog - Video to Subtitle",
93
+ inputs=gr.inputs.Video(label="Upload Video File"),
94
+ outputs=gr.outputs.File(label="Auto-Transcript")
95
+ )
96
+
97
+ gradio_ui.launch(inline=False)
process_audio.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import auditok
2
+
3
+
4
+ def segment_audio(audio_name):
5
+ audio_regions = auditok.split(audio_name,
6
+ min_dur=2, # minimum duration of a valid audio in seconds
7
+ max_dur=8, # maximum duration of an audio segment
8
+ max_silence=0.8, # maximum duration of tolerated continuous silence within an event
9
+ energy_threshold=55, # threshold of detection
10
+ sampling_rate=16000
11
+ )
12
+
13
+ for i, r in enumerate(audio_regions):
14
+ filename = r.save(audio_name[:-4]+"_{meta.start:.3f}-{meta.end:.3f}.wav")
15
+
16
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ soundfile
2
+ transformers
3
+ torch
4
+ gradio
5
+ auditok
write_srt.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+
4
+ def write_to_file(file_handle, inferred_text, line_count, limits):
5
+ """Write the inferred text to SRT file
6
+ Follows a specific format for SRT files
7
+
8
+ Args:
9
+ file_handle : SRT file handle
10
+ inferred_text : text to be written
11
+ line_count : subtitle line count
12
+ limits : starting and ending times for text
13
+ """
14
+
15
+ sep = ','
16
+
17
+ d = str(datetime.timedelta(seconds=float(limits[0])))
18
+ try:
19
+ from_dur = "0" + str(d.split(".")[0]) + sep + str(d.split(".")[-1][:2])
20
+ except:
21
+ from_dur = "0" + str(d) + sep + "00"
22
+
23
+ d = str(datetime.timedelta(seconds=float(limits[1])))
24
+ try:
25
+ to_dur = "0" + str(d.split(".")[0]) + sep + str(d.split(".")[-1][:2])
26
+ except:
27
+ to_dur = "0" + str(d) + sep + "00"
28
+
29
+ file_handle.write(str(line_count) + "\n")
30
+ file_handle.write(from_dur + " --> " + to_dur + "\n")
31
+ file_handle.write(inferred_text + "\n\n")