File size: 4,259 Bytes
262d511
 
 
 
 
 
0cc2cbd
 
953aff6
262d511
 
 
 
0cc2cbd
 
 
 
953aff6
5cc58eb
 
0cc2cbd
d6d4252
5cc58eb
 
0cc2cbd
 
 
 
 
262d511
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cc2cbd
 
 
 
 
 
 
 
 
262d511
 
0cc2cbd
 
 
1221d26
0cc2cbd
262d511
 
 
 
 
 
 
 
 
 
 
 
 
 
048700d
262d511
 
 
 
 
 
 
 
 
 
0cc2cbd
 
 
 
 
262d511
 
 
 
 
 
 
 
 
 
e7cb8eb
262d511
79eccc5
e9c72a6
0cc2cbd
 
262d511
 
 
bc815d6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os, sys, re
import shutil
import subprocess
import soundfile
from process_audio import segment_audio
from write_srt import write_to_file
from clean_text import clean_english, clean_german, clean_spanish
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import AutoModelForCTC, AutoProcessor
import torch
import gradio as gr


english_model = "facebook/wav2vec2-large-960h-lv60-self"
english_tokenizer = Wav2Vec2Processor.from_pretrained(english_model)
english_asr_model = Wav2Vec2ForCTC.from_pretrained(english_model)

german_model = "flozi00/wav2vec2-large-xlsr-53-german-with-lm"
german_tokenizer = Wav2Vec2Processor.from_pretrained(german_model)
german_asr_model = Wav2Vec2ForCTC.from_pretrained(german_model)

spanish_model = "patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm"
spanish_tokenizer = Wav2Vec2Processor.from_pretrained(spanish_model)
spanish_asr_model = Wav2Vec2ForCTC.from_pretrained(spanish_model)

# Get German corpus and update nltk
command = ["python", "-m", "textblob.download_corpora"]
subprocess.run(command)


# Line count for SRT file
line_count = 0

def sort_alphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] 
    
    return sorted(data, key = alphanum_key)

def transcribe_audio(tokenizer, asr_model, audio_file, file_handle):  
    # Run Wav2Vec2.0 inference on each audio file generated after VAD segmentation.
    global line_count
    
    speech, rate = soundfile.read(audio_file) 
    input_values = tokenizer(speech, sampling_rate=16000, return_tensors = "pt", padding='longest').input_values
    logits = asr_model(input_values).logits
    prediction = torch.argmax(logits, dim = -1)

    
    infered_text = tokenizer.batch_decode(prediction)[0].lower()
    if len(infered_text) > 1:
        if lang == 'english':
            infered_text = clean_english(infered_text)
        elif lang == 'german':
            infered_text = clean_german(infered_text)
        elif lang == 'spanish':
            infered_text = clean_spanish(infered_text)

        print(infered_text)
        limits = audio_file.split(os.sep)[-1][:-4].split("_")[-1].split("-")
        line_count += 1
        write_to_file(file_handle, infered_text, line_count, limits)
    else:
        infered_text = ''

        
def get_subs(input_file, language):
    # Get directory for audio
    base_directory = os.getcwd()
    audio_directory = os.path.join(base_directory, "audio")
    if os.path.isdir(audio_directory):
        shutil.rmtree(audio_directory)
    os.mkdir(audio_directory)
    
    # Extract audio from video file
    video_file = input_file
    audio_file = audio_directory+'/temp.wav'
    command = ["ffmpeg", "-i", video_file, "-ac", "1", "-ar", "16000","-vn", "-f", "wav", audio_file]
    subprocess.run(command)
    
    video_file = input_file.split('/')[-1][:-4]
    srt_file_name = os.path.join(video_file + ".srt")
    
    # Split audio file based on VAD silent segments
    segment_audio(audio_file)
    os.remove(audio_file) 
    
    # Output SRT file
    file_handle = open(srt_file_name, "a+")
    file_handle.seek(0)
    for file in sort_alphanumeric(os.listdir(audio_directory)):
        audio_segment_path = os.path.join(audio_directory, file)
        global lang
        lang = language.lower()
        tokenizer = globals()[lang+'_tokenizer']
        asr_model = globals()[lang+'_asr_model']

        if audio_segment_path.split(os.sep)[-1] != audio_file.split(os.sep)[-1]:
            transcribe_audio(tokenizer, asr_model, audio_segment_path, file_handle)

    file_handle.close()
    shutil.rmtree(audio_directory)    

    return srt_file_name


gradio_ui = gr.Interface(
    enable_queue=True,
    fn=get_subs,
    title="Video to Subtitle",
    description="Get subtitles (SRT file) for your videos. Inference speed is about 10s/per 1min of video BUT the speed of uploading your video depends on your internet connection.",
    inputs=[gr.inputs.Video(label="Upload Video File"),
        gr.inputs.Radio(label="Choose Language", choices=['English', 'German', 'Spanish'])],
    outputs=gr.outputs.File(label="Auto-Transcript")
    )

gradio_ui.launch()