File size: 2,878 Bytes
2bcca4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
from pytube import YouTube
from moviepy.editor import VideoFileClip
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-tiny"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


def transcribe(url):
    # get video and extract video
    def get_video(yt_url):
        try:
            video = YouTube(yt_url)
            video.streams.get_by_itag(22).download(filename='video.mp4')
            print('Video succesfully downloaded from Youtube')
        except Exception as e:
            print(f'Failed to download Youtube video \nerror : {e}')

    def audio_from_video(video_path):
        try:
            video = VideoFileClip(video_path)
            audio = video.audio
            audio.write_audiofile('audio.wav')   
            video.close()
            audio.close()
        except Exception as e:
            print(f'Failed to extract audio from video \nerror : {e}')

    url = url
    video_path = './video.mp4'

    get_video(url)
    audio_from_video(video_path)


    # transcribe audio
   
    audio = 'audio.wav'

    text_audio = pipe(audio)

    chunks = text_audio['chunks']

    chunks_count = len(chunks)

    chunk_id = []
    timestamps = []
    texts = []
    start_time = []
    end_time = []


    for i in range(0, chunks_count):
        chunk_id.append(i)
        texts.append(chunks[i]['text'])
        start_time.append(chunks[i]['timestamp'][0])
        end_time.append(chunks[i]['timestamp'][1])

    chunk_length = []
    for i in range(0, chunks_count-1):
        chunk_length.append(round(end_time[i] - start_time[i], 3))

    output = list(zip(chunk_id, chunk_length, texts, start_time, end_time))

    sample_output_list = []
    for sublist in output:
        chunk_dict = {
            "chunk_id": sublist[0],
            "chunk_length": sublist[1],
            "text": sublist[2],
            "start_time": sublist[3],
            "end_time": sublist[4]
        }
        sample_output_list.append(chunk_dict)
    
    return sample_output_list

intf = gr.Interface(
    fn=transcribe,
    inputs = ["text"],
    outputs = ["text"]
)

intf.launch()