File size: 3,865 Bytes
4962437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#speech to text tool

import os
import subprocess

import whisperx
from pydub import AudioSegment
from pytube import YouTube


class SpeechToText:
    def __init__(
            self, 
            video_url, 
            audio_format='mp3',
            device='cuda',
            batch_size = 16,
            compute_type = "float16",
            hf_api_key = None
        ):
        """
        # Example usage
        video_url = "url"
        speech_to_text = SpeechToText(video_url)
        transcription = speech_to_text.transcribe_youtube_video()
        print(transcription)

        """
        self.video_url = video_url
        self.audio_format = audio_format
        self.device = device
        self.batch_size = batch_size
        self.compute_type = compute_type
        self.hf_api_key = hf_api_key
    
    def install(self):
        subprocess.run(["pip", "install", "whisperx"])
        subprocess.run(["pip", "install", "pytube"])
        subprocess.run(["pip", "install", "pydub"])
        

    def download_youtube_video(self):
        audio_file = f'video.{self.audio_format}'
        
        # Download video πŸ“₯
        yt = YouTube(self.video_url)
        yt_stream = yt.streams.filter(only_audio=True).first()
        yt_stream.download(filename='video.mp4')

        # Convert video to audio 🎧
        video = AudioSegment.from_file("video.mp4", format="mp4")
        video.export(audio_file, format=self.audio_format)   
        os.remove("video.mp4")
        
        return audio_file

    def transcribe_youtube_video(self):
        audio_file = self.download_youtube_video()
        
        device = "cuda"
        batch_size = 16
        compute_type = "float16"

        # 1. Transcribe with original Whisper (batched) πŸ—£οΈ
        model = whisperx.load_model("large-v2", device, compute_type=compute_type)
        audio = whisperx.load_audio(audio_file)
        result = model.transcribe(audio, batch_size=batch_size)

        # 2. Align Whisper output πŸ”
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
        result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

        # 3. Assign speaker labels 🏷️
        diarize_model = whisperx.DiarizationPipeline(
            use_auth_token=self.hf_api_key, 
            device=device
        )
        diarize_model(audio_file)
        
        try:
            segments = result["segments"]
            transcription = " ".join(segment['text'] for segment in segments)
            return transcription
        except KeyError:
            print("The key 'segments' is not found in the result.")
    
    def transcribe(self, audio_file):
        model = whisperx.load_model(
            "large-v2", 
            self.device, 
            self.compute_type
        )
        audio = whisperx.load_audio(audio_file)
        result = model.transcribe(
            audio, 
            batch_size=self.batch_size
        )

        # 2. Align Whisper output πŸ”
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
        result = whisperx.align(
            result["segments"], 
            model_a, 
            metadata, 
            audio, 
            self.device, 
            return_char_alignments=False
        )

        # 3. Assign speaker labels 🏷️
        diarize_model = whisperx.DiarizationPipeline(
            use_auth_token=self.hf_api_key,
            device=self.device
        )

        diarize_model(audio_file)
        
        try:
            segments = result["segments"]
            transcription = " ".join(segment['text'] for segment in segments)
            return transcription
        except KeyError:
            print("The key 'segments' is not found in the result.")