File size: 5,233 Bytes
c62f556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import whisper
from modules.subtitle_manager import get_srt,get_vtt,safe_filename
from modules.youtube_manager import get_ytdata,get_ytaudio
import gradio as gr
import os

DEFAULT_MODEL_SIZE="tiny"

class WhisperInference():
    def __init__(self):
        print("\nInitializing Model..\n")
        self.current_model_size = DEFAULT_MODEL_SIZE
        self.model = whisper.load_model(name=DEFAULT_MODEL_SIZE,download_root="models")
        self.available_models = ["tiny","tiny.en"]
        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))

    def transcribe_file(self,fileobjs
                        ,model_size,lang,subformat,istranslate,
                        progress=gr.Progress()):

        def progress_callback(progress_value):
            progress(progress_value,desc="Transcribing..")
        
        if model_size != self.current_model_size:
            progress(0,desc="Initializing Model..")
            self.current_model_size = model_size
            self.model = whisper.load_model(name=model_size,download_root="models")

        if lang == "Automatic Detection" :
            lang = None    

        progress(0,desc="Loading Audio..")    

        files_info = {}
        for fileobj in fileobjs: 
            print(f"\n\n {fileobj.name} \n\n")

            audio = whisper.load_audio(fileobj.name)

            translatable_model = ["large","large-v1","large-v2"]
            if istranslate and self.current_model_size in translatable_model:
                result = self.model.transcribe(audio=audio,language=lang,verbose=False,task="translate",progress_callback=progress_callback)
            else : 
                result = self.model.transcribe(audio=audio,language=lang,verbose=False,progress_callback=progress_callback)

            progress(1,desc="Completed!")

            file_name, file_ext = os.path.splitext(os.path.basename(fileobj.orig_name))
            file_name = file_name[:-9]
            file_name = safe_filename(file_name)

            if subformat == "SRT":
                subtitle = get_srt(result["segments"])
            elif subformat == "WebVTT":
                subtitle = get_vtt(result["segments"])

            files_info[file_name] = subtitle

        total_result = ''
        for file_name,subtitle in files_info.items():
            total_result+='------------------------------------\n'
            total_result+=f'{file_name}\n\n'  
            total_result+=f'{subtitle}'  

        return f"\n\n{total_result}"
    
    def transcribe_youtube(self,youtubelink
                        ,model_size,lang,subformat,istranslate,
                        progress=gr.Progress()):
        
        def progress_callback(progress_value):
            progress(progress_value,desc="Transcribing..")

        if model_size != self.current_model_size:
            progress(0,desc="Initializing Model..")
            self.current_model_size = model_size
            self.model = whisper.load_model(name=model_size,download_root="models")

        if lang == "Automatic Detection" :
            lang = None    

        progress(0,desc="Loading Audio from Youtube..")    
        yt = get_ytdata(youtubelink)
        audio = whisper.load_audio(get_ytaudio(yt))

        translatable_model = ["large","large-v1","large-v2"]
        if istranslate and self.current_model_size in translatable_model:
            result = self.model.transcribe(audio=audio,language=lang,verbose=False,task="translate",progress_callback=progress_callback)
        else : 
            result = self.model.transcribe(audio=audio,language=lang,verbose=False,progress_callback=progress_callback)

        progress(1,desc="Completed!")

        file_name = safe_filename(yt.title)

        if subformat == "SRT":
            subtitle = get_srt(result["segments"])
        elif subformat == "WebVTT":
            subtitle = get_vtt(result["segments"])

        return f"\n\n{subtitle}"
    
    def transcribe_mic(self,micaudio
                    ,model_size,lang,subformat,istranslate,
                    progress=gr.Progress()):

        def progress_callback(progress_value):
            progress(progress_value,desc="Transcribing..")
        
        if model_size != self.current_model_size:
            progress(0,desc="Initializing Model..")
            self.current_model_size = model_size
            self.model = whisper.load_model(name=model_size,download_root="models")

        if lang == "Automatic Detection" :
            lang = None    

        progress(0,desc="Loading Audio..")    

        translatable_model = ["large","large-v1","large-v2"]
        if istranslate and self.current_model_size in translatable_model:
            result = self.model.transcribe(audio=micaudio,language=lang,verbose=False,task="translate",progress_callback=progress_callback)
        else : 
            result = self.model.transcribe(audio=micaudio,language=lang,verbose=False,progress_callback=progress_callback)

        progress(1,desc="Completed!")

        if subformat == "SRT":
            subtitle = get_srt(result["segments"])
        elif subformat == "WebVTT":
            subtitle = get_vtt(result["segments"])
            
        return f"\n\n{subtitle}"