File size: 4,989 Bytes
e711356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from pydub import AudioSegment
import whisper
from settings import MODEL_PARSER
from pytube import YouTube

class BagOfModels:
    '''model            ->  is a model from hugging face
       model_names      ->  modelnames that can be chosen from in streamlit
       model_settinsg   ->  settings of model that can be customized by user
    '''
    args = MODEL_PARSER
    barfs = 5

    def __init__(self,model,model_names,model_settings,model_tasks, **kwargs):
        self.model = model
        self.model_names = model_names
        self.model_settings = model_settings
        self.model_tasks = model_tasks
        self.kwargs = kwargs

    @classmethod
    def get_model_settings(cls):
        bag_of_models = BagOfModels(**vars(cls.args))
        return bag_of_models.model_settings
    
    @classmethod
    def get_model_names(cls):
        bag_of_models = BagOfModels(**vars(cls.args))
        return bag_of_models.model_names
    
    @classmethod
    def get_model(cls):
        bag_of_models = BagOfModels(**vars(cls.args))
        return bag_of_models.model

    @classmethod
    def get_model_tasks(cls):
        bag_of_models = BagOfModels(**vars(cls.args))
        return bag_of_models.model_tasks
        
    @classmethod
    def load_model(cls,model_name,**kwargs):           
        bag_of_models = BagOfModels(**vars(cls.args))
        cls.model = bag_of_models.model
        assert model_name in bag_of_models.model_names, f"please pick one of the available models: {bag_of_models.model_names}"      
        return Model(model_name,**cls.model[model_name])
     
    
class Model:
    def __init__(self,model_name,task,url,**kwargs):
        self.url = url
        self.model_name = model_name
        self.name = self.url.split("https://huggingface.co/")[1] 
        self.task = task
        self.kwargs = kwargs      
        self.init_optional_args(**self.kwargs)    
    
    def init_optional_args(self,year=None,description=None):
        self._year = year
        self._description = description
    
    def predict_stt(self,source,source_type,model_task):       
        model = whisper.load_model(self.model_name.split("_")[1]) #tiny - base - medium 
        stt = SoundToText(source,source_type,model_task,model=model,tokenizer=None)
        stt.whisper()
        return stt

    def predict_summary(self):
        tokenizer = Wav2Vec2Processor.from_pretrained(self.name)
        model = Wav2Vec2ForCTC.from_pretrained(self.name) # Note: PyTorch Model

class Transcription():
    def __init__(self,model,source,source_type) -> None:
        pass

class SoundToText():
    def __init__(self,source,source_type,model_task,model,tokenizer=None):
        self.source = source
        self.source_type = source_type
        self.model = model
        self.model_task = model_task
        self.tokenizer = tokenizer
    
    def wav2vec(self,size):
        pass
    
    def wav2vec2(self,size):
        pass
    
    def whisper(self):
        # download youtube url
        if self.source_type == "YouTube":       
            self.audio_path = YouTube(self.source).streams.get_by_itag(140).download("output/", filename="audio")
        
        if self.source_type == "File": 
            audio = None
            if self.source.name.endswith('.wav'): audio = AudioSegment.from_wav(self.source)
            elif self.source.name.endswith('.mp3'): audio = AudioSegment.from_mp3(self.source)                
            audio.export('output/audio.wav', format='wav')
            self.audio_path = "output/audio.wav"            

        model = whisper.load_model("base")
        self.raw_output = model.transcribe(self.audio_path,verbose=True)

        self.text = self.raw_output["text"]
        self.language = self.raw_output["language"]
        self.segments = self.raw_output["segments"]

        # Remove token ids from the output
        for segment in self.segments:
            del segment["tokens"]

        self.transcribed = True
                
class TextToSummary():
    def __init__(self,input_text,min_length,max_length):        
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        self.summary_input = input_text   
        self.summary_output = (self.summarizer(self.summary_input, min_length=min_length, max_length=max_length, do_sample=False))
        
    def get_summary(self):
        return self.summary_output
        
    def wav2vec(self):
        pass

def record(model_name):
    args = MODEL_PARSER
    models = BagOfModels.get_model_names()
    tasks = BagOfModels.get_model_tasks()
    whisper_base = BagOfModels.load_model(model_name,**vars(args))
    whisper_base.predict()

if __name__== "__main__":
    args = MODEL_PARSER
    models = BagOfModels.get_model_names()
    tasks = BagOfModels.get_model_tasks()
    whisper_base = BagOfModels.load_model("whisper_base",**vars(args))
    whisper_base.predict_stt()