barto17 commited on
Commit
0c25646
1 Parent(s): b2dca93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -8
app.py CHANGED
@@ -59,16 +59,34 @@ LANGUANGE_MAP = {
59
  }
60
 
61
 
62
- processor = WhisperProcessor.from_pretrained(model_id)
63
- model = WhisperForConditionalGeneration.from_pretrained(model_id)
64
- model.eval()
65
- model.to(device)
66
 
 
 
 
 
 
 
 
67
 
68
- bos_token_id = processor.tokenizer.all_special_ids[-106]
69
- decoder_input_ids = torch.tensor([bos_token_id]).to(device)
70
 
 
 
 
 
 
 
 
 
 
71
 
 
 
 
 
72
 
73
  def detect_language(sentence):
74
 
@@ -80,7 +98,18 @@ def detect_language(sentence):
80
  predictions = torch.nn.functional.softmax(output.logits, dim=-1)
81
  probability, pred_idx = torch.max(predictions, dim=-1)
82
  language = LANGUANGE_MAP[pred_idx.item()]
83
- return language, probability.item()
 
 
 
 
 
 
 
 
 
 
 
84
 
85
 
86
  def process_audio_file(file, sampling_rate):
@@ -123,7 +152,7 @@ def transcribe(Microphone, File_Upload):
123
  language, probability = detect_language(transcription)
124
 
125
  return transcription.capitalize(), language, probability
126
-
127
 
128
  examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
129
  examples = [[f"./{f}"] for f in examples]
 
59
  }
60
 
61
 
62
+ from pytube import YouTube
63
+ import whisper
 
 
64
 
65
+ # define function for transcription
66
+ def transcribe(Microphone, File_Upload):
67
+ warn_output = ""
68
+ if (Microphone is not None) and (File_Upload is not None):
69
+ warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
70
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
71
+ file = Microphone
72
 
73
+ elif (Microphone is None) and (File_Upload is None):
74
+ return "ERROR: You have to either use the microphone or upload an audio file"
75
 
76
+ elif Microphone is not None:
77
+ file = Microphone
78
+ else:
79
+ file = File_Upload
80
+
81
+
82
+ language = None
83
+
84
+ options = whisper.DecodingOptions(without_timestamps=True)
85
 
86
+ loaded_model = whisper.load_model("base")
87
+ transcript = loaded_model.transcribe(file, language=language)
88
+
89
+ return detect_language(transcript["text"])
90
 
91
  def detect_language(sentence):
92
 
 
98
  predictions = torch.nn.functional.softmax(output.logits, dim=-1)
99
  probability, pred_idx = torch.max(predictions, dim=-1)
100
  language = LANGUANGE_MAP[pred_idx.item()]
101
+ return sentence, language, probability.item()
102
+
103
+
104
+ """
105
+ processor = WhisperProcessor.from_pretrained(model_id)
106
+ model = WhisperForConditionalGeneration.from_pretrained(model_id)
107
+ model.eval()
108
+ model.to(device)
109
+
110
+
111
+ bos_token_id = processor.tokenizer.all_special_ids[-106]
112
+ decoder_input_ids = torch.tensor([bos_token_id]).to(device)
113
 
114
 
115
  def process_audio_file(file, sampling_rate):
 
152
  language, probability = detect_language(transcription)
153
 
154
  return transcription.capitalize(), language, probability
155
+ """
156
 
157
  examples=['sample1.mp3', 'sample2.mp3', 'sample3.mp3']
158
  examples = [[f"./{f}"] for f in examples]