Spaces:
Sleeping
Sleeping
import whisper | |
from tempfile import NamedTemporaryFile | |
class Transcription: | |
def __init__(self, source): | |
self.source = source | |
# self.device = device | |
# self.audios = [] | |
# with NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: | |
# tmp_file.write(file.getvalue()) | |
# # self.audios.append(tmp_file.name) | |
# self.audios.append(tmp_file) | |
# self.audios.append(source) | |
def transcribe( | |
self, | |
model | |
# whisper_model_option: str, | |
# translation: bool, | |
): | |
# # Get the whisper model | |
# transcriber = whisper.load_model(whisper_model_option, device=self.device) | |
# self.output = [] | |
# for idx, _ in enumerate(self.audios): | |
# identify language | |
audio = whisper.load_audio(self.source)#audios)#[idx]) | |
audio = whisper.pad_or_trim(audio) | |
# print(model.__dict__) | |
# n_mels = 128 if 'large' in model.name else 80 | |
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device) | |
_, probs = model.detect_language(mel) | |
language = max(probs, key=probs.get) | |
self.raw_output = model.transcribe( | |
self.source,#audios[idx], | |
language=language, | |
verbose=True, | |
word_timestamps=True, | |
# fp16=(model.device == 'cuda') # use fp16 on GPU for speed/memory | |
) | |
# if(translation): | |
# self.translation = model.transcribe( | |
# self.audios[idx], | |
# language=language, | |
# verbose=True, | |
# word_timestamps=True, | |
# task='translate' | |
# )["text"] | |
# self.raw_output["translation"] = self.translation | |
self.segments = self.raw_output['segments'] | |
for segment in self.raw_output['segments']: | |
del segment['tokens'] | |
self.raw_output.update( | |
name=self.source[0], #[idx],#.name, | |
language=language | |
) | |
self.output = self.raw_output | |
# self.output.append(self.raw_output) | |
# print(self.raw_output['segments']) | |