conspectum / transcriber.py
macsunmood's picture
update app
6edd739
raw
history blame contribute delete
2.2 kB
import whisper
from tempfile import NamedTemporaryFile
class Transcription:
def __init__(self, source):
self.source = source
# self.device = device
# self.audios = []
# with NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
# tmp_file.write(file.getvalue())
# # self.audios.append(tmp_file.name)
# self.audios.append(tmp_file)
# self.audios.append(source)
def transcribe(
self,
model
# whisper_model_option: str,
# translation: bool,
):
# # Get the whisper model
# transcriber = whisper.load_model(whisper_model_option, device=self.device)
# self.output = []
# for idx, _ in enumerate(self.audios):
# identify language
audio = whisper.load_audio(self.source)#audios)#[idx])
audio = whisper.pad_or_trim(audio)
# print(model.__dict__)
# n_mels = 128 if 'large' in model.name else 80
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
_, probs = model.detect_language(mel)
language = max(probs, key=probs.get)
self.raw_output = model.transcribe(
self.source,#audios[idx],
language=language,
verbose=True,
word_timestamps=True,
# fp16=(model.device == 'cuda') # use fp16 on GPU for speed/memory
)
# if(translation):
# self.translation = model.transcribe(
# self.audios[idx],
# language=language,
# verbose=True,
# word_timestamps=True,
# task='translate'
# )["text"]
# self.raw_output["translation"] = self.translation
self.segments = self.raw_output['segments']
for segment in self.raw_output['segments']:
del segment['tokens']
self.raw_output.update(
name=self.source[0], #[idx],#.name,
language=language
)
self.output = self.raw_output
# self.output.append(self.raw_output)
# print(self.raw_output['segments'])