File size: 2,195 Bytes
e4e56ea
 
 
 
 
 
 
 
6edd739
e4e56ea
 
 
 
 
 
6edd739
e4e56ea
 
 
 
 
 
 
 
 
 
6edd739
e4e56ea
6edd739
 
 
 
e4e56ea
6edd739
 
 
e4e56ea
6edd739
 
e4e56ea
6edd739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4e56ea
6edd739
 
 
e4e56ea
6edd739
 
 
 
e4e56ea
6edd739
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import whisper
from tempfile import NamedTemporaryFile


class Transcription:
    def __init__(self, source):
        self.source = source
        # self.device = device
        # self.audios = []

        #     with NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
        #         tmp_file.write(file.getvalue())
        #         # self.audios.append(tmp_file.name)
        #         self.audios.append(tmp_file)

        # self.audios.append(source)

    def transcribe(
        self,
        model
        # whisper_model_option: str,
        # translation: bool,
    ):
        # # Get the whisper model
        # transcriber = whisper.load_model(whisper_model_option, device=self.device)

        # self.output = []

        # for idx, _ in enumerate(self.audios):
        # identify language
        audio = whisper.load_audio(self.source)#audios)#[idx])
        audio = whisper.pad_or_trim(audio)

        # print(model.__dict__)
        # n_mels = 128 if 'large' in model.name else 80
        mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)

        _, probs = model.detect_language(mel)
        language = max(probs, key=probs.get)

        self.raw_output = model.transcribe(
            self.source,#audios[idx],
            language=language,
            verbose=True,
            word_timestamps=True, 
            # fp16=(model.device == 'cuda')  # use fp16 on GPU for speed/memory
        )
        # if(translation):
        #     self.translation = model.transcribe(
        #         self.audios[idx],
        #         language=language,
        #         verbose=True,
        #         word_timestamps=True,
        #         task='translate'
        #     )["text"]
        #     self.raw_output["translation"] = self.translation

        self.segments = self.raw_output['segments']
        for segment in self.raw_output['segments']:
            del segment['tokens']

        self.raw_output.update(
            name=self.source[0], #[idx],#.name, 
            language=language
        )

        self.output = self.raw_output

        # self.output.append(self.raw_output)
        # print(self.raw_output['segments'])