sotirios-slv commited on
Commit
e0a729c
·
1 Parent(s): 252f6f4

Removed some kwargs to simplify the implementation

Browse files
Files changed (1) hide show
  1. app.py +9 -19
app.py CHANGED
@@ -5,13 +5,10 @@ import gradio as gr
5
  import torch
6
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
7
 
8
- # from datasets import load_dataset
9
-
10
 
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
13
 
14
- # model_id = "openai/whisper-small"
15
  model_id = "openai/whisper-large-v3"
16
 
17
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
@@ -22,27 +19,20 @@ model.to(device)
22
  processor = AutoProcessor.from_pretrained(model_id)
23
 
24
  pipe = pipeline(
25
- "automatic-speech-recognition",
26
  model=model,
27
- tokenizer=processor.tokenizer,
28
- feature_extractor=processor.feature_extractor,
29
- max_new_tokens=128,
30
  chunk_length_s=30,
31
- batch_size=16,
32
- return_timestamps=True,
33
- torch_dtype=torch_dtype,
34
  device=device,
35
  )
36
 
37
- # dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
38
- # sample = dataset[0]["audio"]
39
-
40
- # result = pipe(sample)
41
- # print(result["text"])
42
-
43
 
44
- def reverse_audio(audio):
45
- # sr, data = audio
46
  result = pipe(audio)
47
  logging.info(f'TRANSCRIPTION {result["text"]}')
48
  return result
@@ -57,7 +47,7 @@ input_audio = gr.Audio(
57
  show_controls=False,
58
  ),
59
  )
60
- demo = gr.Interface(fn=reverse_audio, inputs=input_audio, outputs="text")
61
 
62
  if __name__ == "__main__":
63
  demo.launch()
 
5
  import torch
6
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
7
 
 
 
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
11
 
 
12
  model_id = "openai/whisper-large-v3"
13
 
14
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
 
19
  processor = AutoProcessor.from_pretrained(model_id)
20
 
21
  pipe = pipeline(
22
+ task="automatic-speech-recognition",
23
  model=model,
24
+ # tokenizer=processor.tokenizer,
25
+ # feature_extractor=processor.feature_extractor,
26
+ # max_new_tokens=128,
27
  chunk_length_s=30,
28
+ batch_size=8,
29
+ # return_timestamps=True,
30
+ # torch_dtype=torch_dtype,
31
  device=device,
32
  )
33
 
 
 
 
 
 
 
34
 
35
+ def transcribe_audio(audio):
 
36
  result = pipe(audio)
37
  logging.info(f'TRANSCRIPTION {result["text"]}')
38
  return result
 
47
  show_controls=False,
48
  ),
49
  )
50
+ demo = gr.Interface(fn=transcribe_audio, inputs=input_audio, outputs="text")
51
 
52
  if __name__ == "__main__":
53
  demo.launch()