hubsnippetai commited on
Commit
2036c34
1 Parent(s): 2238241

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -8
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import torch
2
  # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
- from transformers import pipeline
4
  import gradio as gr
5
  import datetime
6
 
@@ -29,10 +29,24 @@ pipe = pipeline(
29
  """
30
  # call a text generation model to display the audio content after identifying the word(s) in the text output
31
 
32
- #import torch
33
- #from transformers import pipeline
34
- #from datasets import load_dataset
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
37
 
38
  pipe = pipeline(
@@ -42,7 +56,7 @@ pipe = pipeline(
42
  chunk_length_s=30,
43
  device=device,
44
  )
45
-
46
  # ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
47
  # sample = ds[0]["audio"]
48
 
@@ -52,9 +66,19 @@ pipe = pipeline(
52
  #prediction = pipe(sample.copy(), batch_size=8, return_timestamps=True)["chunks"]
53
 
54
 
55
- def audio2text(audio_file, prompt : str | list):
56
- prediction = pipe(audio_file, batch_size=8, return_timestamps=True)["chunks"]
 
 
 
 
 
 
 
 
 
 
57
  #prediction=pipe(audio_file)
58
- return prediction['text']
59
 
60
  gr.Interface(fn=audio2text, inputs=[gr.Audio(label='upload your audio file', sources='upload', type='filepath'), gr.Textbox(label="provide word(s) to search for")], outputs=[gr.Textbox(label="transcription")]).launch()
 
1
  import torch
2
  # from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
4
  import gradio as gr
5
  import datetime
6
 
 
29
  """
30
  # call a text generation model to display the audio content after identifying the word(s) in the text output
31
 
32
+ # import torch
33
+ # from transformers import pipeline
34
+ # from datasets import load_dataset
35
 
36
+
37
+ # from transformers import WhisperProcessor, WhisperForConditionalGeneration
38
+ # from datasets import load_dataset
39
+
40
+ # load model and processor
41
+ processor = WhisperProcessor.from_pretrained("microsoft/whisper-base-webnn")
42
+ model = WhisperForConditionalGeneration.from_pretrained("microsoft/whisper-base-webnn")
43
+ model.config.forced_decoder_ids = None
44
+
45
+ # load dummy dataset and read audio files
46
+ # ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
47
+ # sample = ds[0]["audio"]
48
+
49
+ """
50
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
51
 
52
  pipe = pipeline(
 
56
  chunk_length_s=30,
57
  device=device,
58
  )
59
+ """
60
  # ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
61
  # sample = ds[0]["audio"]
62
 
 
66
  #prediction = pipe(sample.copy(), batch_size=8, return_timestamps=True)["chunks"]
67
 
68
 
69
+ def audio2text(audio_file, prompt : list):
70
+
71
+ input_features = processor(audio_file, sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
72
+
73
+ # generate token ids
74
+ predicted_ids = model.generate(input_features)
75
+ # decode token ids to text
76
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
77
+
78
+ # transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
79
+
80
+ # prediction = pipe(audio_file, batch_size=8, return_timestamps=True)["chunks"]
81
  #prediction=pipe(audio_file)
82
+ return transcription['text']
83
 
84
  gr.Interface(fn=audio2text, inputs=[gr.Audio(label='upload your audio file', sources='upload', type='filepath'), gr.Textbox(label="provide word(s) to search for")], outputs=[gr.Textbox(label="transcription")]).launch()