mikr commited on
Commit
1808ded
1 Parent(s): 7df6e8c

working demo

Browse files
Files changed (2) hide show
  1. app.py +23 -37
  2. requirements.txt +2 -0
app.py CHANGED
@@ -1,64 +1,50 @@
1
  import gradio as gr
2
  import soundfile as sf
3
  import torch
4
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
 
 
5
 
6
  MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"
7
- lang = "cs"
8
 
9
  device = 0 if torch.cuda.is_available() else "cpu"
10
 
11
- model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device)
12
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
13
 
14
- pipe = pipeline(
15
- model=MODEL_NAME,
16
- )
17
 
18
- def transcribe(file_upload):
19
- warn_output = ""
20
- if (file_upload is None):
21
- return "ERROR: You have to either use the microphone or upload an audio file"
22
 
23
- file = file_upload
24
- text = pipe(file)["text"]
25
- return warn_output + text
26
 
 
 
27
 
28
- def readwav(a_f):
29
- wav, sr = sf.read(a_f, dtype=np.float32)
30
- if len(wav.shape) == 2:
31
- wav = wav.mean(1)
32
- if sr != 16000:
33
- wlen = int(wav.shape[0] / sr * 16000)
34
- wav = signal.resample(wav, wlen)
35
- return wav
36
 
37
- def transcribe2(file_upload):
38
- wav = readwav(file_upload)
39
- with torch.inference_mode():
40
- input_values = processor(wav, sampling_rate=16000).input_values[0]
41
- input_values = torch.tensor(input_values, device=device).unsqueeze(0)
42
- logits = model(input_values).logits
43
- pred_ids = torch.argmax(logits, dim=-1)
44
- xcp = processor.batch_decode(pred_ids)
45
- return xcp[0]
46
 
47
 
48
  iface = gr.Interface(
49
- fn=transcribe2,
50
  inputs=[
51
- gr.File(type="binary", label="Upload Audio File"), # Audio file upload
52
  ],
53
  outputs="text",
54
  theme="huggingface",
55
- title="Wav2Vec2-Bert demo - transcribe Czech Audio",
56
  description=(
57
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the fine-tuned"
58
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Whisper Fine Tuning Sprint Event 2022 "
59
  "and 🤗 Transformers to transcribe audio files of arbitrary length."
60
  ),
61
  allow_flagging="never",
62
  )
63
 
64
- iface.launch()
 
1
  import gradio as gr
2
  import soundfile as sf
3
  import torch
4
+ import numpy as np
5
+ import librosa
6
+ from transformers import AutoProcessor, Wav2Vec2BertForCTC
7
 
8
  MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"
 
9
 
10
  device = 0 if torch.cuda.is_available() else "cpu"
11
 
12
+ print("device:",device)
 
13
 
14
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
15
+ model = Wav2Vec2BertForCTC.from_pretrained(MODEL_NAME).to(device)
 
16
 
 
 
 
 
17
 
18
+ def transcribe(audio_path):
19
+ a, s = librosa.load(audio_path, sr=16_000)
 
20
 
21
+ # inputs = processor(a, sampling_rate=s, return_tensors="pt")
22
+ input_values = processor(a, sampling_rate=s, return_tensors="pt").input_features
23
 
24
+ with torch.no_grad():
25
+ logits = model(input_values.to(device)).logits
26
+
27
+ predicted_ids = torch.argmax(logits, dim=-1)
 
 
 
 
28
 
29
+ # transcribe speech
30
+ transcription = processor.batch_decode(predicted_ids)
31
+ return transcription[0]
 
 
 
 
 
 
32
 
33
 
34
  iface = gr.Interface(
35
+ fn=transcribe,
36
  inputs=[
37
+ gr.File(type="filepath", label="Upload Audio File"), # Audio file upload
38
  ],
39
  outputs="text",
40
  theme="huggingface",
41
+ title="Czech W2v-BERT 2.0 speech encoder demo - transcribe Czech Audio",
42
  description=(
43
+ "Transcribe audio inputs with the click of a button! Demo uses the fine-tuned"
44
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Facebook W2v-BERT 2.0 speech encoder "
45
  "and 🤗 Transformers to transcribe audio files of arbitrary length."
46
  ),
47
  allow_flagging="never",
48
  )
49
 
50
+ iface.launch(server_name="0.0.0.0")
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  git+https://github.com/huggingface/transformers
2
  torch
3
  soundfile
 
 
 
1
  git+https://github.com/huggingface/transformers
2
  torch
3
  soundfile
4
+ librosa
5
+ ffmpy