RikeshSilwal commited on
Commit
3bc51c8
1 Parent(s): 71b5a2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -34
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import gradio as gr
2
-
3
  import torch
4
  import torchaudio
5
  from datasets import load_dataset
6
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
7
  import pandas as pd
8
  from sklearn.model_selection import train_test_split
9
 
@@ -13,41 +13,17 @@ from pydub import AudioSegment
13
 
14
 
15
 
16
- # processor = Wav2Vec2Processor.from_pretrained("RikeshSilwal/wav2vec2-nepali")
17
- # model = Wav2Vec2ForCTC.from_pretrained("RikeshSilwal/wav2vec2-nepali")
18
-
19
- processor = Wav2Vec2Processor.from_pretrained("RikeshSilwal/wav2vec2-nepali-rikeshsilwal")
20
- model = Wav2Vec2ForCTC.from_pretrained("RikeshSilwal/wav2vec2-nepali-rikeshsilwal")
21
-
22
  from torchaudio.transforms import Resample
23
  import numpy as np
24
 
25
 
26
 
27
-
28
-
29
- # def transcribe_audio(audio_file):
30
- # input_arr, sampling_rate =torchaudio.load(audio_file)
31
- # resampler = Resample(orig_freq=sampling_rate, new_freq=16000)
32
- # input_arr = resampler(input_arr).squeeze().numpy()
33
- # sampling_rate = 16000
34
- # inputs = processor(input_arr, sampling_rate=16_000, return_tensors="pt", padding=True)
35
-
36
- # with torch.no_grad():
37
- # logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
38
-
39
- # predicted_ids = torch.argmax(logits, dim=-1)
40
-
41
- # predicted_words= processor.batch_decode(predicted_ids)
42
-
43
- # return predicted_words[0]
44
-
45
  def transcribe_audio(audio_file):
46
  audio = AudioSegment.from_wav(audio_file)
47
 
48
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
49
 
50
- input_arr, sampling_rate =torchaudio.load(audio_file)
51
  # Create TorchGating instance
52
  tg = TG(sr=sampling_rate, nonstationary=True).to(device)
53
  try:
@@ -55,13 +31,39 @@ def transcribe_audio(audio_file):
55
  except:
56
  input_arr = input_arr
57
  if sampling_rate != 16000:
58
- resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
59
- input_arr = resampler(input_arr).squeeze().numpy()
 
 
 
 
 
 
 
 
 
60
 
61
- recognizer = pipeline("automatic-speech-recognition", model="Harveenchadha/vakyansh-wav2vec2-nepali-nem-130")
62
- prediction = recognizer(input_arr, chunk_length_s=5, stride_length_s=(2,1))
63
- prediction = recognizer(input_arr)
64
- prediction = re.sub('[<s>]' , '' , str(prediction['text']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  audio_input = gr.inputs.Audio(source="upload", type="filepath")
67
 
@@ -71,4 +73,3 @@ iface = gr.Interface(fn=transcribe_audio, inputs=audio_input,
71
  button")
72
 
73
  iface.launch(inline=False)
74
-
 
1
  import gradio as gr
2
+ import librosa
3
  import torch
4
  import torchaudio
5
  from datasets import load_dataset
6
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
7
  import pandas as pd
8
  from sklearn.model_selection import train_test_split
9
 
 
13
 
14
 
15
 
 
 
 
 
 
 
16
  from torchaudio.transforms import Resample
17
  import numpy as np
18
 
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def transcribe_audio(audio_file):
22
  audio = AudioSegment.from_wav(audio_file)
23
 
24
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
25
 
26
+ input_arr, sampling_rate =librosa.load(audio_file)
27
  # Create TorchGating instance
28
  tg = TG(sr=sampling_rate, nonstationary=True).to(device)
29
  try:
 
31
  except:
32
  input_arr = input_arr
33
  if sampling_rate != 16000:
34
+ input_arr = librosa.resample(input_arr, orig_sr=sampling_rate, target_sr=16000)
35
+
36
+
37
+ MODEL_NAME = "rikeshsilwalekg/whisper-small-wer35-ekg"
38
+
39
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
40
+
41
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
42
+ MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True
43
+ )
44
+ model.to(device)
45
 
46
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
47
+
48
+ pipe = pipeline(
49
+ "automatic-speech-recognition",
50
+ model=model,
51
+ tokenizer=processor.tokenizer,
52
+ feature_extractor=processor.feature_extractor,
53
+ max_new_tokens=128,
54
+ chunk_length_s=30,
55
+ batch_size=16,
56
+ return_timestamps=False,
57
+ torch_dtype=torch_dtype,
58
+ device=device,
59
+ )
60
+
61
+ # return_timestamps=True for sentence level timestaps
62
+ # for word level timestamps return_timestamps="word"
63
+ prediction = pipe(input_arr)
64
+
65
+
66
+ prediction = prediction['text']
67
 
68
  audio_input = gr.inputs.Audio(source="upload", type="filepath")
69
 
 
73
  button")
74
 
75
  iface.launch(inline=False)