Spaces:

reach-vb
/

asr-pyctcdecode

Runtime error

App Files Files

xet

Community

Vaibhav Srivastav commited on Jan 17, 2022

Commit

b8af00e

1 Parent(s): 851eb15

adding decoding w lm

Browse files

Files changed (2) hide show

4gram_small.arpa.gz +3 -0
app.py +24 -2

4gram_small.arpa.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4c4fe64751abecdeb7040fe6ed7f2440c2d3f36ed35c43e3510f7cf95578f2a
+size 18358716

app.py CHANGED Viewed

@@ -42,6 +42,28 @@ def predict_and_ctc_decode(input_file, model_name):
   return transcribed_text
 def predict_and_greedy_decode(input_file, model_name):
   processor, model = return_processor_and_model(model_name)
   speech = load_and_fix_data(input_file)
@@ -57,12 +79,12 @@ def predict_and_greedy_decode(input_file, model_name):
   return transcribed_text
 def return_all_predictions(input_file, model_name):
-  return predict_and_ctc_decode(input_file, model_name), predict_and_greedy_decode(input_file, model_name)
 gr.Interface(return_all_predictions,
              inputs = [gr.inputs.Audio(source="microphone", type="filepath", label="Record/ Drop audio"), gr.inputs.Dropdown(["facebook/wav2vec2-base-960h", "facebook/hubert-large-ls960-ft"], label="Model Name")],
-             outputs = [gr.outputs.Textbox(label="Beam CTC decoding"), gr.outputs.Textbox(label="Greedy decoding")],
              title="ASR using Wav2Vec2/ Hubert & pyctcdecode",
              description = "Comparing greedy decoder with beam search CTC decoder, record/ drop your audio!",
              layout = "horizontal",

   return transcribed_text
+def predict_and_ctc_lm_decode(input_file, model_name):
+  processor, model = return_processor_and_model(model_name)
+  speech = load_and_fix_data(input_file)
+  input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
+  logits = model(input_values).logits.cpu().detach().numpy()[0]
+  vocab_list = list(processor.tokenizer.get_vocab().keys())
+  vocab_dict = processor.tokenizer.get_vocab()
+  sorted_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
+  decoder = build_ctcdecoder(
+    list(sorted_dict.keys()),
+    "4gram_small.arpa.gz",
+    )
+  pred = decoder.decode(logits)
+  transcribed_text = fix_transcription_casing(pred.lower())
+  return transcribed_text
 def predict_and_greedy_decode(input_file, model_name):
   processor, model = return_processor_and_model(model_name)
   speech = load_and_fix_data(input_file)
   return transcribed_text
 def return_all_predictions(input_file, model_name):
+  return predict_and_ctc_decode(input_file, model_name), predict_and_ctc_lm_decode(input_file, model_name), predict_and_greedy_decode(input_file, model_name)
 gr.Interface(return_all_predictions,
              inputs = [gr.inputs.Audio(source="microphone", type="filepath", label="Record/ Drop audio"), gr.inputs.Dropdown(["facebook/wav2vec2-base-960h", "facebook/hubert-large-ls960-ft"], label="Model Name")],
+             outputs = [gr.outputs.Textbox(label="Beam CTC decoding"), gr.outputs.Textbox(label="Beam CTC decoding w/ LM"), gr.outputs.Textbox(label="Greedy decoding")],
              title="ASR using Wav2Vec2/ Hubert & pyctcdecode",
              description = "Comparing greedy decoder with beam search CTC decoder, record/ drop your audio!",
              layout = "horizontal",