Spaces:

Slava917
/

pronunciation-trainer

Runtime error

File size: 1,859 Bytes

25b92d1
877be96
1d14371
b150df1
 
9719ddd
7432f05
 
 
9719ddd
ed47f0e
 
 
b150df1
ed47f0e
3e9b43d
ed47f0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a8c946
05277a2
82696f4
f259382
82696f4
 
 
 
 
 
 
c556980
ed47f0e
 
edfb611
ed47f0e
b0a4d88

import pandas as pd
import gradio as gr
print(gr.__version__)
import torch
import torchaudio


df= pd.read_csv('native_words_subset.csv')

torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_set_nvfuser_enabled(False)

loader = torch.jit.load("audio_loader.pt")
model = torch.jit.load('QuartzNet_thunderspeech_3.pt').eval()

vocab = model.text_transform.vocab.itos
vocab[-1] = ''

def convert_probs(probs):
  ids = probs.argmax(1)[0]
  s = []
  if vocab[ids[0]]: s.append(vocab[ids[0]])
  for i in range(1,len(ids)):
    if ids[i-1] != ids[i]:
      new = vocab[ids[i]]
      if new: s.append(new)
  #return '.'.join(s)
  return s
 
  
def predict(path):
  audio = loader(path)
  probs = model(audio, torch.tensor(audio.shape[0] * [audio.shape[-1]], device=audio.device))[0]
  return convert_probs(probs)
 
 
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def compare(chosen_word, path):
  etalons = [list(val.split('.')) for val in df.loc[df['replica'] == chosen_word, 'transcription'].values]
  user = predict(path)
  coeff = 0.0
  idx=0
  for i in range(len(etalons)):
    new_coeff =  similar(user, etalons[i])
    if new_coeff > coeff:
      coeff = new_coeff
      idx=i
  return f'The similarity coefficient of your pronunciation and the pronunciation of a native speaker is {coeff}. The closer the coefficient is to 1, the better.' + '\nYour pronunciation: [' + ''.join(user) + ']\nClosest native pronunciation: ['  + ''.join(etalons[idx]) + ']'


word_choice = gr.inputs.Dropdown(sorted(list(df['replica'].unique())), label="Choose a word")

gr.Interface(fn=compare, inputs=[word_choice, gr.inputs.Audio(source='microphone', type='filepath', optional=True)], outputs= 'text').launch(debug=True)