mizoru commited on
Commit
508fc3a
1 Parent(s): 6a69d6a

Copy from previous app2

Browse files
Files changed (3) hide show
  1. app.py +59 -0
  2. audio_loader.pt +3 -0
  3. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ print(gr.__version__)
4
+ import torch
5
+ import torchaudio
6
+
7
+
8
+ df= pd.read_csv('native_words_subset.csv')
9
+
10
+ torch._C._jit_override_can_fuse_on_cpu(False)
11
+ torch._C._jit_override_can_fuse_on_gpu(False)
12
+ torch._C._jit_set_texpr_fuser_enabled(False)
13
+ torch._C._jit_set_nvfuser_enabled(False)
14
+
15
+ loader = torch.jit.load("audio_loader.pt")
16
+ model = torch.jit.load('QuartzNet_thunderspeech_3.pt').eval()
17
+
18
+ vocab = model.text_transform.vocab.itos
19
+ vocab[-1] = ''
20
+
21
+ def convert_probs(probs):
22
+ ids = probs.argmax(1)[0]
23
+ s = []
24
+ if vocab[ids[0]]: s.append(vocab[ids[0]])
25
+ for i in range(1,len(ids)):
26
+ if ids[i-1] != ids[i]:
27
+ new = vocab[ids[i]]
28
+ if new: s.append(new)
29
+ #return '.'.join(s)
30
+ return s
31
+
32
+
33
+ def predict(path):
34
+ audio = loader(path)
35
+ probs = model(audio, torch.tensor(audio.shape[0] * [audio.shape[-1]], device=audio.device))[0]
36
+ return convert_probs(probs)
37
+
38
+
39
+ from difflib import SequenceMatcher
40
+
41
+ def similar(a, b):
42
+ return SequenceMatcher(None, a, b).ratio()
43
+
44
+ def compare(chosen_word, path):
45
+ etalons = [list(val.split('.')) for val in df.loc[df['replica'] == chosen_word, 'transcription'].values]
46
+ user = predict(path)
47
+ coeff = 0.0
48
+ idx=0
49
+ for i in range(len(etalons)):
50
+ new_coeff = similar(user, etalons[i])
51
+ if new_coeff > coeff:
52
+ coeff = new_coeff
53
+ idx=i
54
+ return f'The similarity coefficient of your pronunciation and the pronunciation of a native speaker is {coeff}. The closer the coefficient is to 1, the better.' + '\nYour pronunciation: [' + ''.join(user) + ']\nClosest native pronunciation: [' + ''.join(etalons[idx]) + ']'
55
+
56
+
57
+ word_choice = gr.inputs.Dropdown(sorted(list(df['replica'].unique())), label="Choose a word")
58
+
59
+ gr.Interface(fn=compare, inputs=[word_choice, gr.inputs.Audio(source='microphone', type='filepath', optional=True)], outputs= 'text').launch(debug=True)
audio_loader.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7478d9de0a639a8a769684847bf697c8899e74825d9822e4d90895b916b2b51
3
+ size 23265
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ torchaudio