Aryan Wadhawan commited on
Commit
0cffe6d
β€’
1 Parent(s): 3e7b6ee

Implemented everything

Browse files
Files changed (1) hide show
  1. app.py +52 -48
app.py CHANGED
@@ -8,26 +8,17 @@ import io
8
  import base64
9
  from strsimpy.jaro_winkler import JaroWinkler
10
 
11
- # base64 to audio βœ…
12
- # audio to transcription βœ…
13
- # audio to text βœ…
14
- # text to phoneme βœ…
15
- # accuracy = jarowinkler(transcription, phoneme) βœ…
16
- # band = getBandFromAccuracy(accuracy) βœ…
17
- # return accuracy, band βœ…
18
 
19
-
20
- def lark(audioAsB64):
21
- # base64 to wav data conversion
22
  wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
23
-
24
- # audio to transcription
25
  processor = Wav2Vec2Processor.from_pretrained(
26
  "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
27
  )
28
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
29
 
30
- waveform, sample_rate = librosa.load(io.BytesIO(wav_data), sr=16000)
 
 
31
 
32
  input_values = processor(
33
  waveform, sampling_rate=sample_rate, return_tensors="pt"
@@ -37,55 +28,68 @@ def lark(audioAsB64):
37
  logits = model(input_values).logits
38
 
39
  predicted_ids = torch.argmax(logits, dim=-1)
40
- speechToPhonemeTranscription = processor.batch_decode(predicted_ids)[0]
 
 
 
 
 
 
 
41
 
42
- # audio to text
43
- processorSTT = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
 
 
44
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
45
 
46
- input_values = processorSTT(
47
  waveform, sampling_rate=sample_rate, return_tensors="pt"
48
  ).input_values
49
 
50
  logits = model(input_values).logits
51
 
52
  predicted_ids = torch.argmax(logits, dim=-1)
53
- speechToTextTranscripition = processor.batch_decode(predicted_ids)[0]
54
 
55
- # text to phoneme
56
- graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscripition)
 
57
 
58
- # accuracy = jaroWinkler(transcription, phoneme)
59
 
 
60
  jarowinkler = JaroWinkler()
61
- similarity_score = jarowinkler.similarity(
62
- speechToPhonemeTranscription, graphemeToPhonemeTranscription
63
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # ielts pronunciation band estimation
66
- def getBandFromSimilarityScore(similarity_score):
67
- if similarity_score >= 0.91:
68
- return 9
69
- elif similarity_score >= 0.81:
70
- return 8
71
- elif similarity_score >= 0.73:
72
- return 7
73
- elif similarity_score >= 0.65:
74
- return 6
75
- elif similarity_score >= 0.60:
76
- return 5
77
- elif similarity_score >= 0.46:
78
- return 4
79
- elif similarity_score >= 0.35:
80
- return 3
81
- elif similarity_score >= 0.1:
82
- return 2
83
- else:
84
- return 1
85
-
86
- IELTSband = getBandFromSimilarityScore(similarity_score)
87
-
88
- return [similarity_score, IELTSband, speechToTextTranscripition]
89
 
90
 
91
  iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text", "text"])
 
8
  import base64
9
  from strsimpy.jaro_winkler import JaroWinkler
10
 
 
 
 
 
 
 
 
11
 
12
+ def speechToPhonemeWS(audioAsB64):
 
 
13
  wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
 
 
14
  processor = Wav2Vec2Processor.from_pretrained(
15
  "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
16
  )
17
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
18
 
19
+ waveform, sample_rate = librosa.load(
20
+ io.BytesIO(wav_data), sr=16000
21
+ ) # Downsample 44.1kHz to 8kHz
22
 
23
  input_values = processor(
24
  waveform, sampling_rate=sample_rate, return_tensors="pt"
 
28
  logits = model(input_values).logits
29
 
30
  predicted_ids = torch.argmax(logits, dim=-1)
31
+ transcription = processor.batch_decode(predicted_ids)
32
+ speechToPhonemeTranscription = transcription[0]
33
+ speechToPhonemeTranscription = speechToPhonemeTranscription.replace(" ", "")
34
+ return speechToPhonemeTranscription
35
+
36
+
37
+ def speechToTextToPhonemeWS(audioAsB64):
38
+ wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
39
 
40
+ waveform, sample_rate = librosa.load(
41
+ io.BytesIO(wav_data), sr=16000
42
+ ) # Downsample 44.1kHz to 8kHz
43
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
44
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
45
 
46
+ input_values = processor(
47
  waveform, sampling_rate=sample_rate, return_tensors="pt"
48
  ).input_values
49
 
50
  logits = model(input_values).logits
51
 
52
  predicted_ids = torch.argmax(logits, dim=-1)
53
+ speechToTextTranscription = processor.batch_decode(predicted_ids)
54
 
55
+ graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscription[0])
56
+ graphemeToPhonemeTranscription = graphemeToPhonemeTranscription.replace(" ", "")
57
+ return [speechToTextTranscription[0], graphemeToPhonemeTranscription]
58
 
 
59
 
60
+ def similarity(S2P, G2P2T):
61
  jarowinkler = JaroWinkler()
62
+ similarity_score = jarowinkler.similarity(S2P, G2P2T)
63
+ return similarity_score
64
+
65
+
66
+ def similarityScoreToBand(similarity_score):
67
+ if similarity_score >= 0.91:
68
+ return 9
69
+ elif similarity_score >= 0.81:
70
+ return 8
71
+ elif similarity_score >= 0.73:
72
+ return 7
73
+ elif similarity_score >= 0.65:
74
+ return 6
75
+ elif similarity_score >= 0.60:
76
+ return 5
77
+ elif similarity_score >= 0.46:
78
+ return 4
79
+ elif similarity_score >= 0.35:
80
+ return 3
81
+ elif similarity_score >= 0.1:
82
+ return 2
83
+ else:
84
+ return 1
85
 
86
+
87
+ def lark(audioAsB64):
88
+ s2p = speechToPhonemeWS(audioAsB64)
89
+ [s2t, s2t2p] = speechToTextToPhonemeWS(audioAsB64)
90
+ ss = similarity(s2t2p, s2p)
91
+ band = similarityScoreToBand(ss)
92
+ return [ss, band, s2t]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text", "text"])