RobPruzan commited on
Commit
04a6d97
1 Parent(s): a9db20d

Updating layout and introducing interoperability

Browse files
Files changed (1) hide show
  1. app.py +249 -144
app.py CHANGED
@@ -1,21 +1,26 @@
1
  import csv
2
- import statistics
3
  import string
4
 
5
  import gensim.downloader as api
6
- import gradio as gr
7
  import nltk
 
8
  import pandas as pd
9
  import readability
10
  import seaborn as sns
11
  import torch
12
  from fuzzywuzzy import fuzz
13
  from nltk.corpus import stopwords
 
14
  from nltk.tokenize import word_tokenize
15
  from sklearn.metrics.pairwise import cosine_similarity
16
  from transformers import DistilBertTokenizer
17
  from transformers import pipeline
18
 
 
 
 
 
19
  nltk.download('cmudict')
20
 
21
  nltk.download('stopwords')
@@ -29,85 +34,102 @@ device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
29
 
30
  # loading model
31
  PATH = 'pytorchBERTmodel'
32
- model = torch.load(PATH, map_location=torch.device('cpu'))
33
  model.eval()
34
 
35
- model.to('cpu')
36
 
37
  p = pipeline("automatic-speech-recognition")
38
 
 
 
 
 
 
 
 
 
 
39
  w2v = dict({})
40
- for idx, key in enumerate(glove_vectors.key_to_index.keys()):
41
- w2v[key] = glove_vectors.get_vector(key)
42
 
43
 
44
  def calculate_diversity(text):
 
 
 
45
 
46
- stop_words = set(stopwords.words('english'))
47
- for i in string.punctuation:
48
- stop_words.add(i)
49
 
50
- tokenized_text = word_tokenize(text)
51
- tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
52
- sim_words = {}
53
- if len(tokenized_text) <= 1:
54
- return 1,"More Text Required"
55
 
 
 
 
 
56
 
 
57
 
58
-
59
- for idx, anc_word in enumerate(tokenized_text):
60
- if anc_word in stop_words:
61
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- vocab = [anc_word]
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- for pos, comp_word in enumerate(tokenized_text):
66
- if anc_word in sim_words.get(pos, []):
67
- if anc_word == sim_words[pos][0]:
68
- sim_words[idx] = sim_words[pos]
69
- continue
70
- try:
71
- if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1), w2v[comp_word].reshape(1, -1)) > .75:
72
- vocab.append(comp_word)
73
- sim_words[idx] = vocab
74
 
75
- except KeyError:
76
- continue
77
-
78
-
79
- scores = {}
80
- for key, value in sim_words.items():
81
- if len(value) == 1:
82
- scores[key] = 1
83
- continue
84
- if len(value) == 2:
85
- scores[key] = -1
86
- continue
87
- t_sim = len(value) - 1
88
- t_rep = (len(value) - 1) - (len(set(value[1:])))
89
-
90
- score = ((t_sim - t_rep)/t_sim)**2
91
-
92
- scores[key] = score
93
-
94
- mean_score = 0
95
- total = 0
96
-
97
- for value in scores.values():
98
- if value == -1:
99
- continue
100
- mean_score += value
101
- total += 1
102
-
103
- return scores, mean_score/total
104
 
105
 
106
  def dict_to_list(dictionary, max_size=10):
107
  outer_list = []
108
  inner_list = []
109
 
110
- for value in dictionary.values():
111
  inner_list.append(value)
112
  if len(inner_list) == max_size:
113
  outer_list.append(inner_list)
@@ -140,7 +162,7 @@ def stats(text):
140
 
141
  def predict(text, tokenizer=tokenizer):
142
  model.eval()
143
- model.to('cpu')
144
 
145
  def prepare_data(text, tokenizer):
146
  input_ids = []
@@ -166,14 +188,25 @@ def predict(text, tokenizer=tokenizer):
166
  tokenized_example_text = prepare_data(text, tokenizer)
167
  with torch.no_grad():
168
  result = model(
169
- tokenized_example_text['input_ids'].to('cpu'),
170
- attention_mask=tokenized_example_text['attention_masks'].to('cpu'),
171
  return_dict=True
172
  ).logits
173
 
174
  return result
175
 
176
 
 
 
 
 
 
 
 
 
 
 
 
177
  def reading_difficulty(excerpt):
178
  if len(excerpt) == 0:
179
  return "No Text Provided"
@@ -192,12 +225,14 @@ def reading_difficulty(excerpt):
192
  win_preds.append(predict(text, tokenizer).item())
193
  result = statistics.mean(win_preds)
194
  score = -(result * 1.786 + 6.4) + 10
195
- return score
 
196
 
197
  else:
198
  result = predict(excerpt).item()
199
  score = -(result * 1.786 + 6.4) + 10
200
- return score
 
201
 
202
 
203
  def calculate_stats(file_name, data_index):
@@ -237,11 +272,11 @@ def calculate_stats(file_name, data_index):
237
  def transcribe(audio):
238
  # speech to text using pipeline
239
  text = p(audio)["text"]
240
- transcription.append(text)
241
  return text
242
 
243
 
244
  def compute_score(target, actual):
 
245
  target = target.lower()
246
  actual = actual.lower()
247
  return fuzz.ratio(target, actual)
@@ -256,94 +291,164 @@ def phon(text):
256
  pronun.append(alph[word][0])
257
  except Exception as e:
258
  pronun.append(word)
259
- return pronun
260
 
 
 
 
 
 
261
 
262
- def gradio_fn(text, audio, target, actual_audio):
263
- if text is None and audio is None and target is None and actual_audio is None:
264
- return "No Inputs", "No Inputs", "No Inputs", "No Inputs"
265
- speech_score = 0
266
-
267
- if actual_audio is not None:
268
- actual = p(actual_audio)["text"]
269
- speech_score = compute_score(target, actual)
270
-
271
- return "Difficulty Score: " + str(reading_difficulty(actual)), "Transcript: " + str(
272
- actual.lower()), "Diversity Score: " + str(calculate_diversity(target)[1]), "Speech Score: " + str(speech_score)
273
- div = calculate_diversity(text)
274
- transcription = []
275
- if audio is not None:
276
- text = p(audio)["text"]
277
- transcription.append(text)
278
- state = div[0]
279
- return "Difficulty Score: " + str(reading_difficulty(text)), "Transcript: " + str(
280
- transcription[-1].lower()), "Diversity Score: " + str(div[1]), "No Inputs"
281
-
282
- return "Difficulty Score: " + str(reading_difficulty(text)), "Diversity Score: " + str(
283
- div[1]), "No Audio Provided", "No Audio Provided"
284
 
285
 
286
  def plot():
287
- text = state
288
  diversity = calculate_diversity(text)[0]
 
289
  df = pd.DataFrame(dict_to_list(diversity))
290
  return heatmap(diversity, df)
291
 
292
 
 
 
 
 
 
 
 
 
 
293
 
294
- example_data = []
295
- x = 0
296
- with open('train.csv') as f:
297
- reader = csv.reader(f)
298
- next(reader)
299
- for line in reader:
300
- example_data.append([line[3]])
301
- x += 1
302
- if x > 100:
303
- break
304
-
305
- state = {}
306
- interface = gr.Interface(
307
- fn=gradio_fn,
308
- inputs=[gr.components.Textbox(
309
- label="Text Difficulty Scoring",
310
- lines = 6),
311
- gr.components.Audio(
312
- label="Speech Translation",
313
- source="microphone",
314
- type="filepath"),
315
- gr.components.Textbox(
316
- label="Type Your Target Text to Recite",
317
- placeholder="How much wood would a woodchuck chuck if a woodchuck could chuck wood?"
318
- ),
319
- gr.components.Audio(
320
- label="Read Text Typed Above for Pronunciation Score",
321
- source="microphone",
322
- type="filepath")
323
- ],
324
-
325
- outputs=["text", "text", "text", "text"],
326
- theme="huggingface",
327
- description="Enter text or speak into your microphone to have your text analyzed!",
328
- rounded=True,
329
- container=True,
330
- article="""
331
- Text Difficulty Score- Using a fine-tuned Distil-Bert model, we automatically determine how difficult something is to read while incorporating underlying semantics.
332
- To efficiently compute text difficulty, a Distil-Bert pre-trained model is fine-tuned for regression using The CommonLit Ease of Readability (CLEAR)
333
- Corpus. https://educationaldatamining.org/EDM2021/virtual/static/pdf/EDM21_paper_35.pdf This dataset contains over 110,000 pairwise comparisons of
334
- ~1100 teachers responded to the question, "Which text is easier for students to understand?". This model is trained end-end (regression layer down to
335
- the first attention layer) to ensure the best performance- Merchant et al. 2020
336
-
337
- Speech Pronunciaion Scoring: The Wave2Vec 2.0 model is utilized to convert audio into text in real-time. The model predicts words or phonemes (smallest
338
- unit of speech distinguishing one word (or word element) from another) from the user input audio. Due to the nature of the model, users with poor
339
- pronunciation receive inaccurate translations. This project attempts to score pronunciation by asking a user to read a target excerpt into the microphone. We then
340
- pass this audio through Wave2Vec 2.0 to get the inferred intended words. We measure the loss as the Levenshtein distance between the target and actual transcripts-
341
- the Levenshtein distance between two words is the minimum number of single-character edits required to change one word into the other.
342
-
343
- Lexical Diversity Score: The lexical diversity score is computed by taking the ratio of unique similar words to total similar words squared. The similarity is computed
344
- as if the cosine similarity of the word2vec embeddings is greater than .75. It is bad writing/speech practice to repeat the same words when it's possible not to.
345
- Vocabulary diversity is generally computed by taking the ratio of unique strings/ total strings. This does not give an indication if the person has a large vocabulary
346
- or if the topic does not require a diverse vocabulary to express it. Words that are not in the Word2Vec vocabulary will not be incorporated into the score.
347
- """
348
-
349
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import csv
 
2
  import string
3
 
4
  import gensim.downloader as api
5
+ import matplotlib.pyplot as plt
6
  import nltk
7
+ import numpy as np
8
  import pandas as pd
9
  import readability
10
  import seaborn as sns
11
  import torch
12
  from fuzzywuzzy import fuzz
13
  from nltk.corpus import stopwords
14
+ from nltk.corpus import wordnet as wn
15
  from nltk.tokenize import word_tokenize
16
  from sklearn.metrics.pairwise import cosine_similarity
17
  from transformers import DistilBertTokenizer
18
  from transformers import pipeline
19
 
20
+ nltk.download('wordnet')
21
+
22
+ nltk.download('omw-1.4')
23
+
24
  nltk.download('cmudict')
25
 
26
  nltk.download('stopwords')
 
34
 
35
  # loading model
36
  PATH = 'pytorchBERTmodel'
37
+ model = torch.load(PATH)
38
  model.eval()
39
 
40
+ model.to(device)
41
 
42
  p = pipeline("automatic-speech-recognition")
43
 
44
+
45
+ def syns(word):
46
+ synonyms = []
47
+ for syn in wn.synsets(word):
48
+ for lm in syn.lemmas():
49
+ synonyms.append(lm.name())
50
+ return set(synonyms)
51
+
52
+
53
  w2v = dict({})
54
+ for idx, key in enumerate(glove_vectors.wv.vocab):
55
+ w2v[key] = glove_vectors.wv.get_vector(key)
56
 
57
 
58
  def calculate_diversity(text):
59
+ stop_words = set(stopwords.words('english'))
60
+ for i in string.punctuation:
61
+ stop_words.add(i)
62
 
63
+ tokenized_text = word_tokenize(text)
 
 
64
 
65
+ tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
66
+ sim_words = {}
67
+ if len(tokenized_text) <= 1:
68
+ return 1, "More Text Required"
 
69
 
70
+ for idx, anc in enumerate(tokenized_text):
71
+ if anc in stop_words or not anc in w2v or anc.isdigit():
72
+ sim_words[idx] = '@'
73
+ continue
74
 
75
+ vocab = [anc]
76
 
77
+ for pos, comp in enumerate(tokenized_text):
78
+ if pos == idx:
79
+ continue
80
+ if comp in stop_words:
81
+ continue
82
+ if not comp.isalpha():
83
+ continue
84
+ try:
85
+ if cosine_similarity(w2v[anc].reshape(1, -1), w2v[comp].reshape(1, -1)) > .7 or comp in syns(anc):
86
+ vocab.append(comp)
87
+ except KeyError:
88
+ continue
89
+ sim_words[idx] = vocab
90
+ print(sim_words)
91
+ scores = {}
92
+ for key, value in sim_words.items():
93
+ if len(value) == 1:
94
+ scores[key] = -1
95
+ continue
96
+ # if len(value) == 2:
97
+ # scores[key] = -1
98
+ # continue
99
+ t_sim = len(value)
100
+ t_rep = (len(value)) - (len(set(value)))
101
+
102
+ score = ((t_sim - t_rep) / t_sim) ** 2
103
+
104
+ scores[key] = score
105
+
106
+ mean_score = 0
107
+ total = 0
108
 
109
+ for value in scores.values():
110
+ if value == -1:
111
+ continue
112
+ mean_score += value
113
+ total += 1
114
+ try:
115
+ return scores, {"Diversity Score": mean_score / total}
116
+ except ZeroDivisionError:
117
+ return scores, {"Dviersity Score": "Not Enough Data"}
118
+
119
+
120
+ def get_scores(text):
121
+ return calculate_diversity(text)[0]
122
 
 
 
 
 
 
 
 
 
 
123
 
124
+ def get_mean_score(text):
125
+ return calculate_diversity(text)[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
 
128
  def dict_to_list(dictionary, max_size=10):
129
  outer_list = []
130
  inner_list = []
131
 
132
+ for key, value in dictionary.items():
133
  inner_list.append(value)
134
  if len(inner_list) == max_size:
135
  outer_list.append(inner_list)
 
162
 
163
  def predict(text, tokenizer=tokenizer):
164
  model.eval()
165
+ model.to(device)
166
 
167
  def prepare_data(text, tokenizer):
168
  input_ids = []
 
188
  tokenized_example_text = prepare_data(text, tokenizer)
189
  with torch.no_grad():
190
  result = model(
191
+ tokenized_example_text['input_ids'].to(device),
192
+ attention_mask=tokenized_example_text['attention_masks'].to(device),
193
  return_dict=True
194
  ).logits
195
 
196
  return result
197
 
198
 
199
+ def level(score):
200
+ if score <= 3:
201
+ return "Elementary School"
202
+ elif 3 <= score <= 6:
203
+ return "Middle School"
204
+ elif 6 <= score <= 8:
205
+ return "High School"
206
+ else:
207
+ return "College"
208
+
209
+
210
  def reading_difficulty(excerpt):
211
  if len(excerpt) == 0:
212
  return "No Text Provided"
 
225
  win_preds.append(predict(text, tokenizer).item())
226
  result = statistics.mean(win_preds)
227
  score = -(result * 1.786 + 6.4) + 10
228
+ return "Difficulty Level: " + str(round(score, 2)) + '/10' + ' | A ' + str(
229
+ level(score)) + " student could understand this"
230
 
231
  else:
232
  result = predict(excerpt).item()
233
  score = -(result * 1.786 + 6.4) + 10
234
+ return 'Difficulty Level: ' + str(round(score, 2)) + '/10' + ' | A ' + str(
235
+ level(score)) + " student could understand this"
236
 
237
 
238
  def calculate_stats(file_name, data_index):
 
272
  def transcribe(audio):
273
  # speech to text using pipeline
274
  text = p(audio)["text"]
 
275
  return text
276
 
277
 
278
  def compute_score(target, actual):
279
+ print(target)
280
  target = target.lower()
281
  actual = actual.lower()
282
  return fuzz.ratio(target, actual)
 
291
  pronun.append(alph[word][0])
292
  except Exception as e:
293
  pronun.append(word)
 
294
 
295
+ def remove_digits(lists):
296
+ for lst in lists:
297
+ for idx, word in enumerate(lst):
298
+ lst[idx] = ''.join([letter for letter in word if not letter.isdigit()])
299
+ return lists
300
 
301
+ output = []
302
+ for i in remove_digits(pronun):
303
+ output.append('-'.join(i).lower())
304
+ return ' '.join(output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
 
307
  def plot():
 
308
  diversity = calculate_diversity(text)[0]
309
+ print(diversity)
310
  df = pd.DataFrame(dict_to_list(diversity))
311
  return heatmap(diversity, df)
312
 
313
 
314
+ def diversity_inter(text):
315
+ words = word_tokenize(text)
316
+ scores = get_scores(text)
317
+ interpret_values = [('', 0.0)]
318
+ for key, value in scores.items():
319
+ interpret_values.append((words[key], value))
320
+ interpret_values.append(('', 0.0))
321
+ print(interpret_values)
322
+ return {'original': text, 'interpretation': interpret_values}
323
 
324
+
325
+ def sliding_window(text):
326
+ wind_preds = []
327
+ windows = []
328
+ new_values = []
329
+ heat_map = []
330
+ words = word_tokenize(text)
331
+ for idx, text in enumerate(words):
332
+ if idx <= len(words) - 26:
333
+ x = ' '.join(words[idx: idx + 25])
334
+ windows.append(x)
335
+
336
+ for text in windows:
337
+ prediction = -(predict(text).item() * 1.786 + 6.4) + 10
338
+ wind_preds.append(prediction)
339
+
340
+ size = 25
341
+ for i in wind_preds:
342
+ for j in range(size):
343
+ new_values.append(i)
344
+
345
+ heat_map = []
346
+ for idx, i in enumerate(new_values):
347
+ window = new_values[idx:idx + size]
348
+ heat_map.append(np.mean(window))
349
+ compressed_map = []
350
+ for idx, i in enumerate(heat_map):
351
+ if idx % size == 0:
352
+ window = heat_map[idx:idx + size]
353
+ compressed_map.append(np.mean(window))
354
+
355
+ inter_scores = compressed_map
356
+ while len(inter_scores) <= len(words) - 1:
357
+ inter_scores.append(compressed_map[-1])
358
+
359
+ x = list(range(len(inter_scores)))
360
+ y = inter_scores
361
+
362
+ fig, ax = plt.subplots()
363
+
364
+ ax.plot(x, y, color='orange', linewidth=2)
365
+ ax.grid(False)
366
+ plt.xlabel('Word Number', fontweight='bold')
367
+ plt.ylabel('Difficulty Score', fontweight='bold')
368
+ fig.patch.set_facecolor('white')
369
+ plt.suptitle('Difficulty Score Across Text', fontsize=14, fontweight='bold')
370
+ plt.style.use('ggplot')
371
+
372
+ fig = plt.gcf()
373
+
374
+ map = [('', 0)]
375
+ maxy = max(inter_scores)
376
+ miny = min(inter_scores)
377
+ spread = maxy - miny
378
+
379
+ for idx, i in enumerate(words):
380
+ map.append((i, (inter_scores[idx] - miny) / spread))
381
+ map.append(('', 0))
382
+
383
+ return fig, map
384
+
385
+
386
+ def get_plot(text):
387
+ return sliding_window(text)[0]
388
+
389
+
390
+ def get_dif_inter(text):
391
+ return {'original': text, 'interpretation': sliding_window(text)[1]}
392
+
393
+
394
+ def speech_to_text(speech, target):
395
+ text = p(speech)["text"]
396
+ return text.lower(), {'Pronunciation Score': compute_score(text, target) / 100}, phon(target)
397
+
398
+
399
+ def my_i_func(text):
400
+ return {"original": "", "interpretation": [('', 0.0), ('what', -0.2), ('great', 0.3), ('day', 0.5), ('', 0.0)]}
401
+
402
+
403
+ inter = {"original": "what a wonderful day", "interpretation": [0, .2, .3, .5]}
404
+
405
+ with gr.Blocks() as demo:
406
+ with gr.Column():
407
+ with gr.Row():
408
+ with gr.Box():
409
+ with gr.Column():
410
+ with gr.Group():
411
+ with gr.Tabs():
412
+ with gr.TabItem("Text"):
413
+ in_text = gr.Textbox(label="In Text")
414
+ grade = gr.Button("Grade Your Text")
415
+ with gr.TabItem("Speech"):
416
+ audio_file = gr.Audio(source="microphone", type="filepath")
417
+ grade1 = gr.Button("Grade Your Speech")
418
+
419
+ with gr.Box():
420
+ diff_output = gr.Label(label='Difficulty Level', show_label=True)
421
+ gr.Markdown("Diversity Score Across Text")
422
+ plotter = gr.Plot()
423
+
424
+ with gr.Row():
425
+ with gr.Box():
426
+ div_output = gr.Label(label='Diversity Score', show_label=False)
427
+ gr.Markdown("Diversity Heamap")
428
+ interpretation = gr.components.Interpretation(in_text, label="Diversity Heapmap")
429
+ # gr.DataFrame(df)
430
+ with gr.Box():
431
+ # plotter = gr.Plot()
432
+ # gr.Markdown("*Nominal Score May Not Represent ")
433
+ interpretation2 = gr.components.Interpretation(in_text, label="Difficulty Heapmap")
434
+ with gr.Row():
435
+ with gr.Box():
436
+ with gr.Group():
437
+ target = gr.Textbox(label="Target Text")
438
+ with gr.Group():
439
+ audio_file1 = gr.Audio(source="microphone", type="filepath")
440
+ b1 = gr.Button("Grade Your Pronunciation")
441
+ with gr.Box():
442
+ some_val = gr.Label()
443
+ text = gr.Textbox()
444
+ phones = gr.Textbox()
445
+
446
+ grade.click(reading_difficulty, inputs=in_text, outputs=diff_output)
447
+ grade.click(get_mean_score, inputs=in_text, outputs=div_output)
448
+ grade.click(diversity_inter, inputs=in_text, outputs=interpretation)
449
+ grade.click(get_dif_inter, inputs=in_text, outputs=interpretation2)
450
+ grade.click(get_plot, inputs=in_text, outputs=plotter)
451
+ # grade1.click(transcribe, inputs=input_audio, outputs=in_text)
452
+ # pronun.click(transcribe, inputs=pronon_audio, outputs=trans)
453
+ b1.click(speech_to_text, inputs=[audio_file1, target], outputs=[text, some_val, phones])
454
+ demo.launch(debug=True)