RobPruzan commited on
Commit
a4e5c29
1 Parent(s): 2801f34

reformatting

Browse files
Files changed (1) hide show
  1. app.py +235 -244
app.py CHANGED
@@ -1,32 +1,19 @@
1
- # -*- coding: utf-8 -*-
2
- """app.ipynb
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/10plMWPNgOBAggggGeW01XD195JH5cYlR
8
- """
9
-
10
 
 
11
  import gradio as gr
12
- import csv
13
- import string
14
- import readability
15
- import pandas as pd
16
  import nltk
17
- from nltk.tokenize import word_tokenize
 
 
18
  import torch
19
- import gensim
20
- import gensim.downloader as api
 
21
  from sklearn.metrics.pairwise import cosine_similarity
22
- from nltk.corpus import wordnet as wn
23
  from transformers import DistilBertTokenizer
24
- from nltk.corpus import stopwords
25
- from fuzzywuzzy import fuzz
26
- from fuzzywuzzy import process
27
- from transformers import pipeline
28
- import statistics
29
- import seaborn as sns
30
 
31
  nltk.download('cmudict')
32
 
@@ -39,7 +26,7 @@ glove_vectors = api.load('glove-wiki-gigaword-100')
39
  tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
40
  device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
41
 
42
- #loading model
43
  PATH = 'pytorchBERTmodel'
44
  model = torch.load(PATH, map_location=torch.device('cpu'))
45
  model.eval()
@@ -50,72 +37,69 @@ p = pipeline("automatic-speech-recognition")
50
 
51
  w2v = dict({})
52
  for idx, key in enumerate(glove_vectors.key_to_index.keys()):
53
- w2v[key] = glove_vectors.get_vector(key)
54
 
55
- def calculate_diversity(text):
56
-
57
- stop_words = set(stopwords.words('english'))
58
- for i in string.punctuation:
59
- stop_words.add(i)
60
 
61
- tokenized_text = word_tokenize(text)
62
- tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
63
- sim_words = {}
64
- if len(tokenized_text) <= 1:
65
- return 1,"More Text Required"
66
 
 
 
 
 
 
67
 
 
 
 
 
 
 
68
 
69
-
70
- for idx, anc_word in enumerate(tokenized_text):
71
- if anc_word in stop_words:
72
- continue
73
- if idx in sim_words:
74
- sim_words[idx] = sim_words[idx]
75
- continue
76
 
77
- vocab = [anc_word]
78
 
79
- for pos, comp_word in enumerate(tokenized_text):
 
 
 
80
 
81
- try:
82
- if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1), w2v[comp_word].reshape(1, -1)) > .75:
83
- vocab.append(comp_word)
84
 
85
- sim_words[idx] = vocab
 
86
 
87
- except KeyError:
88
- continue
 
 
 
89
 
 
 
90
 
 
91
 
92
- scores = {}
93
- for key, value in sim_words.items():
94
- if len(value) == 1:
95
- scores[key] = 1
96
- continue
97
 
98
- t_sim = len(value) - 1
99
- t_rep = (len(value) - 1) - (len(set(value)) )
100
-
101
- score = ((t_sim - t_rep)/t_sim)**2
102
 
103
- scores[key] = score
 
 
104
 
105
- mean_score = 0
106
- total = 0
107
 
108
- for value in scores.values():
109
- mean_score += value
110
- total += 1
111
-
112
- return scores, mean_score/total
113
 
114
  def dict_to_list(dictionary, max_size=10):
115
  outer_list = []
116
  inner_list = []
117
-
118
- for key, value in dictionary.items():
119
  inner_list.append(value)
120
  if len(inner_list) == max_size:
121
  outer_list.append(inner_list)
@@ -124,213 +108,220 @@ def dict_to_list(dictionary, max_size=10):
124
  outer_list.append(inner_list)
125
  return outer_list
126
 
127
- def heatmap(scores, df):
128
- total = 0
129
- loops = 0
130
 
131
- for ratio in scores.values():
132
- #conditional to visualize the difference between no ratio and a 0 ratio score
133
- if ratio != -.3:
134
- total += ratio
135
- loops += 1
136
 
137
- diversity_average = total/loops
 
 
 
 
 
 
 
 
 
138
 
139
- return sns.heatmap(df, cmap='gist_gray_r', vmin = -.3).set(title='Word Diversity Score Heatmap (Average Score: ' + str(diversity_average) + ')')
140
 
141
  def stats(text):
142
- results = readability.getmeasures(text, lang='en')
143
- return results
 
144
 
145
  def predict(text, tokenizer=tokenizer):
146
-
147
- model.eval()
148
- model.to('cpu')
149
- def prepare_data(text, tokenizer):
150
-
151
- input_ids = []
152
- attention_masks = []
153
-
154
-
155
- encoded_text = tokenizer.encode_plus(
156
- text,
157
- truncation=True,
158
- add_special_tokens = True,
159
- max_length = 315,
160
- pad_to_max_length=True,
161
- return_attention_mask = True,
162
- return_tensors = 'pt'
163
- )
164
-
165
-
166
- input_ids.append(encoded_text['input_ids'])
167
- attention_masks.append(encoded_text['attention_mask'])
168
-
169
- input_ids = torch.cat(input_ids, dim=0)
170
- attention_masks = torch.cat(attention_masks, dim=0)
171
- return {'input_ids':input_ids, 'attention_masks':attention_masks}
172
- tokenized_example_text = prepare_data(text, tokenizer)
173
- with torch.no_grad():
174
-
175
- result = model(
176
- tokenized_example_text['input_ids'].to('cpu'),
177
- attention_mask = tokenized_example_text['attention_masks'].to('cpu'),
178
- return_dict=True
179
- ).logits
180
-
181
- return result
182
 
183
  def reading_difficulty(excerpt):
184
- if len(excerpt) == 0:
185
- return "No Text Provided"
186
- windows = []
187
- words = tokenizer.tokenize(excerpt)
188
-
189
- if len(words) > 301:
190
- for idx, text in enumerate(words):
191
- if idx % 300 == 0:
192
- if idx <= len(words) - 301:
193
- x = ' '.join(words[idx: idx+299])
194
- windows.append(x)
195
-
196
- win_preds = []
197
- for text in windows:
198
- win_preds.append(predict(text, tokenizer).item())
199
- result = statistics.mean(win_preds)
200
- score = -(result * 1.786 + 6.4) + 10
201
- return score
202
-
203
- else:
204
- result = predict(excerpt).item()
205
- score = -(result * 1.786 + 6.4) + 10
206
- return score
 
207
 
208
  def calculate_stats(file_name, data_index):
209
- #unicode escape only for essays
210
- with open(file_name, encoding= 'unicode_escape') as f:
211
- information = {'lines':0, 'words_per_sentence':0, 'words':0, 'syll_per_word':0, 'characters_per_word':0, 'reading_difficulty':0 }
212
- reader = csv.reader(f)
213
-
214
- for line in reader:
215
-
216
- if len(line[data_index]) < 100:
217
- continue
218
-
219
- #if detect(line[data_index][len(line[data_index]) -400: len(line[data_index])-1]) == 'en':
220
-
221
- try:
222
- stat = stats(line[data_index])
223
-
224
- except ValueError:
225
- continue
226
-
227
-
228
-
229
- information['lines'] += 1
230
- print(information['lines'])
231
- information['words_per_sentence'] += stat['sentence info']['words_per_sentence']
232
- information['words'] += stat['sentence info']['words']
233
- information['syll_per_word'] += stat['sentence info']['syll_per_word']
234
- information['characters_per_word'] += stat['sentence info']['characters_per_word']
235
- information['reading_difficulty'] += reading_difficulty(line[data_index])
236
-
237
-
238
-
239
- for i in information:
240
- if i != 'lines' and i != 'words':
241
- information[i] /= information['lines']
242
-
243
- return information
244
 
245
  def transcribe(audio):
246
- #speech to text using pipeline
247
- text = p(audio)["text"]
248
- transcription.append(text)
249
- return text
 
250
 
251
  def compute_score(target, actual):
252
- target = target.lower()
253
- actual = actual.lower()
254
- return fuzz.ratio(target,actual)
 
255
 
256
  def phon(text):
257
- alph = nltk.corpus.cmudict.dict()
258
- text = word_tokenize(text)
259
- pronun = []
260
- for word in text:
261
- try:
262
- pronun.append(alph[word][0])
263
- except Exception as e:
264
- pronun.append(word)
265
- return pronun
 
266
 
267
  def gradio_fn(text, audio, target, actual_audio):
268
- if text == None and audio == None and target == None and actual_audio == None:
269
- return "No Inputs", "No Inputs", "No Inputs", "No Inputs"
270
- speech_score = 0
271
- div = calculate_diversity(text)
272
-
273
- if actual_audio != None:
274
- actual = p(actual_audio)["text"]
275
- print('sdfgs')
276
- speech_score = compute_score(target, actual)
277
-
278
- return "Difficulty Score: " + str(reading_difficulty(actual)), "Transcript: " + str(actual.lower()), "Diversity Score: " + str(div[1]), "Speech Score: " + str(speech_score)
279
-
280
- transcription = []
281
- if audio != None:
282
- text = p(audio)["text"]
283
- transcription.append(text)
284
- state = div[0]
285
- return "Difficulty Score: " + str(reading_difficulty(text)), "Transcript: " + str(transcription[-1].lower()), "Diversity Score: " + str(div[1]), "No Inputs"
286
-
287
- return "Difficulty Score: " + str(reading_difficulty(text)),"Diversity Score: " + str(div[1]), "No Audio Provided", "No Inputs"
 
 
 
 
288
 
289
  def plot():
290
- text = state
291
- diversity = calculate_diversity(text)[0]
292
- print(diversity)
293
- df = pd.DataFrame(dict_to_list(diversity))
294
- return heatmap(diversity, df)
 
295
 
296
  import csv
 
297
  example_data = []
298
  x = 0
299
  with open('train.csv') as f:
300
-
301
- reader = csv.reader(f)
302
- next(reader)
303
- for line in reader:
304
- example_data.append([line[3]])
305
- x += 1
306
- if x > 100:
307
- break
308
 
309
  state = {}
310
  interface = gr.Interface(
311
  fn=gradio_fn,
312
- inputs= [gr.components.Textbox(
313
- label="Text"),
314
- gr.components.Audio(
315
- label="Speech Translation",
316
- source="microphone",
317
- type="filepath"),
318
- gr.components.Textbox(
319
- label="Target Text to Recite"
320
- ),
321
- gr.components.Audio(
322
- label="Read Text Above for Score",
323
- source="microphone",
324
- type="filepath")
325
- ],
326
-
327
- outputs = ["text", "text", "text", "text"],
328
  theme="huggingface",
329
  description="Enter text or speak into your microphone to have your text analyzed!",
330
-
331
  rounded=True,
332
  container=True
333
-
334
 
335
-
336
- ).launch()
1
+ import statistics
2
+ import string
 
 
 
 
 
 
 
3
 
4
+ import gensim.downloader as api
5
  import gradio as gr
 
 
 
 
6
  import nltk
7
+ import pandas as pd
8
+ import readability
9
+ import seaborn as sns
10
  import torch
11
+ from fuzzywuzzy import fuzz
12
+ from nltk.corpus import stopwords
13
+ from nltk.tokenize import word_tokenize
14
  from sklearn.metrics.pairwise import cosine_similarity
 
15
  from transformers import DistilBertTokenizer
16
+ from transformers import pipeline
 
 
 
 
 
17
 
18
  nltk.download('cmudict')
19
 
26
  tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
27
  device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
28
 
29
+ # loading model
30
  PATH = 'pytorchBERTmodel'
31
  model = torch.load(PATH, map_location=torch.device('cpu'))
32
  model.eval()
37
 
38
  w2v = dict({})
39
  for idx, key in enumerate(glove_vectors.key_to_index.keys()):
40
+ w2v[key] = glove_vectors.get_vector(key)
41
 
 
 
 
 
 
42
 
43
+ def calculate_diversity(text):
44
+ stop_words = set(stopwords.words('english'))
45
+ for i in string.punctuation:
46
+ stop_words.add(i)
 
47
 
48
+ tokenized_text = word_tokenize(text)
49
+ tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
50
+ sim_words = {}
51
+ if len(tokenized_text) <= 1:
52
+ return 1, "More Text Required"
53
 
54
+ for idx, anc_word in enumerate(tokenized_text):
55
+ if anc_word in stop_words:
56
+ continue
57
+ if idx in sim_words:
58
+ sim_words[idx] = sim_words[idx]
59
+ continue
60
 
61
+ vocab = [anc_word]
 
 
 
 
 
 
62
 
63
+ for pos, comp_word in enumerate(tokenized_text):
64
 
65
+ try:
66
+ if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1),
67
+ w2v[comp_word].reshape(1, -1)) > .75:
68
+ vocab.append(comp_word)
69
 
70
+ sim_words[idx] = vocab
 
 
71
 
72
+ except KeyError:
73
+ continue
74
 
75
+ scores = {}
76
+ for k, value in sim_words.items():
77
+ if len(value) == 1:
78
+ scores[k] = 1
79
+ continue
80
 
81
+ t_sim = len(value) - 1
82
+ t_rep = (len(value) - 1) - (len(set(value)))
83
 
84
+ score = ((t_sim - t_rep) / t_sim) ** 2
85
 
86
+ scores[key] = score
 
 
 
 
87
 
88
+ mean_score = 0
89
+ total = 0
 
 
90
 
91
+ for value in scores.values():
92
+ mean_score += value
93
+ total += 1
94
 
95
+ return scores, mean_score / total
 
96
 
 
 
 
 
 
97
 
98
  def dict_to_list(dictionary, max_size=10):
99
  outer_list = []
100
  inner_list = []
101
+
102
+ for value in dictionary.values():
103
  inner_list.append(value)
104
  if len(inner_list) == max_size:
105
  outer_list.append(inner_list)
108
  outer_list.append(inner_list)
109
  return outer_list
110
 
 
 
 
111
 
112
+ def heatmap(scores, df):
113
+ total = 0
114
+ loops = 0
 
 
115
 
116
+ for ratio in scores.values():
117
+ # conditional to visualize the difference between no ratio and a 0 ratio score
118
+ if ratio != -.3:
119
+ total += ratio
120
+ loops += 1
121
+
122
+ diversity_average = total / loops
123
+
124
+ return sns.heatmap(df, cmap='gist_gray_r', vmin=-.3).set(
125
+ title='Word Diversity Score Heatmap (Average Score: ' + str(diversity_average) + ')')
126
 
 
127
 
128
  def stats(text):
129
+ results = readability.getmeasures(text, lang='en')
130
+ return results
131
+
132
 
133
  def predict(text, tokenizer=tokenizer):
134
+ model.eval()
135
+ model.to('cpu')
136
+
137
+ def prepare_data(text, tokenizer):
138
+ input_ids = []
139
+ attention_masks = []
140
+
141
+ encoded_text = tokenizer.encode_plus(
142
+ text,
143
+ truncation=True,
144
+ add_special_tokens=True,
145
+ max_length=315,
146
+ pad_to_max_length=True,
147
+ return_attention_mask=True,
148
+ return_tensors='pt'
149
+ )
150
+
151
+ input_ids.append(encoded_text['input_ids'])
152
+ attention_masks.append(encoded_text['attention_mask'])
153
+
154
+ input_ids = torch.cat(input_ids, dim=0)
155
+ attention_masks = torch.cat(attention_masks, dim=0)
156
+ return {'input_ids': input_ids, 'attention_masks': attention_masks}
157
+
158
+ tokenized_example_text = prepare_data(text, tokenizer)
159
+ with torch.no_grad():
160
+ result = model(
161
+ tokenized_example_text['input_ids'].to('cpu'),
162
+ attention_mask=tokenized_example_text['attention_masks'].to('cpu'),
163
+ return_dict=True
164
+ ).logits
165
+
166
+ return result
167
+
 
 
168
 
169
  def reading_difficulty(excerpt):
170
+ if len(excerpt) == 0:
171
+ return "No Text Provided"
172
+ windows = []
173
+ words = tokenizer.tokenize(excerpt)
174
+
175
+ if len(words) > 301:
176
+ for idx, text in enumerate(words):
177
+ if idx % 300 == 0:
178
+ if idx <= len(words) - 301:
179
+ x = ' '.join(words[idx: idx + 299])
180
+ windows.append(x)
181
+
182
+ win_preds = []
183
+ for text in windows:
184
+ win_preds.append(predict(text, tokenizer).item())
185
+ result = statistics.mean(win_preds)
186
+ score = -(result * 1.786 + 6.4) + 10
187
+ return score
188
+
189
+ else:
190
+ result = predict(excerpt).item()
191
+ score = -(result * 1.786 + 6.4) + 10
192
+ return score
193
+
194
 
195
  def calculate_stats(file_name, data_index):
196
+ # unicode escape only for essays
197
+ with open(file_name, encoding='unicode_escape') as f:
198
+ information = {'lines': 0, 'words_per_sentence': 0, 'words': 0, 'syll_per_word': 0, 'characters_per_word': 0,
199
+ 'reading_difficulty': 0}
200
+ reader = csv.reader(f)
201
+
202
+ for line in reader:
203
+
204
+ if len(line[data_index]) < 100:
205
+ continue
206
+
207
+ # if detect(line[data_index][len(line[data_index]) -400: len(line[data_index])-1]) == 'en':
208
+
209
+ try:
210
+ stat = stats(line[data_index])
211
+
212
+ except ValueError:
213
+ continue
214
+
215
+ information['lines'] += 1
216
+ print(information['lines'])
217
+ information['words_per_sentence'] += stat['sentence info']['words_per_sentence']
218
+ information['words'] += stat['sentence info']['words']
219
+ information['syll_per_word'] += stat['sentence info']['syll_per_word']
220
+ information['characters_per_word'] += stat['sentence info']['characters_per_word']
221
+ information['reading_difficulty'] += reading_difficulty(line[data_index])
222
+
223
+ for i in information:
224
+ if i != 'lines' and i != 'words':
225
+ information[i] /= information['lines']
226
+
227
+ return information
228
+
 
 
229
 
230
  def transcribe(audio):
231
+ # speech to text using pipeline
232
+ text = p(audio)["text"]
233
+ transcription.append(text)
234
+ return text
235
+
236
 
237
  def compute_score(target, actual):
238
+ target = target.lower()
239
+ actual = actual.lower()
240
+ return fuzz.ratio(target, actual)
241
+
242
 
243
  def phon(text):
244
+ alph = nltk.corpus.cmudict.dict()
245
+ text = word_tokenize(text)
246
+ pronun = []
247
+ for word in text:
248
+ try:
249
+ pronun.append(alph[word][0])
250
+ except Exception as e:
251
+ pronun.append(word)
252
+ return pronun
253
+
254
 
255
  def gradio_fn(text, audio, target, actual_audio):
256
+ if text == None and audio == None and target == None and actual_audio == None:
257
+ return "No Inputs", "No Inputs", "No Inputs", "No Inputs"
258
+ speech_score = 0
259
+ div = calculate_diversity(text)
260
+
261
+ if actual_audio != None:
262
+ actual = p(actual_audio)["text"]
263
+ print('sdfgs')
264
+ speech_score = compute_score(target, actual)
265
+
266
+ return "Difficulty Score: " + str(reading_difficulty(actual)), "Transcript: " + str(
267
+ actual.lower()), "Diversity Score: " + str(div[1]), "Speech Score: " + str(speech_score)
268
+
269
+ transcription = []
270
+ if audio != None:
271
+ text = p(audio)["text"]
272
+ transcription.append(text)
273
+ state = div[0]
274
+ return "Difficulty Score: " + str(reading_difficulty(text)), "Transcript: " + str(
275
+ transcription[-1].lower()), "Diversity Score: " + str(div[1]), "No Inputs"
276
+
277
+ return "Difficulty Score: " + str(reading_difficulty(text)), "Diversity Score: " + str(
278
+ div[1]), "No Audio Provided", "No Inputs"
279
+
280
 
281
  def plot():
282
+ text = state
283
+ diversity = calculate_diversity(text)[0]
284
+ print(diversity)
285
+ df = pd.DataFrame(dict_to_list(diversity))
286
+ return heatmap(diversity, df)
287
+
288
 
289
  import csv
290
+
291
  example_data = []
292
  x = 0
293
  with open('train.csv') as f:
294
+ reader = csv.reader(f)
295
+ next(reader)
296
+ for line in reader:
297
+ example_data.append([line[3]])
298
+ x += 1
299
+ if x > 100:
300
+ break
 
301
 
302
  state = {}
303
  interface = gr.Interface(
304
  fn=gradio_fn,
305
+ inputs=[gr.components.Textbox(
306
+ label="Text"),
307
+ gr.components.Audio(
308
+ label="Speech Translation",
309
+ source="microphone",
310
+ type="filepath"),
311
+ gr.components.Textbox(
312
+ label="Target Text to Recite"
313
+ ),
314
+ gr.components.Audio(
315
+ label="Read Text Above for Score",
316
+ source="microphone",
317
+ type="filepath")
318
+ ],
319
+
320
+ outputs=["text", "text", "text", "text"],
321
  theme="huggingface",
322
  description="Enter text or speak into your microphone to have your text analyzed!",
323
+
324
  rounded=True,
325
  container=True
 
326
 
327
+ ).launch()