RobPruzan commited on
Commit
3ed467a
1 Parent(s): b637fc2
Files changed (1) hide show
  1. app.py.py +336 -0
app.py.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/10plMWPNgOBAggggGeW01XD195JH5cYlR
8
+ """
9
+
10
+
11
+ import gradio as gr
12
+ import csv
13
+ import string
14
+ import readability
15
+ import pandas as pd
16
+ import nltk
17
+ from nltk.tokenize import word_tokenize
18
+ import torch
19
+ import gensim
20
+ import gensim.downloader as api
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+ from nltk.corpus import wordnet as wn
23
+ from transformers import DistilBertTokenizer
24
+ from nltk.corpus import stopwords
25
+ from fuzzywuzzy import fuzz
26
+ from fuzzywuzzy import process
27
+ from transformers import pipeline
28
+ import statistics
29
+ import seaborn as sns
30
+
31
+ nltk.download('cmudict')
32
+
33
+ nltk.download('stopwords')
34
+
35
+ nltk.download('punkt')
36
+
37
+ glove_vectors = api.load('glove-wiki-gigaword-100')
38
+
39
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
40
+ device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
41
+
42
+ #loading model
43
+ PATH = '"C:\Users\Robby\Desktop\automaticlit\pytorchBERTmodel"'
44
+ model = torch.load(PATH)
45
+ model.eval()
46
+
47
+ model.to(device)
48
+
49
+ p = pipeline("automatic-speech-recognition")
50
+
51
+ w2v = dict({})
52
+ for idx, key in enumerate(glove_vectors.wv.vocab):
53
+ w2v[key] = glove_vectors.wv.get_vector(key)
54
+
55
+ def calculate_diversity(text):
56
+
57
+ stop_words = set(stopwords.words('english'))
58
+ for i in string.punctuation:
59
+ stop_words.add(i)
60
+
61
+ tokenized_text = word_tokenize(text)
62
+ tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
63
+ sim_words = {}
64
+ if len(tokenized_text) <= 1:
65
+ return 1,"More Text Required"
66
+
67
+
68
+
69
+
70
+ for idx, anc_word in enumerate(tokenized_text):
71
+ if anc_word in stop_words:
72
+ continue
73
+ if idx in sim_words:
74
+ sim_words[idx] = sim_words[idx]
75
+ continue
76
+
77
+ vocab = [anc_word]
78
+
79
+ for pos, comp_word in enumerate(tokenized_text):
80
+
81
+ try:
82
+ if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1), w2v[comp_word].reshape(1, -1)) > .75:
83
+ vocab.append(comp_word)
84
+
85
+ sim_words[idx] = vocab
86
+
87
+ except KeyError:
88
+ continue
89
+
90
+
91
+
92
+ scores = {}
93
+ for key, value in sim_words.items():
94
+ if len(value) == 1:
95
+ scores[key] = 1
96
+ continue
97
+
98
+ t_sim = len(value) - 1
99
+ t_rep = (len(value) - 1) - (len(set(value)) )
100
+
101
+ score = ((t_sim - t_rep)/t_sim)**2
102
+
103
+ scores[key] = score
104
+
105
+ mean_score = 0
106
+ total = 0
107
+
108
+ for value in scores.values():
109
+ mean_score += value
110
+ total += 1
111
+
112
+ return scores, mean_score/total
113
+
114
+ def dict_to_list(dictionary, max_size=10):
115
+ outer_list = []
116
+ inner_list = []
117
+
118
+ for key, value in dictionary.items():
119
+ inner_list.append(value)
120
+ if len(inner_list) == max_size:
121
+ outer_list.append(inner_list)
122
+ inner_list = []
123
+ if len(inner_list) > 0:
124
+ outer_list.append(inner_list)
125
+ return outer_list
126
+
127
+ def heatmap(scores, df):
128
+ total = 0
129
+ loops = 0
130
+
131
+ for ratio in scores.values():
132
+ #conditional to visualize the difference between no ratio and a 0 ratio score
133
+ if ratio != -.3:
134
+ total += ratio
135
+ loops += 1
136
+
137
+ diversity_average = total/loops
138
+
139
+ return sns.heatmap(df, cmap='gist_gray_r', vmin = -.3).set(title='Word Diversity Score Heatmap (Average Score: ' + str(diversity_average) + ')')
140
+
141
+ def stats(text):
142
+ results = readability.getmeasures(text, lang='en')
143
+ return results
144
+
145
+ def predict(text, tokenizer=tokenizer):
146
+
147
+ model.eval()
148
+ model.to(device)
149
+ def prepare_data(text, tokenizer):
150
+
151
+ input_ids = []
152
+ attention_masks = []
153
+
154
+
155
+ encoded_text = tokenizer.encode_plus(
156
+ text,
157
+ truncation=True,
158
+ add_special_tokens = True,
159
+ max_length = 315,
160
+ pad_to_max_length=True,
161
+ return_attention_mask = True,
162
+ return_tensors = 'pt'
163
+ )
164
+
165
+
166
+ input_ids.append(encoded_text['input_ids'])
167
+ attention_masks.append(encoded_text['attention_mask'])
168
+
169
+ input_ids = torch.cat(input_ids, dim=0)
170
+ attention_masks = torch.cat(attention_masks, dim=0)
171
+ return {'input_ids':input_ids, 'attention_masks':attention_masks}
172
+ tokenized_example_text = prepare_data(text, tokenizer)
173
+ with torch.no_grad():
174
+
175
+ result = model(
176
+ tokenized_example_text['input_ids'].to(device),
177
+ attention_mask = tokenized_example_text['attention_masks'].to(device),
178
+ return_dict=True
179
+ ).logits
180
+
181
+ return result
182
+
183
+ def reading_difficulty(excerpt):
184
+ if len(excerpt) == 0:
185
+ return "No Text Provided"
186
+ windows = []
187
+ words = tokenizer.tokenize(excerpt)
188
+
189
+ if len(words) > 301:
190
+ for idx, text in enumerate(words):
191
+ if idx % 300 == 0:
192
+ if idx <= len(words) - 301:
193
+ x = ' '.join(words[idx: idx+299])
194
+ windows.append(x)
195
+
196
+ win_preds = []
197
+ for text in windows:
198
+ win_preds.append(predict(text, tokenizer).item())
199
+ result = statistics.mean(win_preds)
200
+ score = -(result * 1.786 + 6.4) + 10
201
+ return score
202
+
203
+ else:
204
+ result = predict(excerpt).item()
205
+ score = -(result * 1.786 + 6.4) + 10
206
+ return score
207
+
208
+ def calculate_stats(file_name, data_index):
209
+ #unicode escape only for essays
210
+ with open(file_name, encoding= 'unicode_escape') as f:
211
+ information = {'lines':0, 'words_per_sentence':0, 'words':0, 'syll_per_word':0, 'characters_per_word':0, 'reading_difficulty':0 }
212
+ reader = csv.reader(f)
213
+
214
+ for line in reader:
215
+
216
+ if len(line[data_index]) < 100:
217
+ continue
218
+
219
+ #if detect(line[data_index][len(line[data_index]) -400: len(line[data_index])-1]) == 'en':
220
+
221
+ try:
222
+ stat = stats(line[data_index])
223
+
224
+ except ValueError:
225
+ continue
226
+
227
+
228
+
229
+ information['lines'] += 1
230
+ print(information['lines'])
231
+ information['words_per_sentence'] += stat['sentence info']['words_per_sentence']
232
+ information['words'] += stat['sentence info']['words']
233
+ information['syll_per_word'] += stat['sentence info']['syll_per_word']
234
+ information['characters_per_word'] += stat['sentence info']['characters_per_word']
235
+ information['reading_difficulty'] += reading_difficulty(line[data_index])
236
+
237
+
238
+
239
+ for i in information:
240
+ if i != 'lines' and i != 'words':
241
+ information[i] /= information['lines']
242
+
243
+ return information
244
+
245
+ def transcribe(audio):
246
+ #speech to text using pipeline
247
+ text = p(audio)["text"]
248
+ transcription.append(text)
249
+ return text
250
+
251
+ def compute_score(target, actual):
252
+ target = target.lower()
253
+ actual = actual.lower()
254
+ return fuzz.ratio(target,actual)
255
+
256
+ def phon(text):
257
+ alph = nltk.corpus.cmudict.dict()
258
+ text = word_tokenize(text)
259
+ pronun = []
260
+ for word in text:
261
+ try:
262
+ pronun.append(alph[word][0])
263
+ except Exception as e:
264
+ pronun.append(word)
265
+ return pronun
266
+
267
+ def gradio_fn(text, audio, target, actual_audio):
268
+ if text == None and audio == None and target == None and actual_audio == None:
269
+ return "No Inputs", "No Inputs", "No Inputs", "No Inputs"
270
+ speech_score = 0
271
+ div = calculate_diversity(text)
272
+
273
+ if actual_audio != None:
274
+ actual = p(actual_audio)["text"]
275
+ print('sdfgs')
276
+ speech_score = compute_score(target, actual)
277
+
278
+ return "Difficulty Score: " + str(reading_difficulty(actual)), "Transcript: " + str(actual.lower()), "Diversity Score: " + str(div[1]), "Speech Score: " + str(speech_score)
279
+
280
+ transcription = []
281
+ if audio != None:
282
+ text = p(audio)["text"]
283
+ transcription.append(text)
284
+ state = div[0]
285
+ return "Difficulty Score: " + str(reading_difficulty(text)), "Transcript: " + str(transcription[-1].lower()), "Diversity Score: " + str(div[1]), "No Inputs"
286
+
287
+ return "Difficulty Score: " + str(reading_difficulty(text)),"Diversity Score: " + str(div[1]), "No Audio Provided", "No Inputs"
288
+
289
+ def plot():
290
+ text = state
291
+ diversity = calculate_diversity(text)[0]
292
+ print(diversity)
293
+ df = pd.DataFrame(dict_to_list(diversity))
294
+ return heatmap(diversity, df)
295
+
296
+ import csv
297
+ example_data = []
298
+ x = 0
299
+ with open('C:\Users\Robby\Desktop\automaticlit\train.csv') as f:
300
+
301
+ reader = csv.reader(f)
302
+
303
+ for line in reader:
304
+ example_data.append([line[3]])
305
+ x += 1
306
+ if x > 100:
307
+ break
308
+
309
+ state = {}
310
+ interface = gr.Interface(
311
+ fn=gradio_fn,
312
+ inputs= [gr.components.Textbox(
313
+ label="Text"),
314
+ gr.components.Audio(
315
+ label="Speech Translation",
316
+ source="microphone",
317
+ type="filepath"),
318
+ gr.components.Textbox(
319
+ label="Target Text to Recite"
320
+ ),
321
+ gr.components.Audio(
322
+ label="Read Text Above for Score",
323
+ source="microphone",
324
+ type="filepath")
325
+ ],
326
+
327
+ outputs = ["text", "text", "text", "text"],
328
+ theme="huggingface",
329
+ description="Enter text or speak into your microphone to have your text analyzed!",
330
+
331
+ rounded=True,
332
+ container=True,
333
+ examples=example_data,
334
+ examples_per_page = 3
335
+
336
+ ).launch(debug=True)