RobPruzan commited on
Commit
84ed9cd
1 Parent(s): 6373e5b

Delete, wrong file name

Browse files
Files changed (1) hide show
  1. app.py.py +0 -336
app.py.py DELETED
@@ -1,336 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """app.ipynb
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/10plMWPNgOBAggggGeW01XD195JH5cYlR
8
- """
9
-
10
-
11
- import gradio as gr
12
- import csv
13
- import string
14
- import readability
15
- import pandas as pd
16
- import nltk
17
- from nltk.tokenize import word_tokenize
18
- import torch
19
- import gensim
20
- import gensim.downloader as api
21
- from sklearn.metrics.pairwise import cosine_similarity
22
- from nltk.corpus import wordnet as wn
23
- from transformers import DistilBertTokenizer
24
- from nltk.corpus import stopwords
25
- from fuzzywuzzy import fuzz
26
- from fuzzywuzzy import process
27
- from transformers import pipeline
28
- import statistics
29
- import seaborn as sns
30
-
31
- nltk.download('cmudict')
32
-
33
- nltk.download('stopwords')
34
-
35
- nltk.download('punkt')
36
-
37
- glove_vectors = api.load('glove-wiki-gigaword-100')
38
-
39
- tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
40
- device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
41
-
42
- #loading model
43
- PATH = '"C:\Users\Robby\Desktop\automaticlit\pytorchBERTmodel"'
44
- model = torch.load(PATH)
45
- model.eval()
46
-
47
- model.to(device)
48
-
49
- p = pipeline("automatic-speech-recognition")
50
-
51
- w2v = dict({})
52
- for idx, key in enumerate(glove_vectors.wv.vocab):
53
- w2v[key] = glove_vectors.wv.get_vector(key)
54
-
55
- def calculate_diversity(text):
56
-
57
- stop_words = set(stopwords.words('english'))
58
- for i in string.punctuation:
59
- stop_words.add(i)
60
-
61
- tokenized_text = word_tokenize(text)
62
- tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
63
- sim_words = {}
64
- if len(tokenized_text) <= 1:
65
- return 1,"More Text Required"
66
-
67
-
68
-
69
-
70
- for idx, anc_word in enumerate(tokenized_text):
71
- if anc_word in stop_words:
72
- continue
73
- if idx in sim_words:
74
- sim_words[idx] = sim_words[idx]
75
- continue
76
-
77
- vocab = [anc_word]
78
-
79
- for pos, comp_word in enumerate(tokenized_text):
80
-
81
- try:
82
- if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1), w2v[comp_word].reshape(1, -1)) > .75:
83
- vocab.append(comp_word)
84
-
85
- sim_words[idx] = vocab
86
-
87
- except KeyError:
88
- continue
89
-
90
-
91
-
92
- scores = {}
93
- for key, value in sim_words.items():
94
- if len(value) == 1:
95
- scores[key] = 1
96
- continue
97
-
98
- t_sim = len(value) - 1
99
- t_rep = (len(value) - 1) - (len(set(value)) )
100
-
101
- score = ((t_sim - t_rep)/t_sim)**2
102
-
103
- scores[key] = score
104
-
105
- mean_score = 0
106
- total = 0
107
-
108
- for value in scores.values():
109
- mean_score += value
110
- total += 1
111
-
112
- return scores, mean_score/total
113
-
114
- def dict_to_list(dictionary, max_size=10):
115
- outer_list = []
116
- inner_list = []
117
-
118
- for key, value in dictionary.items():
119
- inner_list.append(value)
120
- if len(inner_list) == max_size:
121
- outer_list.append(inner_list)
122
- inner_list = []
123
- if len(inner_list) > 0:
124
- outer_list.append(inner_list)
125
- return outer_list
126
-
127
- def heatmap(scores, df):
128
- total = 0
129
- loops = 0
130
-
131
- for ratio in scores.values():
132
- #conditional to visualize the difference between no ratio and a 0 ratio score
133
- if ratio != -.3:
134
- total += ratio
135
- loops += 1
136
-
137
- diversity_average = total/loops
138
-
139
- return sns.heatmap(df, cmap='gist_gray_r', vmin = -.3).set(title='Word Diversity Score Heatmap (Average Score: ' + str(diversity_average) + ')')
140
-
141
- def stats(text):
142
- results = readability.getmeasures(text, lang='en')
143
- return results
144
-
145
- def predict(text, tokenizer=tokenizer):
146
-
147
- model.eval()
148
- model.to(device)
149
- def prepare_data(text, tokenizer):
150
-
151
- input_ids = []
152
- attention_masks = []
153
-
154
-
155
- encoded_text = tokenizer.encode_plus(
156
- text,
157
- truncation=True,
158
- add_special_tokens = True,
159
- max_length = 315,
160
- pad_to_max_length=True,
161
- return_attention_mask = True,
162
- return_tensors = 'pt'
163
- )
164
-
165
-
166
- input_ids.append(encoded_text['input_ids'])
167
- attention_masks.append(encoded_text['attention_mask'])
168
-
169
- input_ids = torch.cat(input_ids, dim=0)
170
- attention_masks = torch.cat(attention_masks, dim=0)
171
- return {'input_ids':input_ids, 'attention_masks':attention_masks}
172
- tokenized_example_text = prepare_data(text, tokenizer)
173
- with torch.no_grad():
174
-
175
- result = model(
176
- tokenized_example_text['input_ids'].to(device),
177
- attention_mask = tokenized_example_text['attention_masks'].to(device),
178
- return_dict=True
179
- ).logits
180
-
181
- return result
182
-
183
- def reading_difficulty(excerpt):
184
- if len(excerpt) == 0:
185
- return "No Text Provided"
186
- windows = []
187
- words = tokenizer.tokenize(excerpt)
188
-
189
- if len(words) > 301:
190
- for idx, text in enumerate(words):
191
- if idx % 300 == 0:
192
- if idx <= len(words) - 301:
193
- x = ' '.join(words[idx: idx+299])
194
- windows.append(x)
195
-
196
- win_preds = []
197
- for text in windows:
198
- win_preds.append(predict(text, tokenizer).item())
199
- result = statistics.mean(win_preds)
200
- score = -(result * 1.786 + 6.4) + 10
201
- return score
202
-
203
- else:
204
- result = predict(excerpt).item()
205
- score = -(result * 1.786 + 6.4) + 10
206
- return score
207
-
208
- def calculate_stats(file_name, data_index):
209
- #unicode escape only for essays
210
- with open(file_name, encoding= 'unicode_escape') as f:
211
- information = {'lines':0, 'words_per_sentence':0, 'words':0, 'syll_per_word':0, 'characters_per_word':0, 'reading_difficulty':0 }
212
- reader = csv.reader(f)
213
-
214
- for line in reader:
215
-
216
- if len(line[data_index]) < 100:
217
- continue
218
-
219
- #if detect(line[data_index][len(line[data_index]) -400: len(line[data_index])-1]) == 'en':
220
-
221
- try:
222
- stat = stats(line[data_index])
223
-
224
- except ValueError:
225
- continue
226
-
227
-
228
-
229
- information['lines'] += 1
230
- print(information['lines'])
231
- information['words_per_sentence'] += stat['sentence info']['words_per_sentence']
232
- information['words'] += stat['sentence info']['words']
233
- information['syll_per_word'] += stat['sentence info']['syll_per_word']
234
- information['characters_per_word'] += stat['sentence info']['characters_per_word']
235
- information['reading_difficulty'] += reading_difficulty(line[data_index])
236
-
237
-
238
-
239
- for i in information:
240
- if i != 'lines' and i != 'words':
241
- information[i] /= information['lines']
242
-
243
- return information
244
-
245
- def transcribe(audio):
246
- #speech to text using pipeline
247
- text = p(audio)["text"]
248
- transcription.append(text)
249
- return text
250
-
251
- def compute_score(target, actual):
252
- target = target.lower()
253
- actual = actual.lower()
254
- return fuzz.ratio(target,actual)
255
-
256
- def phon(text):
257
- alph = nltk.corpus.cmudict.dict()
258
- text = word_tokenize(text)
259
- pronun = []
260
- for word in text:
261
- try:
262
- pronun.append(alph[word][0])
263
- except Exception as e:
264
- pronun.append(word)
265
- return pronun
266
-
267
- def gradio_fn(text, audio, target, actual_audio):
268
- if text == None and audio == None and target == None and actual_audio == None:
269
- return "No Inputs", "No Inputs", "No Inputs", "No Inputs"
270
- speech_score = 0
271
- div = calculate_diversity(text)
272
-
273
- if actual_audio != None:
274
- actual = p(actual_audio)["text"]
275
- print('sdfgs')
276
- speech_score = compute_score(target, actual)
277
-
278
- return "Difficulty Score: " + str(reading_difficulty(actual)), "Transcript: " + str(actual.lower()), "Diversity Score: " + str(div[1]), "Speech Score: " + str(speech_score)
279
-
280
- transcription = []
281
- if audio != None:
282
- text = p(audio)["text"]
283
- transcription.append(text)
284
- state = div[0]
285
- return "Difficulty Score: " + str(reading_difficulty(text)), "Transcript: " + str(transcription[-1].lower()), "Diversity Score: " + str(div[1]), "No Inputs"
286
-
287
- return "Difficulty Score: " + str(reading_difficulty(text)),"Diversity Score: " + str(div[1]), "No Audio Provided", "No Inputs"
288
-
289
- def plot():
290
- text = state
291
- diversity = calculate_diversity(text)[0]
292
- print(diversity)
293
- df = pd.DataFrame(dict_to_list(diversity))
294
- return heatmap(diversity, df)
295
-
296
- import csv
297
- example_data = []
298
- x = 0
299
- with open('C:\Users\Robby\Desktop\automaticlit\train.csv') as f:
300
-
301
- reader = csv.reader(f)
302
-
303
- for line in reader:
304
- example_data.append([line[3]])
305
- x += 1
306
- if x > 100:
307
- break
308
-
309
- state = {}
310
- interface = gr.Interface(
311
- fn=gradio_fn,
312
- inputs= [gr.components.Textbox(
313
- label="Text"),
314
- gr.components.Audio(
315
- label="Speech Translation",
316
- source="microphone",
317
- type="filepath"),
318
- gr.components.Textbox(
319
- label="Target Text to Recite"
320
- ),
321
- gr.components.Audio(
322
- label="Read Text Above for Score",
323
- source="microphone",
324
- type="filepath")
325
- ],
326
-
327
- outputs = ["text", "text", "text", "text"],
328
- theme="huggingface",
329
- description="Enter text or speak into your microphone to have your text analyzed!",
330
-
331
- rounded=True,
332
- container=True,
333
- examples=example_data,
334
- examples_per_page = 3
335
-
336
- ).launch(debug=True)