Spaces:

RobPruzan
/

automaticlitassesment

Runtime error

App Files Files Community

RobPruzan commited on Aug 8, 2022

Commit

8218bc2

•

1 Parent(s): 84ed9cd

Upload app.py

Browse files

Files changed (1) hide show

app.py +336 -0

app.py ADDED Viewed

	@@ -0,0 +1,336 @@

+# -*- coding: utf-8 -*-
+"""app.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/10plMWPNgOBAggggGeW01XD195JH5cYlR
+"""
+import gradio as gr
+import csv
+import string
+import readability
+import pandas as pd
+import nltk
+from nltk.tokenize import word_tokenize
+import torch
+import gensim
+import gensim.downloader as api
+from sklearn.metrics.pairwise import cosine_similarity
+from nltk.corpus import wordnet as wn
+from transformers import DistilBertTokenizer
+from nltk.corpus import stopwords
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+from transformers import pipeline
+import statistics
+import seaborn as sns
+nltk.download('cmudict')
+nltk.download('stopwords')
+nltk.download('punkt')
+glove_vectors = api.load('glove-wiki-gigaword-100')
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
+#loading model
+PATH = '"C:\Users\Robby\Desktop\automaticlit\pytorchBERTmodel"'
+model = torch.load(PATH)
+model.eval()
+model.to(device)
+p = pipeline("automatic-speech-recognition")
+w2v = dict({})
+for idx, key in enumerate(glove_vectors.wv.vocab):
+  w2v[key] = glove_vectors.wv.get_vector(key)
+def calculate_diversity(text):
+  stop_words = set(stopwords.words('english'))
+  for i in string.punctuation:
+    stop_words.add(i)
+  tokenized_text = word_tokenize(text)
+  tokenized_text = list(map(lambda word: word.lower(), tokenized_text))
+  sim_words = {}
+  if len(tokenized_text) <= 1:
+    return 1,"More Text Required"
+  for idx, anc_word in enumerate(tokenized_text):
+    if anc_word in stop_words:
+      continue
+    if idx in sim_words:
+      sim_words[idx] = sim_words[idx]
+      continue
+    vocab = [anc_word]
+    for pos, comp_word in enumerate(tokenized_text):
+      try:
+        if not comp_word in stop_words and cosine_similarity(w2v[anc_word].reshape(1, -1), w2v[comp_word].reshape(1, -1)) > .75:
+          vocab.append(comp_word)
+        sim_words[idx] = vocab
+      except KeyError:
+        continue
+  scores = {}
+  for key, value in sim_words.items():
+    if len(value) == 1:
+      scores[key] = 1
+      continue
+    t_sim = len(value) - 1
+    t_rep = (len(value) - 1) - (len(set(value)) )
+    score = ((t_sim - t_rep)/t_sim)**2
+    scores[key] = score
+  mean_score = 0
+  total = 0
+  for value in scores.values():
+    mean_score += value
+    total += 1
+  return scores, mean_score/total
+def dict_to_list(dictionary, max_size=10):
+    outer_list = []
+    inner_list = []
+    for key, value in dictionary.items():
+        inner_list.append(value)
+        if len(inner_list) == max_size:
+            outer_list.append(inner_list)
+            inner_list = []
+    if len(inner_list) > 0:
+        outer_list.append(inner_list)
+    return outer_list
+def heatmap(scores, df):
+  total = 0
+  loops = 0
+  for ratio in scores.values():
+    #conditional to visualize the difference between no ratio and a 0 ratio score
+    if ratio != -.3:
+      total += ratio
+      loops += 1
+  diversity_average = total/loops
+  return sns.heatmap(df, cmap='gist_gray_r', vmin = -.3).set(title='Word Diversity Score Heatmap (Average Score: ' + str(diversity_average) + ')')
+def stats(text):
+  results = readability.getmeasures(text, lang='en')
+  return results
+def predict(text, tokenizer=tokenizer):
+  model.eval()
+  model.to(device)
+  def prepare_data(text, tokenizer):
+    input_ids = []
+    attention_masks = []
+    encoded_text = tokenizer.encode_plus(
+        text,
+        truncation=True,
+        add_special_tokens = True,
+        max_length = 315,
+        pad_to_max_length=True,
+        return_attention_mask = True,
+        return_tensors = 'pt'
+    )
+    input_ids.append(encoded_text['input_ids'])
+    attention_masks.append(encoded_text['attention_mask'])
+    input_ids = torch.cat(input_ids, dim=0)
+    attention_masks = torch.cat(attention_masks, dim=0)
+    return {'input_ids':input_ids, 'attention_masks':attention_masks}
+  tokenized_example_text = prepare_data(text, tokenizer)
+  with torch.no_grad():
+    result = model(
+      tokenized_example_text['input_ids'].to(device),
+      attention_mask = tokenized_example_text['attention_masks'].to(device),
+      return_dict=True
+  ).logits
+  return result
+def reading_difficulty(excerpt):
+  if len(excerpt) == 0:
+    return "No Text Provided"
+  windows = []
+  words = tokenizer.tokenize(excerpt)
+  if len(words) > 301:
+    for idx, text in enumerate(words):
+      if idx % 300 == 0:
+        if idx <= len(words) - 301:
+          x = ' '.join(words[idx: idx+299])
+          windows.append(x)
+    win_preds = []
+    for text in windows:
+      win_preds.append(predict(text, tokenizer).item())
+    result = statistics.mean(win_preds)
+    score = -(result * 1.786 + 6.4) + 10
+    return score
+  else:
+    result = predict(excerpt).item()
+    score = -(result * 1.786 + 6.4) + 10
+    return score
+def calculate_stats(file_name, data_index):
+  #unicode escape only for essays
+  with open(file_name, encoding= 'unicode_escape') as f:
+    information = {'lines':0, 'words_per_sentence':0, 'words':0, 'syll_per_word':0, 'characters_per_word':0, 'reading_difficulty':0 }
+    reader = csv.reader(f)
+    for line in reader:
+      if len(line[data_index]) < 100:
+        continue
+      #if detect(line[data_index][len(line[data_index]) -400: len(line[data_index])-1]) == 'en':
+      try:
+        stat = stats(line[data_index])
+      except ValueError:
+        continue
+      information['lines'] += 1
+      print(information['lines'])
+      information['words_per_sentence'] += stat['sentence info']['words_per_sentence']
+      information['words'] += stat['sentence info']['words']
+      information['syll_per_word'] += stat['sentence info']['syll_per_word']
+      information['characters_per_word'] += stat['sentence info']['characters_per_word']
+      information['reading_difficulty'] += reading_difficulty(line[data_index])
+  for i in information:
+    if i != 'lines' and i != 'words':
+      information[i] /= information['lines']
+  return information
+def transcribe(audio):
+  #speech to text using pipeline
+  text = p(audio)["text"]
+  transcription.append(text)
+  return text
+def compute_score(target, actual):
+  target = target.lower()
+  actual = actual.lower()
+  return fuzz.ratio(target,actual)
+def phon(text):
+  alph = nltk.corpus.cmudict.dict()
+  text = word_tokenize(text)
+  pronun = []
+  for word in text:
+      try:
+        pronun.append(alph[word][0])
+      except Exception as e:
+        pronun.append(word)
+  return pronun
+def gradio_fn(text, audio, target, actual_audio):
+  if text == None and audio == None and target == None and actual_audio == None:
+    return "No Inputs", "No Inputs", "No Inputs", "No Inputs"
+  speech_score = 0
+  div = calculate_diversity(text)
+  if actual_audio != None:
+    actual = p(actual_audio)["text"]
+    print('sdfgs')
+    speech_score = compute_score(target, actual)
+    return "Difficulty Score: " + str(reading_difficulty(actual)),  "Transcript: " + str(actual.lower()), "Diversity Score: " + str(div[1]), "Speech Score: " + str(speech_score)
+  transcription = []
+  if audio != None:
+    text = p(audio)["text"]
+    transcription.append(text)
+    state = div[0]
+    return "Difficulty Score: " + str(reading_difficulty(text)),  "Transcript: " + str(transcription[-1].lower()), "Diversity Score: " + str(div[1]), "No Inputs"
+  return "Difficulty Score: " + str(reading_difficulty(text)),"Diversity Score: " + str(div[1]), "No Audio Provided", "No Inputs"
+def plot():
+  text = state
+  diversity = calculate_diversity(text)[0]
+  print(diversity)
+  df = pd.DataFrame(dict_to_list(diversity))
+  return heatmap(diversity, df)
+import csv
+example_data = []
+x = 0
+with open('C:\Users\Robby\Desktop\automaticlit\train.csv') as f:
+  reader = csv.reader(f)
+  for line in reader:
+    example_data.append([line[3]])
+    x += 1
+    if x > 100:
+      break
+state = {}
+interface = gr.Interface(
+    fn=gradio_fn,
+    inputs= [gr.components.Textbox(
+                 label="Text"),
+             gr.components.Audio(
+                 label="Speech Translation",
+                 source="microphone",
+                 type="filepath"),
+             gr.components.Textbox(
+                 label="Target Text to Recite"
+             ),
+             gr.components.Audio(
+                 label="Read Text Above for Score",
+                 source="microphone",
+                 type="filepath")
+             ],
+    outputs = ["text", "text", "text", "text"],
+    theme="huggingface",
+    description="Enter text or speak into your microphone to have your text analyzed!",
+    rounded=True,
+    container=True,
+    examples=example_data,
+    examples_per_page = 3
+    ).launch(debug=True)