Spaces:
Build error
Build error
import gradio as gr | |
from matplotlib.dates import SU | |
from regex import F | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sentence_transformers import SentenceTransformer, util | |
from sklearn.metrics.pairwise import cosine_similarity | |
import spacy | |
import pandas as pd | |
import numpy as np | |
from tqdm import tqdm | |
import textdistance | |
from spacy.lang.en.stop_words import STOP_WORDS | |
#import psycopg2 | |
import os | |
'''connection = psycopg2.connect(user="db_admin", | |
password="", | |
host="127.0.0.1", | |
port="5432", | |
database="my_db")''' | |
nlp = spacy.load("en_core_web_md") | |
def listToString(s): | |
# initialize an empty string | |
str1 = " " | |
# return string | |
return (str1.join(s)) | |
def rm_stop(my_doc): | |
# Create list of word tokens | |
token_list = [] | |
for token in my_doc: | |
token_list.append(token.text) | |
# Create list of word tokens after removing stopwords | |
filtered_sentence =[] | |
for word in token_list: | |
lexeme = nlp.vocab[word] | |
if lexeme.is_stop == False: | |
filtered_sentence.append(word) | |
return filtered_sentence | |
def text_processing(sentence): | |
sentence = [token.lemma_.lower() | |
for token in nlp(sentence) | |
if token.is_alpha and not token.is_stop] | |
return sentence | |
def jaccard_sim(sent1,sent2): | |
# Text Processing | |
sentence1 = text_processing(sent1) | |
sentence2 = text_processing(sent2) | |
# Jaccard similarity | |
return textdistance.jaccard.normalized_similarity(sentence1, sentence2) | |
def sim(Ideal_Answer,Submitted_Answer): | |
# SBERT EMBEDDINGS | |
text1=Ideal_Answer.replace("\"","").replace("\'","") | |
text2=Submitted_Answer.replace("\"","").replace("\'","") | |
output=[] | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
#Compute embedding for both lists | |
embedding_1= model.encode(text1, convert_to_tensor=True) | |
embedding_2 = model.encode(text2, convert_to_tensor=True) | |
score=util.pytorch_cos_sim(embedding_1, embedding_2) | |
output.append("SBERT:"+str(int(float(str(score).split("[")[2].split("]")[0])*10.0))+",") | |
sbert=int(float(str(score).split("[")[2].split("]")[0])*10.0) | |
#Jaccard | |
output.append("Jaccard:"+str(int(jaccard_sim(text1,text2)*10.0))+",") | |
#spacy average word2vec | |
nlp = spacy.load("en_core_web_md") # make sure to use larger package! | |
doc1 = listToString(rm_stop(nlp(text1))) | |
doc2 = listToString(rm_stop(nlp(text2))) | |
# Similarity of two documents | |
w2v=int(nlp(doc1).similarity(nlp(doc2))*10.0) | |
final_score=int(0.8*sbert+0.2*w2v) | |
output.append("Word2Vec:"+str(int(nlp(doc1).similarity(nlp(doc2))*10.0))+",final_score:"+str(final_score)) | |
out_string=listToString(output) | |
#return out_string | |
return str(out_string) | |
iface = gr.Interface(fn=sim, | |
inputs=["text","text"], | |
outputs=gr.outputs.Textbox(),) | |
iface.launch() | |