File size: 3,052 Bytes
a8f86e9
 
 
 
 
 
 
 
 
 
 
 
afe954c
a8f86e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a054f8
a8f86e9
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import gradio as gr
from matplotlib.dates import SU
from regex import F
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
import textdistance
from spacy.lang.en.stop_words import STOP_WORDS
#import psycopg2
import os


'''connection = psycopg2.connect(user="db_admin",
                              password="",
                              host="127.0.0.1",
                              port="5432",
                              database="my_db")'''






nlp = spacy.load("en_core_web_md")


def listToString(s):
   
    # initialize an empty string
    str1 = " "
   
    # return string 
    return (str1.join(s))

def rm_stop(my_doc):
    # Create list of word tokens
    token_list = []
    for token in my_doc:
        token_list.append(token.text)

    

    # Create list of word tokens after removing stopwords
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)
    
    return filtered_sentence

def text_processing(sentence):

    sentence = [token.lemma_.lower()
                for token in nlp(sentence) 
                if token.is_alpha and not token.is_stop]
    
    return sentence

def jaccard_sim(sent1,sent2):
    # Text Processing
    sentence1 = text_processing(sent1)
    sentence2 = text_processing(sent2)
    
    # Jaccard similarity
    return textdistance.jaccard.normalized_similarity(sentence1, sentence2)

def sim(Ideal_Answer,Submitted_Answer):
# SBERT EMBEDDINGS
    text1=Ideal_Answer.replace("\"","").replace("\'","")
    text2=Submitted_Answer.replace("\"","").replace("\'","")
    output=[]
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    #Compute embedding for both lists
    embedding_1= model.encode(text1, convert_to_tensor=True)
    embedding_2 = model.encode(text2, convert_to_tensor=True)

    score=util.pytorch_cos_sim(embedding_1, embedding_2)
    output.append("SBERT:"+str(int(float(str(score).split("[")[2].split("]")[0])*10.0))+",")
    sbert=int(float(str(score).split("[")[2].split("]")[0])*10.0)
    #Jaccard
    output.append("Jaccard:"+str(int(jaccard_sim(text1,text2)*10.0))+",")

    #spacy average word2vec
    nlp = spacy.load("en_core_web_md")  # make sure to use larger package!
    doc1 =  listToString(rm_stop(nlp(text1)))
    doc2 =  listToString(rm_stop(nlp(text2)))

    # Similarity of two documents
    w2v=int(nlp(doc1).similarity(nlp(doc2))*10.0)
    final_score=int(0.8*sbert+0.2*w2v)
    output.append("Word2Vec:"+str(int(nlp(doc1).similarity(nlp(doc2))*10.0))+",final_score:"+str(final_score))
    out_string=listToString(output)
    #return out_string
    return str(out_string)


iface = gr.Interface(fn=sim, 
                     inputs=["text","text"], 
                     outputs=gr.outputs.Textbox(),)
iface.launch()