File size: 2,925 Bytes
30933bd
 
 
 
 
 
 
 
 
8b70035
30933bd
 
8b70035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30933bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import util
class SentenceSimiliarity():

    def __init__(self, sentence1, sentence2):
        self.sentence1 = sentence1
        self.sentence2 = sentence2
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
    
    def model_selection(self):
        available_models = [
            "distilbert-base-uncased",
            "bert-base-uncased",
            "sentence-transformers/all-MiniLM-L6-v2",
            "sentence-transformers/all-mpnet-base-v2",
            "intfloat/multilingual-e5-base",
            "togethercomputer/m2-bert-80M-32k-retrieval",
            "togethercomputer/m2-bert-80M-8k-retrieval",
            "togethercomputer/m2-bert-80M-2k-retrieval",
        ]
        self.model_name = st.sidebar.selectbox(
            label="Select Your Models",
            options=available_models,
        )


    def tokenize(self):
        tokenized1 = self.tokenizer(
            self.sentence1,
            return_tensors='pt',
            padding=True,
            truncation=True
        )
        tokenized2 = self.tokenizer(
            self.sentence2,
            return_tensors='pt',
            padding=True,
            truncation=True
        )
        return tokenized1, tokenized2

    def get_embeddings(self):
        tokenized1, tokenized2 = self.tokenize()
        with torch.no_grad():
            embeddings1 = self.model(**tokenized1).last_hidden_state.mean(dim=1)
            embeddings2 = self.model(**tokenized2).last_hidden_state.mean(dim=1)
        return embeddings1, embeddings2
    
    def get_similarity_scores(self):
        embeddings1, embeddings2 = self.get_embeddings()
        scores = util.cos_sim(embeddings1, embeddings2)
        return scores

    
    def results(self):
        scores = self.get_similarity_scores()
        statement = f"The sentence has {scores.item() * 100:.2f}% similarity"
        return statement
    

class UI():

    def __init__(self):
        st.title("Sentence Similiarity Checker")
        st.caption("You can use this for checking similarity between resume and job description")
    
    def get(self):
        self.sentence1 = st.text_area(
            label="Sentence 1",
            help="This is a parent text the next text will be compared with this text"
        )
        self.sentence2 = st.text_area(
            label="Sentence 2",
            help="This is a child text"
        )
        self.button = st.button(
            label="Check",
            help='Check Sentence Similarity'
        )

    def result(self):
        self.get()
        ss = SentenceSimiliarity(self.sentence1, self.sentence2)
        
        if self.button:
            st.text(ss.results())
        # print(ss.results())

ui = UI()
ui.result()