srikanththirumani
commited on
Commit
•
8c61de9
1
Parent(s):
5ae9fd6
Update app.py
Browse files
app.py
CHANGED
@@ -10,8 +10,6 @@ import spacy
|
|
10 |
import matplotlib.pyplot as plt
|
11 |
import io
|
12 |
import base64
|
13 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
14 |
-
import numpy as np
|
15 |
|
16 |
# Load spaCy model for semantic analysis
|
17 |
nlp = spacy.load("en_core_web_md")
|
@@ -37,11 +35,7 @@ def preprocess_text(text):
|
|
37 |
tokens = word_tokenize(text)
|
38 |
return [word for word in tokens if word not in stop_words]
|
39 |
|
40 |
-
def
|
41 |
-
words1 = preprocess_text(text1)
|
42 |
-
words2 = preprocess_text(text2)
|
43 |
-
vec1 = Counter(words1)
|
44 |
-
vec2 = Counter(words2)
|
45 |
intersection = set(vec1.keys()) & set(vec2.keys())
|
46 |
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
47 |
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
|
@@ -50,17 +44,29 @@ def calculate_word_similarity(text1, text2):
|
|
50 |
if not denominator:
|
51 |
return 0.0
|
52 |
else:
|
53 |
-
return float(numerator) / denominator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def calculate_sentence_similarity(text1, text2):
|
56 |
sentences1 = sent_tokenize(text1)
|
57 |
sentences2 = sent_tokenize(text2)
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
64 |
|
65 |
def semantic_similarity(text1, text2):
|
66 |
doc1 = nlp(text1)
|
@@ -74,15 +80,15 @@ def longest_common_subsequence(text1, text2):
|
|
74 |
L = [[0] * (n + 1) for _ in range(m + 1)]
|
75 |
for i in range(1, m + 1):
|
76 |
for j in range(1, n + 1):
|
77 |
-
if
|
78 |
L[i][j] = L[i-1][j-1] + 1
|
79 |
else:
|
80 |
L[i][j] = max(L[i-1][j], L[i][j-1])
|
81 |
lcs = []
|
82 |
i, j = m, n
|
83 |
while i > 0 and j > 0:
|
84 |
-
if
|
85 |
-
lcs.append(
|
86 |
i -= 1
|
87 |
j -= 1
|
88 |
elif L[i-1][j] > L[i][j-1]:
|
|
|
10 |
import matplotlib.pyplot as plt
|
11 |
import io
|
12 |
import base64
|
|
|
|
|
13 |
|
14 |
# Load spaCy model for semantic analysis
|
15 |
nlp = spacy.load("en_core_web_md")
|
|
|
35 |
tokens = word_tokenize(text)
|
36 |
return [word for word in tokens if word not in stop_words]
|
37 |
|
38 |
+
def cosine_similarity(vec1, vec2):
|
|
|
|
|
|
|
|
|
39 |
intersection = set(vec1.keys()) & set(vec2.keys())
|
40 |
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
41 |
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
|
|
|
44 |
if not denominator:
|
45 |
return 0.0
|
46 |
else:
|
47 |
+
return float(numerator) / denominator
|
48 |
+
|
49 |
+
def calculate_word_similarity(text1, text2):
|
50 |
+
words1 = preprocess_text(text1)
|
51 |
+
words2 = preprocess_text(text2)
|
52 |
+
vec1 = Counter(words1)
|
53 |
+
vec2 = Counter(words2)
|
54 |
+
similarity = cosine_similarity(vec1, vec2)
|
55 |
+
return similarity * 100
|
56 |
|
57 |
def calculate_sentence_similarity(text1, text2):
|
58 |
sentences1 = sent_tokenize(text1)
|
59 |
sentences2 = sent_tokenize(text2)
|
60 |
+
similarities = []
|
61 |
+
for sent1 in sentences1:
|
62 |
+
max_similarity = 0
|
63 |
+
for sent2 in sentences2:
|
64 |
+
similarity = calculate_word_similarity(sent1, sent2)
|
65 |
+
if similarity > max_similarity:
|
66 |
+
max_similarity = similarity
|
67 |
+
similarities.append(max_similarity)
|
68 |
+
average_similarity = sum(similarities) / len(similarities) if similarities else 0.0
|
69 |
+
return average_similarity
|
70 |
|
71 |
def semantic_similarity(text1, text2):
|
72 |
doc1 = nlp(text1)
|
|
|
80 |
L = [[0] * (n + 1) for _ in range(m + 1)]
|
81 |
for i in range(1, m + 1):
|
82 |
for j in range(1, n + 1):
|
83 |
+
if sentences2[j-1] in sentences1:
|
84 |
L[i][j] = L[i-1][j-1] + 1
|
85 |
else:
|
86 |
L[i][j] = max(L[i-1][j], L[i][j-1])
|
87 |
lcs = []
|
88 |
i, j = m, n
|
89 |
while i > 0 and j > 0:
|
90 |
+
if sentences2[j-1] in sentences1:
|
91 |
+
lcs.append(sentences2[j-1])
|
92 |
i -= 1
|
93 |
j -= 1
|
94 |
elif L[i-1][j] > L[i][j-1]:
|