Spaces:
Running
Running
eljanmahammadli
commited on
Commit
•
caa635d
1
Parent(s):
c38b78d
changed similarity to sentence transformers
Browse files- __pycache__/utils.cpython-311.pyc +0 -0
- requirements.txt +2 -1
- utils.py +10 -1
__pycache__/utils.cpython-311.pyc
ADDED
Binary file (13.7 kB). View file
|
|
requirements.txt
CHANGED
@@ -20,4 +20,5 @@ spacy
|
|
20 |
textstat
|
21 |
plotly
|
22 |
tqdm
|
23 |
-
pymupdf
|
|
|
|
20 |
textstat
|
21 |
plotly
|
22 |
tqdm
|
23 |
+
pymupdf
|
24 |
+
sentence-transformers
|
utils.py
CHANGED
@@ -9,10 +9,12 @@ from collections import Counter
|
|
9 |
import numpy as np
|
10 |
import asyncio
|
11 |
import nltk
|
|
|
12 |
|
13 |
nltk.download('punkt')
|
14 |
|
15 |
WORD = re.compile(r"\w+")
|
|
|
16 |
|
17 |
|
18 |
# returns cosine similarity of two vectors
|
@@ -53,6 +55,13 @@ def cosineSim(text1, text2):
|
|
53 |
cosine = get_cosine(vector1, vector2)
|
54 |
return cosine
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def get_soup_requests(url):
|
57 |
page = requests.get(url)
|
58 |
if page.status_code == 200:
|
@@ -130,7 +139,7 @@ def googleSearch(
|
|
130 |
urlList.append(url)
|
131 |
scoreArray.append([0] * len(sentences))
|
132 |
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
133 |
-
scoreArray[urlList.index(url)][i] =
|
134 |
sentence, snippet
|
135 |
)
|
136 |
else:
|
|
|
9 |
import numpy as np
|
10 |
import asyncio
|
11 |
import nltk
|
12 |
+
from sentence_transformers import SentenceTransformer, util
|
13 |
|
14 |
nltk.download('punkt')
|
15 |
|
16 |
WORD = re.compile(r"\w+")
|
17 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
18 |
|
19 |
|
20 |
# returns cosine similarity of two vectors
|
|
|
55 |
cosine = get_cosine(vector1, vector2)
|
56 |
return cosine
|
57 |
|
58 |
+
def sentence_similarity(text1, text2):
|
59 |
+
embedding_1= model.encode(text1, convert_to_tensor=True)
|
60 |
+
embedding_2 = model.encode(text2, convert_to_tensor=True)
|
61 |
+
|
62 |
+
o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
63 |
+
return round(o.item(), 2)
|
64 |
+
|
65 |
def get_soup_requests(url):
|
66 |
page = requests.get(url)
|
67 |
if page.status_code == 200:
|
|
|
139 |
urlList.append(url)
|
140 |
scoreArray.append([0] * len(sentences))
|
141 |
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
142 |
+
scoreArray[urlList.index(url)][i] = sentence_similarity(
|
143 |
sentence, snippet
|
144 |
)
|
145 |
else:
|