Commit
•
a3697ff
1
Parent(s):
45e0954
Update find_similar_news.py
Browse files- find_similar_news.py +12 -5
find_similar_news.py
CHANGED
@@ -43,17 +43,16 @@ def get_milvus_collection():
|
|
43 |
logger.warning('Exiting get_milvus_collection()')
|
44 |
return collection
|
45 |
|
46 |
-
def find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model, top_n: int=
|
47 |
logger.warning('Entering find_similar_news')
|
48 |
search_params = {"metric_type": "IP"}
|
49 |
-
# search_vec = vectorizer.vectorize_(text)
|
50 |
logger.warning('Querying Milvus for most similar results')
|
51 |
results = collection.search([search_vec],
|
52 |
anns_field='article_embed', # annotations field specified in the schema definition
|
53 |
param=search_params,
|
54 |
limit=top_n,
|
55 |
guarantee_timestamp=1,
|
56 |
-
output_fields=['article_title', '
|
57 |
|
58 |
logger.warning('retrieved search results from Milvus')
|
59 |
logger.warning('Computing cross encoder similarity scores')
|
@@ -64,11 +63,19 @@ def find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_m
|
|
64 |
|
65 |
logger.warning('Generating HTML output')
|
66 |
html_output = ""
|
|
|
67 |
for n, i in enumerate(similarity_idxs):
|
68 |
title_ = results[i].entity.get('article_title')
|
69 |
url_ = results[i].entity.get('article_url')
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
logger.warning('Successfully generated HTML output')
|
73 |
logger.warning('Exiting find_similar_news')
|
74 |
return html_output
|
|
|
43 |
logger.warning('Exiting get_milvus_collection()')
|
44 |
return collection
|
45 |
|
46 |
+
def find_similar_news(text, search_vec, collection, vectorizer, sent_model, ce_model, top_n: int=10):
|
47 |
logger.warning('Entering find_similar_news')
|
48 |
search_params = {"metric_type": "IP"}
|
|
|
49 |
logger.warning('Querying Milvus for most similar results')
|
50 |
results = collection.search([search_vec],
|
51 |
anns_field='article_embed', # annotations field specified in the schema definition
|
52 |
param=search_params,
|
53 |
limit=top_n,
|
54 |
guarantee_timestamp=1,
|
55 |
+
output_fields=['article_title', 'article_url'])[0] # which fields to return in output
|
56 |
|
57 |
logger.warning('retrieved search results from Milvus')
|
58 |
logger.warning('Computing cross encoder similarity scores')
|
|
|
63 |
|
64 |
logger.warning('Generating HTML output')
|
65 |
html_output = ""
|
66 |
+
article_count = 0
|
67 |
for n, i in enumerate(similarity_idxs):
|
68 |
title_ = results[i].entity.get('article_title')
|
69 |
url_ = results[i].entity.get('article_url')
|
70 |
+
if title_ != text:
|
71 |
+
html_output += f'''<a class="similar-news-item" href="{url_}" target="_blank">{title_}</a><br>
|
72 |
+
'''
|
73 |
+
article_count += 1
|
74 |
+
|
75 |
+
if article_count == 5 :
|
76 |
+
break
|
77 |
+
|
78 |
+
|
79 |
logger.warning('Successfully generated HTML output')
|
80 |
logger.warning('Exiting find_similar_news')
|
81 |
return html_output
|