Upload 3 files
Browse files- README.md +40 -7
- app.py +85 -0
- requirements.txt +3 -0
README.md
CHANGED
@@ -1,13 +1,46 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license:
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Utafiti
|
3 |
+
emoji: π
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: blue
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.17.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
+
|
14 |
+
# Utafiti (Swahili for Research)
|
15 |
+
|
16 |
+
- This is a Streamlit app that uses the sentence-transformers library to find similar sentences in a corpus of text. It is based on the [Sentence-BERT](https://www.sbert.net/) model.
|
17 |
+
- The corpus is a collection of research paper titles in the realm of mathematics and computer science. Only the titles are used for purposes of similarity search. The abstracts as thus fetched by matching title to abstract (summary) indices stored as pickle files in the resources directory.
|
18 |
+
- For query optimization, the faiss (Facebook AI Similarity Search) library is used to index the corpus. Additionally, clustering using Voronoi cells has been implemented to further optimize the search.
|
19 |
+
- Hosted on HuggingFace Spaces at: https://huggingface.co/spaces/eolang/utafiti
|
20 |
+
|
21 |
+
## Additonal details include:
|
22 |
+
- Encoding size: 768
|
23 |
+
- Corpus size: 28,000
|
24 |
+
- Library: Sentence-BERT (Sentence Transformers), Faiss, Torch, NumPy, Streamlit
|
25 |
+
- Model: multi-qa-MiniLM-L6-cos-v1
|
26 |
+
|
27 |
+
### Possible Fixes (*This is a hobby project, so no promises* :-)
|
28 |
+
1. Inlculssion of direct links to Axriv, Papers with Code and Google Scholar links for the papers
|
29 |
+
2. An API of some sort to allow for easy integration with other projects (maybe in the infamous version 2.0)
|
30 |
+
3. A more robust search engine (maybe in the infamous version 2.0)
|
31 |
+
|
32 |
+
|
33 |
+
### Random things to note
|
34 |
+
- I'm not running this on Streamlit Sharing, due to an issue with the fails-CPU library. **May** migrate it in case I find a fix.
|
35 |
+
- I'm not the best developer, so the code is not the best. I'm still learning and I'm open to suggestions and improvements.
|
36 |
+
- I'm not a professional mathematician &/ researcher &/ scientist, so the corpus, algorithms, etc. may not be the best. I'm still learning and I'm open to suggestions and improvements.
|
37 |
+
|
38 |
+
### Acknowledgements
|
39 |
+
- [Sentence-BERT](https://www.sbert.net/)
|
40 |
+
- [multi-qa-MiniLM-L6-cos-v1]((https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1))
|
41 |
+
|
42 |
+
|
43 |
+
## For feedback and suggestions, please contact me on:
|
44 |
+
- [LinkedIn](https://www.linkedin.com/in/eolang/)
|
45 |
+
|
46 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import json
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
#load files/resources using pickle
|
8 |
+
with open('resources/titles.pkl', 'rb') as f:
|
9 |
+
titles = pickle.load(f)
|
10 |
+
|
11 |
+
with open('resources/embeddings.pkl', 'rb') as f:
|
12 |
+
embeddings = pickle.load(f)
|
13 |
+
|
14 |
+
with open('resources/authors.pkl', 'rb') as f:
|
15 |
+
authors = pickle.load(f)
|
16 |
+
|
17 |
+
with open('resources/years.pkl', 'rb') as f:
|
18 |
+
years = pickle.load(f)
|
19 |
+
|
20 |
+
with open('resources/summary.pkl', 'rb') as f:
|
21 |
+
summary = pickle.load(f)
|
22 |
+
|
23 |
+
with open('resources/titles_urls.pkl', 'rb') as f:
|
24 |
+
title_urls = pickle.load(f)
|
25 |
+
|
26 |
+
|
27 |
+
index = pickle.load(open('resources/index.sav' , 'rb'))
|
28 |
+
model = pickle.load(open('resources/model.sav' , 'rb'))
|
29 |
+
|
30 |
+
|
31 |
+
# Query Function
|
32 |
+
def query(query_text, k=5):
|
33 |
+
enc = model.encode([query_text])
|
34 |
+
D, I = index.search(enc, k)
|
35 |
+
|
36 |
+
results = []
|
37 |
+
for i in range(k):
|
38 |
+
results.append(
|
39 |
+
{
|
40 |
+
"Title" : titles[I[0][i]],
|
41 |
+
"Author" : authors[I[0][i]],
|
42 |
+
"Year" : years[I[0][i]],
|
43 |
+
"Summary" : summary[I[0][i]],
|
44 |
+
"Title URL" : title_urls[I[0][i]]
|
45 |
+
}
|
46 |
+
)
|
47 |
+
|
48 |
+
return results
|
49 |
+
|
50 |
+
|
51 |
+
def main():
|
52 |
+
# Streamlit App
|
53 |
+
st.title("Research Paper/ Publication Search Engine")
|
54 |
+
|
55 |
+
# app description
|
56 |
+
st.markdown(f"<p style='text-align: justify;'>This is a search engine that uses a pre-trained sentence transformer model to encode the user query and the abstracts of the papers. The encoded vectors are then used to find the most similar papers to the user query.", unsafe_allow_html=True)
|
57 |
+
|
58 |
+
# DISCLAIMER: Data capture
|
59 |
+
# st.markdown(f"<p style='text-align: justify;'>Please note that the search engine captures the user query and stores it in a text file. This is done to build a collection of user queries for future use when building a bigger data pool for the search engine. \nline", unsafe_allow_html=True)
|
60 |
+
|
61 |
+
|
62 |
+
query_text = st.text_input("Enter your query e.g. NLP")
|
63 |
+
|
64 |
+
if st.button("Search"):
|
65 |
+
results = query(query_text)
|
66 |
+
|
67 |
+
for result in results:
|
68 |
+
st.write("Title: " + result["Title"])
|
69 |
+
st.write("Author: " + result["Author"])
|
70 |
+
st.write("Year Published: " + str(result["Year"]))
|
71 |
+
|
72 |
+
# justified text for summary
|
73 |
+
st.write("Abstract")
|
74 |
+
st.markdown(f"<p style='text-align: justify;'>{result['Summary']}</p>", unsafe_allow_html=True)
|
75 |
+
st.markdown(f"<a href='{result['Title URL']}'>Google Search</a>", unsafe_allow_html=True)
|
76 |
+
st.write("--" * 50)
|
77 |
+
|
78 |
+
|
79 |
+
# store user query in a text file,
|
80 |
+
# The purpose of this is to build a collection of user queries for future use when building a bigger data pool for the search engine
|
81 |
+
# with open('resources/user_query.txt', 'a') as f:
|
82 |
+
# f.write(query_text + "\n")'''
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
faiss-cpu
|
2 |
+
numpy
|
3 |
+
sentence-transformers
|