|
import streamlit as st |
|
from PIL import Image |
|
import ujson |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from nltk.tokenize import word_tokenize |
|
from nltk.corpus import stopwords |
|
from nltk.stem import PorterStemmer |
|
|
|
import nltk |
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
|
|
|
|
|
|
stemmer = PorterStemmer() |
|
stop_words = stopwords.words('english') |
|
tfidf = TfidfVectorizer() |
|
|
|
|
|
with open('publication_list_stemmed.json', 'r') as f: |
|
pub_list_first_stem = ujson.load(f) |
|
with open('publication_indexed_dictionary.json', 'r') as f: |
|
pub_index = ujson.load(f) |
|
with open('author_list_stemmed.json', 'r') as f: |
|
author_list_first_stem = ujson.load(f) |
|
with open('author_indexed_dictionary.json', 'r') as f: |
|
author_index = ujson.load(f) |
|
with open('author_names.json', 'r') as f: |
|
author_name = ujson.load(f) |
|
with open('pub_name.json', 'r') as f: |
|
pub_name = ujson.load(f) |
|
with open('pub_url.json', 'r') as f: |
|
pub_url = ujson.load(f) |
|
with open('pub_cu_author.json', 'r') as f: |
|
pub_cu_author = ujson.load(f) |
|
with open('pub_date.json', 'r') as f: |
|
pub_date = ujson.load(f) |
|
|
|
|
|
def search_data(input_text, operator_val, search_type): |
|
output_data = {} |
|
if operator_val == 2: |
|
input_text = input_text.lower().split() |
|
pointer = [] |
|
for token in input_text: |
|
if len(input_text) < 2: |
|
st.warning("Please enter at least 2 words to apply the operator.") |
|
break |
|
|
|
|
|
|
|
stem_temp = "" |
|
stem_word_file = [] |
|
temp_file = [] |
|
word_list = word_tokenize(token) |
|
|
|
for x in word_list: |
|
if x not in stop_words: |
|
stem_temp += stemmer.stem(x) + " " |
|
stem_word_file.append(stem_temp) |
|
|
|
if search_type == "publication" and pub_index.get(stem_word_file[0].strip()): |
|
pointer = pub_index.get(stem_word_file[0].strip()) |
|
elif search_type == "author" and author_index.get(stem_word_file[0].strip()): |
|
pointer = author_index.get(stem_word_file[0].strip()) |
|
|
|
if len(pointer) == 0: |
|
output_data = {} |
|
else: |
|
for j in pointer: |
|
if search_type == "publication": |
|
temp_file.append(pub_list_first_stem[j]) |
|
elif search_type == "author": |
|
temp_file.append(author_list_first_stem[j]) |
|
|
|
temp_file = tfidf.fit_transform(temp_file) |
|
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) |
|
|
|
for j in pointer: |
|
output_data[j] = cosine_output[pointer.index(j)] |
|
|
|
else: |
|
input_text = input_text.lower().split() |
|
pointer = [] |
|
match_word = [] |
|
for token in input_text: |
|
if len(input_text) < 2: |
|
st.warning("Please enter at least 2 words to apply the operator.") |
|
break |
|
|
|
|
|
|
|
temp_file = [] |
|
set2 = set() |
|
stem_word_file = [] |
|
word_list = word_tokenize(token) |
|
stem_temp = "" |
|
for x in word_list: |
|
if x not in stop_words: |
|
stem_temp += stemmer.stem(x) + " " |
|
stem_word_file.append(stem_temp) |
|
|
|
if search_type == "publication" and pub_index.get(stem_word_file[0].strip()): |
|
set1 = set(pub_index.get(stem_word_file[0].strip())) |
|
pointer.extend(list(set1)) |
|
elif search_type == "author" and author_index.get(stem_word_file[0].strip()): |
|
set1 = set(author_index.get(stem_word_file[0].strip())) |
|
pointer.extend(list(set1)) |
|
|
|
if match_word == []: |
|
match_word = list({z for z in pointer if z in set2 or (set2.add(z) or False)}) |
|
else: |
|
match_word.extend(list(set1)) |
|
match_word = list({z for z in match_word if z in set2 or (set2.add(z) or False)}) |
|
|
|
if len(input_text) > 1: |
|
match_word = {z for z in match_word if z in set2 or (set2.add(z) or False)} |
|
|
|
if len(match_word) == 0: |
|
output_data = {} |
|
else: |
|
for j in list(match_word): |
|
if search_type == "publication": |
|
temp_file.append(pub_list_first_stem[j]) |
|
elif search_type == "author": |
|
temp_file.append(author_list_first_stem[j]) |
|
|
|
temp_file = tfidf.fit_transform(temp_file) |
|
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) |
|
|
|
for j in list(match_word): |
|
output_data[j] = cosine_output[list(match_word).index(j)] |
|
else: |
|
if len(pointer) == 0: |
|
output_data = {} |
|
else: |
|
for j in pointer: |
|
if search_type == "publication": |
|
temp_file.append(pub_list_first_stem[j]) |
|
elif search_type == "author": |
|
temp_file.append(author_list_first_stem[j]) |
|
|
|
temp_file = tfidf.fit_transform(temp_file) |
|
cosine_output = cosine_similarity(temp_file, tfidf.transform(stem_word_file)) |
|
|
|
for j in pointer: |
|
output_data[j] = cosine_output[pointer.index(j)] |
|
|
|
return output_data |
|
|
|
|
|
def app(): |
|
|
|
|
|
image = Image.open('Fordham-University-Logo.png') |
|
st.image(image) |
|
|
|
|
|
st.markdown("<p style='text-align: center;'> Uncover the brilliance: Explore profiles, groundbreaking work, and cutting-edge research by the exceptional minds of Fordham University.</p>", unsafe_allow_html=True) |
|
|
|
|
|
input_text = st.text_input("Search research:", key="query_input") |
|
operator_val = st.radio( |
|
"Search Filters", |
|
['Exact', 'Relevant'], |
|
index=1, |
|
key="operator_input", |
|
horizontal=True, |
|
) |
|
search_type = st.radio( |
|
"Search in:", |
|
['Publications', 'Authors'], |
|
index=0, |
|
key="search_type_input", |
|
horizontal=True, |
|
) |
|
|
|
if st.button("SEARCH"): |
|
if search_type == "Publications": |
|
output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "publication") |
|
elif search_type == "Authors": |
|
output_data = search_data(input_text, 1 if operator_val == 'Exact' else 2, "author") |
|
else: |
|
output_data = {} |
|
|
|
|
|
show_results(output_data, search_type) |
|
|
|
st.markdown("<p style='text-align: center;'> Brought to you with ❤ by <a href='https://github.com/maladeep'>Mala Deep</a> | Data © Coventry University </p>", unsafe_allow_html=True) |
|
|
|
|
|
def show_results(output_data, search_type): |
|
aa = 0 |
|
rank_sorting = sorted(output_data.items(), key=lambda z: z[1], reverse=True) |
|
|
|
|
|
st.info(f"Showing results for: {len(rank_sorting)}") |
|
|
|
|
|
N_cards_per_row = 3 |
|
for n_row, (id_val, ranking) in enumerate(rank_sorting): |
|
i = n_row % N_cards_per_row |
|
if i == 0: |
|
st.write("---") |
|
cols = st.columns(N_cards_per_row, gap="large") |
|
|
|
with cols[n_row % N_cards_per_row]: |
|
if search_type == "Publications": |
|
st.caption(f"{pub_date[id_val].strip()}") |
|
st.markdown(f"**{pub_cu_author[id_val].strip()}**") |
|
st.markdown(f"*{pub_name[id_val].strip()}*") |
|
st.markdown(f"**{pub_url[id_val]}**") |
|
elif search_type == "Authors": |
|
st.caption(f"{pub_date[id_val].strip()}") |
|
st.markdown(f"**{author_name[id_val].strip()}**") |
|
st.markdown(f"*{pub_name[id_val].strip()}*") |
|
st.markdown(f"**{pub_url[id_val]}**") |
|
st.markdown(f"Ranking: {ranking[0]:.2f}") |
|
|
|
aa += 1 |
|
|
|
if aa == 0: |
|
st.info("No results found. Please try again.") |
|
else: |
|
st.info(f"Results shown for: {aa}") |
|
|
|
|
|
if __name__ == '__main__': |
|
app() |
|
|
|
|