paragraphs / app.py
Neal Caren
fonting
9b176b9
import streamlit as st
import numpy as np
import re
import pickle
from collections import OrderedDict
import io
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
from nltk.tokenize import sent_tokenize
import nltk
import gdown
import requests
from PIL import Image
# Trying to figure out some CSS stuff
st.markdown(
"""
<style>
.streamlit-expanderHeader {
font-size: medium;
}
</style>
""",
unsafe_allow_html=True,
)
nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import pandas as pd
purl = st.secrets["graphs_url"]
print(purl)
@st.cache
def load_embeddings():
url = "https://drive.google.com/uc?export=download&id=1z9eoBI07p_YtrdK1ZWZeCRT5T5mu5nhV"
output = "embeddings.npy"
gdown.download(url, output, quiet=False)
corpus_embeddings = np.load(output)
return corpus_embeddings
@st.cache
def load_data(url):
#url = "https://drive.google.com/uc?export=download&id=1nIBS9is8YCeiPBqA7MifVC5xeaKWH8uL"
output = "passages.jsonl"
gdown.download(url, output, quiet=False)
df = pd.read_json(output, lines=True)
df.reset_index(inplace=True, drop=True)
return df
st.title('Sociology EMERAC')
st.write('This project is a work-in-progress that searches the text of recently-published articles from a few sociology journals and retrieves the most relevant paragraphs.')
with st.spinner(text="Loading data..."):
df = load_data(purl)
passages = df['text'].values
no_of_graphs=len(df)
no_of_articles = len(df['cite'].value_counts())
notes = f'''Notes:
* I have found three types of searches work best:
* Phrases or specific topics, such as "inequality in latin america", "race color skin tone measurement", "audit study experiment gender", or "logistic regression or linear probability model".
* Citations to well-known works, either using author year ("bourdieu 1984") or author idea ("Crenshaw intersectionality")
* Questions, like "What is a topic model?" or "How did Weber define bureaucracy?"
* The search expands beyond exact matching, so "asia social movements" may return paragraphs on Asian-Americans politics and South Korean labor unions.
* The first search can take up to 10 seconds as the files load. After that, it's quicker to respond.
* The most relevant paragraph to your search is returned first, along with up to four other related paragraphs from that article.
* The most relevant sentence within each paragraph, as determined by math, is displayed. Click on it to see the full paragraph.
* The results are not exhaustive, and seem to drift off even when you suspect there are more relevant articles :man-shrugging:.
* The dataset currently includes {no_of_graphs:,} paragraphs from {no_of_articles:,} published in the last five years in *Mobilization*, *Social Forces*, *Social Problems*, *Sociology of Race and Ethnicity*, *Gender and Society*, *Socius*, *JHSB*, *Annual Review of Sociology*, and the *American Sociological Review*.
* Behind the scenes, the semantic search uses [text embeddings](https://www.sbert.net) with a [retrieve & re-rank](https://colab.research.google.com/github/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb) process to find the best matches.
* Let [me](mailto:neal.caren@unc.edu) know what you think or it looks broken.
'''
st.markdown(notes)
def sent_trans_load():
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens, max 512
return bi_encoder
def sent_cross_load():
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
return cross_encoder
with st.spinner(text="Loading embeddings..."):
corpus_embeddings = load_embeddings()
def search(query, top_k=50):
##### Sematic Search #####
# Encode the query using the bi-encoder and find potentially relevant passages
question_embedding = bi_encoder.encode(query, convert_to_tensor=True).to(device)
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
hits = hits[0] # Get the hits for the first query
##### Re-Ranking #####
# Now, score all retrieved passages with the cross_encoder
cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
cross_scores = cross_encoder.predict(cross_inp)
# Sort results by the cross-encoder scores
for idx in range(len(cross_scores)):
hits[idx]['cross-score'] = cross_scores[idx]
# Output of top-5 hits from re-ranker
print("\n-------------------------\n")
print("Search Results")
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
hd = OrderedDict()
for hit in hits[0:30]:
row_id = hit['corpus_id']
cite = df.loc[row_id]['cite']
#graph = passages[row_id]
graph = df.loc[row_id]['text']
# Find best sentence
ab_sentences= [s for s in sent_tokenize(graph)]
cross_inp = [[query, s] for s in ab_sentences]
cross_scores = cross_encoder.predict(cross_inp)
thesis = pd.Series(cross_scores, ab_sentences).sort_values().index[-1]
graph = graph.replace(thesis, f'**{thesis}**')
if cite in hd:
hd[cite].append(graph)
else:
hd[cite] = [graph]
for cite, graphs in hd.items():
cite = cite.replace(", ", '. "').replace(', Social ', '", Social ')
st.write(cite)
for graph in graphs[:5]:
# refind the Thesis
thesis = re.findall('\*\*(.*?)\*\*', graph)[0]
with st.expander(thesis):
st.write(f'> {graph}')
st.write('')
# print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
search_query = st.text_input('Enter your search phrase:')
if search_query!='':
with st.spinner(text="Searching and sorting results."):
placeholder = st.empty()
with placeholder.container():
st.image('https://www.dropbox.com/s/yndn6lkesjga9a6/emerac.png?raw=1')
bi_encoder = sent_trans_load()
cross_encoder = sent_cross_load()
search(search_query)
placeholder.empty()