File size: 6,149 Bytes
ee21672
 
 
cc67574
ee21672
 
cc67574
ee21672
 
 
 
 
cc67574
ee21672
52a7c4d
cc67574
52a7c4d
 
ee21672
 
53c1ce6
ee21672
 
113691b
8d60d52
 
113691b
9f25694
 
 
 
 
 
1819245
 
9f25694
1819245
113691b
8d60d52
3d359f3
113691b
 
 
 
 
 
 
 
 
2a3fed6
ee21672
d9f8ad9
ee21672
113691b
 
8d60d52
113691b
032092a
113691b
c66477c
113691b
 
 
adcac3d
032092a
fb8ce76
adcac3d
032092a
3390aef
b73ea8d
b33016d
887de6c
2a3fed6
ee21672
d9f8ad9
113691b
 
 
ee21672
 
 
 
 
 
 
 
 
 
 
 
 
 
52a7c4d
ee21672
 
432614e
ee21672
 
 
 
 
 
 
 
3390aef
ee21672
 
 
53c1ce6
ee21672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3390aef
ee21672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4471b2f
ee21672
4471b2f
 
5cd0e47
353fc7a
 
ee21672
 
 
 
 
 
 
3f62198
ee21672
 
 
2a3fed6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import streamlit as st

import numpy as np
import re
import pickle
from collections import OrderedDict
import io

from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch

from nltk.tokenize import sent_tokenize
import nltk

import gdown
import requests


nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import pandas as pd

purl = st.secrets["graphs_url"]
print(purl)


@st.cache
def load_embeddings():
    url = "https://drive.google.com/uc?export=download&id=1z9eoBI07p_YtrdK1ZWZeCRT5T5mu5nhV"
    output = "embeddings.npy"
    gdown.download(url, output, quiet=False)

    corpus_embeddings = np.load(output)
    return corpus_embeddings

@st.cache
def load_data(url):
    #url = "https://drive.google.com/uc?export=download&id=1nIBS9is8YCeiPBqA7MifVC5xeaKWH8uL"
    output = "passages.jsonl"
    gdown.download(url, output, quiet=False)

    df = pd.read_json(output, lines=True)

    df.reset_index(inplace=True, drop=True)
    return df


st.title('Sociology EMERAC')

st.write('This project is a work-in-progress that searches the text of recently-published articles from a few sociology journals and retrieves the most relevant paragraphs.')


with st.spinner(text="Loading data..."):
    df = load_data(purl)
    passages = df['text'].values

no_of_graphs=len(df)
no_of_articles = len(df['cite'].value_counts())


notes = f'''Notes:
* I have found three types of searches work best:
     * Phrases or specific topics, such as "inequality in latin america", "race color skin tone measurement", "audit study experiment gender",  or "logistic regression or linear probability model".
     * Citations to well-known works, either using author year ("bourdieu 1984") or author idea ("Crenshaw intersectionality")
    * Questions, like "What is a topic model?" or "How did Weber define bureaucracy?"
* The search expands beyond exact matching, so "asia social movements" may return paragraphs on Asian-Americans politics and South Korean labor unions.
* The first search can take up to 10 seconds as the files load. After that, it's quicker to respond.
* The most relevant paragraph to your search is returned first, along with up to four other related paragraphs from that article.
* The most relevant sentence within each paragraph, as determined by math, is displayed. Click on it to see the full paragraph.
* The results are not exhaustive, and seem to drift off even when you suspect there are more relevant articles :man-shrugging:.
* The dataset currently includes {no_of_graphs:,} paragraphs from {no_of_articles:,} published in the last five years in *Mobilization*, *Social Forces*, *Social Problems*, *Sociology of Race and Ethnicity*, *Gender and Society*, *Socius*, *JHSB*, *Annual Review of Sociology*, and the *American Sociological Review*.
* Behind the scenes, the semantic search uses [text embeddings](https://www.sbert.net) with a [retrieve & re-rank](https://colab.research.google.com/github/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb) process to find the best matches.
* Let [me](mailto:neal.caren@unc.edu) know what you think or it looks broken.
'''

st.markdown(notes)


def sent_trans_load():
    #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
    bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
    bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens, max 512
    return bi_encoder

def sent_cross_load():
    #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    return cross_encoder







with st.spinner(text="Loading embeddings..."):
    corpus_embeddings = load_embeddings()





def search(query, top_k=50):

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True).to(device)


    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query
    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Search Results")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

    hd = OrderedDict()
    for hit in hits[0:30]:

        row_id = hit['corpus_id']
        cite = df.loc[row_id]['cite']
        #graph = passages[row_id]
        graph = df.loc[row_id]['text']

        # Find best sentence
        ab_sentences= [s for s in sent_tokenize(graph)]
        cross_inp = [[query, s] for s in ab_sentences]
        cross_scores = cross_encoder.predict(cross_inp)
        thesis = pd.Series(cross_scores, ab_sentences).sort_values().index[-1]
        graph = graph.replace(thesis, f'**{thesis}**')

        if cite in hd:

          hd[cite].append(graph)
        else:
          hd[cite] = [graph]

    for cite, graphs in hd.items():
        cite = cite.replace(",  ", '. "').replace(', Social ', '", Social ')
        st.write(cite)

        for graph in graphs[:5]:
          # refind the Thesis
          thesis = re.findall('\*\*(.*?)\*\*', graph)[0]

          with st.expander(thesis):
             st.write(f'* {graph}')
        st.write('')
       # print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))



search_query = st.text_input('Enter your search phrase:')
if search_query!='':
    with st.spinner(text="Searching and sorting results."):
        bi_encoder = sent_trans_load()
        cross_encoder = sent_cross_load()
        search(search_query)
        st.markdown('![EMERAC](emerac.png)')