NealCaren commited on
Commit
3390aef
1 Parent(s): c66477c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -52,12 +52,14 @@ no_of_articles = len(df['cite'].value_counts())
52
 
53
 
54
  notes = f'''Notes:
55
- * To get the best results, search like you are using Google. My best luck comes from phrases like "social movements and public opinion", "inequality in latin america", "race color skin tone measurement", "audit study experiment gender", "Crenshaw intersectionality", or "logistic regression or linear probability model". You can also use questions like "What is a topic model?" or "How did Weber define bureaucracy?"
56
- * The dataset currently includes articles published in the last five years in *Mobilization*, *Social Forces*, *Social Problems*, *Sociology of Race and Ethnicity*, *Gender and Society*, *Socius*, *JHSB*, *Annual Review of Sociology*, and the *American Sociological Review*, totaling {no_of_graphs:,} paragraphs from a {no_of_articles:,} articles.
 
 
57
  * The most relevant paragraph to your search is returned first, along with up to four other related paragraphs from that article.
58
  * The most relevant sentence within each paragraph, as determined by math, is bolded.
 
59
  * Behind the scenes, the semantic search uses [text embeddings](https://www.sbert.net) with a [retrieve & re-rank](https://colab.research.google.com/github/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb) process to find the best matches.
60
- * The first search can take up to 30 seconds as the files load. After that, it's quicker to respond.
61
  * Let [me](mailto:neal.caren@unc.edu) know what you think or it looks broken.
62
  '''
63
 
@@ -102,7 +104,7 @@ with st.spinner(text="Loading embeddings..."):
102
 
103
 
104
 
105
- def search(query, top_k=40):
106
 
107
  ##### Sematic Search #####
108
  # Encode the query using the bi-encoder and find potentially relevant passages
@@ -126,7 +128,7 @@ def search(query, top_k=40):
126
  hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
127
 
128
  hd = OrderedDict()
129
- for hit in hits[0:20]:
130
 
131
  row_id = hit['corpus_id']
132
  cite = df.loc[row_id]['cite']
 
52
 
53
 
54
  notes = f'''Notes:
55
+ * To get the best results, search like you are using Google rather looking for exact phrases.
56
+ * My best luck comes from searches like "social movements and public opinion", "inequality in latin america", "race color skin tone measurement", "audit study experiment gender", "Crenshaw intersectionality", or "logistic regression or linear probability model".
57
+ * You can also use questions like "What is a topic model?" or "How did Weber define bureaucracy?"
58
+ * The first search can take up to 10 seconds as the files load. After that, it's quicker to respond.
59
  * The most relevant paragraph to your search is returned first, along with up to four other related paragraphs from that article.
60
  * The most relevant sentence within each paragraph, as determined by math, is bolded.
61
+ * The dataset currently includes articles published in the last five years in *Mobilization*, *Social Forces*, *Social Problems*, *Sociology of Race and Ethnicity*, *Gender and Society*, *Socius*, *JHSB*, *Annual Review of Sociology*, and the *American Sociological Review*, totaling {no_of_graphs:,} paragraphs from {no_of_articles:,} articles.
62
  * Behind the scenes, the semantic search uses [text embeddings](https://www.sbert.net) with a [retrieve & re-rank](https://colab.research.google.com/github/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb) process to find the best matches.
 
63
  * Let [me](mailto:neal.caren@unc.edu) know what you think or it looks broken.
64
  '''
65
 
 
104
 
105
 
106
 
107
+ def search(query, top_k=50):
108
 
109
  ##### Sematic Search #####
110
  # Encode the query using the bi-encoder and find potentially relevant passages
 
128
  hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
129
 
130
  hd = OrderedDict()
131
+ for hit in hits[0:30]:
132
 
133
  row_id = hit['corpus_id']
134
  cite = df.loc[row_id]['cite']