NealCaren commited on
Commit
113691b
1 Parent(s): 9608e05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -18
app.py CHANGED
@@ -22,19 +22,46 @@ nltk.download('punkt')
22
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
 
24
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  st.title('Sociology Paragraph Search')
26
 
27
  st.write('This project is a work-in-progress that searches the text of recently-published articles from a few sociology journals and retrieves the most relevant paragraphs.')
28
 
29
- st.markdown('''Notes:
 
 
 
 
 
 
 
 
 
30
  * To get the best results, search like you are using Google. My best luck comes from phrases like "social movements and public opinion", "inequality in latin america", "race color skin tone measurement", "audit study experiment gender", "Crenshaw intersectionality", or "logistic regression or linear probability model". You can also use questions like "What is a topic model?" or "How did Weber define bureaucracy?"
31
- * The dataset currently includes articles published in the last five years in *Mobilization*, Social Forces, Social Problems, Sociology of Race and Ethnicity, Gender and Society, Socius, JHSB, Annual Review of Sociology, and the American Sociological Review, totaling more than 100,000 paragraphs from a few thousand articles.
32
  * The most relevant paragraph to your search is returned first, along with up to four other related paragraphs from that article.
33
  * The most relevant sentence within each paragraph, as determined by math, is bolded.
34
  * Behind the scenes, the semantic search uses [text embeddings](https://www.sbert.net) with a [retrieve & re-rank](https://colab.research.google.com/github/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb) process to find the best matches.
35
  * The first search can take up to 30 seconds as the files load. After that, it's quicker to respond.
36
  * Let [me](mailto:neal.caren@unc.edu) know what you think or it looks broken.
37
- ''')
 
 
38
 
39
 
40
  def sent_trans_load():
@@ -49,24 +76,9 @@ def sent_cross_load():
49
  return cross_encoder
50
 
51
 
52
- @st.cache
53
- def load_data():
54
- #df = pd.read_json('https://www.dropbox.com/s/82lwbaym3b1o6uq/passages.jsonl?raw=1', lines=True)
55
-
56
- url = "https://drive.google.com/uc?export=download&id=1nIBS9is8YCeiPBqA7MifVC5xeaKWH8uL"
57
- output = "passages.jsonl"
58
- gdown.download(url, output, quiet=False)
59
-
60
- df = pd.read_json(output, lines=True)
61
 
62
- df.reset_index(inplace=True, drop=True)
63
- return df
64
 
65
 
66
- with st.spinner(text="Loading data..."):
67
- df = load_data()
68
- passages = df['text'].values
69
-
70
  @st.cache
71
  def load_embeddings():
72
  #efs = [np.load(f'embeddings_{i}.pt.npy') for i in range(0,5)]
 
22
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
 
24
  import pandas as pd
25
+
26
+
27
+ @st.cache
28
+ def load_data():
29
+ #df = pd.read_json('https://www.dropbox.com/s/82lwbaym3b1o6uq/passages.jsonl?raw=1', lines=True)
30
+
31
+ url = "https://drive.google.com/uc?export=download&id=1nIBS9is8YCeiPBqA7MifVC5xeaKWH8uL"
32
+ output = "passages.jsonl"
33
+ gdown.download(url, output, quiet=False)
34
+
35
+ df = pd.read_json(output, lines=True)
36
+
37
+ df.reset_index(inplace=True, drop=True)
38
+ return df
39
+
40
+
41
  st.title('Sociology Paragraph Search')
42
 
43
  st.write('This project is a work-in-progress that searches the text of recently-published articles from a few sociology journals and retrieves the most relevant paragraphs.')
44
 
45
+
46
+ with st.spinner(text="Loading data..."):
47
+ df = load_data()
48
+ passages = df['text'].values
49
+
50
+ no_of_graphs=len(df)
51
+ no_of_articles = len(df['cite'].value_count())
52
+
53
+
54
+ notes = f'''Notes:
55
  * To get the best results, search like you are using Google. My best luck comes from phrases like "social movements and public opinion", "inequality in latin america", "race color skin tone measurement", "audit study experiment gender", "Crenshaw intersectionality", or "logistic regression or linear probability model". You can also use questions like "What is a topic model?" or "How did Weber define bureaucracy?"
56
+ * The dataset currently includes articles published in the last five years in *Mobilization*, Social Forces, Social Problems, Sociology of Race and Ethnicity, Gender and Society, Socius, JHSB, Annual Review of Sociology, and the American Sociological Review, totaling {no_of_graphs} paragraphs from a {no_of_articles} articles.
57
  * The most relevant paragraph to your search is returned first, along with up to four other related paragraphs from that article.
58
  * The most relevant sentence within each paragraph, as determined by math, is bolded.
59
  * Behind the scenes, the semantic search uses [text embeddings](https://www.sbert.net) with a [retrieve & re-rank](https://colab.research.google.com/github/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/retrieve_rerank_simple_wikipedia.ipynb) process to find the best matches.
60
  * The first search can take up to 30 seconds as the files load. After that, it's quicker to respond.
61
  * Let [me](mailto:neal.caren@unc.edu) know what you think or it looks broken.
62
+ '''
63
+
64
+ st.markdown(notes)
65
 
66
 
67
  def sent_trans_load():
 
76
  return cross_encoder
77
 
78
 
 
 
 
 
 
 
 
 
 
79
 
 
 
80
 
81
 
 
 
 
 
82
  @st.cache
83
  def load_embeddings():
84
  #efs = [np.load(f'embeddings_{i}.pt.npy') for i in range(0,5)]