Nick Sorros commited on
Commit
b493a01
β€’
1 Parent(s): cacf814

Tag more grants and implement most common

Browse files
Files changed (4) hide show
  1. app.py +18 -11
  2. preprocess.py +8 -1
  3. tag.py +2 -2
  4. tagged_grants.jsonl +0 -0
app.py CHANGED
@@ -1,39 +1,46 @@
 
1
  import streamlit as st
2
  import srsly
3
 
 
4
  def search(query):
5
  results = []
6
  for grant in grants:
7
  if query in grant["tags"]:
8
- results.append({
9
- "title": grant["title"],
10
- "tags": grant["tags"]
11
- })
12
  st.session_state["results"] = results
13
 
 
14
  st.header("Search πŸ”Ž grants using MeSH πŸ”–")
15
  st.sidebar.header("Information β„Ή")
16
- st.sidebar.write("A complete list of MeSH tags can be found here https://meshb.nlm.nih.gov/treeView")
 
 
17
  st.sidebar.write("The grants data can be found https://www.threesixtygiving.org/")
18
- st.sidebar.write("The model used to tag grants is https://huggingface.co/Wellcome/WellcomeBertMesh")
 
 
19
 
20
  if "grants" not in st.session_state:
21
  st.session_state["grants"] = list(srsly.read_jsonl("tagged_grants.jsonl"))
22
 
23
  grants = st.session_state["grants"]
24
 
25
- query = st.text_input("", value="Humans")
26
  st.button("Search πŸ”Ž", on_click=search, kwargs={"query": query})
27
 
28
  if "results" in st.session_state:
29
  st.caption("Related MeSH terms")
30
- unique_tags = list(set(list([tag for res in st.session_state["results"] for tag in res["tags"]])))
 
 
 
31
  columns = st.columns(5)
32
  for row_i in range(3):
33
  for col_i, col in enumerate(columns):
34
  with col:
35
- tag_i = row_i*5 + col_i
36
- if tag_i < len(unique_tags):
37
- tag = unique_tags[tag_i]
38
  st.button(tag, on_click=search, kwargs={"query": tag})
39
  st.table(st.session_state["results"])
 
1
+ from collections import Counter
2
  import streamlit as st
3
  import srsly
4
 
5
+
6
  def search(query):
7
  results = []
8
  for grant in grants:
9
  if query in grant["tags"]:
10
+ results.append({"title": grant["title"], "tags": grant["tags"]})
 
 
 
11
  st.session_state["results"] = results
12
 
13
+
14
  st.header("Search πŸ”Ž grants using MeSH πŸ”–")
15
  st.sidebar.header("Information β„Ή")
16
+ st.sidebar.write(
17
+ "A complete list of MeSH tags can be found here https://meshb.nlm.nih.gov/treeView"
18
+ )
19
  st.sidebar.write("The grants data can be found https://www.threesixtygiving.org/")
20
+ st.sidebar.write(
21
+ "The model used to tag grants is https://huggingface.co/Wellcome/WellcomeBertMesh"
22
+ )
23
 
24
  if "grants" not in st.session_state:
25
  st.session_state["grants"] = list(srsly.read_jsonl("tagged_grants.jsonl"))
26
 
27
  grants = st.session_state["grants"]
28
 
29
+ query = st.text_input("", value="Malaria")
30
  st.button("Search πŸ”Ž", on_click=search, kwargs={"query": query})
31
 
32
  if "results" in st.session_state:
33
  st.caption("Related MeSH terms")
34
+
35
+ retrieved_tags = [tag for res in st.session_state["results"] for tag in res["tags"]]
36
+ most_common_tags = [tag for tag, _ in Counter(retrieved_tags).most_common(20)]
37
+
38
  columns = st.columns(5)
39
  for row_i in range(3):
40
  for col_i, col in enumerate(columns):
41
  with col:
42
+ tag_i = row_i * 5 + col_i
43
+ if tag_i < len(most_common_tags):
44
+ tag = most_common_tags[tag_i]
45
  st.button(tag, on_click=search, kwargs={"query": tag})
46
  st.table(st.session_state["results"])
preprocess.py CHANGED
@@ -3,14 +3,21 @@ import json
3
  from tqdm import tqdm
4
  import typer
5
 
 
6
  def preprocess(data_path, processed_data_path):
7
  with open(data_path) as f:
8
  data = json.loads(f.read())
9
 
10
  with open(processed_data_path, "w") as f:
11
  for grant in tqdm(data["grants"]):
12
- if any([org["name"] == "The Wellcome Trust" for org in grant["fundingOrganization"]]):
 
 
 
 
 
13
  f.write(json.dumps(grant) + "\n")
14
 
 
15
  if __name__ == "__main__":
16
  typer.run(preprocess)
 
3
  from tqdm import tqdm
4
  import typer
5
 
6
+
7
  def preprocess(data_path, processed_data_path):
8
  with open(data_path) as f:
9
  data = json.loads(f.read())
10
 
11
  with open(processed_data_path, "w") as f:
12
  for grant in tqdm(data["grants"]):
13
+ if any(
14
+ [
15
+ org["name"] == "The Wellcome Trust"
16
+ for org in grant["fundingOrganization"]
17
+ ]
18
+ ):
19
  f.write(json.dumps(grant) + "\n")
20
 
21
+
22
  if __name__ == "__main__":
23
  typer.run(preprocess)
tag.py CHANGED
@@ -24,13 +24,13 @@ def tag(data_path, tagged_data_path, sample_size: int = 10):
24
 
25
  texts = [grant["title_and_description"] for grant in data]
26
  for batch_index in tqdm(range(0, len(texts), 10)):
27
- batch_texts = texts[batch_index:batch_index+10]
28
 
29
  inputs = tokenizer(batch_texts, padding="max_length")
30
  labels = model(**inputs, return_labels=True)
31
 
32
  for i, tags in enumerate(labels):
33
- data[batch_index+i]["tags"] = tags
34
 
35
  srsly.write_jsonl(tagged_data_path, data)
36
 
 
24
 
25
  texts = [grant["title_and_description"] for grant in data]
26
  for batch_index in tqdm(range(0, len(texts), 10)):
27
+ batch_texts = texts[batch_index : batch_index + 10]
28
 
29
  inputs = tokenizer(batch_texts, padding="max_length")
30
  labels = model(**inputs, return_labels=True)
31
 
32
  for i, tags in enumerate(labels):
33
+ data[batch_index + i]["tags"] = tags
34
 
35
  srsly.write_jsonl(tagged_data_path, data)
36
 
tagged_grants.jsonl CHANGED
The diff for this file is too large to render. See raw diff