mattupson commited on
Commit
b8d16b2
1 Parent(s): eb457d3

chg: Extract locations from Wellcome examples

Browse files
.dvc/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /config.local
2
+ /tmp
3
+ /cache
.dvc/config ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [core]
2
+ remote = s3
3
+ autostage = true
4
+ ['remote "s3"']
5
+ url = s3://mantisnlp-blogs/ner
.dvcignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Add patterns of files dvc should ignore, which could improve
2
+ # the performance. Learn more at
3
+ # https://dvc.org/doc/user-guide/dvcignore
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  __pycache__/
2
- images/
 
 
1
  __pycache__/
2
+ images/
3
+ .venv/
app.py CHANGED
@@ -1,40 +1,48 @@
 
 
1
  import spacy
 
2
  import streamlit as st
3
 
4
- # from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
5
-
6
 
7
- def render_entities(entities):
8
- colors = {"LOCATION": "#5cff84"}
9
- options = {"ents": ["LOCATION"], "colors": colors}
10
- html = spacy.displacy.render(entities, style="ent", options=options, manual=True)
11
- html = html.replace("\n", " ")
12
 
13
- return html
14
 
 
 
15
 
16
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
17
 
18
- st.header("Location Entity Recognition Demo 🔎🌆🌍")
19
- threshold = st.sidebar.slider("Threshold", value=0.5, min_value=0.0, max_value=1.0)
20
- display_probabilities = st.sidebar.checkbox("Display probabilities")
21
 
 
 
 
 
22
 
23
- text = st.text_area("Text input", value="This text is about Malaria", height=400)
 
24
 
25
- nlp = spacy.load("en_core_web_trf")
26
-
27
- doc = nlp(text)
28
 
29
- ents = [
30
- {"start": ent.start_char, "end": ent.end_char, "label": "LOCATION"}
31
- for ent in doc.ents
32
- ]
33
- foo = {"text": text, "ents": ents}
34
 
35
 
36
- print(ents)
37
- print(doc.ents)
38
 
39
- html = render_entities(foo)
40
- st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
  import spacy
4
+ import srsly
5
  import streamlit as st
6
 
7
+ nlp = spacy.load("en_core_web_trf")
 
8
 
9
+ # Load pre-processed grants from disk.
 
 
 
 
10
 
11
+ grants = list(srsly.read_jsonl("data/processed/entities.jsonl"))
12
 
13
+ colors = {"GPE": "#5cff84", "LOC": "#5cff84"}
14
+ options = {"ents": ["GPE", "LOC"], "colors": colors}
15
 
16
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
17
 
 
 
 
18
 
19
+ def render_entities(doc, colors: dict, options: dict) -> str:
20
+ """
21
+ Takes a SpaCy doc
22
+ """
23
 
24
+ #if isinstance(doc, spacy.tokens.doc.Doc):
25
+ # doc = doc.to_json()
26
 
27
+ html = spacy.displacy.render(doc, style="ent", options=options)
28
+ html = html.replace("\n", " ")
 
29
 
30
+ return html
 
 
 
 
31
 
32
 
33
+ st.header("Location Entity Recognition Demo 🔎🌆🌍")
 
34
 
35
+ st.subheader("Look for Locations")
36
+
37
+ if st.button("Show new example", key="text"):
38
+ sample = random.choice(grants)
39
+ doc = nlp(sample["text"])
40
+ html = render_entities(doc, colors, options)
41
+ text = st.text_area("Text input", value=sample["text"], height=200)
42
+ st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
43
+ else:
44
+ sample = random.choice(grants)
45
+ doc = nlp(sample["text"])
46
+ html = render_entities(doc, colors, options)
47
+ text = st.text_area("Text input", value=sample["text"], height=200, help="Enter text here and click the 'Find Locations' button to search for entities.")
48
+ st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
data/processed/.gitignore CHANGED
@@ -1 +1,2 @@
1
  /wellcome_grant_descriptions.csv
 
 
1
  /wellcome_grant_descriptions.csv
2
+ /entities.jsonl
dvc.lock CHANGED
@@ -7,9 +7,25 @@ stages:
7
  md5: 5c0d0e532709648b61625e7e130dfaa4
8
  size: 31028261
9
  - path: src/subset_data.py
10
- md5: 3b6059867baea4de020776bcfdc9c2a4
11
- size: 604
 
 
 
12
  outs:
13
  - path: data/processed/wellcome_grant_descriptions.csv
14
- md5: bb28282adc17ccd209ed370bc4557e40
15
- size: 1307583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  md5: 5c0d0e532709648b61625e7e130dfaa4
8
  size: 31028261
9
  - path: src/subset_data.py
10
+ md5: f4cffd497cb8341cf05728e89cbb0871
11
+ size: 1008
12
+ params:
13
+ params.yaml:
14
+ n_docs: 500
15
  outs:
16
  - path: data/processed/wellcome_grant_descriptions.csv
17
+ md5: 18dd6a7611d7f53b1067def7ba075cba
18
+ size: 644736
19
+ entities:
20
+ cmd: python src/process_docs.py
21
+ deps:
22
+ - path: data/processed/wellcome_grant_descriptions.csv
23
+ md5: 18dd6a7611d7f53b1067def7ba075cba
24
+ size: 644736
25
+ - path: src/process_docs.py
26
+ md5: 1f570b1aa0f44b0bb131317c305deff5
27
+ size: 1309
28
+ outs:
29
+ - path: data/processed/entities.jsonl
30
+ md5: 26846cdd657a516281b24c376a93f018
31
+ size: 214902
dvc.yaml CHANGED
@@ -4,5 +4,14 @@ stages:
4
  deps:
5
  - src/subset_data.py
6
  - data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
 
 
7
  outs:
8
  - data/processed/wellcome_grant_descriptions.csv
 
 
 
 
 
 
 
 
4
  deps:
5
  - src/subset_data.py
6
  - data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
7
+ params:
8
+ - n_docs
9
  outs:
10
  - data/processed/wellcome_grant_descriptions.csv
11
+ entities:
12
+ cmd: python src/process_docs.py
13
+ deps:
14
+ - src/process_docs.py
15
+ - data/processed/wellcome_grant_descriptions.csv
16
+ outs:
17
+ - data/processed/entities.jsonl
params.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ n_docs: 500
requirements.txt CHANGED
@@ -14,10 +14,8 @@ confection==0.0.3
14
  cymem==2.0.6
15
  decorator==5.1.1
16
  entrypoints==0.4
17
- filelock==3.8.0
18
  gitdb==4.0.9
19
  GitPython==3.1.29
20
- huggingface-hub==0.10.1
21
  idna==3.4
22
  importlib-metadata==5.0.0
23
  importlib-resources==5.10.0
@@ -31,7 +29,7 @@ packaging==21.3
31
  pandas==1.5.0
32
  pathy==0.6.2
33
  Pillow==9.2.0
34
- pkgutil_resolve_name==1.3.10
35
  preshed==3.0.7
36
  protobuf==3.20.3
37
  pyarrow==9.0.0
@@ -44,8 +42,6 @@ pyrsistent==0.18.1
44
  python-dateutil==2.8.2
45
  pytz==2022.4
46
  pytz-deprecation-shim==0.1.0.post0
47
- PyYAML==6.0
48
- regex==2022.9.13
49
  requests==2.28.1
50
  rich==12.6.0
51
  semver==2.13.0
@@ -58,15 +54,12 @@ spacy-loggers==1.0.3
58
  srsly==2.4.4
59
  streamlit==1.13.0
60
  thinc==8.1.3
61
- tokenizers==0.13.1
62
  toml==0.10.2
63
  toolz==0.12.0
64
- torch==1.12.1
65
  tornado==6.2
66
  tqdm==4.64.1
67
- transformers==4.23.1
68
  typer==0.4.2
69
- typing_extensions==4.4.0
70
  tzdata==2022.4
71
  tzlocal==4.2
72
  urllib3==1.26.12
 
14
  cymem==2.0.6
15
  decorator==5.1.1
16
  entrypoints==0.4
 
17
  gitdb==4.0.9
18
  GitPython==3.1.29
 
19
  idna==3.4
20
  importlib-metadata==5.0.0
21
  importlib-resources==5.10.0
 
29
  pandas==1.5.0
30
  pathy==0.6.2
31
  Pillow==9.2.0
32
+ pkgutil-resolve-name==1.3.10
33
  preshed==3.0.7
34
  protobuf==3.20.3
35
  pyarrow==9.0.0
 
42
  python-dateutil==2.8.2
43
  pytz==2022.4
44
  pytz-deprecation-shim==0.1.0.post0
 
 
45
  requests==2.28.1
46
  rich==12.6.0
47
  semver==2.13.0
 
54
  srsly==2.4.4
55
  streamlit==1.13.0
56
  thinc==8.1.3
 
57
  toml==0.10.2
58
  toolz==0.12.0
 
59
  tornado==6.2
60
  tqdm==4.64.1
 
61
  typer==0.4.2
62
+ typing-extensions==4.4.0
63
  tzdata==2022.4
64
  tzlocal==4.2
65
  urllib3==1.26.12
src/process_docs.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+
3
+ import spacy
4
+ import srsly
5
+ import tqdm
6
+
7
+ nlp = spacy.load("en_core_web_trf")
8
+
9
+ INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
10
+ OUTPUT_FILE = "data/processed/entities.jsonl"
11
+ ENTITY_SUBSET = ["GPE", "LOC"]
12
+
13
+
14
+ def process_documents(input_file: str, output_file: str):
15
+
16
+ data = []
17
+
18
+ print(f"Reading data from {input_file}...")
19
+
20
+ with open(input_file, "r") as f:
21
+ reader = csv.reader(f)
22
+ next(reader)
23
+
24
+ for row in reader:
25
+ data.append(row[0])
26
+
27
+ print(f"Processing {len(data)} documents...")
28
+
29
+ entities = []
30
+
31
+ for doc_ in tqdm.tqdm(data):
32
+ doc = nlp(doc_)
33
+
34
+ # Get a list of found entities
35
+
36
+ ents = [
37
+ {
38
+ "text": ent.text,
39
+ "label": ent.label_,
40
+ "start": ent.start_char,
41
+ "end": ent.end_char,
42
+ }
43
+ for ent in doc.ents
44
+ if ent.label_ in ENTITY_SUBSET
45
+ ]
46
+
47
+ if ents:
48
+ entities.append(
49
+ {
50
+ "text": doc.text,
51
+ "ents": ents,
52
+ }
53
+ )
54
+
55
+ print(f"Writing {len(entities)} documents to {output_file}...")
56
+
57
+ srsly.write_jsonl(output_file, entities)
58
+
59
+
60
+ if __name__ == "__main__":
61
+ process_documents(INPUT_FILE, OUTPUT_FILE)
src/subset_data.py CHANGED
@@ -1,25 +1,41 @@
1
  import numpy as np
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
3
 
4
  INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
5
  OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
6
 
7
- print(f"Reading data from {INPUT_FILE}")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- data = pd.read_csv(INPUT_FILE)
 
11
 
12
- data = (
13
- data[["Description"]]
14
- .replace("Not available", np.nan)
15
- .dropna()
16
- .drop_duplicates()
17
- .reset_index(drop=True)
18
- .sample(1000)
19
- )
20
 
21
- print(f"Number of rows: {data.shape[0]}")
22
- print(f"Number of unique rows: {data['Description'].nunique()}")
23
 
24
- print(f"Saving file to {OUTPUT_FILE}")
25
- data.to_csv(OUTPUT_FILE, index=False)
 
 
1
  import numpy as np
2
  import pandas as pd
3
+ import yaml
4
+
5
+
6
+ def load_config(config_file: str) -> dict:
7
+ with open(config_file) as f:
8
+ config = yaml.safe_load(f)
9
+
10
+ return config
11
+
12
 
13
  INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
14
  OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
15
 
 
16
 
17
+ def subset_docs(input_file: str, output_file: str, sample: int):
18
+
19
+ print(f"Reading data from {input_file}")
20
+
21
+ data = pd.read_csv(input_file)
22
+
23
+ data = (
24
+ data[["Description"]]
25
+ .replace("Not available", np.nan)
26
+ .dropna()
27
+ .drop_duplicates()
28
+ .reset_index(drop=True)
29
+ .sample(sample)
30
+ )
31
 
32
+ print(f"Number of rows: {data.shape[0]}")
33
+ print(f"Number of unique rows: {data['Description'].nunique()}")
34
 
35
+ print(f"Saving file to {output_file}")
36
+ data.to_csv(output_file, index=False)
 
 
 
 
 
 
37
 
 
 
38
 
39
+ if __name__ == "__main__":
40
+ params = load_config("params.yaml")
41
+ subset_docs(INPUT_FILE, OUTPUT_FILE, sample=params["n_docs"])
unpinned_requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  streamlit
2
- spacy
 
 
1
  streamlit
2
+ spacy
3
+ pandas