Spaces:
Runtime error
Runtime error
chg: Extract locations from Wellcome examples
Browse files- .dvc/.gitignore +3 -0
- .dvc/config +5 -0
- .dvcignore +3 -0
- .gitignore +2 -1
- app.py +32 -24
- data/processed/.gitignore +1 -0
- dvc.lock +20 -4
- dvc.yaml +9 -0
- params.yaml +1 -0
- requirements.txt +2 -9
- src/process_docs.py +61 -0
- src/subset_data.py +30 -14
- unpinned_requirements.txt +2 -1
.dvc/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
/config.local
|
2 |
+
/tmp
|
3 |
+
/cache
|
.dvc/config
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[core]
|
2 |
+
remote = s3
|
3 |
+
autostage = true
|
4 |
+
['remote "s3"']
|
5 |
+
url = s3://mantisnlp-blogs/ner
|
.dvcignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Add patterns of files dvc should ignore, which could improve
|
2 |
+
# the performance. Learn more at
|
3 |
+
# https://dvc.org/doc/user-guide/dvcignore
|
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
__pycache__/
|
2 |
-
images/
|
|
|
|
1 |
__pycache__/
|
2 |
+
images/
|
3 |
+
.venv/
|
app.py
CHANGED
@@ -1,40 +1,48 @@
|
|
|
|
|
|
1 |
import spacy
|
|
|
2 |
import streamlit as st
|
3 |
|
4 |
-
|
5 |
-
|
6 |
|
7 |
-
|
8 |
-
colors = {"LOCATION": "#5cff84"}
|
9 |
-
options = {"ents": ["LOCATION"], "colors": colors}
|
10 |
-
html = spacy.displacy.render(entities, style="ent", options=options, manual=True)
|
11 |
-
html = html.replace("\n", " ")
|
12 |
|
13 |
-
|
14 |
|
|
|
|
|
15 |
|
16 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
17 |
|
18 |
-
st.header("Location Entity Recognition Demo 🔎🌆🌍")
|
19 |
-
threshold = st.sidebar.slider("Threshold", value=0.5, min_value=0.0, max_value=1.0)
|
20 |
-
display_probabilities = st.sidebar.checkbox("Display probabilities")
|
21 |
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
doc = nlp(text)
|
28 |
|
29 |
-
|
30 |
-
{"start": ent.start_char, "end": ent.end_char, "label": "LOCATION"}
|
31 |
-
for ent in doc.ents
|
32 |
-
]
|
33 |
-
foo = {"text": text, "ents": ents}
|
34 |
|
35 |
|
36 |
-
|
37 |
-
print(doc.ents)
|
38 |
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
import spacy
|
4 |
+
import srsly
|
5 |
import streamlit as st
|
6 |
|
7 |
+
nlp = spacy.load("en_core_web_trf")
|
|
|
8 |
|
9 |
+
# Load pre-processed grants from disk.
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
grants = list(srsly.read_jsonl("data/processed/entities.jsonl"))
|
12 |
|
13 |
+
colors = {"GPE": "#5cff84", "LOC": "#5cff84"}
|
14 |
+
options = {"ents": ["GPE", "LOC"], "colors": colors}
|
15 |
|
16 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
17 |
|
|
|
|
|
|
|
18 |
|
19 |
+
def render_entities(doc, colors: dict, options: dict) -> str:
|
20 |
+
"""
|
21 |
+
Takes a SpaCy doc
|
22 |
+
"""
|
23 |
|
24 |
+
#if isinstance(doc, spacy.tokens.doc.Doc):
|
25 |
+
# doc = doc.to_json()
|
26 |
|
27 |
+
html = spacy.displacy.render(doc, style="ent", options=options)
|
28 |
+
html = html.replace("\n", " ")
|
|
|
29 |
|
30 |
+
return html
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
+
st.header("Location Entity Recognition Demo 🔎🌆🌍")
|
|
|
34 |
|
35 |
+
st.subheader("Look for Locations")
|
36 |
+
|
37 |
+
if st.button("Show new example", key="text"):
|
38 |
+
sample = random.choice(grants)
|
39 |
+
doc = nlp(sample["text"])
|
40 |
+
html = render_entities(doc, colors, options)
|
41 |
+
text = st.text_area("Text input", value=sample["text"], height=200)
|
42 |
+
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
43 |
+
else:
|
44 |
+
sample = random.choice(grants)
|
45 |
+
doc = nlp(sample["text"])
|
46 |
+
html = render_entities(doc, colors, options)
|
47 |
+
text = st.text_area("Text input", value=sample["text"], height=200, help="Enter text here and click the 'Find Locations' button to search for entities.")
|
48 |
+
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
data/processed/.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
/wellcome_grant_descriptions.csv
|
|
|
|
1 |
/wellcome_grant_descriptions.csv
|
2 |
+
/entities.jsonl
|
dvc.lock
CHANGED
@@ -7,9 +7,25 @@ stages:
|
|
7 |
md5: 5c0d0e532709648b61625e7e130dfaa4
|
8 |
size: 31028261
|
9 |
- path: src/subset_data.py
|
10 |
-
md5:
|
11 |
-
size:
|
|
|
|
|
|
|
12 |
outs:
|
13 |
- path: data/processed/wellcome_grant_descriptions.csv
|
14 |
-
md5:
|
15 |
-
size:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
md5: 5c0d0e532709648b61625e7e130dfaa4
|
8 |
size: 31028261
|
9 |
- path: src/subset_data.py
|
10 |
+
md5: f4cffd497cb8341cf05728e89cbb0871
|
11 |
+
size: 1008
|
12 |
+
params:
|
13 |
+
params.yaml:
|
14 |
+
n_docs: 500
|
15 |
outs:
|
16 |
- path: data/processed/wellcome_grant_descriptions.csv
|
17 |
+
md5: 18dd6a7611d7f53b1067def7ba075cba
|
18 |
+
size: 644736
|
19 |
+
entities:
|
20 |
+
cmd: python src/process_docs.py
|
21 |
+
deps:
|
22 |
+
- path: data/processed/wellcome_grant_descriptions.csv
|
23 |
+
md5: 18dd6a7611d7f53b1067def7ba075cba
|
24 |
+
size: 644736
|
25 |
+
- path: src/process_docs.py
|
26 |
+
md5: 1f570b1aa0f44b0bb131317c305deff5
|
27 |
+
size: 1309
|
28 |
+
outs:
|
29 |
+
- path: data/processed/entities.jsonl
|
30 |
+
md5: 26846cdd657a516281b24c376a93f018
|
31 |
+
size: 214902
|
dvc.yaml
CHANGED
@@ -4,5 +4,14 @@ stages:
|
|
4 |
deps:
|
5 |
- src/subset_data.py
|
6 |
- data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
|
|
|
|
|
7 |
outs:
|
8 |
- data/processed/wellcome_grant_descriptions.csv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
deps:
|
5 |
- src/subset_data.py
|
6 |
- data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
|
7 |
+
params:
|
8 |
+
- n_docs
|
9 |
outs:
|
10 |
- data/processed/wellcome_grant_descriptions.csv
|
11 |
+
entities:
|
12 |
+
cmd: python src/process_docs.py
|
13 |
+
deps:
|
14 |
+
- src/process_docs.py
|
15 |
+
- data/processed/wellcome_grant_descriptions.csv
|
16 |
+
outs:
|
17 |
+
- data/processed/entities.jsonl
|
params.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
n_docs: 500
|
requirements.txt
CHANGED
@@ -14,10 +14,8 @@ confection==0.0.3
|
|
14 |
cymem==2.0.6
|
15 |
decorator==5.1.1
|
16 |
entrypoints==0.4
|
17 |
-
filelock==3.8.0
|
18 |
gitdb==4.0.9
|
19 |
GitPython==3.1.29
|
20 |
-
huggingface-hub==0.10.1
|
21 |
idna==3.4
|
22 |
importlib-metadata==5.0.0
|
23 |
importlib-resources==5.10.0
|
@@ -31,7 +29,7 @@ packaging==21.3
|
|
31 |
pandas==1.5.0
|
32 |
pathy==0.6.2
|
33 |
Pillow==9.2.0
|
34 |
-
|
35 |
preshed==3.0.7
|
36 |
protobuf==3.20.3
|
37 |
pyarrow==9.0.0
|
@@ -44,8 +42,6 @@ pyrsistent==0.18.1
|
|
44 |
python-dateutil==2.8.2
|
45 |
pytz==2022.4
|
46 |
pytz-deprecation-shim==0.1.0.post0
|
47 |
-
PyYAML==6.0
|
48 |
-
regex==2022.9.13
|
49 |
requests==2.28.1
|
50 |
rich==12.6.0
|
51 |
semver==2.13.0
|
@@ -58,15 +54,12 @@ spacy-loggers==1.0.3
|
|
58 |
srsly==2.4.4
|
59 |
streamlit==1.13.0
|
60 |
thinc==8.1.3
|
61 |
-
tokenizers==0.13.1
|
62 |
toml==0.10.2
|
63 |
toolz==0.12.0
|
64 |
-
torch==1.12.1
|
65 |
tornado==6.2
|
66 |
tqdm==4.64.1
|
67 |
-
transformers==4.23.1
|
68 |
typer==0.4.2
|
69 |
-
|
70 |
tzdata==2022.4
|
71 |
tzlocal==4.2
|
72 |
urllib3==1.26.12
|
|
|
14 |
cymem==2.0.6
|
15 |
decorator==5.1.1
|
16 |
entrypoints==0.4
|
|
|
17 |
gitdb==4.0.9
|
18 |
GitPython==3.1.29
|
|
|
19 |
idna==3.4
|
20 |
importlib-metadata==5.0.0
|
21 |
importlib-resources==5.10.0
|
|
|
29 |
pandas==1.5.0
|
30 |
pathy==0.6.2
|
31 |
Pillow==9.2.0
|
32 |
+
pkgutil-resolve-name==1.3.10
|
33 |
preshed==3.0.7
|
34 |
protobuf==3.20.3
|
35 |
pyarrow==9.0.0
|
|
|
42 |
python-dateutil==2.8.2
|
43 |
pytz==2022.4
|
44 |
pytz-deprecation-shim==0.1.0.post0
|
|
|
|
|
45 |
requests==2.28.1
|
46 |
rich==12.6.0
|
47 |
semver==2.13.0
|
|
|
54 |
srsly==2.4.4
|
55 |
streamlit==1.13.0
|
56 |
thinc==8.1.3
|
|
|
57 |
toml==0.10.2
|
58 |
toolz==0.12.0
|
|
|
59 |
tornado==6.2
|
60 |
tqdm==4.64.1
|
|
|
61 |
typer==0.4.2
|
62 |
+
typing-extensions==4.4.0
|
63 |
tzdata==2022.4
|
64 |
tzlocal==4.2
|
65 |
urllib3==1.26.12
|
src/process_docs.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
|
3 |
+
import spacy
|
4 |
+
import srsly
|
5 |
+
import tqdm
|
6 |
+
|
7 |
+
nlp = spacy.load("en_core_web_trf")
|
8 |
+
|
9 |
+
INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
|
10 |
+
OUTPUT_FILE = "data/processed/entities.jsonl"
|
11 |
+
ENTITY_SUBSET = ["GPE", "LOC"]
|
12 |
+
|
13 |
+
|
14 |
+
def process_documents(input_file: str, output_file: str):
|
15 |
+
|
16 |
+
data = []
|
17 |
+
|
18 |
+
print(f"Reading data from {input_file}...")
|
19 |
+
|
20 |
+
with open(input_file, "r") as f:
|
21 |
+
reader = csv.reader(f)
|
22 |
+
next(reader)
|
23 |
+
|
24 |
+
for row in reader:
|
25 |
+
data.append(row[0])
|
26 |
+
|
27 |
+
print(f"Processing {len(data)} documents...")
|
28 |
+
|
29 |
+
entities = []
|
30 |
+
|
31 |
+
for doc_ in tqdm.tqdm(data):
|
32 |
+
doc = nlp(doc_)
|
33 |
+
|
34 |
+
# Get a list of found entities
|
35 |
+
|
36 |
+
ents = [
|
37 |
+
{
|
38 |
+
"text": ent.text,
|
39 |
+
"label": ent.label_,
|
40 |
+
"start": ent.start_char,
|
41 |
+
"end": ent.end_char,
|
42 |
+
}
|
43 |
+
for ent in doc.ents
|
44 |
+
if ent.label_ in ENTITY_SUBSET
|
45 |
+
]
|
46 |
+
|
47 |
+
if ents:
|
48 |
+
entities.append(
|
49 |
+
{
|
50 |
+
"text": doc.text,
|
51 |
+
"ents": ents,
|
52 |
+
}
|
53 |
+
)
|
54 |
+
|
55 |
+
print(f"Writing {len(entities)} documents to {output_file}...")
|
56 |
+
|
57 |
+
srsly.write_jsonl(output_file, entities)
|
58 |
+
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
process_documents(INPUT_FILE, OUTPUT_FILE)
|
src/subset_data.py
CHANGED
@@ -1,25 +1,41 @@
|
|
1 |
import numpy as np
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
|
5 |
OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
|
6 |
|
7 |
-
print(f"Reading data from {INPUT_FILE}")
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
|
|
11 |
|
12 |
-
|
13 |
-
data
|
14 |
-
.replace("Not available", np.nan)
|
15 |
-
.dropna()
|
16 |
-
.drop_duplicates()
|
17 |
-
.reset_index(drop=True)
|
18 |
-
.sample(1000)
|
19 |
-
)
|
20 |
|
21 |
-
print(f"Number of rows: {data.shape[0]}")
|
22 |
-
print(f"Number of unique rows: {data['Description'].nunique()}")
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
1 |
import numpy as np
|
2 |
import pandas as pd
|
3 |
+
import yaml
|
4 |
+
|
5 |
+
|
6 |
+
def load_config(config_file: str) -> dict:
|
7 |
+
with open(config_file) as f:
|
8 |
+
config = yaml.safe_load(f)
|
9 |
+
|
10 |
+
return config
|
11 |
+
|
12 |
|
13 |
INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
|
14 |
OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
|
15 |
|
|
|
16 |
|
17 |
+
def subset_docs(input_file: str, output_file: str, sample: int):
|
18 |
+
|
19 |
+
print(f"Reading data from {input_file}")
|
20 |
+
|
21 |
+
data = pd.read_csv(input_file)
|
22 |
+
|
23 |
+
data = (
|
24 |
+
data[["Description"]]
|
25 |
+
.replace("Not available", np.nan)
|
26 |
+
.dropna()
|
27 |
+
.drop_duplicates()
|
28 |
+
.reset_index(drop=True)
|
29 |
+
.sample(sample)
|
30 |
+
)
|
31 |
|
32 |
+
print(f"Number of rows: {data.shape[0]}")
|
33 |
+
print(f"Number of unique rows: {data['Description'].nunique()}")
|
34 |
|
35 |
+
print(f"Saving file to {output_file}")
|
36 |
+
data.to_csv(output_file, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
|
|
|
|
38 |
|
39 |
+
if __name__ == "__main__":
|
40 |
+
params = load_config("params.yaml")
|
41 |
+
subset_docs(INPUT_FILE, OUTPUT_FILE, sample=params["n_docs"])
|
unpinned_requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
streamlit
|
2 |
-
spacy
|
|
|
|
1 |
streamlit
|
2 |
+
spacy
|
3 |
+
pandas
|