Spaces:
Runtime error
Runtime error
chg: Remove grants with PERSON ents
Browse files- README.md +5 -1
- app.py +10 -9
- data/processed/entities.jsonl +0 -0
- dvc.lock +4 -4
- params.yaml +1 -0
- src/process_docs.py +22 -8
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: LocationFinder
|
3 |
-
emoji:
|
4 |
colorFrom: red
|
5 |
colorTo: gray
|
6 |
sdk: streamlit
|
@@ -10,3 +10,7 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: LocationFinder
|
3 |
+
emoji: π
|
4 |
colorFrom: red
|
5 |
colorTo: gray
|
6 |
sdk: streamlit
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
|
14 |
+
Demo of Location Detection to accompany the blog [Extracting useful information from documents with Named Entity Recognition]()
|
15 |
+
|
16 |
+
Data from The Wellcome Trust are taken from data that are publishes openly at [360 Giving](https://data.threesixtygiving.org/). They are published under a [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license.
|
app.py
CHANGED
@@ -27,8 +27,15 @@ def render_entities(doc, colors: dict, options: dict) -> str:
|
|
27 |
return html
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
st.header("Location Recognition Demo πππ")
|
31 |
-
st.sidebar.header("Information
|
32 |
st.sidebar.markdown(
|
33 |
"""
|
34 |
This example application accompanies the blog post: [Extracting useful information from documents with Named Entity Recognition]().
|
@@ -37,17 +44,11 @@ The application will extract the following types of location entity:
|
|
37 |
|
38 |
* __GPE__: Geopolitical entities (countries, cities, states)
|
39 |
* __LOC__: Locations (mountains, rivers, lakes)
|
|
|
|
|
40 |
"""
|
41 |
)
|
42 |
|
43 |
-
|
44 |
-
def show_example(text):
|
45 |
-
html = render_entities(doc, colors, options)
|
46 |
-
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
47 |
-
|
48 |
-
return text
|
49 |
-
|
50 |
-
|
51 |
if st.button("Show Wellcome example", key="text"):
|
52 |
sample = random.choice(grants)
|
53 |
text = st.text_area(
|
|
|
27 |
return html
|
28 |
|
29 |
|
30 |
+
def show_example(text):
|
31 |
+
html = render_entities(doc, colors, options)
|
32 |
+
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
33 |
+
|
34 |
+
return text
|
35 |
+
|
36 |
+
|
37 |
st.header("Location Recognition Demo πππ")
|
38 |
+
st.sidebar.header("Information βΉοΈ ")
|
39 |
st.sidebar.markdown(
|
40 |
"""
|
41 |
This example application accompanies the blog post: [Extracting useful information from documents with Named Entity Recognition]().
|
|
|
44 |
|
45 |
* __GPE__: Geopolitical entities (countries, cities, states)
|
46 |
* __LOC__: Locations (mountains, rivers, lakes)
|
47 |
+
|
48 |
+
This model will innevitably make some mistakes; it was trained on a large generic corpus of text, and the Wellcome Trust grant applications come from a very specific domain. We could improve this model by fine-tuning it on data from this domain.
|
49 |
"""
|
50 |
)
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
if st.button("Show Wellcome example", key="text"):
|
53 |
sample = random.choice(grants)
|
54 |
text = st.text_area(
|
data/processed/entities.jsonl
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
dvc.lock
CHANGED
@@ -23,9 +23,9 @@ stages:
|
|
23 |
md5: 18dd6a7611d7f53b1067def7ba075cba
|
24 |
size: 644736
|
25 |
- path: src/process_docs.py
|
26 |
-
md5:
|
27 |
-
size:
|
28 |
outs:
|
29 |
- path: data/processed/entities.jsonl
|
30 |
-
md5:
|
31 |
-
size:
|
|
|
23 |
md5: 18dd6a7611d7f53b1067def7ba075cba
|
24 |
size: 644736
|
25 |
- path: src/process_docs.py
|
26 |
+
md5: 54d0e1cf9a85cba745fe80206b7c71d0
|
27 |
+
size: 1723
|
28 |
outs:
|
29 |
- path: data/processed/entities.jsonl
|
30 |
+
md5: ca8a907b4d66d5541bc1b6b508abd7eb
|
31 |
+
size: 94862
|
params.yaml
CHANGED
@@ -1 +1,2 @@
|
|
1 |
n_docs: 500
|
|
|
|
1 |
n_docs: 500
|
2 |
+
max_docs: 50
|
src/process_docs.py
CHANGED
@@ -1,14 +1,19 @@
|
|
1 |
import csv
|
|
|
2 |
|
3 |
import spacy
|
4 |
import srsly
|
5 |
import tqdm
|
|
|
|
|
|
|
6 |
|
7 |
nlp = spacy.load("en_core_web_trf")
|
8 |
|
9 |
INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
|
10 |
OUTPUT_FILE = "data/processed/entities.jsonl"
|
11 |
-
|
|
|
12 |
|
13 |
|
14 |
def process_documents(input_file: str, output_file: str):
|
@@ -41,16 +46,25 @@ def process_documents(input_file: str, output_file: str):
|
|
41 |
"end": ent.end_char,
|
42 |
}
|
43 |
for ent in doc.ents
|
44 |
-
if ent.label_ in ENTITY_SUBSET
|
45 |
]
|
46 |
|
47 |
if ents:
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
print(f"Writing {len(entities)} documents to {output_file}...")
|
56 |
|
|
|
1 |
import csv
|
2 |
+
import random
|
3 |
|
4 |
import spacy
|
5 |
import srsly
|
6 |
import tqdm
|
7 |
+
import yaml
|
8 |
+
|
9 |
+
params = yaml.safe_load(open("params.yaml"))
|
10 |
|
11 |
nlp = spacy.load("en_core_web_trf")
|
12 |
|
13 |
INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
|
14 |
OUTPUT_FILE = "data/processed/entities.jsonl"
|
15 |
+
INCLUDE_ENTS = {"GPE", "LOC"}
|
16 |
+
EXCLUDE_ENTS = {"PERSON"}
|
17 |
|
18 |
|
19 |
def process_documents(input_file: str, output_file: str):
|
|
|
46 |
"end": ent.end_char,
|
47 |
}
|
48 |
for ent in doc.ents
|
|
|
49 |
]
|
50 |
|
51 |
if ents:
|
52 |
+
found_ents = set([ent["label"] for ent in ents])
|
53 |
+
|
54 |
+
if found_ents.intersection(INCLUDE_ENTS) and not found_ents.intersection(
|
55 |
+
EXCLUDE_ENTS
|
56 |
+
):
|
57 |
+
entities.append(
|
58 |
+
{
|
59 |
+
"text": doc.text,
|
60 |
+
"ents": ents,
|
61 |
+
}
|
62 |
+
)
|
63 |
+
|
64 |
+
print(f"Randomly selecting {params['max_docs']} documents...")
|
65 |
+
|
66 |
+
random.shuffle(entities)
|
67 |
+
entities = entities[: params["max_docs"]]
|
68 |
|
69 |
print(f"Writing {len(entities)} documents to {output_file}...")
|
70 |
|