mattupson commited on
Commit
59135d9
β€’
1 Parent(s): 6de1e1a

chg: Remove grants with PERSON ents

Browse files
Files changed (6) hide show
  1. README.md +5 -1
  2. app.py +10 -9
  3. data/processed/entities.jsonl +0 -0
  4. dvc.lock +4 -4
  5. params.yaml +1 -0
  6. src/process_docs.py +22 -8
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: LocationFinder
3
- emoji: πŸ¦€
4
  colorFrom: red
5
  colorTo: gray
6
  sdk: streamlit
@@ -10,3 +10,7 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
  title: LocationFinder
3
+ emoji: 🌍
4
  colorFrom: red
5
  colorTo: gray
6
  sdk: streamlit
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ Demo of Location Detection to accompany the blog [Extracting useful information from documents with Named Entity Recognition]()
15
+
16
+ Data from The Wellcome Trust are taken from data that are publishes openly at [360 Giving](https://data.threesixtygiving.org/). They are published under a [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license.
app.py CHANGED
@@ -27,8 +27,15 @@ def render_entities(doc, colors: dict, options: dict) -> str:
27
  return html
28
 
29
 
 
 
 
 
 
 
 
30
  st.header("Location Recognition Demo πŸ”ŽπŸŒ†πŸŒ")
31
- st.sidebar.header("Information β„Ή")
32
  st.sidebar.markdown(
33
  """
34
  This example application accompanies the blog post: [Extracting useful information from documents with Named Entity Recognition]().
@@ -37,17 +44,11 @@ The application will extract the following types of location entity:
37
 
38
  * __GPE__: Geopolitical entities (countries, cities, states)
39
  * __LOC__: Locations (mountains, rivers, lakes)
 
 
40
  """
41
  )
42
 
43
-
44
- def show_example(text):
45
- html = render_entities(doc, colors, options)
46
- st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
47
-
48
- return text
49
-
50
-
51
  if st.button("Show Wellcome example", key="text"):
52
  sample = random.choice(grants)
53
  text = st.text_area(
27
  return html
28
 
29
 
30
+ def show_example(text):
31
+ html = render_entities(doc, colors, options)
32
+ st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
33
+
34
+ return text
35
+
36
+
37
  st.header("Location Recognition Demo πŸ”ŽπŸŒ†πŸŒ")
38
+ st.sidebar.header("Information ℹ️ ")
39
  st.sidebar.markdown(
40
  """
41
  This example application accompanies the blog post: [Extracting useful information from documents with Named Entity Recognition]().
44
 
45
  * __GPE__: Geopolitical entities (countries, cities, states)
46
  * __LOC__: Locations (mountains, rivers, lakes)
47
+
48
+ This model will innevitably make some mistakes; it was trained on a large generic corpus of text, and the Wellcome Trust grant applications come from a very specific domain. We could improve this model by fine-tuning it on data from this domain.
49
  """
50
  )
51
 
 
 
 
 
 
 
 
 
52
  if st.button("Show Wellcome example", key="text"):
53
  sample = random.choice(grants)
54
  text = st.text_area(
data/processed/entities.jsonl CHANGED
The diff for this file is too large to render. See raw diff
dvc.lock CHANGED
@@ -23,9 +23,9 @@ stages:
23
  md5: 18dd6a7611d7f53b1067def7ba075cba
24
  size: 644736
25
  - path: src/process_docs.py
26
- md5: 1f570b1aa0f44b0bb131317c305deff5
27
- size: 1309
28
  outs:
29
  - path: data/processed/entities.jsonl
30
- md5: 26846cdd657a516281b24c376a93f018
31
- size: 214902
23
  md5: 18dd6a7611d7f53b1067def7ba075cba
24
  size: 644736
25
  - path: src/process_docs.py
26
+ md5: 54d0e1cf9a85cba745fe80206b7c71d0
27
+ size: 1723
28
  outs:
29
  - path: data/processed/entities.jsonl
30
+ md5: ca8a907b4d66d5541bc1b6b508abd7eb
31
+ size: 94862
params.yaml CHANGED
@@ -1 +1,2 @@
1
  n_docs: 500
 
1
  n_docs: 500
2
+ max_docs: 50
src/process_docs.py CHANGED
@@ -1,14 +1,19 @@
1
  import csv
 
2
 
3
  import spacy
4
  import srsly
5
  import tqdm
 
 
 
6
 
7
  nlp = spacy.load("en_core_web_trf")
8
 
9
  INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
10
  OUTPUT_FILE = "data/processed/entities.jsonl"
11
- ENTITY_SUBSET = ["GPE", "LOC"]
 
12
 
13
 
14
  def process_documents(input_file: str, output_file: str):
@@ -41,16 +46,25 @@ def process_documents(input_file: str, output_file: str):
41
  "end": ent.end_char,
42
  }
43
  for ent in doc.ents
44
- if ent.label_ in ENTITY_SUBSET
45
  ]
46
 
47
  if ents:
48
- entities.append(
49
- {
50
- "text": doc.text,
51
- "ents": ents,
52
- }
53
- )
 
 
 
 
 
 
 
 
 
 
54
 
55
  print(f"Writing {len(entities)} documents to {output_file}...")
56
 
1
  import csv
2
+ import random
3
 
4
  import spacy
5
  import srsly
6
  import tqdm
7
+ import yaml
8
+
9
+ params = yaml.safe_load(open("params.yaml"))
10
 
11
  nlp = spacy.load("en_core_web_trf")
12
 
13
  INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
14
  OUTPUT_FILE = "data/processed/entities.jsonl"
15
+ INCLUDE_ENTS = {"GPE", "LOC"}
16
+ EXCLUDE_ENTS = {"PERSON"}
17
 
18
 
19
  def process_documents(input_file: str, output_file: str):
46
  "end": ent.end_char,
47
  }
48
  for ent in doc.ents
 
49
  ]
50
 
51
  if ents:
52
+ found_ents = set([ent["label"] for ent in ents])
53
+
54
+ if found_ents.intersection(INCLUDE_ENTS) and not found_ents.intersection(
55
+ EXCLUDE_ENTS
56
+ ):
57
+ entities.append(
58
+ {
59
+ "text": doc.text,
60
+ "ents": ents,
61
+ }
62
+ )
63
+
64
+ print(f"Randomly selecting {params['max_docs']} documents...")
65
+
66
+ random.shuffle(entities)
67
+ entities = entities[: params["max_docs"]]
68
 
69
  print(f"Writing {len(entities)} documents to {output_file}...")
70