Spaces:

mantisnlp
/

LocationFinder

Runtime error

mattupson commited on Oct 12, 2022

Commit

59135d9

unverified ·

1 Parent(s): 6de1e1a

chg: Remove grants with PERSON ents

Files changed (6) hide show

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: LocationFinder
-emoji: 🦀
 colorFrom: red
 colorTo: gray
 sdk: streamlit
@@ -10,3 +10,7 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: LocationFinder
+emoji: 🌍
 colorFrom: red
 colorTo: gray
 sdk: streamlit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Demo of Location Detection to accompany the blog [Extracting useful information from documents with Named Entity Recognition]()
+Data from The Wellcome Trust are taken from data that are publishes openly at [360 Giving](https://data.threesixtygiving.org/). They are published under a [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license.

app.py CHANGED Viewed

@@ -27,8 +27,15 @@ def render_entities(doc, colors: dict, options: dict) -> str:
     return html
 st.header("Location Recognition Demo 🔎🌆🌍")
-st.sidebar.header("Information ℹ")
 st.sidebar.markdown(
     """
 This example application accompanies the blog post: [Extracting useful information from documents with Named Entity Recognition]().
@@ -37,17 +44,11 @@ The application will extract the following types of location entity:
 * __GPE__: Geopolitical entities (countries, cities, states)
 * __LOC__: Locations (mountains, rivers, lakes)
 """
 )
-def show_example(text):
-    html = render_entities(doc, colors, options)
-    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
-    return text
 if st.button("Show Wellcome example", key="text"):
     sample = random.choice(grants)
     text = st.text_area(

     return html
+def show_example(text):
+    html = render_entities(doc, colors, options)
+    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
+    return text
 st.header("Location Recognition Demo 🔎🌆🌍")
+st.sidebar.header("Information ℹ️ ")
 st.sidebar.markdown(
     """
 This example application accompanies the blog post: [Extracting useful information from documents with Named Entity Recognition]().
 * __GPE__: Geopolitical entities (countries, cities, states)
 * __LOC__: Locations (mountains, rivers, lakes)
+This model will innevitably make some mistakes; it was trained on a large generic corpus of text, and the Wellcome Trust grant applications come from a very specific domain. We could improve this model by fine-tuning it on data from this domain.
 """
 )
 if st.button("Show Wellcome example", key="text"):
     sample = random.choice(grants)
     text = st.text_area(

data/processed/entities.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff

dvc.lock CHANGED Viewed

@@ -23,9 +23,9 @@ stages:
       md5: 18dd6a7611d7f53b1067def7ba075cba
       size: 644736
     - path: src/process_docs.py
-      md5: 1f570b1aa0f44b0bb131317c305deff5
-      size: 1309
     outs:
     - path: data/processed/entities.jsonl
-      md5: 26846cdd657a516281b24c376a93f018
-      size: 214902

       md5: 18dd6a7611d7f53b1067def7ba075cba
       size: 644736
     - path: src/process_docs.py
+      md5: 54d0e1cf9a85cba745fe80206b7c71d0
+      size: 1723
     outs:
     - path: data/processed/entities.jsonl
+      md5: ca8a907b4d66d5541bc1b6b508abd7eb
+      size: 94862

params.yaml CHANGED Viewed

src/process_docs.py CHANGED Viewed

@@ -1,14 +1,19 @@
 import csv
 import spacy
 import srsly
 import tqdm
 nlp = spacy.load("en_core_web_trf")
 INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
 OUTPUT_FILE = "data/processed/entities.jsonl"
-ENTITY_SUBSET = ["GPE", "LOC"]
 def process_documents(input_file: str, output_file: str):
@@ -41,16 +46,25 @@ def process_documents(input_file: str, output_file: str):
                 "end": ent.end_char,
             }
             for ent in doc.ents
-            if ent.label_ in ENTITY_SUBSET
         ]
         if ents:
-            entities.append(
-                {
-                    "text": doc.text,
-                    "ents": ents,
-                }
-            )
     print(f"Writing {len(entities)} documents to {output_file}...")

 import csv
+import random
 import spacy
 import srsly
 import tqdm
+import yaml
+params = yaml.safe_load(open("params.yaml"))
 nlp = spacy.load("en_core_web_trf")
 INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
 OUTPUT_FILE = "data/processed/entities.jsonl"
+INCLUDE_ENTS = {"GPE", "LOC"}
+EXCLUDE_ENTS = {"PERSON"}
 def process_documents(input_file: str, output_file: str):
                 "end": ent.end_char,
             }
             for ent in doc.ents
         ]
         if ents:
+            found_ents = set([ent["label"] for ent in ents])
+            if found_ents.intersection(INCLUDE_ENTS) and not found_ents.intersection(
+                EXCLUDE_ENTS
+            ):
+                entities.append(
+                    {
+                        "text": doc.text,
+                        "ents": ents,
+                    }
+                )
+    print(f"Randomly selecting {params['max_docs']} documents...")
+    random.shuffle(entities)
+    entities = entities[: params["max_docs"]]
     print(f"Writing {len(entities)} documents to {output_file}...")