Spaces:

mantisnlp
/

LocationFinder

Runtime error

App Files Files Community

mattupson commited on Oct 12, 2022

Commit

b8d16b2

•

1 Parent(s): eb457d3

chg: Extract locations from Wellcome examples

Browse files

Files changed (13) hide show

.dvc/.gitignore +3 -0
.dvc/config +5 -0
.dvcignore +3 -0
.gitignore +2 -1
app.py +32 -24
data/processed/.gitignore +1 -0
dvc.lock +20 -4
dvc.yaml +9 -0
params.yaml +1 -0
requirements.txt +2 -9
src/process_docs.py +61 -0
src/subset_data.py +30 -14
unpinned_requirements.txt +2 -1

.dvc/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/config.local
+/tmp
+/cache

.dvc/config ADDED Viewed

	@@ -0,0 +1,5 @@

+[core]
+    remote = s3
+    autostage = true
+['remote "s3"']
+    url = s3://mantisnlp-blogs/ner

.dvcignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 __pycache__/
-images/

 __pycache__/
+images/
+.venv/

app.py CHANGED Viewed

@@ -1,40 +1,48 @@
 import spacy
 import streamlit as st
-# from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
-def render_entities(entities):
-    colors = {"LOCATION": "#5cff84"}
-    options = {"ents": ["LOCATION"], "colors": colors}
-    html = spacy.displacy.render(entities, style="ent", options=options, manual=True)
-    html = html.replace("\n", " ")
-    return html
 HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
-st.header("Location Entity Recognition Demo 🔎🌆🌍")
-threshold = st.sidebar.slider("Threshold", value=0.5, min_value=0.0, max_value=1.0)
-display_probabilities = st.sidebar.checkbox("Display probabilities")
-text = st.text_area("Text input", value="This text is about Malaria", height=400)
-nlp = spacy.load("en_core_web_trf")
-doc = nlp(text)
-ents = [
-    {"start": ent.start_char, "end": ent.end_char, "label": "LOCATION"}
-    for ent in doc.ents
-]
-foo = {"text": text, "ents": ents}
-print(ents)
-print(doc.ents)
-html = render_entities(foo)
-st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)

+import random
 import spacy
+import srsly
 import streamlit as st
+nlp = spacy.load("en_core_web_trf")
+# Load pre-processed grants from disk.
+grants = list(srsly.read_jsonl("data/processed/entities.jsonl"))
+colors = {"GPE": "#5cff84", "LOC": "#5cff84"}
+options = {"ents": ["GPE", "LOC"], "colors": colors}
 HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
+def render_entities(doc, colors: dict, options: dict) -> str:
+    """
+    Takes a SpaCy doc
+    """
+    #if isinstance(doc, spacy.tokens.doc.Doc):
+    #    doc = doc.to_json()
+    html = spacy.displacy.render(doc, style="ent", options=options)
+    html = html.replace("\n", " ")
+    return html
+st.header("Location Entity Recognition Demo 🔎🌆🌍")
+st.subheader("Look for Locations")
+if st.button("Show new example", key="text"):
+    sample = random.choice(grants)
+    doc = nlp(sample["text"])
+    html = render_entities(doc, colors, options)
+    text = st.text_area("Text input", value=sample["text"], height=200)
+    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
+else:
+    sample = random.choice(grants)
+    doc = nlp(sample["text"])
+    html = render_entities(doc, colors, options)
+    text = st.text_area("Text input", value=sample["text"], height=200, help="Enter text here and click the 'Find Locations' button to search for entities.")
+    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)

data/processed/.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	/wellcome_grant_descriptions.csv


1	/wellcome_grant_descriptions.csv
2	+ /entities.jsonl

dvc.lock CHANGED Viewed

@@ -7,9 +7,25 @@ stages:
       md5: 5c0d0e532709648b61625e7e130dfaa4
       size: 31028261
     - path: src/subset_data.py
-      md5: 3b6059867baea4de020776bcfdc9c2a4
-      size: 604
     outs:
     - path: data/processed/wellcome_grant_descriptions.csv
-      md5: bb28282adc17ccd209ed370bc4557e40
-      size: 1307583

       md5: 5c0d0e532709648b61625e7e130dfaa4
       size: 31028261
     - path: src/subset_data.py
+      md5: f4cffd497cb8341cf05728e89cbb0871
+      size: 1008
+    params:
+      params.yaml:
+        n_docs: 500
     outs:
     - path: data/processed/wellcome_grant_descriptions.csv
+      md5: 18dd6a7611d7f53b1067def7ba075cba
+      size: 644736
+  entities:
+    cmd: python src/process_docs.py
+    deps:
+    - path: data/processed/wellcome_grant_descriptions.csv
+      md5: 18dd6a7611d7f53b1067def7ba075cba
+      size: 644736
+    - path: src/process_docs.py
+      md5: 1f570b1aa0f44b0bb131317c305deff5
+      size: 1309
+    outs:
+    - path: data/processed/entities.jsonl
+      md5: 26846cdd657a516281b24c376a93f018
+      size: 214902

dvc.yaml CHANGED Viewed

@@ -4,5 +4,14 @@ stages:
     deps:
       - src/subset_data.py
       - data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
     outs:
       - data/processed/wellcome_grant_descriptions.csv

     deps:
       - src/subset_data.py
       - data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv
+    params:
+      - n_docs
     outs:
       - data/processed/wellcome_grant_descriptions.csv
+  entities:
+    cmd: python src/process_docs.py
+    deps:
+      - src/process_docs.py
+      - data/processed/wellcome_grant_descriptions.csv
+    outs:
+      - data/processed/entities.jsonl

params.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ n_docs: 500

requirements.txt CHANGED Viewed

@@ -14,10 +14,8 @@ confection==0.0.3
 cymem==2.0.6
 decorator==5.1.1
 entrypoints==0.4
-filelock==3.8.0
 gitdb==4.0.9
 GitPython==3.1.29
-huggingface-hub==0.10.1
 idna==3.4
 importlib-metadata==5.0.0
 importlib-resources==5.10.0
@@ -31,7 +29,7 @@ packaging==21.3
 pandas==1.5.0
 pathy==0.6.2
 Pillow==9.2.0
-pkgutil_resolve_name==1.3.10
 preshed==3.0.7
 protobuf==3.20.3
 pyarrow==9.0.0
@@ -44,8 +42,6 @@ pyrsistent==0.18.1
 python-dateutil==2.8.2
 pytz==2022.4
 pytz-deprecation-shim==0.1.0.post0
-PyYAML==6.0
-regex==2022.9.13
 requests==2.28.1
 rich==12.6.0
 semver==2.13.0
@@ -58,15 +54,12 @@ spacy-loggers==1.0.3
 srsly==2.4.4
 streamlit==1.13.0
 thinc==8.1.3
-tokenizers==0.13.1
 toml==0.10.2
 toolz==0.12.0
-torch==1.12.1
 tornado==6.2
 tqdm==4.64.1
-transformers==4.23.1
 typer==0.4.2
-typing_extensions==4.4.0
 tzdata==2022.4
 tzlocal==4.2
 urllib3==1.26.12

 cymem==2.0.6
 decorator==5.1.1
 entrypoints==0.4
 gitdb==4.0.9
 GitPython==3.1.29
 idna==3.4
 importlib-metadata==5.0.0
 importlib-resources==5.10.0
 pandas==1.5.0
 pathy==0.6.2
 Pillow==9.2.0
+pkgutil-resolve-name==1.3.10
 preshed==3.0.7
 protobuf==3.20.3
 pyarrow==9.0.0
 python-dateutil==2.8.2
 pytz==2022.4
 pytz-deprecation-shim==0.1.0.post0
 requests==2.28.1
 rich==12.6.0
 semver==2.13.0
 srsly==2.4.4
 streamlit==1.13.0
 thinc==8.1.3
 toml==0.10.2
 toolz==0.12.0
 tornado==6.2
 tqdm==4.64.1
 typer==0.4.2
+typing-extensions==4.4.0
 tzdata==2022.4
 tzlocal==4.2
 urllib3==1.26.12

src/process_docs.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import csv
+import spacy
+import srsly
+import tqdm
+nlp = spacy.load("en_core_web_trf")
+INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
+OUTPUT_FILE = "data/processed/entities.jsonl"
+ENTITY_SUBSET = ["GPE", "LOC"]
+def process_documents(input_file: str, output_file: str):
+    data = []
+    print(f"Reading data from {input_file}...")
+    with open(input_file, "r") as f:
+        reader = csv.reader(f)
+        next(reader)
+        for row in reader:
+            data.append(row[0])
+    print(f"Processing {len(data)} documents...")
+    entities = []
+    for doc_ in tqdm.tqdm(data):
+        doc = nlp(doc_)
+        # Get a list of found entities
+        ents = [
+            {
+                "text": ent.text,
+                "label": ent.label_,
+                "start": ent.start_char,
+                "end": ent.end_char,
+            }
+            for ent in doc.ents
+            if ent.label_ in ENTITY_SUBSET
+        ]
+        if ents:
+            entities.append(
+                {
+                    "text": doc.text,
+                    "ents": ents,
+                }
+            )
+    print(f"Writing {len(entities)} documents to {output_file}...")
+    srsly.write_jsonl(output_file, entities)
+if __name__ == "__main__":
+    process_documents(INPUT_FILE, OUTPUT_FILE)

src/subset_data.py CHANGED Viewed

@@ -1,25 +1,41 @@
 import numpy as np
 import pandas as pd
 INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
 OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
-print(f"Reading data from {INPUT_FILE}")
-data = pd.read_csv(INPUT_FILE)
-data = (
-    data[["Description"]]
-    .replace("Not available", np.nan)
-    .dropna()
-    .drop_duplicates()
-    .reset_index(drop=True)
-    .sample(1000)
-)
-print(f"Number of rows: {data.shape[0]}")
-print(f"Number of unique rows: {data['Description'].nunique()}")
-print(f"Saving file to {OUTPUT_FILE}")
-data.to_csv(OUTPUT_FILE, index=False)

 import numpy as np
 import pandas as pd
+import yaml
+def load_config(config_file: str) -> dict:
+    with open(config_file) as f:
+        config = yaml.safe_load(f)
+    return config
 INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
 OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
+def subset_docs(input_file: str, output_file: str, sample: int):
+    print(f"Reading data from {input_file}")
+    data = pd.read_csv(input_file)
+    data = (
+        data[["Description"]]
+        .replace("Not available", np.nan)
+        .dropna()
+        .drop_duplicates()
+        .reset_index(drop=True)
+        .sample(sample)
+    )
+    print(f"Number of rows: {data.shape[0]}")
+    print(f"Number of unique rows: {data['Description'].nunique()}")
+    print(f"Saving file to {output_file}")
+    data.to_csv(output_file, index=False)
+if __name__ == "__main__":
+    params = load_config("params.yaml")
+    subset_docs(INPUT_FILE, OUTPUT_FILE, sample=params["n_docs"])

unpinned_requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 streamlit
-spacy

 streamlit
+spacy
+pandas