import random import spacy import srsly import streamlit as st nlp = spacy.load("en_core_web_trf") # Load pre-processed grants from disk. grants = list(srsly.read_jsonl("data/processed/entities.jsonl")) colors = {"GPE": "#5cff84", "LOC": "#5cff84"} options = {"ents": ["GPE", "LOC"], "colors": colors} HTML_WRAPPER = """
{}
""" def render_entities(doc, colors: dict, options: dict) -> str: """ Takes a SpaCy doc and renders the entities with the given colors. """ html = spacy.displacy.render(doc, style="ent", options=options) html = html.replace("\n", " ") return html st.header("Location Recognition Demo 🔎🌆🌍") st.sidebar.header("Information ℹ") st.sidebar.markdown( """ This example application accompanies the blog post: [Extracting useful information from documents with Named Entity Recognition](). It uses a pre-trained Named Entity Recognition (NER) model from the [spaCy](https://spacy.io/) library to extract locations from your own examples, or a sample of grant applications from The Wellcome Trust. The application will extract the following types of location entity: * __GPE__: Geopolitical entities (countries, cities, states) * __LOC__: Locations (mountains, rivers, lakes) """ ) def show_example(text): html = render_entities(doc, colors, options) st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True) return text if st.button("Show Wellcome example", key="text"): sample = random.choice(grants) text = st.text_area( "Add your own text or click the button to see a Wellcome example", value=sample["text"], height=200, help="Enter your own text and press CTRL + ENTER to search for entities", ) doc = nlp(text) show_example(text) else: text = st.text_area( "Add your own text or click the button to see a Wellcome example", value="Enter your text here", height=200, help="Enter your own text and press CTRL + ENTER to search for entities", ) doc = nlp(text) show_example(text) st.markdown( "Examples from The Wellcome Trust are taken from data that are publishes openly at [360 Giving](https://data.threesixtygiving.org/). They are published under a [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license." )