Spaces:

WordLift
/

entity-linking

Running

File size: 7,803 Bytes

bbcf937
59c3f8c
bbcf937
44b938c
24d58c0
117cafd
 
cb76a4b
 
 
 
 
 
 
 
 
 
 
 
0bec8b3
542aecd
d24252f
4ac935a
bbcf937
8e7625e
d24252f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e7625e
 
 
 
 
4ac935a
8e7625e
4ac935a
 
 
 
 
 
 
8e7625e
4ac935a
8e7625e
42d1bed
dedd775
9494755
 
 
40806d0
a168d09
fdfd405
9494755
74fb24f
d24252f
 
 
 
 
9494755
 
 
 
dedd775
320ee5a
9494755
bbcf937
320ee5a
 
c9574f5
 
b126447
c9574f5
971e940
44b938c
 
b126447
 
9d9274e
44b938c
 
 
 
9d9274e
bbcf937
 
3dac3c5
c9574f5
bbcf937
dd4ee36
 
 
 
bbcf937
74fb24f
fe49e8e
 
b126447
 
890d925
d24252f
9e9596c
890d925
 
d8a2dff
890d925
 
117cafd
fdfd405
0fe6ed0
dd4ee36
117cafd
0fe6ed0
117cafd
 
 
 
0fe6ed0
 
117cafd
98acdc3
 
 
117cafd
98acdc3
 
117cafd
 
 
 
 
98acdc3
a4303d6
9d9274e
 
 
 
a4303d6
 
 
98acdc3
117cafd
98acdc3
 
 
 
 
 
 
 
 
 
 
 
 
 
117cafd
98acdc3
 
117cafd
98acdc3
 
9d9274e
 
5cb9d08
 
 
44b938c
5cb9d08
 
 
 
 
 
542aecd
5cb9d08
542aecd
 
5cb9d08
24d58c0
49703d7
31c00d2

import streamlit as st
from annotated_text import annotated_text
from refined.inference.processor import Refined
import requests
import json
import spacy

# Page config
st.set_page_config(
    page_title="Entity Linking by WordLift",
    page_icon="fav-ico.png",
    layout="wide",
    initial_sidebar_state="collapsed",
    menu_items={
        'Get Help': 'https://wordlift.io/book-a-demo/',
        'About': "# This is a demo app for NEL/NED/NER and SEO"
    }
)

# Sidebar
st.sidebar.image("logo-wordlift.png")
language_options = {"English", "English - spaCy", "German"}
selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)

# Based on selected language, configure model, entity set, and citation options
if selected_language == "German" or selected_language == "English - spaCy":
    selected_model_name = None
    selected_entity_set = None

    entity_fishing_citation = """
    @misc{entity-fishing,
    title = {entity-fishing},
    publisher = {GitHub},
    year = {2016--2023},
    archivePrefix = {swh},
    eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
    }
    """

    with st.sidebar.expander('Citations'):
        st.markdown(entity_fishing_citation)
else:
    model_options = ["aida_model", "wikipedia_model_with_numbers"]
    entity_set_options = ["wikidata", "wikipedia"]
    
    selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
    selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)

    refined_citation = """
    @inproceedings{ayoola-etal-2022-refined,
    title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking",
    author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni",
    booktitle = "NAACL",
    year = "2022"
    }
    """

    with st.sidebar.expander('Citations'):
        st.markdown(refined_citation)

@st.cache_resource  # 👈 Add the caching decorator
def load_model(selected_language, model_name=None, entity_set=None):
    if selected_language == "German":
        # Load the German-specific model
        nlp_model_de = spacy.load("de_core_news_lg")
        nlp_model_de.add_pipe("entityfishing")
        
        return nlp_model_de
    elif selected_language == "English - spaCy":
        # Load English-specific model
        nlp_model_en = spacy.load("en_core_web_sm")
        nlp_model_en.add_pipe("entityfishing")

        return nlp_model_en    
    else:
        # Load the pretrained model for other languages
        refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
        return refined_model

# Use the cached model
model = load_model(selected_language, selected_model_name, selected_entity_set)

# Helper functions
def get_wikidata_id(entity_string):
    entity_list = entity_string.split("=")
    entity_id = str(entity_list[1])
    entity_link = "http://www.wikidata.org/entity/" + entity_id
    return {"id": entity_id, "link": entity_link}
    
def get_entity_data(entity_link):
    try:
        # Format the entity_link
        formatted_link = entity_link.replace("http://", "http/")
        response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
        return response.json()
    except Exception as e:
        print(f"Exception when fetching data for entity: {entity_link}. Exception: {e}")
        return None
            
# Create the form
with st.form(key='my_form'):
    text_input = st.text_area(label='Enter a sentence')
    submit_button = st.form_submit_button(label='Analyze')

# Initialization
entities_map = {}
entities_data = {}

if text_input:
    if selected_language in ["German", "English - spaCy"]:
        doc = model(text_input)
        entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
        for entity in entities:
            entity_string, entity_type, wikidata_id, wikidata_url = entity
            if wikidata_url:
                # Ensure correct format for the German and English model
                formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
                entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
                entity_data = get_entity_data(formatted_wikidata_url)
                
                if entity_data is not None:
                    entities_data[entity_string] = entity_data
    else:
        entities = model.process_text(text_input)

        for entity in entities:
            single_entity_list = str(entity).strip('][').replace("\'", "").split(', ')
            if len(single_entity_list) >= 2 and "wikidata" in single_entity_list[1]:
                entities_map[single_entity_list[0].strip()] = get_wikidata_id(single_entity_list[1])
                entity_data = get_entity_data(entities_map[single_entity_list[0].strip()]["link"])
                if entity_data is not None:
                    entities_data[single_entity_list[0].strip()] = entity_data

    combined_entity_info_dictionary = dict([(k, [entities_map[k], entities_data[k] if k in entities_data else None]) for k in entities_map])
    
    if submit_button:
        # Prepare a list to hold the final output
        final_text = []
    
        # JSON-LD data
        json_ld_data = {
                "@context": "https://schema.org",
                "@type": "WebPage",
                "mentions": []
            }
    
       # Replace each entity in the text with its annotated version
        for entity_string, entity_info in entities_map.items():
            # Check if the entity has a valid Wikidata link
            if entity_info["link"] is None or entity_info["link"] == "None":
                continue  # skip this entity
            
            entity_data = entities_data.get(entity_string, None)
            entity_type = None
            if entity_data is not None:
                entity_type = entity_data.get("@type", None)
    
            # Use different colors based on the entity's type
            color = "#8ef"  # Default color
            if entity_type == "Place":
                color = "#8AC7DB"
            elif entity_type == "Organization":
                color = "#ADD8E6"
            elif entity_type == "Person":
                color = "#67B7D1"
            elif entity_type == "Product":
                color = "#2ea3f2"
            elif entity_type == "CreativeWork":
                color = "#00BFFF"
            elif entity_type == "Event":
                color = "#1E90FF"
    
            entity_annotation = (entity_string, entity_info["id"], color)
            text_input = text_input.replace(entity_string, f'{{{str(entity_annotation)}}}', 1)
                
            # Add the entity to JSON-LD data
            entity_json_ld = combined_entity_info_dictionary[entity_string][1]
            if entity_json_ld and entity_json_ld.get("link") != "None":
                json_ld_data["mentions"].append(entity_json_ld)

        # Split the modified text_input into a list
        text_list = text_input.split("{")
        
        for item in text_list:
            if "}" in item:
                item_list = item.split("}")
                final_text.append(eval(item_list[0]))
                if len(item_list[1]) > 0:
                    final_text.append(item_list[1])
            else:
                final_text.append(item)

        # Pass the final_text to the annotated_text function
        annotated_text(*final_text)
        
        with st.expander("See annotations"):
            st.write(combined_entity_info_dictionary)

        with st.expander("Here is the final JSON-LD"):
            st.json(json_ld_data)  # Output JSON-LD