import gradio as gr
from transformers import pipeline, AutoTokenizer
# Define the model name
MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual"
# Load the tokenizer and model using the pipeline
ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ner_pipeline = pipeline(
"generic-ner",
model=MODEL_NAME,
tokenizer=ner_tokenizer,
trust_remote_code=True,
device="cpu",
)
def format_entities_as_html(entities):
excluded_keys = {"start", "end", "index"} # Keys to exclude from the output
html_output = "
"
for entity in entities:
html_output += (
"
" # Each entity in a separate div
)
# Dynamically add all fields except the excluded ones
for key, value in entity.items():
if key not in excluded_keys:
if isinstance(value, float): # Format score if it's a float
html_output += (
f"{key.capitalize()}: {value:.2f}
"
)
else:
html_output += f"{key.capitalize()}: {value}
"
html_output += "
"
html_output += "
"
return html_output
# Function to process the sentence and extract entities
def extract_entities(sentence):
results = ner_pipeline(sentence)
# Debugging the result format
print(f"NER results: {results}")
entities = []
seen_spans = set() # Track the spans we have already added to avoid overlaps
# Print debug info about tokenization
print(f"Original text: {sentence}")
print("Results:", results)
# it should look like:
# [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47),
# 'index': 78, 'text': 'Reuters', 'start': 440, 'end': 447}]
for entity in results:
entity["start"] = entity["lOffset"]
entity["end"] = entity["rOffset"]
entity_span = (entity["start"], entity["end"])
# Only add non-overlapping entities
if entity_span not in seen_spans:
seen_spans.add(entity_span)
entity_text = sentence[
entity["start"] : entity["end"]
].strip() # Ensure we're working with the correct portion of the text
entity["surface"] = entity_text
label = f"{entity['type']}"
if "title" in entity:
label += f" - Title: {entity['title']}"
if "name" in entity:
label += f" - Name: {entity['name']}"
if "function" in entity:
label += f" - Function: {entity['function']}"
entity["entity"] = label
# print(f"Entity text: {entity}")
entities.append(entity)
print(f"Entities: {entities}")
# Sort entities by their start position
# entities = sorted(entities, key=lambda x: x["start"])
return {"text": sentence, "entities": entities}
# Create Gradio interface
def ner_app_interface():
input_sentence = gr.Textbox(
lines=5, label="Input Sentence", placeholder="Enter a sentence for NER:"
)
output_entities = gr.HTML(label="Extracted Entities")
# Interface definition
interface = gr.Interface(
fn=extract_entities,
inputs=input_sentence,
outputs=[gr.HighlightedText(label="Text with mentions")],
# outputs=output_entities,
title="Named Entity Recognition",
description="Enter a sentence to extract named entities using the NER model from the Impresso project.",
examples=[
[
"Des chercheurs de l'Université de Cambridge ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul."
],
[
"Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. (Reuters)"
],
["In the year 1789, the Estates-General was convened in France."],
[
"The event was held at the Palace of Versailles, a symbol of French monarchy."
],
[
"At Versailles, Marie Antoinette, the Queen of France, was involved in discussions."
],
[
"Maximilien Robespierre, a leading member of the National Assembly, also participated."
],
[
"Jean-Jacques Rousseau, the famous philosopher, was a significant figure in the debate."
],
[
"Another important participant was Charles de Talleyrand, the Bishop of Autun."
],
[
"Meanwhile, across the Atlantic, George Washington, the first President of the United States, was shaping policies."
],
[
"Thomas Jefferson, the nation's Secretary of State, played a key role in drafting policies for the new American government."
],
],
live=False,
)
interface.launch(share=True)
# Run the app
if __name__ == "__main__":
ner_app_interface()