import gradio as gr from transformers import pipeline, AutoTokenizer # Define the model name MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual" # Load the tokenizer and model using the pipeline ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) ner_pipeline = pipeline( "generic-ner", model=MODEL_NAME, tokenizer=ner_tokenizer, trust_remote_code=True, device="cpu", ) def format_entities_as_html(entities): excluded_keys = {"start", "end", "index"} # Keys to exclude from the output html_output = "
" for entity in entities: html_output += ( "
" # Each entity in a separate div ) # Dynamically add all fields except the excluded ones for key, value in entity.items(): if key not in excluded_keys: if isinstance(value, float): # Format score if it's a float html_output += ( f"{key.capitalize()}: {value:.2f}
" ) else: html_output += f"{key.capitalize()}: {value}
" html_output += "
" html_output += "
" return html_output # Function to process the sentence and extract entities def extract_entities(sentence): results = ner_pipeline(sentence) # Debugging the result format print(f"NER results: {results}") entities = [] seen_spans = set() # Track the spans we have already added to avoid overlaps # Print debug info about tokenization print(f"Original text: {sentence}") print("Results:", results) # it should look like: # [{'entity': 'org.ent.pressagency.Reuters', 'score': np.float32(98.47), # 'index': 78, 'text': 'Reuters', 'start': 440, 'end': 447}] for entity in results: entity["start"] = entity["lOffset"] entity["end"] = entity["rOffset"] entity_span = (entity["start"], entity["end"]) # Only add non-overlapping entities if entity_span not in seen_spans: seen_spans.add(entity_span) entity_text = sentence[ entity["start"] : entity["end"] ].strip() # Ensure we're working with the correct portion of the text entity["surface"] = entity_text label = f"{entity['type']}" if "title" in entity: label += f" - Title: {entity['title']}" if "name" in entity: label += f" - Name: {entity['name']}" if "function" in entity: label += f" - Function: {entity['function']}" entity["entity"] = label # print(f"Entity text: {entity}") entities.append(entity) print(f"Entities: {entities}") # Sort entities by their start position # entities = sorted(entities, key=lambda x: x["start"]) return {"text": sentence, "entities": entities} # Create Gradio interface def ner_app_interface(): input_sentence = gr.Textbox( lines=5, label="Input Sentence", placeholder="Enter a sentence for NER:" ) output_entities = gr.HTML(label="Extracted Entities") # Interface definition interface = gr.Interface( fn=extract_entities, inputs=input_sentence, outputs=[gr.HighlightedText(label="Text with mentions")], # outputs=output_entities, title="Named Entity Recognition", description="Enter a sentence to extract named entities using the NER model from the Impresso project.", examples=[ [ "Des chercheurs de l'Université de Cambridge ont développé une nouvelle technique de calcul quantique qui promet d'augmenter exponentiellement les vitesses de calcul." ], [ "Le rapport complet sur ces découvertes a été publié dans la prestigieuse revue 'Nature Physics'. (Reuters)" ], ["In the year 1789, the Estates-General was convened in France."], [ "The event was held at the Palace of Versailles, a symbol of French monarchy." ], [ "At Versailles, Marie Antoinette, the Queen of France, was involved in discussions." ], [ "Maximilien Robespierre, a leading member of the National Assembly, also participated." ], [ "Jean-Jacques Rousseau, the famous philosopher, was a significant figure in the debate." ], [ "Another important participant was Charles de Talleyrand, the Bishop of Autun." ], [ "Meanwhile, across the Atlantic, George Washington, the first President of the United States, was shaping policies." ], [ "Thomas Jefferson, the nation's Secretary of State, played a key role in drafting policies for the new American government." ], ], live=False, ) interface.launch(share=True) # Run the app if __name__ == "__main__": ner_app_interface()