Spaces:
Sleeping
Sleeping
File size: 2,153 Bytes
cf00609 c5922b9 d25649c c5922b9 507724b c5922b9 28094fc fb510e6 d25649c e607aa8 d25649c cf00609 28094fc d25649c 28094fc c5922b9 28094fc c5922b9 28094fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# Updated NamedEntityRecognitionTool in ner_tool.py
from transformers import pipeline
from transformers import Tool
class NamedEntityRecognitionTool(Tool):
name = "ner_tool"
description = "Identifies and labels various entities in a given text."
inputs = ["text"]
outputs = ["text"]
def __call__(self, text: str):
# Initialize the named entity recognition pipeline
ner_analyzer = pipeline("ner")
# Perform named entity recognition on the input text
entities = ner_analyzer(text)
# Prepare a list to store word-level entities
word_entities = []
# Initialize variables to track the current word and its label
current_word = ""
current_label = None
for entity in entities:
label = entity.get("entity", "UNKNOWN")
word = entity.get("word", "")
start = entity.get("start", -1)
end = entity.get("end", -1)
# Extract the complete entity text
entity_text = text[start:end].strip()
# Check for multi-token entities
if "##" in word:
# Concatenate sub-tokens to form the complete word
current_word += entity_text
current_label = label
else:
# If it's the first token of a new word, add the previous word to the list
if current_word:
word_entities.append({"word": current_word, "label": current_label, "entity_text": current_word})
current_word = ""
current_label = None
# Add the current token as a new word
word_entities.append({"word": word, "label": label, "entity_text": entity_text})
# Check for any remaining word
if current_word:
word_entities.append({"word": current_word, "label": current_label, "entity_text": current_word})
# Print the identified word-level entities
print(f"Word-level Entities: {word_entities}")
return {"entities": word_entities} # Return a dictionary with the specified output component
|