Spaces:
No application file
No application file
#!/usr/bin/env python3 | |
""" | |
Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro | |
Objetivo: Neste script utilizamos um modelo pré-treinado para extrair | |
Entidades e usamos o pacote logging do python para registrar | |
nossos LOGs. | |
""" | |
import logging | |
from transformers import pipeline | |
class EntityRecognizer: | |
def __init__(self, model_name="Babelscape/wikineural-multilingual-ner"): # https://huggingface.co/Babelscape/wikineural-multilingual-ner | |
self.model = self.load_model(model_name) | |
self.logger = self.setup_logger() | |
def load_model(self, model_name="Babelscape/wikineural-multilingual-ner"): | |
# Carrego o modelo pré-treinado do Hugging Face: | |
return pipeline("ner", model=model_name, tokenizer=model_name) | |
def setup_logger(self): | |
# Configuração de Logs: | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | |
file_handler = logging.FileHandler('reconhecimento_de_entidade.log') | |
file_handler.setLevel(logging.INFO) | |
file_handler.setFormatter(formatter) | |
logger.addHandler(file_handler) | |
return logger | |
def recognize_entities(self, text): | |
# Use o modelo NER pré-treinado para reconhecer entidades no texto: | |
entities = self.model(text) | |
recognized_entities = [] | |
for entity in entities: | |
entity_text = entity['word'] | |
entity_type = entity['entity'] | |
recognized_entities.append((entity_text, entity_type)) | |
self.logger.info(f"Entidades reconhecidas: {recognized_entities}") | |
return recognized_entities | |
def process_classification_result(self, tokens_and_tags): | |
result = {} | |
current_type = None | |
current_entity = "" | |
for token, tag in tokens_and_tags: | |
if tag.startswith("B-"): | |
if current_type is not None and current_entity: | |
result[current_entity] = current_type | |
current_type = tag[2:] | |
current_entity = token | |
elif tag.startswith("I-"): | |
current_entity += " " + token | |
if current_type is not None and current_entity: | |
result[current_entity] = current_type | |
return result | |
if __name__ == "__main__": | |
# Exemplo de uso: | |
#model_name = "Babelscape/wikineural-multilingual-ner" | |
#text = "O Eddwin e a Karina foram para Estados Unidos a estudar em Harvard." | |
text = "Eddy e Karina compraram uns tênis na loja Nike." | |
entity_recognizer = EntityRecognizer() # entity_recognizer = EntityRecognizer(model_name) | |
recognized = entity_recognizer.recognize_entities(text) | |
print(recognized) | |
print("🤗🤗🤗") | |
result = entity_recognizer.process_classification_result(recognized) | |
result = {k.replace(" ##", ""): v for k, v in result.items()} # Remove '##' from keys | |
print(result) | |