File size: 3,313 Bytes
dbb4ab1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from transformers import pipeline
import gradio as gr
import docx
import pandas as pd
# Cargar el modelo NER
model_name = "johnyyhk/bert-finetuned-ner-chinese-people-daily"
get_completion = pipeline("ner", model=model_name)
# Funci贸n para fusionar tokens
def merge_tokens(tokens):
merged_tokens = []
for token in tokens:
if merged_tokens and token['entity'].startswith('I-') and merged_tokens[-1]['entity'].endswith(token['entity'][2:]):
# Si el token contin煤a la entidad del anterior, fusi贸nalos
last_token = merged_tokens[-1]
last_token['word'] += token['word'].replace('##', '')
last_token['end'] = token['end']
last_token['score'] = (last_token['score'] + token['score']) / 2
else:
# De lo contrario, agrega el token a la lista
merged_tokens.append(token)
return merged_tokens
# Extraer nombres de personas de la respuesta del modelo
def extract_person_names(tokens):
names = []
current_name = ""
for token in tokens:
if token['entity'] == 'B-PER':
if current_name:
names.append(current_name)
current_name = token['word']
elif token['entity'] == 'I-PER' and current_name:
current_name += token['word']
else:
if current_name:
names.append(current_name)
current_name = ""
if current_name:
names.append(current_name)
return list(set(names)) # Eliminar duplicados
# Procesar el archivo DOCX
def process_docx(file_path):
doc = docx.Document(file_path)
paragraphs = []
for p in doc.paragraphs:
text = p.text.strip()
if text:
# Dividir por saltos de l铆nea internos si existen
sub_paragraphs = text.split("\n")
paragraphs.extend([sub_p.strip() for sub_p in sub_paragraphs if sub_p.strip()])
return paragraphs
# Crear bloques de p谩rrafos
def create_paragraph_blocks(paragraphs, block_size=4):
return ["\n".join(paragraphs[i:i + block_size]) for i in range(0, len(paragraphs), block_size)]
# Funci贸n principal de procesamiento
def process_ner(file):
paragraphs = process_docx(file.name)
paragraph_blocks = create_paragraph_blocks(paragraphs)
all_names = []
for block in paragraph_blocks:
tokens = get_completion(block)
merged_tokens = merge_tokens(tokens)
names = extract_person_names(merged_tokens)
all_names.extend(names)
all_names = list(set(all_names)) # Eliminar duplicados
# Guardar en un archivo Excel
df = pd.DataFrame({'Person Names': all_names})
output_path = "ner_output.xlsx"
df.to_excel(output_path, index=False)
return output_path
# Gradio interfaz
def ner_interface(file):
output_path = process_ner(file)
return f"NER completado. Archivo guardado en: {output_path}", output_path
demo = gr.Interface(
fn=ner_interface,
inputs=gr.File(label="Sube un archivo DOCX"),
outputs=[gr.Textbox(label="Resultado"), gr.File(label="Descargar archivo")],
title="NER de Nombres de Personas",
description="Extrae nombres de personas desde un archivo DOCX usando NER y guarda los resultados en un archivo Excel.",
allow_flagging="never"
)
demo.launch(inline=False) |