from transformers import pipeline import gradio as gr import docx import pandas as pd # Cargar el modelo NER model_name = "johnyyhk/bert-finetuned-ner-chinese-people-daily" get_completion = pipeline("ner", model=model_name) # Función para fusionar tokens def merge_tokens(tokens): merged_tokens = [] for token in tokens: if merged_tokens and token['entity'].startswith('I-') and merged_tokens[-1]['entity'].endswith(token['entity'][2:]): # Si el token continúa la entidad del anterior, fusiónalos last_token = merged_tokens[-1] last_token['word'] += token['word'].replace('##', '') last_token['end'] = token['end'] last_token['score'] = (last_token['score'] + token['score']) / 2 else: # De lo contrario, agrega el token a la lista merged_tokens.append(token) return merged_tokens # Extraer nombres de personas de la respuesta del modelo def extract_person_names(tokens): names = [] current_name = "" for token in tokens: if token['entity'] == 'B-PER': if current_name: names.append(current_name) current_name = token['word'] elif token['entity'] == 'I-PER' and current_name: current_name += token['word'] else: if current_name: names.append(current_name) current_name = "" if current_name: names.append(current_name) return list(set(names)) # Eliminar duplicados # Procesar el archivo DOCX def process_docx(file_path): doc = docx.Document(file_path) paragraphs = [] for p in doc.paragraphs: text = p.text.strip() if text: # Dividir por saltos de línea internos si existen sub_paragraphs = text.split("\n") paragraphs.extend([sub_p.strip() for sub_p in sub_paragraphs if sub_p.strip()]) return paragraphs # Crear bloques de párrafos def create_paragraph_blocks(paragraphs, block_size=4): return ["\n".join(paragraphs[i:i + block_size]) for i in range(0, len(paragraphs), block_size)] # Función principal de procesamiento def process_ner(file): paragraphs = process_docx(file.name) paragraph_blocks = create_paragraph_blocks(paragraphs) all_names = [] for block in paragraph_blocks: tokens = get_completion(block) merged_tokens = merge_tokens(tokens) names = extract_person_names(merged_tokens) all_names.extend(names) all_names = list(set(all_names)) # Eliminar duplicados # Guardar en un archivo Excel df = pd.DataFrame({'Person Names': all_names}) output_path = "ner_output.xlsx" df.to_excel(output_path, index=False) return output_path # Gradio interfaz def ner_interface(file): output_path = process_ner(file) return f"NER completado. Archivo guardado en: {output_path}", output_path demo = gr.Interface( fn=ner_interface, inputs=gr.File(label="Sube un archivo DOCX"), outputs=[gr.Textbox(label="Resultado"), gr.File(label="Descargar archivo")], title="NER de Nombres de Personas", description="Extrae nombres de personas desde un archivo DOCX usando NER y guarda los resultados en un archivo Excel.", allow_flagging="never" ) demo.launch(inline=False)