File size: 3,313 Bytes
dbb4ab1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from transformers import pipeline
import gradio as gr
import docx
import pandas as pd

# Cargar el modelo NER
model_name = "johnyyhk/bert-finetuned-ner-chinese-people-daily"
get_completion = pipeline("ner", model=model_name)

# Funci贸n para fusionar tokens
def merge_tokens(tokens):
    merged_tokens = []
    for token in tokens:
        if merged_tokens and token['entity'].startswith('I-') and merged_tokens[-1]['entity'].endswith(token['entity'][2:]):
            # Si el token contin煤a la entidad del anterior, fusi贸nalos
            last_token = merged_tokens[-1]
            last_token['word'] += token['word'].replace('##', '')
            last_token['end'] = token['end']
            last_token['score'] = (last_token['score'] + token['score']) / 2
        else:
            # De lo contrario, agrega el token a la lista
            merged_tokens.append(token)
    return merged_tokens

# Extraer nombres de personas de la respuesta del modelo
def extract_person_names(tokens):
    names = []
    current_name = ""
    for token in tokens:
        if token['entity'] == 'B-PER':
            if current_name:
                names.append(current_name)
            current_name = token['word']
        elif token['entity'] == 'I-PER' and current_name:
            current_name += token['word']
        else:
            if current_name:
                names.append(current_name)
                current_name = ""
    if current_name:
        names.append(current_name)
    return list(set(names))  # Eliminar duplicados

# Procesar el archivo DOCX
def process_docx(file_path):
    doc = docx.Document(file_path)
    paragraphs = []
    for p in doc.paragraphs:
        text = p.text.strip()
        if text:
            # Dividir por saltos de l铆nea internos si existen
            sub_paragraphs = text.split("\n")
            paragraphs.extend([sub_p.strip() for sub_p in sub_paragraphs if sub_p.strip()])
    return paragraphs

# Crear bloques de p谩rrafos
def create_paragraph_blocks(paragraphs, block_size=4):
    return ["\n".join(paragraphs[i:i + block_size]) for i in range(0, len(paragraphs), block_size)]

# Funci贸n principal de procesamiento
def process_ner(file):
    paragraphs = process_docx(file.name)
    paragraph_blocks = create_paragraph_blocks(paragraphs)

    all_names = []
    for block in paragraph_blocks:
        tokens = get_completion(block)
        merged_tokens = merge_tokens(tokens)
        names = extract_person_names(merged_tokens)
        all_names.extend(names)

    all_names = list(set(all_names))  # Eliminar duplicados

    # Guardar en un archivo Excel
    df = pd.DataFrame({'Person Names': all_names})
    output_path = "ner_output.xlsx"
    df.to_excel(output_path, index=False)

    return output_path

# Gradio interfaz
def ner_interface(file):
    output_path = process_ner(file)
    return f"NER completado. Archivo guardado en: {output_path}", output_path

demo = gr.Interface(
    fn=ner_interface,
    inputs=gr.File(label="Sube un archivo DOCX"),
    outputs=[gr.Textbox(label="Resultado"), gr.File(label="Descargar archivo")],
    title="NER de Nombres de Personas",
    description="Extrae nombres de personas desde un archivo DOCX usando NER y guarda los resultados en un archivo Excel.",
    allow_flagging="never"
)

demo.launch(inline=False)