lik07 commited on
Commit
dbb4ab1
1 Parent(s): eb3ed0e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -0
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import gradio as gr
3
+ import docx
4
+ import pandas as pd
5
+
6
+ # Cargar el modelo NER
7
+ model_name = "johnyyhk/bert-finetuned-ner-chinese-people-daily"
8
+ get_completion = pipeline("ner", model=model_name)
9
+
10
+ # Función para fusionar tokens
11
+ def merge_tokens(tokens):
12
+ merged_tokens = []
13
+ for token in tokens:
14
+ if merged_tokens and token['entity'].startswith('I-') and merged_tokens[-1]['entity'].endswith(token['entity'][2:]):
15
+ # Si el token continúa la entidad del anterior, fusiónalos
16
+ last_token = merged_tokens[-1]
17
+ last_token['word'] += token['word'].replace('##', '')
18
+ last_token['end'] = token['end']
19
+ last_token['score'] = (last_token['score'] + token['score']) / 2
20
+ else:
21
+ # De lo contrario, agrega el token a la lista
22
+ merged_tokens.append(token)
23
+ return merged_tokens
24
+
25
+ # Extraer nombres de personas de la respuesta del modelo
26
+ def extract_person_names(tokens):
27
+ names = []
28
+ current_name = ""
29
+ for token in tokens:
30
+ if token['entity'] == 'B-PER':
31
+ if current_name:
32
+ names.append(current_name)
33
+ current_name = token['word']
34
+ elif token['entity'] == 'I-PER' and current_name:
35
+ current_name += token['word']
36
+ else:
37
+ if current_name:
38
+ names.append(current_name)
39
+ current_name = ""
40
+ if current_name:
41
+ names.append(current_name)
42
+ return list(set(names)) # Eliminar duplicados
43
+
44
+ # Procesar el archivo DOCX
45
+ def process_docx(file_path):
46
+ doc = docx.Document(file_path)
47
+ paragraphs = []
48
+ for p in doc.paragraphs:
49
+ text = p.text.strip()
50
+ if text:
51
+ # Dividir por saltos de línea internos si existen
52
+ sub_paragraphs = text.split("\n")
53
+ paragraphs.extend([sub_p.strip() for sub_p in sub_paragraphs if sub_p.strip()])
54
+ return paragraphs
55
+
56
+ # Crear bloques de párrafos
57
+ def create_paragraph_blocks(paragraphs, block_size=4):
58
+ return ["\n".join(paragraphs[i:i + block_size]) for i in range(0, len(paragraphs), block_size)]
59
+
60
+ # Función principal de procesamiento
61
+ def process_ner(file):
62
+ paragraphs = process_docx(file.name)
63
+ paragraph_blocks = create_paragraph_blocks(paragraphs)
64
+
65
+ all_names = []
66
+ for block in paragraph_blocks:
67
+ tokens = get_completion(block)
68
+ merged_tokens = merge_tokens(tokens)
69
+ names = extract_person_names(merged_tokens)
70
+ all_names.extend(names)
71
+
72
+ all_names = list(set(all_names)) # Eliminar duplicados
73
+
74
+ # Guardar en un archivo Excel
75
+ df = pd.DataFrame({'Person Names': all_names})
76
+ output_path = "ner_output.xlsx"
77
+ df.to_excel(output_path, index=False)
78
+
79
+ return output_path
80
+
81
+ # Gradio interfaz
82
+ def ner_interface(file):
83
+ output_path = process_ner(file)
84
+ return f"NER completado. Archivo guardado en: {output_path}", output_path
85
+
86
+ demo = gr.Interface(
87
+ fn=ner_interface,
88
+ inputs=gr.File(label="Sube un archivo DOCX"),
89
+ outputs=[gr.Textbox(label="Resultado"), gr.File(label="Descargar archivo")],
90
+ title="NER de Nombres de Personas",
91
+ description="Extrae nombres de personas desde un archivo DOCX usando NER y guarda los resultados en un archivo Excel.",
92
+ allow_flagging="never"
93
+ )
94
+
95
+ demo.launch(inline=False)