|
|
import time |
|
|
import gradio as gr |
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
|
|
|
MODELS = { |
|
|
"dslim/bert-base-NER": "Английская NER (4 типа сущностей)", |
|
|
"Davlan/xlm-roberta-large-ner-hrl": "Многоязычная (крупная, медленная)", |
|
|
"Babelscape/wikineural-multilingual-ner": "Многоязычная (9 языков)", |
|
|
} |
|
|
|
|
|
|
|
|
DEFAULT_MODEL = "dslim/bert-base-NER" |
|
|
|
|
|
|
|
|
ENTITY_COLORS = { |
|
|
"PER": "#FF6B6B", |
|
|
"ORG": "#4ECDC4", |
|
|
"LOC": "#FFD166", |
|
|
"MISC": "#06D6A0", |
|
|
} |
|
|
|
|
|
MAX_CHARS = 1500 |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
print(f"Загрузка модели: {DEFAULT_MODEL}") |
|
|
pipe = pipeline( |
|
|
"ner", |
|
|
model=DEFAULT_MODEL, |
|
|
aggregation_strategy="simple", |
|
|
device=-1 |
|
|
) |
|
|
current_model_name = DEFAULT_MODEL |
|
|
print("Модель успешно загружена") |
|
|
except Exception as e: |
|
|
print(f"Ошибка загрузки основной модели: {e}") |
|
|
print("Попытка загрузки резервной модели...") |
|
|
try: |
|
|
pipe = pipeline( |
|
|
"ner", |
|
|
model="Babelscape/wikineural-multilingual-ner", |
|
|
aggregation_strategy="simple", |
|
|
device=-1 |
|
|
) |
|
|
current_model_name = "Babelscape/wikineural-multilingual-ner" |
|
|
print("Резервная модель загружена") |
|
|
except Exception as e2: |
|
|
print(f"Все модели не загрузились: {e2}") |
|
|
pipe = None |
|
|
current_model_name = None |
|
|
|
|
|
|
|
|
def extract_entities(text, model_choice=None): |
|
|
global pipe, current_model_name |
|
|
|
|
|
|
|
|
if pipe is None: |
|
|
return "❌ Модель не загрузилась. Проверьте логи Space.", None, None, None, None |
|
|
|
|
|
|
|
|
if not text or not text.strip(): |
|
|
return "⚠️ Введите текст для анализа", None, None, None, None |
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
|
|
|
if len(text) > MAX_CHARS: |
|
|
text = text[:MAX_CHARS] |
|
|
length_warning = f" (обрезано до {MAX_CHARS} символов)" |
|
|
else: |
|
|
length_warning = "" |
|
|
|
|
|
|
|
|
if model_choice and model_choice != current_model_name: |
|
|
try: |
|
|
new_pipe = pipeline( |
|
|
"ner", |
|
|
model=model_choice, |
|
|
aggregation_strategy="simple", |
|
|
device=-1 |
|
|
) |
|
|
pipe = new_pipe |
|
|
current_model_name = model_choice |
|
|
except Exception as e: |
|
|
return f"❌ Не удалось загрузить модель {model_choice}: {str(e)}", None, None, None, None |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
|
|
|
entities = pipe(text) |
|
|
latency = round((time.time() - start_time) * 1000, 1) |
|
|
|
|
|
if not entities: |
|
|
result_text = "Сущности не обнаружены" |
|
|
html_output = f"<p>Сущности не обнаружены{length_warning}</p>" |
|
|
stats = "Нет сущностей" |
|
|
else: |
|
|
|
|
|
formatted_result = [] |
|
|
for entity in entities: |
|
|
formatted_result.append({ |
|
|
"Текст": entity['word'], |
|
|
"Тип": entity['entity_group'], |
|
|
"Уверенность": round(entity['score'], 3) |
|
|
}) |
|
|
|
|
|
|
|
|
html_parts = [] |
|
|
last_end = 0 |
|
|
|
|
|
|
|
|
sorted_entities = sorted(entities, key=lambda x: x['start']) |
|
|
|
|
|
for entity in sorted_entities: |
|
|
|
|
|
if entity['start'] > last_end: |
|
|
html_parts.append(text[last_end:entity['start']]) |
|
|
|
|
|
|
|
|
color = ENTITY_COLORS.get(entity['entity_group'], "#CCCCCC") |
|
|
entity_text = text[entity['start']:entity['end']] |
|
|
html_parts.append( |
|
|
f'<mark style="background-color: {color}; padding: 2px 4px; ' |
|
|
f'border-radius: 3px; margin: 1px; border: 1px solid {color}80;" ' |
|
|
f'title="{entity["entity_group"]} (уверенность: {entity["score"]:.2f})">' |
|
|
f'{entity_text}<sup>{entity["entity_group"]}</sup></mark>' |
|
|
) |
|
|
|
|
|
last_end = entity['end'] |
|
|
|
|
|
|
|
|
if last_end < len(text): |
|
|
html_parts.append(text[last_end:]) |
|
|
|
|
|
html_output = '<div style="line-height: 1.6; padding: 10px; background: #f5f5f5; border-radius: 5px;">' + \ |
|
|
''.join(html_parts) + f'<br><small>{length_warning}</small></div>' |
|
|
|
|
|
|
|
|
stats_dict = {} |
|
|
for entity in entities: |
|
|
etype = entity['entity_group'] |
|
|
stats_dict[etype] = stats_dict.get(etype, 0) + 1 |
|
|
|
|
|
stats = " | ".join([f"<b>{k}</b>: {v}" for k, v in stats_dict.items()]) |
|
|
|
|
|
return ( |
|
|
f"✅ Анализ завершен ({len(entities) if entities else 0} сущностей)", |
|
|
formatted_result, |
|
|
html_output, |
|
|
stats, |
|
|
f"{latency} мс" |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
return f"❌ Ошибка при обработке: {str(e)}", None, None, None, None |
|
|
|
|
|
|
|
|
def anonymize_text(text, entities_json): |
|
|
"""Простая анонимизация текста""" |
|
|
if not text or not entities_json: |
|
|
return "Сначала выполните анализ текста" |
|
|
|
|
|
result = text |
|
|
|
|
|
if isinstance(entities_json, list): |
|
|
for entity in sorted(entities_json, key=lambda x: x.get('Позиция', ''), reverse=True): |
|
|
etype = entity.get('Тип', '') |
|
|
if etype in ['PER', 'PERSON']: |
|
|
replacement = '[ЛИЦО]' |
|
|
elif etype in ['ORG', 'ORGANIZATION']: |
|
|
replacement = '[ОРГАНИЗАЦИЯ]' |
|
|
elif etype in ['LOC', 'LOCATION']: |
|
|
replacement = '[МЕСТО]' |
|
|
else: |
|
|
replacement = f'[{etype}]' |
|
|
|
|
|
|
|
|
entity_text = entity.get('Текст', '') |
|
|
if entity_text in result: |
|
|
result = result.replace(entity_text, replacement, 1) |
|
|
|
|
|
return result |
|
|
|
|
|
def create_example(text): |
|
|
"""Создание примера работы""" |
|
|
return extract_entities(text, DEFAULT_MODEL) |
|
|
|
|
|
|
|
|
with gr.Blocks(title="NER App", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
# 🔍 Извлечение именованных сущностей (NER) |
|
|
**Распознавание персон, организаций, локаций и других сущностей в тексте** |
|
|
""") |
|
|
|
|
|
|
|
|
if pipe is None: |
|
|
gr.Error("⚠️ Модель не загрузилась. Проверьте файл requirements.txt и логи Space.") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
|
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=list(MODELS.keys()), |
|
|
value=DEFAULT_MODEL, |
|
|
label="Модель для анализа", |
|
|
info="Для английского текста используйте bert-base-NER" |
|
|
) |
|
|
|
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Введите текст", |
|
|
placeholder="Пример: Apple announced new iPhone at Steve Jobs Theater in Cupertino.", |
|
|
lines=6, |
|
|
max_length=MAX_CHARS |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
analyze_btn = gr.Button("Анализировать", variant="primary") |
|
|
clear_btn = gr.Button("Очистить") |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["Apple CEO Tim Cook announced new products in California."], |
|
|
["Microsoft was founded by Bill Gates and Paul Allen in Albuquerque."], |
|
|
["Elon Musk is the CEO of Tesla and SpaceX, both based in the United States."], |
|
|
["The president of France will visit Berlin next month for EU summit."] |
|
|
], |
|
|
inputs=text_input, |
|
|
label="Примеры текстов" |
|
|
) |
|
|
|
|
|
gr.Markdown(f""" |
|
|
**Ограничения:** |
|
|
- Макс. длина: {MAX_CHARS} символов |
|
|
- Только текст (без файлов) |
|
|
- CPU-режим (может быть медленно) |
|
|
""") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
|
|
|
status = gr.Textbox(label="Статус", interactive=False) |
|
|
|
|
|
|
|
|
with gr.Tab("📊 Результаты"): |
|
|
result_json = gr.JSON(label="Найденные сущности") |
|
|
|
|
|
with gr.Tab("🎨 Визуализация"): |
|
|
result_html = gr.HTML(label="Текст с выделением сущностей") |
|
|
|
|
|
with gr.Tab("📈 Статистика"): |
|
|
stats_output = gr.HTML(label="Статистика по типам") |
|
|
latency_output = gr.Textbox(label="Время обработки") |
|
|
|
|
|
with gr.Accordion("🛡️ Анонимизация", open=False): |
|
|
anonymized_output = gr.Textbox( |
|
|
label="Анонимизированный текст", |
|
|
lines=4, |
|
|
interactive=False |
|
|
) |
|
|
anonymize_btn = gr.Button("Анонимизировать текст") |
|
|
|
|
|
|
|
|
analyze_btn.click( |
|
|
fn=extract_entities, |
|
|
inputs=[text_input, model_dropdown], |
|
|
outputs=[status, result_json, result_html, stats_output, latency_output] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=lambda: ["", None, None, None, None, ""], |
|
|
outputs=[text_input, status, result_json, result_html, stats_output, anonymized_output] |
|
|
) |
|
|
|
|
|
anonymize_btn.click( |
|
|
fn=anonymize_text, |
|
|
inputs=[text_input, result_json], |
|
|
outputs=anonymized_output |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### 🎯 Как пользоваться: |
|
|
1. Введите текст в поле слева |
|
|
2. Нажмите **"Анализировать"** |
|
|
3. Смотрите результаты во вкладках |
|
|
|
|
|
### 🏷️ Обозначения цветов: |
|
|
- <mark style="background-color: #FF6B6B; padding: 2px 6px; border-radius: 3px;">PER</mark> — Персона |
|
|
- <mark style="background-color: #4ECDC4; padding: 2px 6px; border-radius: 3px;">ORG</mark> — Организация |
|
|
- <mark style="background-color: #FFD166; padding: 2px 6px; border-radius: 3px;">LOC</mark> — Локация |
|
|
- <mark style="background-color: #06D6A0; padding: 2px 6px; border-radius: 3px;">MISC</mark> — Прочее |
|
|
|
|
|
### ⚠️ Важно: |
|
|
- Модель `dslim/bert-base-NER` работает только с **английским** текстом |
|
|
- Для русского текста выберите другую модель в выпадающем списке |
|
|
- Большие тексты обрабатываются дольше |
|
|
""") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(debug=False) |