|
|
"""Entity extraction utilities.""" |
|
|
import os |
|
|
import json |
|
|
import pandas as pd |
|
|
from extract_entities import extract_entities_from_transcripts |
|
|
from data.manager import ENTITIES_PATH |
|
|
from data.loader import get_train_dataframe |
|
|
|
|
|
|
|
|
def extract_entities_progress(progress=None): |
|
|
"""Extract Caribbean entities from training data with progress tracking""" |
|
|
try: |
|
|
if progress: |
|
|
progress(0, desc="Starting entity extraction...") |
|
|
|
|
|
if progress: |
|
|
progress(0.2, desc="Loading training data from dataset...") |
|
|
|
|
|
try: |
|
|
train_df = get_train_dataframe() |
|
|
except ValueError as e: |
|
|
return f"β {str(e)}", "{}" |
|
|
if progress: |
|
|
progress(0.4, desc=f"Analyzing {len(train_df):,} transcripts...") |
|
|
|
|
|
|
|
|
entities = extract_entities_from_transcripts( |
|
|
train_df, |
|
|
min_frequency=50, |
|
|
min_frequency_multiword=20, |
|
|
capitalization_threshold=0.7, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
if progress: |
|
|
progress(0.9, desc="Saving entities...") |
|
|
|
|
|
|
|
|
entities_list = sorted(list(entities)) |
|
|
single_word = sorted([e for e in entities if ' ' not in e]) |
|
|
multi_word = sorted([e for e in entities if ' ' in e]) |
|
|
|
|
|
output_data = { |
|
|
'entities': entities_list, |
|
|
'single_word_entities': single_word, |
|
|
'multi_word_entities': multi_word, |
|
|
'count': len(entities_list), |
|
|
'count_single_word': len(single_word), |
|
|
'count_multi_word': len(multi_word), |
|
|
'extraction_params': { |
|
|
'min_frequency': 50, |
|
|
'min_frequency_multiword': 20, |
|
|
'capitalization_threshold': 0.7 |
|
|
} |
|
|
} |
|
|
|
|
|
with open(ENTITIES_PATH, 'w') as f: |
|
|
json.dump(output_data, f, indent=2) |
|
|
|
|
|
if progress: |
|
|
progress(1.0, desc="Complete!") |
|
|
|
|
|
top_single = single_word[:15] |
|
|
top_multi = multi_word[:15] |
|
|
|
|
|
summary = f""" |
|
|
## β
Entity Extraction Complete |
|
|
|
|
|
**Total Entities:** {len(entities_list)} ({len(single_word)} single-word + {len(multi_word)} multi-word) |
|
|
|
|
|
**Top 15 Single-Word Entities:** |
|
|
{', '.join(top_single) if top_single else 'None'} |
|
|
|
|
|
**Top 15 Multi-Word Entities:** |
|
|
{', '.join(top_multi) if top_multi else 'None'} |
|
|
|
|
|
**Saved to:** `{ENTITIES_PATH}` |
|
|
""" |
|
|
|
|
|
return summary, json.dumps(output_data, indent=2) |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
error_msg = f"β Error: {str(e)}\n\n{traceback.format_exc()}" |
|
|
return error_msg, "{}" |
|
|
|
|
|
|