shaun3141's picture
Migrate all data loading to use Hugging Face dataset directly
e3aec0d
"""Entity extraction utilities."""
import os
import json
import pandas as pd
from extract_entities import extract_entities_from_transcripts
from data.manager import ENTITIES_PATH
from data.loader import get_train_dataframe
def extract_entities_progress(progress=None):
"""Extract Caribbean entities from training data with progress tracking"""
try:
if progress:
progress(0, desc="Starting entity extraction...")
if progress:
progress(0.2, desc="Loading training data from dataset...")
try:
train_df = get_train_dataframe()
except ValueError as e:
return f"❌ {str(e)}", "{}"
if progress:
progress(0.4, desc=f"Analyzing {len(train_df):,} transcripts...")
# Run extraction
entities = extract_entities_from_transcripts(
train_df,
min_frequency=50,
min_frequency_multiword=20,
capitalization_threshold=0.7,
verbose=False # Suppress prints in Gradio app
)
if progress:
progress(0.9, desc="Saving entities...")
# Save to JSON
entities_list = sorted(list(entities))
single_word = sorted([e for e in entities if ' ' not in e])
multi_word = sorted([e for e in entities if ' ' in e])
output_data = {
'entities': entities_list,
'single_word_entities': single_word,
'multi_word_entities': multi_word,
'count': len(entities_list),
'count_single_word': len(single_word),
'count_multi_word': len(multi_word),
'extraction_params': {
'min_frequency': 50,
'min_frequency_multiword': 20,
'capitalization_threshold': 0.7
}
}
with open(ENTITIES_PATH, 'w') as f:
json.dump(output_data, f, indent=2)
if progress:
progress(1.0, desc="Complete!")
top_single = single_word[:15]
top_multi = multi_word[:15]
summary = f"""
## βœ… Entity Extraction Complete
**Total Entities:** {len(entities_list)} ({len(single_word)} single-word + {len(multi_word)} multi-word)
**Top 15 Single-Word Entities:**
{', '.join(top_single) if top_single else 'None'}
**Top 15 Multi-Word Entities:**
{', '.join(top_multi) if top_multi else 'None'}
**Saved to:** `{ENTITIES_PATH}`
"""
return summary, json.dumps(output_data, indent=2)
except Exception as e:
import traceback
error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}"
return error_msg, "{}"