Spaces:
Running
Running
| # /scripts/data_explorer.py | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns # ζ·»ε | |
| import numpy as np # ζ·»ε | |
| from pathlib import Path | |
| import json # ζ·»ε | |
| def analyze_subset(file_path, keywords_path, output_dir="analysis"): | |
| """εζειζΈζθ³ͺιεεεΈ""" | |
| print(f"ζ£ε¨εζ: {file_path}") | |
| # θΌε ₯ζΈζ | |
| df = pd.read_csv(file_path) | |
| output_dir = Path(output_dir) | |
| # 1. εΊζ¬η΅±θ¨ (δΏζεζη) | |
| print(f"ηΈ½θ¨ιζΈ: {len(df)}") | |
| df['text_length'] = df['clean_text'].str.len() # η§»ε°ι裑 | |
| print(f"εΉ³εζζ¬ι·εΊ¦: {df['text_length'].mean():.2f}") | |
| # 2. ιι΅εεζ (δΏζεζη) | |
| with open(keywords_path, 'r') as f: | |
| keywords = [line.strip() for line in f if line.strip()] | |
| keyword_stats = {} | |
| for keyword in keywords: | |
| count = df['clean_text'].str.contains(keyword, case=False).sum() | |
| keyword_stats[keyword] = count | |
| print(f"{keyword}: {count} ζ’θ¨ι") | |
| # 3. ε―θ¦ε | |
| output_path = Path(output_dir) / "plots" | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| # 3.1 ιι΅θ©εεΈε (εζη) | |
| plt.figure(figsize=(15, 8)) | |
| plt.bar(keyword_stats.keys(), keyword_stats.values()) | |
| plt.xticks(rotation=45, ha='right') | |
| plt.title('ιι΅θ©εΉι εεΈ') | |
| plt.xlabel('ιι΅θ©') | |
| plt.ylabel('εΉι ζΈι') | |
| # TODO: change the name of the file to the name of the subset | |
| plt.savefig(output_path / "keyword_distribution_emergency_subset.png", bbox_inches='tight') | |
| plt.close() | |
| # 3.2 ζζ¬ι·εΊ¦εεΈ (ζ°ε’η) | |
| plt.figure(figsize=(10, 6)) | |
| df['text_length'].hist(bins=50) | |
| plt.title('ζζ¬ι·εΊ¦εεΈ') | |
| plt.xlabel('ζζ¬ι·εΊ¦') | |
| plt.ylabel('ι »η') | |
| plt.savefig(output_path / "text_length_dist.png", bbox_inches='tight') | |
| plt.close() | |
| # 3.3 ιι΅θ©ε ±ηΎεζ (ζ°ε’η) | |
| cooccurrence_matrix = np.zeros((len(keywords), len(keywords))) | |
| for text in df['clean_text']: | |
| present_keywords = [k for k in keywords if k.lower() in text.lower()] | |
| for i, k1 in enumerate(present_keywords): | |
| for j, k2 in enumerate(present_keywords): | |
| if i != j: | |
| cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1 | |
| plt.figure(figsize=(12, 8)) | |
| sns.heatmap(cooccurrence_matrix, | |
| xticklabels=keywords, | |
| yticklabels=keywords, | |
| cmap='YlOrRd') | |
| plt.title('ιι΅θ©ε ±ηΎη±εε') | |
| plt.xticks(rotation=45, ha='right') | |
| plt.tight_layout() | |
| # TODO: change the name of the file to the name of the subset | |
| plt.savefig(output_path / "keyword_cooccurrence_emergency_subset.png", bbox_inches='tight') | |
| plt.close() | |
| # 4. δΏεη΅±θ¨ζΈζ (ζ΄ε±εζη) | |
| stats_path = Path(output_dir) / "stats" | |
| stats_path.mkdir(parents=True, exist_ok=True) | |
| stats = { | |
| 'εΊζ¬η΅±θ¨': { | |
| 'ηΈ½θ¨ιζΈ': len(df), | |
| 'εΉ³εζζ¬ι·εΊ¦': float(df['text_length'].mean()), | |
| 'ζζ¬ι·εΊ¦εδ½ζΈ': df['text_length'].describe().to_dict() | |
| }, | |
| 'ιι΅θ©η΅±θ¨': keyword_stats | |
| } | |
| # TODO: change the name of the file to the name of the subset | |
| with open(stats_path / "analysis_stats_emergency_subset.json", 'w', encoding='utf-8') as f: | |
| json.dump(stats, f, indent=2, ensure_ascii=False) |