oncall-guide-ai / dataset /scripts /data_explorer.py
YanBoChen
feat(dataset): Implement emergency subset extraction with enhanced matching
988dac9
raw
history blame
3.46 kB
# /scripts/data_explorer.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # 添加
import numpy as np # 添加
from pathlib import Path
import json # 添加
def analyze_subset(file_path, keywords_path, output_dir="analysis"):
"""εˆ†ζžε­ι›†ζ•Έζ“šθ³ͺι‡ε’Œεˆ†εΈƒ"""
print(f"ζ­£εœ¨εˆ†ζž: {file_path}")
# θΌ‰ε…₯ζ•Έζ“š
df = pd.read_csv(file_path)
output_dir = Path(output_dir)
# 1. 基本硱計 (δΏζŒεŽŸζœ‰ηš„)
print(f"ηΈ½θ¨˜ιŒ„ζ•Έ: {len(df)}")
df['text_length'] = df['clean_text'].str.len() # η§»εˆ°ι€™θ£‘
print(f"εΉ³ε‡ζ–‡ζœ¬ι•·εΊ¦: {df['text_length'].mean():.2f}")
# 2. ι—œι΅ε­—εˆ†ζž (δΏζŒεŽŸζœ‰ηš„)
with open(keywords_path, 'r') as f:
keywords = [line.strip() for line in f if line.strip()]
keyword_stats = {}
for keyword in keywords:
count = df['clean_text'].str.contains(keyword, case=False).sum()
keyword_stats[keyword] = count
print(f"{keyword}: {count} ζ’θ¨˜ιŒ„")
# 3. ε―θ¦–εŒ–
output_path = Path(output_dir) / "plots"
output_path.mkdir(parents=True, exist_ok=True)
# 3.1 ι—œι΅θ©žεˆ†εΈƒεœ– (εŽŸζœ‰ηš„)
plt.figure(figsize=(15, 8))
plt.bar(keyword_stats.keys(), keyword_stats.values())
plt.xticks(rotation=45, ha='right')
plt.title('ι—œι΅θ©žεŒΉι…εˆ†εΈƒ')
plt.xlabel('ι—œι΅θ©ž')
plt.ylabel('εŒΉι…ζ•Έι‡')
# TODO: change the name of the file to the name of the subset
plt.savefig(output_path / "keyword_distribution_emergency_subset.png", bbox_inches='tight')
plt.close()
# 3.2 ζ–‡ζœ¬ι•·εΊ¦εˆ†εΈƒ (ζ–°ε’žηš„)
plt.figure(figsize=(10, 6))
df['text_length'].hist(bins=50)
plt.title('ζ–‡ζœ¬ι•·εΊ¦εˆ†εΈƒ')
plt.xlabel('ζ–‡ζœ¬ι•·εΊ¦')
plt.ylabel('ι »ηŽ‡')
plt.savefig(output_path / "text_length_dist.png", bbox_inches='tight')
plt.close()
# 3.3 ι—œι΅θ©žε…±ηΎεˆ†ζž (ζ–°ε’žηš„)
cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
for text in df['clean_text']:
present_keywords = [k for k in keywords if k.lower() in text.lower()]
for i, k1 in enumerate(present_keywords):
for j, k2 in enumerate(present_keywords):
if i != j:
cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
plt.figure(figsize=(12, 8))
sns.heatmap(cooccurrence_matrix,
xticklabels=keywords,
yticklabels=keywords,
cmap='YlOrRd')
plt.title('ι—œι΅θ©žε…±ηΎη†±εŠ›εœ–')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# TODO: change the name of the file to the name of the subset
plt.savefig(output_path / "keyword_cooccurrence_emergency_subset.png", bbox_inches='tight')
plt.close()
# 4. δΏε­˜η΅±θ¨ˆζ•Έζ“š (ζ“΄ε±•εŽŸζœ‰ηš„)
stats_path = Path(output_dir) / "stats"
stats_path.mkdir(parents=True, exist_ok=True)
stats = {
'基本硱計': {
'ηΈ½θ¨˜ιŒ„ζ•Έ': len(df),
'εΉ³ε‡ζ–‡ζœ¬ι•·εΊ¦': float(df['text_length'].mean()),
'ζ–‡ζœ¬ι•·εΊ¦εˆ†δ½ζ•Έ': df['text_length'].describe().to_dict()
},
'ι—œι΅θ©žη΅±θ¨ˆ': keyword_stats
}
# TODO: change the name of the file to the name of the subset
with open(stats_path / "analysis_stats_emergency_subset.json", 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2, ensure_ascii=False)