| |
| import json |
| import os |
| from datasets import Dataset, Features, Value |
| import pandas as pd |
| from pathlib import Path |
|
|
| def parse_kokorochat_with_context(json_file_path, context_window=4, max_history_tokens=1500): |
| """ |
| Parse KokoroChat with conversation history for realistic counseling. |
| |
| Args: |
| json_file_path: Path to JSON file |
| context_window: Number of previous turns to include (default: 4 = 2 exchanges) |
| max_history_tokens: Approximate token limit for history (prevents too long sequences) |
| """ |
| try: |
| with open(json_file_path, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| except Exception as e: |
| return [], 0 |
| |
| conversations = [] |
| dialogue = data.get('dialogue', []) |
| |
| |
| review_en = data.get('review_by_client_en', {}) |
| total_score = review_en.get('score', 0) |
| |
| |
| topic = data.get('topic', {}) |
| main_topic = topic.get('main_en', '') |
| sub_topic = topic.get('sub', '') |
| |
| |
| for i in range(len(dialogue) - 1): |
| current = dialogue[i] |
| next_turn = dialogue[i + 1] |
| |
| |
| if current['role'] == 'client' and next_turn['role'] == 'counselor': |
| client_msg = current['utterance'].strip() |
| counselor_msg = next_turn['utterance'].strip() |
| |
| if len(client_msg) > 5 and len(counselor_msg) > 5: |
| |
| start_idx = max(0, i - context_window) |
| history = dialogue[start_idx:i] |
| |
| |
| history_text = ''.join([h['utterance'] for h in history]) |
| if len(history_text) < max_history_tokens * 3: |
| conversations.append({ |
| 'history': history, |
| 'client': client_msg, |
| 'counselor': counselor_msg, |
| 'quality_score': total_score, |
| 'topic_main': main_topic, |
| 'topic_sub': sub_topic, |
| 'dialogue_id': Path(json_file_path).stem |
| }) |
| |
| return conversations, total_score |
|
|
| def format_conversation_for_lfm2(conversation): |
| """ |
| Format conversation with history into LFM2 ChatML template |
| """ |
| |
| formatted = "<|im_start|>system\n" |
| formatted += "あなたは経験豊富な心理カウンセラーです。クライアントの話を傾聴し、共感的で支援的な応答をしてください。<|im_end|>\n" |
| |
| |
| for turn in conversation['history']: |
| if turn['role'] == 'client': |
| formatted += f"<|im_start|>user\n{turn['utterance']}<|im_end|>\n" |
| elif turn['role'] == 'counselor': |
| formatted += f"<|im_start|>assistant\n{turn['utterance']}<|im_end|>\n" |
| |
| |
| formatted += f"<|im_start|>user\n{conversation['client']}<|im_end|>\n" |
| formatted += f"<|im_start|>assistant\n{conversation['counselor']}<|im_end|><|endoftext|>" |
| |
| return formatted |
|
|
| def create_training_dataset_multiturn( |
| data_dir="./KokoroChat/data", |
| min_score=70, |
| context_window=4 |
| ): |
| """ |
| Create training dataset with conversation context. |
| |
| Args: |
| data_dir: Directory containing JSON files |
| min_score: Minimum quality score (0-100, recommend 85 for top quality) |
| context_window: Number of previous turns to include |
| """ |
| json_files = list(Path(data_dir).rglob("*.json")) |
| print(f"Found {len(json_files)} JSON files") |
| |
| all_conversations = [] |
| score_distribution = [] |
| |
| print("\nProcessing files with multi-turn context...") |
| for idx, json_file in enumerate(json_files): |
| if idx % 1000 == 0: |
| print(f"Processed {idx}/{len(json_files)} files...") |
| |
| try: |
| convs, score = parse_kokorochat_with_context( |
| json_file, |
| context_window=context_window |
| ) |
| score_distribution.append(score) |
| |
| if score >= min_score: |
| all_conversations.extend(convs) |
| except Exception as e: |
| continue |
| |
| print(f"\n=== Processing Results ===") |
| print(f"High-quality files (>= {min_score}): {sum(1 for s in score_distribution if s >= min_score)}") |
| print(f"Total conversation examples: {len(all_conversations)}") |
| |
| if len(all_conversations) == 0: |
| print(f"❌ No conversations found! Try lowering min_score (current: {min_score})") |
| return None |
| |
| |
| formatted_data = [] |
| for conv in all_conversations: |
| formatted_text = format_conversation_for_lfm2(conv) |
| |
| formatted_data.append({ |
| 'text': formatted_text, |
| 'quality_score': conv['quality_score'], |
| 'topic_main': conv['topic_main'], |
| 'topic_sub': conv['topic_sub'], |
| 'has_context': len(conv['history']) > 0 |
| }) |
| |
| |
| features = Features({ |
| 'text': Value('string'), |
| 'quality_score': Value('int64'), |
| 'topic_main': Value('string'), |
| 'topic_sub': Value('string'), |
| 'has_context': Value('bool') |
| }) |
| |
| df = pd.DataFrame(formatted_data) |
| dataset = Dataset.from_pandas(df, features=features) |
| dataset = dataset.train_test_split(test_size=0.1, seed=42) |
| |
| print(f"\n=== Final Dataset ===") |
| print(f"Training samples: {len(dataset['train'])}") |
| print(f"Validation samples: {len(dataset['test'])}") |
| print(f"Examples with context: {sum(df['has_context'])}") |
| |
| |
| dataset.save_to_disk("./kokorochat_processed_multiturn") |
| print("\n✅ Multi-turn dataset saved to ./kokorochat_processed_multiturn") |
| |
| |
| print("\n=== Sample Training Example (with context) ===") |
| sample = dataset['train'][5]['text'] |
| print(sample[:1000] + "\n..." if len(sample) > 1000 else sample) |
| |
| return dataset |
|
|
| if __name__ == "__main__": |
| dataset = create_training_dataset_multiturn( |
| data_dir="./KokoroChat/kokorochat_dialogues", |
| min_score=60, |
| context_window=4 |
| ) |
|
|