| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| import json | |
| from typing import Dict, List, Tuple | |
| import numpy as np | |
| from tqdm import tqdm | |
| from sklearn.metrics import accuracy_score, f1_score | |
| import evaluate | |
| from datasets import load_dataset | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| class CounselorBenchmark: | |
| def __init__(self, base_model_path: str, finetuned_model_path: str): | |
| """ | |
| Initialize benchmark suite for counselor models | |
| """ | |
| self.base_model_path = base_model_path | |
| self.finetuned_model_path = finetuned_model_path | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Load evaluation metrics | |
| self.bleu = evaluate.load("sacrebleu") | |
| self.rouge = evaluate.load("rouge") | |
| self.bertscore = evaluate.load("bertscore") | |
| def load_models(self): | |
| """Load both base and fine-tuned models for comparison""" | |
| # Load base model | |
| print("Loading base model...") | |
| self.base_tokenizer = AutoTokenizer.from_pretrained(self.base_model_path) | |
| self.base_model = AutoModelForCausalLM.from_pretrained( | |
| self.base_model_path, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| # Load fine-tuned model | |
| print("Loading fine-tuned model...") | |
| self.ft_tokenizer = AutoTokenizer.from_pretrained(self.finetuned_model_path) | |
| self.ft_model = AutoModelForCausalLM.from_pretrained( | |
| self.finetuned_model_path, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| def generate_response(self, model, tokenizer, prompt: str, max_length: int = 256): | |
| """Generate response from model""" | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_length, | |
| temperature=0.7, | |
| do_sample=True, | |
| top_p=0.9, | |
| repetition_penalty=1.1 | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract only the generated part | |
| response = response[len(prompt):].strip() | |
| return response | |
| def evaluate_empathy_score(self, response: str) -> float: | |
| """ | |
| Evaluate empathy in counselor response | |
| Custom metric based on Japanese counseling keywords | |
| """ | |
| empathy_keywords = [ | |
| 'わかります', '理解', '共感', '気持ち', '感じ', | |
| 'つらい', '大変', 'お察し', '心配', '支援' | |
| ] | |
| score = sum(1 for keyword in empathy_keywords if keyword in response) | |
| return min(score / len(empathy_keywords), 1.0) | |
| def evaluate_response_quality(self, response: str) -> Dict[str, float]: | |
| """ | |
| Comprehensive response quality evaluation | |
| """ | |
| metrics = {} | |
| # Length appropriateness (not too short, not too long) | |
| response_length = len(response) | |
| if 50 <= response_length <= 300: | |
| metrics['length_score'] = 1.0 | |
| elif response_length < 50: | |
| metrics['length_score'] = response_length / 50 | |
| else: | |
| metrics['length_score'] = max(0, 1 - (response_length - 300) / 500) | |
| # Question engagement (does counselor ask clarifying questions?) | |
| metrics['question_score'] = 1.0 if '?' in response or 'か?' in response else 0.0 | |
| # Supportive language | |
| support_phrases = ['大丈夫', '一緒に', '支援', 'サポート', '助け'] | |
| metrics['support_score'] = sum(1 for phrase in support_phrases if phrase in response) / len(support_phrases) | |
| # Empathy score | |
| metrics['empathy_score'] = self.evaluate_empathy_score(response) | |
| return metrics | |
| def benchmark_on_test_set(self, test_data_path: str, num_samples: int = 100): | |
| """ | |
| Run comprehensive benchmark on test set | |
| """ | |
| # Load test data | |
| test_dataset = load_dataset('json', data_files=test_data_path, split='train') | |
| test_samples = test_dataset.select(range(min(num_samples, len(test_dataset)))) | |
| results = { | |
| 'base_model': {'responses': [], 'metrics': []}, | |
| 'finetuned_model': {'responses': [], 'metrics': []} | |
| } | |
| print(f"Evaluating on {len(test_samples)} test samples...") | |
| for sample in tqdm(test_samples): | |
| prompt = sample['text'].split('### Response:')[0] + '### Response:' | |
| reference = sample['text'].split('### Response:')[1].strip() if '### Response:' in sample['text'] else "" | |
| # Generate responses | |
| base_response = self.generate_response(self.base_model, self.base_tokenizer, prompt) | |
| ft_response = self.generate_response(self.ft_model, self.ft_tokenizer, prompt) | |
| # Store responses | |
| results['base_model']['responses'].append(base_response) | |
| results['finetuned_model']['responses'].append(ft_response) | |
| # Evaluate quality | |
| base_metrics = self.evaluate_response_quality(base_response) | |
| ft_metrics = self.evaluate_response_quality(ft_response) | |
| results['base_model']['metrics'].append(base_metrics) | |
| results['finetuned_model']['metrics'].append(ft_metrics) | |
| return results | |
| def calculate_aggregate_metrics(self, results: Dict) -> Dict: | |
| """Calculate aggregate metrics for comparison""" | |
| aggregate = {} | |
| for model_name in ['base_model', 'finetuned_model']: | |
| model_metrics = results[model_name]['metrics'] | |
| aggregate[model_name] = {} | |
| # Calculate average for each metric | |
| metric_names = model_metrics[0].keys() if model_metrics else [] | |
| for metric in metric_names: | |
| values = [m[metric] for m in model_metrics] | |
| aggregate[model_name][metric] = { | |
| 'mean': np.mean(values), | |
| 'std': np.std(values), | |
| 'min': np.min(values), | |
| 'max': np.max(values) | |
| } | |
| return aggregate | |
| def generate_comparison_report(self, results: Dict, aggregate: Dict): | |
| """Generate detailed comparison report""" | |
| report = [] | |
| report.append("=" * 80) | |
| report.append("COUNSELOR MODEL BENCHMARK REPORT") | |
| report.append("=" * 80) | |
| report.append("") | |
| # Overall performance comparison | |
| report.append("PERFORMANCE COMPARISON:") | |
| report.append("-" * 40) | |
| for metric in aggregate['base_model'].keys(): | |
| base_score = aggregate['base_model'][metric]['mean'] | |
| ft_score = aggregate['finetuned_model'][metric]['mean'] | |
| improvement = ((ft_score - base_score) / base_score * 100) if base_score > 0 else 0 | |
| report.append(f"\n{metric.upper()}:") | |
| report.append(f" Base Model: {base_score:.3f} (±{aggregate['base_model'][metric]['std']:.3f})") | |
| report.append(f" Fine-tuned Model: {ft_score:.3f} (±{aggregate['finetuned_model'][metric]['std']:.3f})") | |
| report.append(f" Improvement: {improvement:+.1f}%") | |
| # Calculate overall score | |
| base_overall = np.mean([aggregate['base_model'][m]['mean'] for m in aggregate['base_model']]) | |
| ft_overall = np.mean([aggregate['finetuned_model'][m]['mean'] for m in aggregate['finetuned_model']]) | |
| overall_improvement = ((ft_overall - base_overall) / base_overall * 100) if base_overall > 0 else 0 | |
| report.append("\n" + "=" * 40) | |
| report.append("OVERALL PERFORMANCE:") | |
| report.append(f" Base Model: {base_overall:.3f}") | |
| report.append(f" Fine-tuned Model: {ft_overall:.3f}") | |
| report.append(f" Overall Improvement: {overall_improvement:+.1f}%") | |
| report.append("=" * 40) | |
| return "\n".join(report) | |
| def visualize_results(self, aggregate: Dict): | |
| """Create visualization of benchmark results""" | |
| # Prepare data for plotting | |
| metrics = list(aggregate['base_model'].keys()) | |
| base_scores = [aggregate['base_model'][m]['mean'] for m in metrics] | |
| ft_scores = [aggregate['finetuned_model'][m]['mean'] for m in metrics] | |
| # Create comparison plot | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) | |
| # Bar plot comparison | |
| x = np.arange(len(metrics)) | |
| width = 0.35 | |
| ax1.bar(x - width/2, base_scores, width, label='Base Model', color='lightblue') | |
| ax1.bar(x + width/2, ft_scores, width, label='Fine-tuned Model', color='darkblue') | |
| ax1.set_xlabel('Metrics') | |
| ax1.set_ylabel('Score') | |
| ax1.set_title('Model Performance Comparison') | |
| ax1.set_xticks(x) | |
| ax1.set_xticklabels(metrics, rotation=45, ha='right') | |
| ax1.legend() | |
| ax1.grid(True, alpha=0.3) | |
| # Improvement percentage plot | |
| improvements = [((ft - base) / base * 100) if base > 0 else 0 | |
| for base, ft in zip(base_scores, ft_scores)] | |
| colors = ['green' if imp > 0 else 'red' for imp in improvements] | |
| ax2.bar(metrics, improvements, color=colors, alpha=0.7) | |
| ax2.set_xlabel('Metrics') | |
| ax2.set_ylabel('Improvement (%)') | |
| ax2.set_title('Fine-tuning Improvement over Base Model') | |
| ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5) | |
| ax2.set_xticklabels(metrics, rotation=45, ha='right') | |
| ax2.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| plt.savefig('benchmark_results.png', dpi=300, bbox_inches='tight') | |
| plt.show() | |
| print("Visualization saved as 'benchmark_results.png'") | |
| # Run benchmarking | |
| if __name__ == "__main__": | |
| # Initialize benchmark | |
| benchmark = CounselorBenchmark( | |
| base_model_path="./models/LFM2-2.6B", | |
| finetuned_model_path="./merged_counselor_mode_2b" | |
| ) | |
| # Load models | |
| benchmark.load_models() | |
| # Run benchmark | |
| print("Running benchmark evaluation...") | |
| results = benchmark.benchmark_on_test_set("./processed_data_score80/test.jsonl", num_samples=100) | |
| # Calculate aggregate metrics | |
| aggregate = benchmark.calculate_aggregate_metrics(results) | |
| # Generate report | |
| report = benchmark.generate_comparison_report(results, aggregate) | |
| print(report) | |
| # Save report | |
| with open("benchmark_report_2b.txt", "w") as f: | |
| f.write(report) | |
| # Visualize results | |
| benchmark.visualize_results(aggregate) | |
| print("\nBenchmarking completed! Check 'benchmark_report.txt' for detailed results.") | |
| #################### | |
| # import torch | |
| # from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # from peft import PeftModel, PeftConfig | |
| # import numpy as np | |
| # from typing import List, Dict, Tuple, Optional | |
| # import json | |
| # from tqdm import tqdm | |
| # import os | |
| # import gc | |
| # import warnings | |
| # from datetime import datetime | |
| # import pandas as pd | |
| # import matplotlib.pyplot as plt | |
| # import seaborn as sns | |
| # from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction | |
| # from rouge_score import rouge_scorer | |
| # import nltk | |
| # from collections import defaultdict | |
| # # Download required NLTK data | |
| # try: | |
| # nltk.download('punkt', quiet=True) | |
| # except: | |
| # pass | |
| # warnings.filterwarnings('ignore') | |
| # class AdvancedCounselorBenchmark: | |
| # def __init__(self, | |
| # base_model_name: str = "LiquidAI/LFM2-1.2B", | |
| # finetuned_model_path: str = "./counselor_model/best_model", | |
| # merged_model_path: str = "./merged_counselor_model", | |
| # test_data_path: str = "./processed_data_score70/test.jsonl", | |
| # device: str = None): | |
| # """ | |
| # Initialize advanced benchmark suite with BLEU and ROUGE metrics | |
| # Args: | |
| # base_model_name: Name/path of base model | |
| # finetuned_model_path: Path to fine-tuned LoRA adapter | |
| # merged_model_path: Path to save/load merged model | |
| # test_data_path: Path to test dataset with reference responses | |
| # device: Device to run on (cuda/cpu) | |
| # """ | |
| # self.base_model_name = base_model_name | |
| # self.finetuned_model_path = finetuned_model_path | |
| # self.merged_model_path = merged_model_path | |
| # self.test_data_path = test_data_path | |
| # self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") | |
| # print(f"🔧 Initializing Advanced Benchmark Suite") | |
| # print(f" Device: {self.device}") | |
| # if self.device == "cuda": | |
| # print(f" GPU: {torch.cuda.get_device_name(0)}") | |
| # print(f" Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") | |
| # # Initialize ROUGE scorer | |
| # self.rouge_scorer = rouge_scorer.RougeScorer( | |
| # ['rouge1', 'rouge2', 'rougeL'], | |
| # use_stemmer=False, # Set to False for Japanese | |
| # lang='japanese' | |
| # ) | |
| # # Smoothing function for BLEU scores | |
| # self.smoothing = SmoothingFunction().method1 | |
| # self.results = {} | |
| # def load_test_data(self) -> List[Dict]: | |
| # """Load test dataset with reference responses""" | |
| # print(f"\n📚 Loading test data from {self.test_data_path}") | |
| # test_data = [] | |
| # if os.path.exists(self.test_data_path): | |
| # with open(self.test_data_path, 'r', encoding='utf-8') as f: | |
| # for line in f: | |
| # data = json.loads(line) | |
| # test_data.append(data) | |
| # print(f" Loaded {len(test_data)} test examples") | |
| # else: | |
| # print(f"⚠️ Test data not found. Creating synthetic test data...") | |
| # test_data = self.create_synthetic_test_data() | |
| # return test_data | |
| # def create_synthetic_test_data(self) -> List[Dict]: | |
| # """Create synthetic test data if real data is not available""" | |
| # synthetic_data = [ | |
| # { | |
| # "text": "### Input:\n最近ストレスを感じています。\n\n### Response:\nストレスを感じているのですね。それは大変つらいことだと思います。どのような状況でストレスを感じることが多いですか?お話を聞かせていただければ、一緒に対処法を考えることができます。", | |
| # "input": "最近ストレスを感じています。", | |
| # "reference": "ストレスを感じているのですね。それは大変つらいことだと思います。どのような状況でストレスを感じることが多いですか?お話を聞かせていただければ、一緒に対処法を考えることができます。" | |
| # }, | |
| # { | |
| # "text": "### Input:\n仕事がうまくいかなくて悩んでいます。\n\n### Response:\n仕事でお悩みなのですね。うまくいかないと感じると、本当に辛いですよね。具体的にどのような点で困難を感じていらっしゃいますか?一緒に整理してみましょう。", | |
| # "input": "仕事がうまくいかなくて悩んでいます。", | |
| # "reference": "仕事でお悩みなのですね。うまくいかないと感じると、本当に辛いですよね。具体的にどのような点で困難を感じていらっしゃいますか?一緒に整理してみましょう。" | |
| # }, | |
| # { | |
| # "text": "### Input:\n人間関係で困っています。\n\n### Response:\n人間関係の悩みは本当に心が疲れますよね。お気持ちお察しします。どのような関係性でお困りでしょうか?職場、家族、友人関係など、もう少し詳しくお聞かせいただけますか?", | |
| # "input": "人間関係で困っています。", | |
| # "reference": "人間関係の悩みは本当に心が疲れますよね。お気持ちお察しします。どのような関係性でお困りでしょうか?職場、家族、友人関係など、もう少し詳しくお聞かせいただけますか?" | |
| # }, | |
| # { | |
| # "text": "### Input:\n将来が不安です。\n\n### Response:\n将来への不安を抱えていらっしゃるのですね。先が見えない不安は、とても重く感じられることと思います。特にどのような点について不安を感じていらっしゃいますか?", | |
| # "input": "将来が不安です。", | |
| # "reference": "将来への不安を抱えていらっしゃるのですね。先が見えない不安は、とても重く感じられることと思います。特にどのような点について不安を感じていらっしゃいますか?" | |
| # }, | |
| # { | |
| # "text": "### Input:\n自信が持てません。\n\n### Response:\n自信が持てないというお気持ち、よくわかります。多くの方が同じような悩みを抱えています。どのような場面で特に自信が持てないと感じますか?あなたの強みも一緒に見つけていきましょう。", | |
| # "input": "自信が持てません。", | |
| # "reference": "自信が持てないというお気持ち、よくわかります。多くの方が同じような悩みを抱えています。どのような場面で特に自信が持てないと感じますか?あなたの強みも一緒に見つけていきましょう。" | |
| # } | |
| # ] | |
| # return synthetic_data | |
| # def merge_and_save_model(self, force_merge: bool = False): | |
| # """Merge LoRA weights with base model and save""" | |
| # if os.path.exists(self.merged_model_path) and not force_merge: | |
| # print(f"✅ Merged model already exists at {self.merged_model_path}") | |
| # return | |
| # print("\n🔄 Merging LoRA adapter with base model...") | |
| # try: | |
| # # Load base model | |
| # print(" Loading base model...") | |
| # base_model = AutoModelForCausalLM.from_pretrained( | |
| # self.base_model_name, | |
| # torch_dtype=torch.float16, | |
| # device_map="auto" if self.device == "cuda" else None, | |
| # trust_remote_code=True, | |
| # low_cpu_mem_usage=True | |
| # ) | |
| # # Check if adapter exists | |
| # adapter_config_path = os.path.join(self.finetuned_model_path, "adapter_config.json") | |
| # if not os.path.exists(adapter_config_path): | |
| # print(f"⚠️ No LoRA adapter found at {self.finetuned_model_path}") | |
| # model = base_model | |
| # else: | |
| # # Load LoRA adapter | |
| # print(" Loading LoRA adapter...") | |
| # model = PeftModel.from_pretrained( | |
| # base_model, | |
| # self.finetuned_model_path, | |
| # torch_dtype=torch.float16 | |
| # ) | |
| # # Merge weights | |
| # print(" Merging weights...") | |
| # model = model.merge_and_unload() | |
| # # Save merged model | |
| # print(f" Saving merged model to {self.merged_model_path}...") | |
| # model.save_pretrained(self.merged_model_path) | |
| # # Save tokenizer | |
| # tokenizer = AutoTokenizer.from_pretrained( | |
| # self.finetuned_model_path | |
| # if os.path.exists(os.path.join(self.finetuned_model_path, "tokenizer_config.json")) | |
| # else self.base_model_name | |
| # ) | |
| # tokenizer.save_pretrained(self.merged_model_path) | |
| # print("✅ Model merged and saved successfully!") | |
| # # Clean up memory | |
| # del base_model, model | |
| # gc.collect() | |
| # torch.cuda.empty_cache() | |
| # except Exception as e: | |
| # print(f"❌ Error during merging: {e}") | |
| # raise | |
| # def load_models(self): | |
| # """Load base and fine-tuned models for comparison""" | |
| # print("\n📚 Loading models for benchmarking...") | |
| # # Load tokenizer | |
| # self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name) | |
| # if self.tokenizer.pad_token is None: | |
| # self.tokenizer.pad_token = self.tokenizer.eos_token | |
| # # Load base model | |
| # print(" Loading base model...") | |
| # self.base_model = AutoModelForCausalLM.from_pretrained( | |
| # self.base_model_name, | |
| # torch_dtype=torch.float16, | |
| # device_map="auto" if self.device == "cuda" else None, | |
| # trust_remote_code=True, | |
| # low_cpu_mem_usage=True | |
| # ) | |
| # self.base_model.eval() | |
| # # Load merged fine-tuned model | |
| # if os.path.exists(self.merged_model_path): | |
| # print(" Loading merged fine-tuned model...") | |
| # self.finetuned_model = AutoModelForCausalLM.from_pretrained( | |
| # self.merged_model_path, | |
| # torch_dtype=torch.float16, | |
| # device_map="auto" if self.device == "cuda" else None, | |
| # trust_remote_code=True, | |
| # low_cpu_mem_usage=True | |
| # ) | |
| # else: | |
| # print(" Loading fine-tuned model (attempting PEFT)...") | |
| # try: | |
| # base_for_peft = AutoModelForCausalLM.from_pretrained( | |
| # self.base_model_name, | |
| # torch_dtype=torch.float16, | |
| # device_map="auto" if self.device == "cuda" else None, | |
| # trust_remote_code=True, | |
| # low_cpu_mem_usage=True | |
| # ) | |
| # self.finetuned_model = PeftModel.from_pretrained( | |
| # base_for_peft, | |
| # self.finetuned_model_path, | |
| # torch_dtype=torch.float16 | |
| # ) | |
| # except: | |
| # self.finetuned_model = AutoModelForCausalLM.from_pretrained( | |
| # self.finetuned_model_path, | |
| # torch_dtype=torch.float16, | |
| # device_map="auto" if self.device == "cuda" else None, | |
| # trust_remote_code=True, | |
| # low_cpu_mem_usage=True | |
| # ) | |
| # self.finetuned_model.eval() | |
| # print("✅ Models loaded successfully!") | |
| # def generate_response(self, model, prompt: str, max_length: int = 150) -> str: | |
| # """Generate response from model""" | |
| # inputs = self.tokenizer( | |
| # prompt, | |
| # return_tensors="pt", | |
| # truncation=True, | |
| # max_length=512 | |
| # ) | |
| # if self.device == "cuda": | |
| # inputs = {k: v.cuda() for k, v in inputs.items()} | |
| # with torch.no_grad(): | |
| # outputs = model.generate( | |
| # **inputs, | |
| # max_new_tokens=max_length, | |
| # temperature=0.7, | |
| # do_sample=True, | |
| # top_p=0.9, | |
| # pad_token_id=self.tokenizer.pad_token_id, | |
| # eos_token_id=self.tokenizer.eos_token_id | |
| # ) | |
| # response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # # Extract only the generated response | |
| # if "### Response:" in response: | |
| # response = response.split("### Response:")[-1].strip() | |
| # elif "Response:" in response: | |
| # response = response.split("Response:")[-1].strip() | |
| # else: | |
| # # Remove the input prompt from response | |
| # response = response[len(prompt):].strip() | |
| # return response | |
| # def tokenize_japanese(self, text: str) -> List[str]: | |
| # """Tokenize Japanese text for BLEU calculation""" | |
| # # Simple character-based tokenization for Japanese | |
| # # In production, use MeCab or similar for better tokenization | |
| # import re | |
| # # Remove special characters and split | |
| # text = re.sub(r'[。、!?\n]', ' ', text) | |
| # tokens = text.strip().split() | |
| # # Character-level tokenization as fallback | |
| # if not tokens: | |
| # tokens = list(text.strip()) | |
| # return tokens | |
| # def calculate_bleu_scores(self, reference: str, hypothesis: str) -> Dict[str, float]: | |
| # """Calculate BLEU-1, BLEU-2, BLEU-3, BLEU-4 scores""" | |
| # # Tokenize texts | |
| # ref_tokens = self.tokenize_japanese(reference) | |
| # hyp_tokens = self.tokenize_japanese(hypothesis) | |
| # # Calculate BLEU scores with different n-grams | |
| # scores = {} | |
| # # BLEU-1 (unigram) | |
| # scores['BLEU-1'] = sentence_bleu( | |
| # [ref_tokens], hyp_tokens, | |
| # weights=(1.0, 0, 0, 0), | |
| # smoothing_function=self.smoothing | |
| # ) | |
| # # BLEU-2 (bigram) | |
| # scores['BLEU-2'] = sentence_bleu( | |
| # [ref_tokens], hyp_tokens, | |
| # weights=(0.5, 0.5, 0, 0), | |
| # smoothing_function=self.smoothing | |
| # ) | |
| # # BLEU-3 (trigram) | |
| # scores['BLEU-3'] = sentence_bleu( | |
| # [ref_tokens], hyp_tokens, | |
| # weights=(0.33, 0.33, 0.34, 0), | |
| # smoothing_function=self.smoothing | |
| # ) | |
| # # BLEU-4 (4-gram) | |
| # scores['BLEU-4'] = sentence_bleu( | |
| # [ref_tokens], hyp_tokens, | |
| # weights=(0.25, 0.25, 0.25, 0.25), | |
| # smoothing_function=self.smoothing | |
| # ) | |
| # return scores | |
| # def calculate_rouge_scores(self, reference: str, hypothesis: str) -> Dict[str, float]: | |
| # """Calculate ROUGE-1, ROUGE-2, ROUGE-L scores""" | |
| # scores = self.rouge_scorer.score(reference, hypothesis) | |
| # return { | |
| # 'ROUGE-1': scores['rouge1'].fmeasure, | |
| # 'ROUGE-2': scores['rouge2'].fmeasure, | |
| # 'ROUGE-L': scores['rougeL'].fmeasure | |
| # } | |
| # def run_bleu_rouge_benchmark(self, num_samples: int = None): | |
| # """Run comprehensive BLEU and ROUGE benchmark""" | |
| # print("\n" + "="*70) | |
| # print("🏃 RUNNING BLEU & ROUGE BENCHMARK") | |
| # print("="*70) | |
| # # Load test data | |
| # test_data = self.load_test_data() | |
| # if num_samples: | |
| # test_data = test_data[:num_samples] | |
| # print(f" Using {num_samples} samples for benchmarking") | |
| # # Initialize score collectors | |
| # base_scores = defaultdict(list) | |
| # finetuned_scores = defaultdict(list) | |
| # # Metrics to calculate | |
| # metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4', | |
| # 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'] | |
| # print(f"\n📊 Evaluating {len(test_data)} test examples...") | |
| # print("-" * 70) | |
| # detailed_results = [] | |
| # for i, example in enumerate(tqdm(test_data, desc="Evaluating")): | |
| # # Extract input and reference | |
| # if 'input' in example: | |
| # input_text = example['input'] | |
| # else: | |
| # # Try to extract from text field | |
| # if "### Input:" in example['text']: | |
| # input_text = example['text'].split("### Input:")[1].split("### Response:")[0].strip() | |
| # else: | |
| # input_text = example['text'].split("\n")[0].strip() | |
| # if 'reference' in example: | |
| # reference = example['reference'] | |
| # else: | |
| # # Try to extract from text field | |
| # if "### Response:" in example['text']: | |
| # reference = example['text'].split("### Response:")[1].strip() | |
| # else: | |
| # parts = example['text'].split("\n") | |
| # reference = parts[1] if len(parts) > 1 else parts[0] | |
| # # Format input for models | |
| # formatted_input = f"### Instruction:\nあなたは思いやりのある心理カウンセラーです。\n\n### Input:\n{input_text}\n\n### Response:\n" | |
| # # Generate responses | |
| # base_response = self.generate_response(self.base_model, formatted_input) | |
| # finetuned_response = self.generate_response(self.finetuned_model, formatted_input) | |
| # # Calculate BLEU scores | |
| # base_bleu = self.calculate_bleu_scores(reference, base_response) | |
| # finetuned_bleu = self.calculate_bleu_scores(reference, finetuned_response) | |
| # # Calculate ROUGE scores | |
| # base_rouge = self.calculate_rouge_scores(reference, base_response) | |
| # finetuned_rouge = self.calculate_rouge_scores(reference, finetuned_response) | |
| # # Combine scores | |
| # base_all_scores = {**base_bleu, **base_rouge} | |
| # finetuned_all_scores = {**finetuned_bleu, **finetuned_rouge} | |
| # # Collect scores | |
| # for metric in metrics: | |
| # base_scores[metric].append(base_all_scores[metric]) | |
| # finetuned_scores[metric].append(finetuned_all_scores[metric]) | |
| # # Store detailed results | |
| # detailed_results.append({ | |
| # 'input': input_text, | |
| # 'reference': reference, | |
| # 'base_response': base_response, | |
| # 'finetuned_response': finetuned_response, | |
| # 'base_scores': base_all_scores, | |
| # 'finetuned_scores': finetuned_all_scores | |
| # }) | |
| # # Print sample results | |
| # if i < 3: # Show first 3 examples | |
| # print(f"\n📝 Example {i+1}:") | |
| # print(f" Input: {input_text[:50]}...") | |
| # print(f" Reference: {reference[:50]}...") | |
| # print(f" Base response: {base_response[:50]}...") | |
| # print(f" Fine-tuned response: {finetuned_response[:50]}...") | |
| # print(f" Base BLEU-4: {base_bleu['BLEU-4']:.3f}") | |
| # print(f" Fine-tuned BLEU-4: {finetuned_bleu['BLEU-4']:.3f}") | |
| # # Calculate aggregate statistics | |
| # print("\n" + "="*70) | |
| # print("📈 BENCHMARK RESULTS") | |
| # print("="*70) | |
| # self.results = { | |
| # 'detailed_results': detailed_results, | |
| # 'aggregate_scores': {}, | |
| # 'improvements': {} | |
| # } | |
| # # Print and store results | |
| # print("\n" + "-"*70) | |
| # print(f"{'Metric':<12} {'Base Model':<20} {'Fine-tuned Model':<20} {'Improvement':<15}") | |
| # print("-"*70) | |
| # for metric in metrics: | |
| # base_mean = np.mean(base_scores[metric]) | |
| # base_std = np.std(base_scores[metric]) | |
| # finetuned_mean = np.mean(finetuned_scores[metric]) | |
| # finetuned_std = np.std(finetuned_scores[metric]) | |
| # # Calculate improvement | |
| # if base_mean > 0: | |
| # improvement = ((finetuned_mean - base_mean) / base_mean) * 100 | |
| # else: | |
| # improvement = 0 | |
| # # Store results | |
| # self.results['aggregate_scores'][metric] = { | |
| # 'base_mean': base_mean, | |
| # 'base_std': base_std, | |
| # 'finetuned_mean': finetuned_mean, | |
| # 'finetuned_std': finetuned_std | |
| # } | |
| # self.results['improvements'][metric] = improvement | |
| # # Print results | |
| # base_str = f"{base_mean:.3f} (±{base_std:.3f})" | |
| # finetuned_str = f"{finetuned_mean:.3f} (±{finetuned_std:.3f})" | |
| # imp_str = f"{improvement:+.1f}%" | |
| # # Color code improvement | |
| # if improvement > 0: | |
| # imp_str = f"✅ {imp_str}" | |
| # elif improvement < 0: | |
| # imp_str = f"⚠️ {imp_str}" | |
| # else: | |
| # imp_str = f"➖ {imp_str}" | |
| # print(f"{metric:<12} {base_str:<20} {finetuned_str:<20} {imp_str:<15}") | |
| # # Calculate overall scores | |
| # print("\n" + "="*70) | |
| # print("🎯 OVERALL PERFORMANCE") | |
| # print("="*70) | |
| # # Average BLEU score | |
| # bleu_metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4'] | |
| # base_bleu_avg = np.mean([np.mean(base_scores[m]) for m in bleu_metrics]) | |
| # finetuned_bleu_avg = np.mean([np.mean(finetuned_scores[m]) for m in bleu_metrics]) | |
| # bleu_improvement = ((finetuned_bleu_avg - base_bleu_avg) / base_bleu_avg) * 100 if base_bleu_avg > 0 else 0 | |
| # # Average ROUGE score | |
| # rouge_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L'] | |
| # base_rouge_avg = np.mean([np.mean(base_scores[m]) for m in rouge_metrics]) | |
| # finetuned_rouge_avg = np.mean([np.mean(finetuned_scores[m]) for m in rouge_metrics]) | |
| # rouge_improvement = ((finetuned_rouge_avg - base_rouge_avg) / base_rouge_avg) * 100 if base_rouge_avg > 0 else 0 | |
| # # Overall average | |
| # base_overall = np.mean([np.mean(base_scores[m]) for m in metrics]) | |
| # finetuned_overall = np.mean([np.mean(finetuned_scores[m]) for m in metrics]) | |
| # overall_improvement = ((finetuned_overall - base_overall) / base_overall) * 100 if base_overall > 0 else 0 | |
| # self.results['summary'] = { | |
| # 'bleu_average': { | |
| # 'base': base_bleu_avg, | |
| # 'finetuned': finetuned_bleu_avg, | |
| # 'improvement': bleu_improvement | |
| # }, | |
| # 'rouge_average': { | |
| # 'base': base_rouge_avg, | |
| # 'finetuned': finetuned_rouge_avg, | |
| # 'improvement': rouge_improvement | |
| # }, | |
| # 'overall': { | |
| # 'base': base_overall, | |
| # 'finetuned': finetuned_overall, | |
| # 'improvement': overall_improvement | |
| # } | |
| # } | |
| # print(f"\n📊 Average BLEU Score:") | |
| # print(f" Base Model: {base_bleu_avg:.3f}") | |
| # print(f" Fine-tuned Model: {finetuned_bleu_avg:.3f}") | |
| # print(f" Improvement: {bleu_improvement:+.1f}%") | |
| # print(f"\n📊 Average ROUGE Score:") | |
| # print(f" Base Model: {base_rouge_avg:.3f}") | |
| # print(f" Fine-tuned Model: {finetuned_rouge_avg:.3f}") | |
| # print(f" Improvement: {rouge_improvement:+.1f}%") | |
| # print(f"\n🎯 Overall Average:") | |
| # print(f" Base Model: {base_overall:.3f}") | |
| # print(f" Fine-tuned Model: {finetuned_overall:.3f}") | |
| # print(f" Improvement: {overall_improvement:+.1f}%") | |
| # print("="*70) | |
| # return self.results | |
| # def visualize_results(self, save_path: str = "bleu_rouge_benchmark.png"): | |
| # """Create comprehensive visualization of BLEU and ROUGE results""" | |
| # if 'aggregate_scores' not in self.results: | |
| # print("❌ No results to visualize. Run benchmark first.") | |
| # return | |
| # print("\n📊 Creating visualizations...") | |
| # fig, axes = plt.subplots(2, 3, figsize=(18, 12)) | |
| # # Color scheme | |
| # base_color = '#3498db' | |
| # finetuned_color = '#e74c3c' | |
| # improvement_positive = '#27ae60' | |
| # improvement_negative = '#c0392b' | |
| # # 1. BLEU Scores Comparison | |
| # bleu_metrics = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4'] | |
| # bleu_base = [self.results['aggregate_scores'][m]['base_mean'] for m in bleu_metrics] | |
| # bleu_finetuned = [self.results['aggregate_scores'][m]['finetuned_mean'] for m in bleu_metrics] | |
| # x = np.arange(len(bleu_metrics)) | |
| # width = 0.35 | |
| # axes[0, 0].bar(x - width/2, bleu_base, width, label='Base Model', | |
| # color=base_color, alpha=0.8) | |
| # axes[0, 0].bar(x + width/2, bleu_finetuned, width, label='Fine-tuned Model', | |
| # color=finetuned_color, alpha=0.8) | |
| # axes[0, 0].set_xlabel('BLEU Metrics') | |
| # axes[0, 0].set_ylabel('Score') | |
| # axes[0, 0].set_title('BLEU Score Comparison') | |
| # axes[0, 0].set_xticks(x) | |
| # axes[0, 0].set_xticklabels(bleu_metrics) | |
| # axes[0, 0].legend() | |
| # axes[0, 0].grid(True, alpha=0.3) | |
| # axes[0, 0].set_ylim([0, max(max(bleu_base), max(bleu_finetuned)) * 1.2]) | |
| # # 2. ROUGE Scores Comparison | |
| # rouge_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L'] | |
| # rouge_base = [self.results['aggregate_scores'][m]['base_mean'] for m in rouge_metrics] | |
| # rouge_finetuned = [self.results['aggregate_scores'][m]['finetuned_mean'] for m in rouge_metrics] | |
| # x = np.arange(len(rouge_metrics)) | |
| # axes[0, 1].bar(x - width/2, rouge_base, width, label='Base Model', | |
| # color=base_color, alpha=0.8) | |
| # axes[0, 1].bar(x + width/2, rouge_finetuned, width, label='Fine-tuned Model', | |
| # color=finetuned_color, alpha=0.8) | |
| # axes[0, 1].set_xlabel('ROUGE Metrics') | |
| # axes[0, 1].set_ylabel('Score') | |
| # axes[0, 1].set_title('ROUGE Score Comparison') | |
| # axes[0, 1].set_xticks(x) | |
| # axes[0, 1].set_xticklabels(rouge_metrics) | |
| # axes[0, 1].legend() | |
| # axes[0, 1].grid(True, alpha=0.3) | |
| # axes[0, 1].set_ylim([0, max(max(rouge_base), max(rouge_finetuned)) * 1.2]) | |
| # # 3. Improvement Percentages | |
| # all_metrics = bleu_metrics + rouge_metrics | |
| # improvements = [self.results['improvements'][m] for m in all_metrics] | |
| # colors = [improvement_positive if imp > 0 else improvement_negative for imp in improvements] | |
| # axes[0, 2].barh(range(len(all_metrics)), improvements, color=colors, alpha=0.7) | |
| # axes[0, 2].set_yticks(range(len(all_metrics))) | |
| # axes[0, 2].set_yticklabels(all_metrics) | |
| # axes[0, 2].set_xlabel('Improvement (%)') | |
| # axes[0, 2].set_title('Performance Improvement by Metric') | |
| # axes[0, 2].axvline(x=0, color='black', linestyle='-', linewidth=0.5) | |
| # axes[0, 2].grid(True, alpha=0.3, axis='x') | |
| # # 4. Line plot showing progression | |
| # axes[1, 0].plot(bleu_metrics, bleu_base, 'o-', label='Base Model', | |
| # color=base_color, linewidth=2, markersize=8) | |
| # axes[1, 0].plot(bleu_metrics, bleu_finetuned, 's-', label='Fine-tuned Model', | |
| # color=finetuned_color, linewidth=2, markersize=8) | |
| # axes[1, 0].set_xlabel('BLEU N-gram') | |
| # axes[1, 0].set_ylabel('Score') | |
| # axes[1, 0].set_title('BLEU Score Progression') | |
| # axes[1, 0].legend() | |
| # axes[1, 0].grid(True, alpha=0.3) | |
| # # 5. Summary Statistics | |
| # ax5 = axes[1, 1] | |
| # ax5.axis('off') | |
| # summary_text = f""" | |
| # BENCHMARK SUMMARY | |
| # {'='*30} | |
| # BLEU Average: | |
| # Base: {self.results['summary']['bleu_average']['base']:.3f} | |
| # Fine-tuned: {self.results['summary']['bleu_average']['finetuned']:.3f} | |
| # Improvement: {self.results['summary']['bleu_average']['improvement']:+.1f}% | |
| # ROUGE Average: | |
| # Base: {self.results['summary']['rouge_average']['base']:.3f} | |
| # Fine-tuned: {self.results['summary']['rouge_average']['finetuned']:.3f} | |
| # Improvement: {self.results['summary']['rouge_average']['improvement']:+.1f}% | |
| # Overall Performance: | |
| # Base: {self.results['summary']['overall']['base']:.3f} | |
| # Fine-tuned: {self.results['summary']['overall']['finetuned']:.3f} | |
| # Improvement: {self.results['summary']['overall']['improvement']:+.1f}% | |
| # Best Improvements: | |
| # """ | |
| # # Find best improvements | |
| # sorted_metrics = sorted(all_metrics, | |
| # key=lambda m: self.results['improvements'][m], | |
| # reverse=True) | |
| # for m in sorted_metrics[:2]: | |
| # summary_text += f" • {m}: {self.results['improvements'][m]:+.1f}%\n" | |
| # if any(self.results['improvements'][m] < 0 for m in all_metrics): | |
| # summary_text += f"\nNeeds Attention:\n" | |
| # for m in sorted_metrics[-2:]: | |
| # if self.results['improvements'][m] < 0: | |
| # summary_text += f" • {m}: {self.results['improvements'][m]:+.1f}%\n" | |
| # ax5.text(0.1, 0.9, summary_text, transform=ax5.transAxes, | |
| # fontsize=10, verticalalignment='top', fontfamily='monospace') | |
| # # 6. Heatmap of all scores | |
| # metrics_for_heatmap = all_metrics | |
| # models = ['Base', 'Fine-tuned'] | |
| # heatmap_data = [] | |
| # for metric in metrics_for_heatmap: | |
| # heatmap_data.append([ | |
| # self.results['aggregate_scores'][metric]['base_mean'], | |
| # self.results['aggregate_scores'][metric]['finetuned_mean'] | |
| # ]) | |
| # im = axes[1, 2].imshow(heatmap_data, cmap='YlOrRd', aspect='auto') | |
| # axes[1, 2].set_xticks(np.arange(len(models))) | |
| # axes[1, 2].set_yticks(np.arange(len(metrics_for_heatmap))) | |
| # axes[1, 2].set_xticklabels(models) | |
| # axes[1, 2].set_yticklabels(metrics_for_heatmap) | |
| # axes[1, 2].set_title('Score Heatmap') | |
| # # Add text annotations | |
| # for i in range(len(metrics_for_heatmap)): | |
| # for j in range(len(models)): | |
| # text = axes[1, 2].text(j, i, f'{heatmap_data[i][j]:.3f}', | |
| # ha="center", va="center", color="black", fontsize=8) | |
| # plt.colorbar(im, ax=axes[1, 2]) | |
| # plt.suptitle('BLEU & ROUGE Benchmark Results', fontsize=16, fontweight='bold') | |
| # plt.tight_layout() | |
| # plt.savefig(save_path, dpi=300, bbox_inches='tight') | |
| # print(f"✅ Visualization saved to {save_path}") | |
| # plt.show() | |
| # def save_results(self, output_path: str = "bleu_rouge_results.json"): | |
| # """Save benchmark results to JSON""" | |
| # # Convert numpy types to Python native types for JSON serialization | |
| # def convert_to_native(obj): | |
| # if isinstance(obj, np.floating): | |
| # return float(obj) | |
| # elif isinstance(obj, np.integer): | |
| # return int(obj) | |
| # elif isinstance(obj, np.ndarray): | |
| # return obj.tolist() | |
| # elif isinstance(obj, dict): | |
| # return {k: convert_to_native(v) for k, v in obj.items()} | |
| # elif isinstance(obj, list): | |
| # return [convert_to_native(item) for item in obj] | |
| # return obj | |
| # results_native = convert_to_native(self.results) | |
| # with open(output_path, 'w', encoding='utf-8') as f: | |
| # json.dump(results_native, f, ensure_ascii=False, indent=2) | |
| # print(f"✅ Results saved to {output_path}") | |
| # def generate_detailed_report(self, output_path: str = "bleu_rouge_report.md"): | |
| # """Generate detailed markdown report""" | |
| # if not self.results: | |
| # print("❌ No results to report. Run benchmark first.") | |
| # return | |
| # report = f"""# BLEU & ROUGE Benchmark Report | |
| # Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| # ## Executive Summary | |
| # Comprehensive evaluation of the fine-tuned counseling model using BLEU and ROUGE metrics. | |
| # ### Overall Performance | |
| # - **Base Model Score**: {self.results['summary']['overall']['base']:.3f} | |
| # - **Fine-tuned Model Score**: {self.results['summary']['overall']['finetuned']:.3f} | |
| # - **Overall Improvement**: {self.results['summary']['overall']['improvement']:+.1f}% | |
| # ## Detailed Metrics | |
| # ### BLEU Scores | |
| # | Metric | Base Model | Fine-tuned Model | Improvement | | |
| # |--------|------------|------------------|-------------| | |
| # """ | |
| # for metric in ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']: | |
| # scores = self.results['aggregate_scores'][metric] | |
| # report += f"| {metric} | {scores['base_mean']:.3f} (±{scores['base_std']:.3f}) | " | |
| # report += f"{scores['finetuned_mean']:.3f} (±{scores['finetuned_std']:.3f}) | " | |
| # report += f"{self.results['improvements'][metric]:+.1f}% |\n" | |
| # report += f""" | |
| # **BLEU Average**: {self.results['summary']['bleu_average']['improvement']:+.1f}% improvement | |
| # ### ROUGE Scores | |
| # | Metric | Base Model | Fine-tuned Model | Improvement | | |
| # |--------|------------|------------------|-------------| | |
| # """ | |
| # for metric in ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']: | |
| # scores = self.results['aggregate_scores'][metric] | |
| # report += f"| {metric} | {scores['base_mean']:.3f} (±{scores['base_std']:.3f}) | " | |
| # report += f"{scores['finetuned_mean']:.3f} (±{scores['finetuned_std']:.3f}) | " | |
| # report += f"{self.results['improvements'][metric]:+.1f}% |\n" | |
| # report += f""" | |
| # **ROUGE Average**: {self.results['summary']['rouge_average']['improvement']:+.1f}% improvement | |
| # ## Sample Outputs | |
| # """ | |
| # # Add sample outputs | |
| # for i, result in enumerate(self.results['detailed_results'][:3]): | |
| # report += f"""### Example {i+1} | |
| # **Input**: {result['input']} | |
| # **Reference**: {result['reference'][:200]}... | |
| # **Base Model Response**: {result['base_response'][:200]}... | |
| # **Fine-tuned Model Response**: {result['finetuned_response'][:200]}... | |
| # **Scores**: | |
| # - Base BLEU-4: {result['base_scores']['BLEU-4']:.3f}, ROUGE-L: {result['base_scores']['ROUGE-L']:.3f} | |
| # - Fine-tuned BLEU-4: {result['finetuned_scores']['BLEU-4']:.3f}, ROUGE-L: {result['finetuned_scores']['ROUGE-L']:.3f} | |
| # --- | |
| # """ | |
| # report += """## Analysis & Recommendations | |
| # """ | |
| # overall_imp = self.results['summary']['overall']['improvement'] | |
| # if overall_imp < -10: | |
| # report += """### ⚠️ Significant Performance Degradation | |
| # The fine-tuned model shows significant degradation in BLEU/ROUGE scores. This indicates: | |
| # 1. **Catastrophic Forgetting**: The model has lost its language generation capabilities | |
| # 2. **Overfitting**: The model memorized training data instead of learning patterns | |
| # 3. **Format Mismatch**: Training and inference formats may differ | |
| # **Immediate Actions Required**: | |
| # - ✅ Ensure proper model merging (LoRA weights with base model) | |
| # - ✅ Reduce learning rate (try 1e-5 or 2e-5) | |
| # - ✅ Use smaller LoRA rank (r=4 or r=8) | |
| # - ✅ Mix general conversation data with counseling data (80/20 ratio) | |
| # - ✅ Implement regularization (weight decay=0.1, dropout=0.1) | |
| # - ✅ Use early stopping with patience=3 | |
| # """ | |
| # elif overall_imp < 0: | |
| # report += """### ⚠️ Minor Performance Degradation | |
| # The model shows slight degradation. Common causes: | |
| # 1. **Aggressive Fine-tuning**: Parameters changed too much | |
| # 2. **Limited Training Data**: Not enough diverse examples | |
| # 3. **Domain Shift**: Counseling domain too different from base training | |
| # **Recommended Actions**: | |
| # - ✅ Fine-tune for fewer epochs (1-2 instead of 3) | |
| # - ✅ Use gradient accumulation for larger effective batch size | |
| # - ✅ Implement knowledge distillation from base model | |
| # - ✅ Add more diverse training examples | |
| # """ | |
| # elif overall_imp < 10: | |
| # report += """### 📊 Modest Improvement | |
| # The model shows small but positive improvements. | |
| # **To Further Improve**: | |
| # - ✅ Increase training data quality and quantity | |
| # - ✅ Experiment with different generation parameters | |
| # - ✅ Fine-tune on domain-specific pre-training | |
| # - ✅ Use ensemble methods with base model | |
| # """ | |
| # else: | |
| # report += """### ✅ Significant Improvement | |
| # Excellent results! The fine-tuned model shows substantial improvements. | |
| # **Next Steps**: | |
| # - ✅ Deploy for A/B testing with users | |
| # - ✅ Monitor performance on edge cases | |
| # - ✅ Consider model compression for deployment | |
| # - ✅ Collect user feedback for iterative improvement | |
| # """ | |
| # with open(output_path, 'w', encoding='utf-8') as f: | |
| # f.write(report) | |
| # print(f"✅ Detailed report saved to {output_path}") | |
| # # Main execution | |
| # if __name__ == "__main__": | |
| # import argparse | |
| # parser = argparse.ArgumentParser(description='Advanced BLEU & ROUGE Benchmark') | |
| # parser.add_argument('--base_model', type=str, default='LiquidAI/LFM2-2.6B', | |
| # help='Base model name') | |
| # parser.add_argument('--finetuned_path', type=str, default='./counselor_model/best_model', | |
| # help='Path to fine-tuned model') | |
| # parser.add_argument('--merged_path', type=str, default='./merged_counselor_mode_2b', | |
| # help='Path to save/load merged model') | |
| # parser.add_argument('--test_data', type=str, default='./processed_data_score80/test.jsonl', | |
| # help='Path to test data') | |
| # parser.add_argument('--num_samples', type=int, default=None, | |
| # help='Number of samples to evaluate (None for all)') | |
| # parser.add_argument('--force_merge', action='store_true', | |
| # help='Force re-merge even if merged model exists') | |
| # parser.add_argument('--skip_merge', action='store_true', | |
| # help='Skip merging step') | |
| # parser.add_argument('--output_dir', type=str, default='./benchmark_results', | |
| # help='Directory to save results') | |
| # args = parser.parse_args() | |
| # # Create output directory | |
| # os.makedirs(args.output_dir, exist_ok=True) | |
| # try: | |
| # # Initialize benchmark | |
| # print("🚀 Initializing Advanced BLEU & ROUGE Benchmark") | |
| # benchmark = AdvancedCounselorBenchmark( | |
| # base_model_name=args.base_model, | |
| # finetuned_model_path=args.finetuned_path, | |
| # merged_model_path=args.merged_path, | |
| # test_data_path=args.test_data | |
| # ) | |
| # # Merge models if needed | |
| # if not args.skip_merge: | |
| # benchmark.merge_and_save_model(force_merge=args.force_merge) | |
| # # Load models | |
| # benchmark.load_models() | |
| # # Run BLEU & ROUGE benchmark | |
| # results = benchmark.run_bleu_rouge_benchmark(num_samples=args.num_samples) | |
| # # Save results | |
| # benchmark.save_results(os.path.join(args.output_dir, "bleu_rouge_results_2b.json")) | |
| # # Generate visualizations | |
| # benchmark.visualize_results(os.path.join(args.output_dir, "bleu_rouge_visualization_2b.png")) | |
| # # Generate detailed report | |
| # benchmark.generate_detailed_report(os.path.join(args.output_dir, "bleu_rouge_report_2b.md")) | |
| # print("\n✅ BLEU & ROUGE Benchmarking completed successfully!") | |
| # print(f"📁 Results saved to {args.output_dir}/") | |
| # except Exception as e: | |
| # print(f"\n❌ Error during benchmarking: {e}") | |
| # import traceback | |
| # traceback.print_exc() | |