File size: 14,745 Bytes
2f35ee2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!/usr/bin/env python3
"""
OnCall.ai System - RAG vs Direct Latency Comparison Chart Generator
==================================================================

Compares RAG and Direct LLM system latency performance.
Reads statistics from latency_statistics_*.json and direct_llm_statistics_*.json

No LLM calls - pure data visualization.

Author: YanBo Chen  
Date: 2025-08-05
"""

import json
import os
import sys
from typing import Dict, List, Any, Tuple
from datetime import datetime
from pathlib import Path
import glob

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


class RAGvsDirectLatencyChartGenerator:
    """Generate RAG vs Direct latency comparison charts"""
    
    def __init__(self):
        """Initialize chart generator"""
        print("πŸ“ˆ Initializing RAG vs Direct Latency Chart Generator...")
        
        # Set up professional chart style
        plt.style.use('default')
        sns.set_palette("husl")
        
        # Define system colors
        self.system_colors = {
            'rag': '#1f77b4',      # Blue
            'direct': '#ff7f0e'    # Orange
        }
        
        print("βœ… Chart Generator ready with professional medical styling")
    
    def find_latest_statistics_files(self) -> Tuple[str, str]:
        """
        Find the most recent RAG and Direct statistics files
        
        Returns:
            Tuple of (rag_file_path, direct_file_path)
        """
        results_dir = Path(__file__).parent / "results"
        
        # Find RAG statistics file
        rag_pattern = str(results_dir / "latency_statistics_*.json")
        rag_files = glob.glob(rag_pattern)
        
        if not rag_files:
            raise FileNotFoundError(f"No RAG latency statistics files found with pattern: {rag_pattern}")
        
        latest_rag_file = max(rag_files, key=os.path.getmtime)
        
        # Find Direct statistics file
        direct_pattern = str(results_dir / "direct_llm_statistics_*.json")
        direct_files = glob.glob(direct_pattern)
        
        if not direct_files:
            raise FileNotFoundError(f"No Direct LLM statistics files found with pattern: {direct_pattern}")
        
        latest_direct_file = max(direct_files, key=os.path.getmtime)
        
        print(f"πŸ“Š Found RAG statistics: {latest_rag_file}")
        print(f"πŸ“Š Found Direct statistics: {latest_direct_file}")
        
        return latest_rag_file, latest_direct_file
    
    def load_statistics(self, rag_file: str, direct_file: str) -> Tuple[Dict, Dict]:
        """
        Load statistics from both files
        
        Args:
            rag_file: Path to RAG statistics file
            direct_file: Path to Direct statistics file
            
        Returns:
            Tuple of (rag_stats, direct_stats)
        """
        print(f"πŸ“ Loading RAG statistics from: {rag_file}")
        with open(rag_file, 'r', encoding='utf-8') as f:
            rag_stats = json.load(f)
        
        print(f"πŸ“ Loading Direct statistics from: {direct_file}")
        with open(direct_file, 'r', encoding='utf-8') as f:
            direct_stats = json.load(f)
        
        return rag_stats, direct_stats
    
    def generate_comparison_charts(self, rag_stats: Dict, direct_stats: Dict) -> str:
        """
        Generate comprehensive RAG vs Direct latency comparison charts
        
        Creates 4-panel comparison:
        1. Category-wise latency comparison
        2. Overall performance comparison
        3. Target compliance comparison
        4. Success rate comparison
        """
        try:
            # Create figure with subplots
            fig, axes = plt.subplots(2, 2, figsize=(16, 12))
            fig.suptitle('RAG vs Direct LLM - Latency Performance Comparison', 
                        fontsize=16, fontweight='bold')
            
            # Chart 1: Category-wise Latency Comparison
            ax1 = axes[0, 0]
            categories = ['diagnosis', 'treatment', 'mixed']
            rag_latencies = []
            direct_latencies = []
            
            for category in categories:
                rag_cat = rag_stats['category_results'].get(category, {})
                direct_cat = direct_stats['category_results'].get(category, {})
                
                rag_latencies.append(rag_cat.get('average_latency', 0))
                direct_latencies.append(direct_cat.get('average_latency', 0))
            
            x = np.arange(len(categories))
            width = 0.35
            
            bars1 = ax1.bar(x - width/2, rag_latencies, width, label='RAG', 
                           color=self.system_colors['rag'], alpha=0.8)
            bars2 = ax1.bar(x + width/2, direct_latencies, width, label='Direct LLM', 
                           color=self.system_colors['direct'], alpha=0.8)
            
            ax1.set_title('Latency by Category', fontweight='bold')
            ax1.set_ylabel('Average Latency (seconds)')
            ax1.set_xlabel('Query Category')
            ax1.set_xticks(x)
            ax1.set_xticklabels([cat.capitalize() for cat in categories])
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            
            # Add target line
            ax1.axhline(y=60.0, color='red', linestyle='--', alpha=0.7, label='60s Target')
            ax1.legend()
            
            # Add value labels on bars
            for bars in [bars1, bars2]:
                for bar in bars:
                    height = bar.get_height()
                    if height > 0:
                        ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
                                f'{height:.1f}s', ha='center', va='bottom', fontsize=9)
            
            # Chart 2: Overall Performance Comparison
            ax2 = axes[0, 1]
            
            systems = ['RAG', 'Direct LLM']
            overall_latencies = [
                rag_stats['overall_results']['average_latency'],
                direct_stats['overall_results']['average_latency']
            ]
            
            bars = ax2.bar(systems, overall_latencies, 
                          color=[self.system_colors['rag'], self.system_colors['direct']], 
                          alpha=0.8)
            
            ax2.set_title('Overall Average Latency', fontweight='bold')
            ax2.set_ylabel('Average Latency (seconds)')
            ax2.grid(True, alpha=0.3)
            
            # Add target line
            ax2.axhline(y=60.0, color='red', linestyle='--', alpha=0.7, label='60s Target')
            ax2.legend()
            
            # Add value labels
            for bar, value in zip(bars, overall_latencies):
                height = bar.get_height()
                ax2.text(bar.get_x() + bar.get_width()/2., height + 1,
                        f'{value:.1f}s', ha='center', va='bottom', fontweight='bold')
            
            # Chart 3: Target Compliance Comparison
            ax3 = axes[1, 0]
            
            rag_compliance = rag_stats['overall_results']['target_compliance'] * 100
            direct_compliance = direct_stats['overall_results']['target_compliance'] * 100
            
            compliance_data = [rag_compliance, direct_compliance]
            
            bars = ax3.bar(systems, compliance_data, 
                          color=[self.system_colors['rag'], self.system_colors['direct']], 
                          alpha=0.8)
            
            ax3.set_title('60s Target Compliance Rate', fontweight='bold')
            ax3.set_ylabel('Compliance Rate (%)')
            ax3.set_ylim(0, 105)
            ax3.grid(True, alpha=0.3)
            
            # Add target line at 100%
            ax3.axhline(y=100.0, color='green', linestyle='--', alpha=0.7, label='100% Target')
            ax3.legend()
            
            # Add percentage labels
            for bar, value in zip(bars, compliance_data):
                height = bar.get_height()
                ax3.text(bar.get_x() + bar.get_width()/2., height + 1,
                        f'{value:.1f}%', ha='center', va='bottom', fontweight='bold')
            
            # Chart 4: Success Rate Comparison
            ax4 = axes[1, 1]
            
            rag_success_rate = rag_stats['overall_results']['successful_queries'] / rag_stats['overall_results']['total_queries'] * 100
            direct_success_rate = direct_stats['overall_results']['successful_queries'] / direct_stats['overall_results']['total_queries'] * 100
            
            success_data = [rag_success_rate, direct_success_rate]
            
            bars = ax4.bar(systems, success_data, 
                          color=[self.system_colors['rag'], self.system_colors['direct']], 
                          alpha=0.8)
            
            ax4.set_title('Query Success Rate', fontweight='bold')
            ax4.set_ylabel('Success Rate (%)')
            ax4.set_ylim(0, 105)
            ax4.grid(True, alpha=0.3)
            
            # Add target line at 100%
            ax4.axhline(y=100.0, color='green', linestyle='--', alpha=0.7, label='100% Target')
            ax4.legend()
            
            # Add percentage labels
            for bar, value in zip(bars, success_data):
                height = bar.get_height()
                ax4.text(bar.get_x() + bar.get_width()/2., height + 1,
                        f'{value:.1f}%', ha='center', va='bottom', fontweight='bold')
            
            # Adjust layout
            plt.tight_layout()
            
            # Save chart
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            chart_filename = f"rag_vs_direct_latency_comparison_{timestamp}.png"
            
            # Ensure results directory exists
            results_dir = Path(__file__).parent / "results"
            results_dir.mkdir(exist_ok=True)
            chart_path = results_dir / chart_filename
            
            plt.savefig(chart_path, dpi=300, bbox_inches='tight', 
                       facecolor='white', edgecolor='none')
            plt.close()
            
            print(f"πŸ“ˆ RAG vs Direct latency comparison charts saved to: {chart_path}")
            return str(chart_path)
            
        except Exception as e:
            print(f"❌ Chart generation failed: {e}")
            return ""
    
    def print_comparison_summary(self, rag_stats: Dict, direct_stats: Dict):
        """Print formatted comparison summary to console"""
        print(f"\nπŸ“Š === RAG vs DIRECT LATENCY COMPARISON SUMMARY ===")
        
        # Overall comparison
        rag_overall = rag_stats['overall_results']
        direct_overall = direct_stats['overall_results']
        
        print(f"\nπŸ”„ Overall Performance:")
        print(f"   RAG System:")
        print(f"     β€’ Average Latency: {rag_overall['average_latency']:.2f}s")
        print(f"     β€’ Success Rate: {rag_overall['successful_queries']}/{rag_overall['total_queries']} ({rag_overall['successful_queries']/rag_overall['total_queries']*100:.1f}%)")
        print(f"     β€’ 60s Target Compliance: {rag_overall['target_compliance']*100:.1f}%")
        
        print(f"   Direct LLM System:")
        print(f"     β€’ Average Latency: {direct_overall['average_latency']:.2f}s")
        print(f"     β€’ Success Rate: {direct_overall['successful_queries']}/{direct_overall['total_queries']} ({direct_overall['success_rate']*100:.1f}%)")
        print(f"     β€’ 60s Target Compliance: {direct_overall['target_compliance']*100:.1f}%")
        
        # Performance winner
        if direct_overall['average_latency'] < rag_overall['average_latency']:
            latency_winner = "Direct LLM"
            latency_improvement = rag_overall['average_latency'] - direct_overall['average_latency']
        else:
            latency_winner = "RAG"
            latency_improvement = direct_overall['average_latency'] - rag_overall['average_latency']
        
        print(f"\nπŸ† Performance Winner:")
        print(f"   β€’ Faster System: {latency_winner}")
        print(f"   β€’ Performance Improvement: {latency_improvement:.2f}s ({latency_improvement/max(rag_overall['average_latency'], direct_overall['average_latency'])*100:.1f}%)")
        
        # Category breakdown
        print(f"\nπŸ“‹ Category Breakdown:")
        categories = ['diagnosis', 'treatment', 'mixed']
        
        for category in categories:
            rag_cat = rag_stats['category_results'].get(category, {})
            direct_cat = direct_stats['category_results'].get(category, {})
            
            if rag_cat.get('query_count', 0) > 0 and direct_cat.get('query_count', 0) > 0:
                rag_latency = rag_cat.get('average_latency', 0)
                direct_latency = direct_cat.get('average_latency', 0)
                
                winner = "Direct" if direct_latency < rag_latency else "RAG"
                difference = abs(rag_latency - direct_latency)
                
                print(f"   {category.capitalize()}:")
                print(f"     β€’ RAG: {rag_latency:.2f}s")
                print(f"     β€’ Direct: {direct_latency:.2f}s")
                print(f"     β€’ Winner: {winner} (faster by {difference:.2f}s)")


# Independent execution interface
if __name__ == "__main__":
    """Independent chart generation interface"""
    
    print("πŸ“ˆ OnCall.ai RAG vs Direct Latency Comparison Chart Generator")
    
    # Initialize chart generator
    chart_gen = RAGvsDirectLatencyChartGenerator()
    
    try:
        # Find latest statistics files
        rag_file, direct_file = chart_gen.find_latest_statistics_files()
        
        # Load statistics
        rag_stats, direct_stats = chart_gen.load_statistics(rag_file, direct_file)
        
        # Generate comparison charts
        print(f"πŸ“ˆ Generating RAG vs Direct comparison charts...")
        chart_path = chart_gen.generate_comparison_charts(rag_stats, direct_stats)
        
        # Print comparison summary
        chart_gen.print_comparison_summary(rag_stats, direct_stats)
        
        print(f"\nβœ… RAG vs Direct latency comparison complete!")
        print(f"πŸ“ˆ Charts saved to: {chart_path}")
        print(f"πŸ’‘ Charts optimized for research presentations and publications")
        
    except FileNotFoundError as e:
        print(f"❌ {e}")
        print("πŸ’‘ Please ensure both evaluators have been run:")
        print("   python latency_evaluator.py  # for RAG statistics")
        print("   python direct_llm_evaluator.py  # for Direct statistics")
    except Exception as e:
        print(f"❌ Chart generation failed: {e}")