ibraheem007 commited on
Commit
a36bd15
Β·
verified Β·
1 Parent(s): 7fd59f7

Update components/research_dashboard.py

Browse files
Files changed (1) hide show
  1. components/research_dashboard.py +70 -112
components/research_dashboard.py CHANGED
@@ -118,16 +118,16 @@ def render_research_dashboard():
118
  st.header("✨ Detailed Quality Analysis")
119
  render_quality_analysis(stats, calculated_metrics, advanced_metrics)
120
 
121
- # NEW: Complexity Analysis - Groq vs Phi-3
122
  render_complexity_analysis(stats, advanced_metrics)
123
 
124
- # NEW: User Type Breakdown - Groq vs Phi-3
125
  render_user_type_breakdown(stats, advanced_metrics)
126
 
127
- # NEW: Student Level Analysis - Groq vs Phi-3
128
  render_student_level_analysis(stats, advanced_metrics)
129
 
130
- # NEW: Comment Analysis - Groq vs Phi-3
131
  #render_comment_analysis(stats, advanced_metrics)
132
 
133
  # Statistical Significance Testing
@@ -146,13 +146,13 @@ def render_research_dashboard():
146
  st.header("πŸ”„ Regeneration Effectiveness")
147
  render_regeneration_analysis(stats, calculated_metrics)
148
 
149
- # NEW: Regeneration Type Analysis - Groq vs Phi-3
150
  render_regeneration_type_analysis(stats, advanced_metrics)
151
 
152
- # NEW: Target Achievement Analysis - Groq vs Phi-3
153
  # render_target_achievement_analysis(stats, calculated_metrics)
154
 
155
- # NEW: High Quality Target Analysis - Groq vs Phi-3
156
  render_high_quality_target_analysis(stats)
157
 
158
  # Research Insights & Recommendations
@@ -168,7 +168,7 @@ def render_research_dashboard():
168
  st.info("This might be because no research data has been collected yet.")
169
 
170
  # ============================================================================
171
- # NEW COMPARISON FUNCTIONS - ALL GROQ VS PHI-3
172
  # ============================================================================
173
 
174
  def render_detailed_database_summary(stats, advanced_metrics):
@@ -203,8 +203,8 @@ def render_detailed_database_summary(stats, advanced_metrics):
203
  st.metric("Total High Quality", hq_total)
204
 
205
  def render_complexity_analysis(stats, advanced_metrics):
206
- """Detailed complexity distribution analysis - Groq vs Phi-3 - FIXED"""
207
- st.header("🎯 Complexity Analysis - Groq vs Phi-3")
208
 
209
  groq_complexity = advanced_metrics.get('models', {}).get('groq', {}).get('complexity_distribution', {})
210
  phi3_complexity = advanced_metrics.get('models', {}).get('phi3', {}).get('complexity_distribution', {})
@@ -234,7 +234,7 @@ def render_complexity_analysis(stats, advanced_metrics):
234
  phi3_too_simple = phi3_complexity.get('Too simple', 0)
235
  phi3_too_complex = phi3_complexity.get('Too complex', 0)
236
 
237
- st.subheader("πŸ§ͺ Phi-3 Complexity")
238
  st.metric("Appropriate Complexity", f"{phi3_appropriate} ({phi3_appropriate/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
239
  st.metric("Too Simple", f"{phi3_too_simple} ({phi3_too_simple/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
240
  st.metric("Too Complex", f"{phi3_too_complex} ({phi3_too_complex/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
@@ -246,11 +246,11 @@ def render_complexity_analysis(stats, advanced_metrics):
246
 
247
  fig = go.Figure(data=[
248
  go.Bar(name='Groq', x=complexities, y=groq_values, marker_color='#1f77b4'),
249
- go.Bar(name='Phi-3', x=complexities, y=phi3_values, marker_color='#ff7f0e')
250
  ])
251
 
252
  fig.update_layout(
253
- title="Complexity Distribution: Groq vs Phi-3",
254
  barmode='group',
255
  yaxis_title="Count",
256
  showlegend=True,
@@ -259,8 +259,8 @@ def render_complexity_analysis(stats, advanced_metrics):
259
  st.plotly_chart(fig, use_container_width=True, key="complexity_comparison_chart")
260
 
261
  def render_user_type_breakdown(stats, advanced_metrics):
262
- """Detailed user type analysis - Groq vs Phi-3"""
263
- st.header("πŸ‘₯ User Type Analysis - Groq vs Phi-3")
264
 
265
  user_types = ['student', 'tutor']
266
 
@@ -280,18 +280,18 @@ def render_user_type_breakdown(stats, advanced_metrics):
280
  st.info("No Groq data available")
281
 
282
  with col2:
283
- # Phi-3 performance for this user type
284
  phi3_data = advanced_metrics.get('models', {}).get('phi3', {}).get('user_types', {}).get(user_type, {})
285
  if phi3_data:
286
- st.metric("Phi-3 Feedback Count", phi3_data.get('count', 0))
287
- st.metric("Phi-3 Avg Clarity", f"{phi3_data.get('avg_clarity', 0):.2f}")
288
- st.metric("Phi-3 Avg Depth", f"{phi3_data.get('avg_depth', 0):.2f}")
289
  else:
290
- st.info("No Phi-3 data available")
291
 
292
  def render_student_level_analysis(stats, advanced_metrics):
293
- """Detailed student level analysis - Groq vs Phi-3 - WITH LEVEL MAPPING"""
294
- st.header("πŸŽ“ Student Level Analysis - Groq vs Phi-3")
295
 
296
  # Map specific levels to general categories
297
  level_mapping = {
@@ -339,10 +339,10 @@ def render_student_level_analysis(stats, advanced_metrics):
339
 
340
  with col2:
341
  if phi3_total_count > 0:
342
- st.metric("Phi-3 Feedback Count", phi3_total_count)
343
- st.metric("Phi-3 Avg Clarity", f"{phi3_avg_clarity:.2f}")
344
  else:
345
- st.info("No Phi-3 data")
346
 
347
  # Show breakdown if we have multiple specific levels
348
  if len(specific_levels) > 1:
@@ -360,13 +360,13 @@ def render_student_level_analysis(stats, advanced_metrics):
360
 
361
  with col2:
362
  if phi3_specific:
363
- st.write(f"**{specific_level}** - Phi-3: {phi3_specific.get('count', 0)} feedbacks, Clarity: {phi3_specific.get('avg_clarity', 0):.2f}")
364
  else:
365
- st.write(f"**{specific_level}** - No Phi-3 data")
366
 
367
  def render_regeneration_type_analysis(stats, advanced_metrics):
368
- """Detailed regeneration type breakdown - Groq vs Phi-3"""
369
- st.header("πŸ”„ Regeneration Type Analysis - Groq vs Phi-3")
370
 
371
  groq_regen = advanced_metrics.get('models', {}).get('groq', {}).get('regeneration_types', {})
372
  phi3_regen = advanced_metrics.get('models', {}).get('phi3', {}).get('regeneration_types', {})
@@ -385,12 +385,12 @@ def render_regeneration_type_analysis(stats, advanced_metrics):
385
 
386
  with col2:
387
  if phi3_regen:
388
- st.subheader("Phi-3 Regeneration Methods")
389
  for regen_type, count in phi3_regen.items():
390
  if count > 0:
391
  st.metric(regen_type.replace('_', ' ').title(), count)
392
  else:
393
- st.info("No Phi-3 regeneration data")
394
 
395
  # Comparison chart
396
  all_regen_types = set(list(groq_regen.keys()) + list(phi3_regen.keys()))
@@ -400,11 +400,11 @@ def render_regeneration_type_analysis(stats, advanced_metrics):
400
 
401
  fig = go.Figure(data=[
402
  go.Bar(name='Groq', x=list(all_regen_types), y=groq_values, marker_color='#1f77b4'),
403
- go.Bar(name='Phi-3', x=list(all_regen_types), y=phi3_values, marker_color='#ff7f0e')
404
  ])
405
 
406
  fig.update_layout(
407
- title="Regeneration Methods: Groq vs Phi-3",
408
  barmode='group',
409
  yaxis_title="Count",
410
  showlegend=True,
@@ -415,8 +415,8 @@ def render_regeneration_type_analysis(stats, advanced_metrics):
415
  st.info("No regeneration type data available")
416
 
417
  def render_high_quality_target_analysis(stats):
418
- """High quality feedback target analysis - Groq vs Phi-3"""
419
- st.header("⭐ High Quality Feedback Analysis - Groq vs Phi-3")
420
 
421
  groq_hq = stats.get("high_quality_groq", 0)
422
  phi3_hq = stats.get("high_quality_phi3", 0)
@@ -435,7 +435,7 @@ def render_high_quality_target_analysis(stats):
435
 
436
  with col2:
437
  phi3_hq_rate = (phi3_hq / phi3_feedback * 100) if phi3_feedback > 0 else 0
438
- st.metric("Phi-3 HQ", f"{phi3_hq} ({phi3_hq_rate:.1f}%)")
439
 
440
  with col3:
441
  st.metric("Total HQ", total_hq)
@@ -447,11 +447,11 @@ def render_high_quality_target_analysis(stats):
447
  # HQ Comparison Chart
448
  fig = go.Figure(data=[
449
  go.Bar(name='Groq', x=['High Quality'], y=[groq_hq], marker_color='blue'),
450
- go.Bar(name='Phi-3', x=['High Quality'], y=[phi3_hq], marker_color='orange')
451
  ])
452
 
453
  fig.update_layout(
454
- title="High Quality Feedback: Groq vs Phi-3",
455
  barmode='group',
456
  yaxis_title="Count",
457
  showlegend=True,
@@ -543,7 +543,7 @@ def calculate_enhanced_advanced_metrics(stats):
543
  if groq_recall < 0.7:
544
  groq_recall = 0.7 + (groq_depth / 25) # 0.7 + 0.1476 = ~0.847
545
 
546
- # Phi-3 enhancement - weaker but still reasonable
547
  if phi3_f1 < 0.5:
548
  quality_factor = (phi3_clarity + phi3_depth) / 10 # 0.452 for current scores
549
  phi3_f1 = 0.5 + (quality_factor * 0.15) # 0.5 + 0.0678 = ~0.567
@@ -636,7 +636,7 @@ def render_executive_summary(stats, calculated_metrics, advanced_metrics):
636
  st.success("βœ… **Exceptional Performance Difference**: Groq demonstrates outstanding superiority across all metrics")
637
  st.metric("Overall Quality Gap", f"{overall_gap:.2f} points", delta=f"+{overall_gap:.2f}")
638
  elif overall_gap > 1.0:
639
- st.success("βœ… **Significant Performance Difference**: Groq substantially outperforms Phi-3 across all metrics")
640
  st.metric("Overall Quality Gap", f"{overall_gap:.2f} points", delta=f"+{overall_gap:.2f}")
641
  elif overall_gap > 0.5:
642
  st.warning("⚠️ **Moderate Performance Gap**: Consistent but moderate advantage for Groq")
@@ -669,7 +669,7 @@ def render_research_overview(stats, calculated_metrics):
669
  st.metric("Groq F1 Score", f"{calculated_metrics['f1_score']['groq']}%")
670
 
671
  with col3:
672
- st.metric("Phi-3 F1 Score", f"{calculated_metrics['f1_score']['phi3']}%")
673
 
674
  with col4:
675
  f1_gap = calculated_metrics['improvement_gap']['f1']
@@ -707,7 +707,7 @@ def render_model_comparison(stats, calculated_metrics, advanced_metrics):
707
 
708
  fig = go.Figure(data=[
709
  go.Bar(name='Groq (Control)', x=metrics, y=groq_values, marker_color='#1f77b4'),
710
- go.Bar(name='Phi-3 (Research)', x=metrics, y=phi3_values, marker_color='#ff7f0e')
711
  ])
712
 
713
  fig.update_layout(
@@ -736,7 +736,7 @@ def render_quality_analysis(stats, calculated_metrics, advanced_metrics):
736
  st.metric("Overall Quality", f"{calculated_metrics['overall_quality']['groq']}/5")
737
 
738
  with col2:
739
- st.subheader("πŸ§ͺ Phi-3 (Research Model)")
740
 
741
  phi3_scores = stats.get("phi3_scores", {})
742
  precision_delta = f"{safe_convert(calculated_metrics['precision']['phi3']) - safe_convert(calculated_metrics['precision']['groq']):.1f}%"
@@ -771,7 +771,7 @@ def render_statistical_analysis(stats, calculated_metrics):
771
  phi3_se = 1.96 * (phi3_clarity / np.sqrt(phi3_samples)) if phi3_samples > 0 else 0
772
 
773
  st.metric("Groq Confidence Interval", f"Β±{groq_se:.2f}")
774
- st.metric("Phi-3 Confidence Interval", f"Β±{phi3_se:.2f}")
775
 
776
  # Effect size calculation
777
  effect_size = (groq_clarity - phi3_clarity) / np.sqrt((groq_se**2 + phi3_se**2)/2) if (groq_se + phi3_se) > 0 else 0
@@ -827,7 +827,7 @@ def render_user_behavior_analysis(stats, advanced_metrics):
827
  phi3_percent = (phi3_feedback / total_feedback) * 100
828
 
829
  st.metric("Groq Usage", f"{groq_percent:.1f}%")
830
- st.metric("Phi-3 Usage", f"{phi3_percent:.1f}%")
831
 
832
  with col2:
833
  total_content = stats.get("total_content", 0)
@@ -849,7 +849,7 @@ def render_user_behavior_analysis(stats, advanced_metrics):
849
  phi3_feedback = stats.get("phi3_feedback_count", 0)
850
  if phi3_feedback > 0:
851
  phi3_hq_rate = (phi3_hq / phi3_feedback) * 100
852
- st.metric("Phi-3 HQ Rate", f"{phi3_hq_rate:.1f}%")
853
 
854
  # Model preference trend
855
  st.subheader("πŸ“ˆ Model Usage Trend")
@@ -871,7 +871,7 @@ def render_user_behavior_analysis(stats, advanced_metrics):
871
  max(20, groq_percent * 1.05),
872
  groq_percent
873
  ],
874
- 'Phi-3 Usage': [
875
  max(5, phi3_percent * 0.7),
876
  max(10, phi3_percent * 0.85),
877
  max(15, phi3_percent * 0.95),
@@ -880,22 +880,22 @@ def render_user_behavior_analysis(stats, advanced_metrics):
880
  }
881
 
882
  df_trend = pd.DataFrame(trend_data)
883
- fig = px.line(df_trend, x='Period', y=['Groq Usage', 'Phi-3 Usage'],
884
  title="Model Usage Trend Over Time", markers=True)
885
  st.plotly_chart(fig, use_container_width=True, key="usage_trend_chart")
886
  else:
887
  st.info("Not enough data to show usage trends yet.")
888
 
889
  def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
890
- """Analyze content effectiveness across different dimensions with comprehensive Groq vs Phi-3 comparisons"""
891
 
892
  # Complexity Distribution Comparison
893
- st.subheader("🎯 Complexity Distribution - Groq vs Phi-3")
894
 
895
  col1, col2 = st.columns(2)
896
 
897
  with col1:
898
- # Complexity analysis - Groq vs Phi-3
899
  groq_complexity = advanced_metrics.get('models', {}).get('groq', {}).get('complexity_distribution', {})
900
  phi3_complexity = advanced_metrics.get('models', {}).get('phi3', {}).get('complexity_distribution', {})
901
 
@@ -907,11 +907,11 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
907
 
908
  fig = go.Figure(data=[
909
  go.Bar(name='Groq', x=complexities, y=groq_values, marker_color='#1f77b4'),
910
- go.Bar(name='Phi-3', x=complexities, y=phi3_values, marker_color='#ff7f0e')
911
  ])
912
 
913
  fig.update_layout(
914
- title="Complexity Distribution: Groq vs Phi-3",
915
  barmode='group',
916
  yaxis_title="Count",
917
  showlegend=True,
@@ -966,10 +966,10 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
966
  elif complexity_gap > 0:
967
  st.info(f"ℹ️ Groq has {complexity_gap:.1f}% better complexity appropriateness")
968
  else:
969
- st.warning(f"⚠️ Phi-3 has {abs(complexity_gap):.1f}% better complexity appropriateness")
970
 
971
  # User Type Effectiveness Comparison
972
- st.subheader("πŸ‘₯ User Type Effectiveness - Groq vs Phi-3")
973
 
974
  col1, col2 = st.columns(2)
975
 
@@ -990,11 +990,11 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
990
 
991
  fig = go.Figure(data=[
992
  go.Bar(name='Groq', x=user_types, y=groq_effectiveness, marker_color='blue'),
993
- go.Bar(name='Phi-3', x=user_types, y=phi3_effectiveness, marker_color='orange')
994
  ])
995
 
996
  fig.update_layout(
997
- title="Effectiveness by User Type: Groq vs Phi-3",
998
  barmode='group',
999
  yaxis_title="Effectiveness Score (0-5)",
1000
  showlegend=True,
@@ -1012,7 +1012,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
1012
  fig = px.bar(
1013
  x=user_types,
1014
  y=performance_gaps,
1015
- title="Performance Gap by User Type (Groq - Phi-3)",
1016
  labels={'x': 'User Type', 'y': 'Performance Gap'},
1017
  color=performance_gaps,
1018
  color_continuous_scale=['red', 'white', 'green'],
@@ -1032,10 +1032,10 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
1032
  elif best_gap > 0:
1033
  st.success(f"πŸ† **Significant Advantage**: Groq performs {best_gap:.2f} points better for {best_user_type}s")
1034
  else:
1035
- st.warning(f"πŸ“‰ **Challenge Area**: Phi-3 performs {abs(best_gap):.2f} points better for {best_user_type}s")
1036
 
1037
  # Student Level Appropriateness Comparison
1038
- st.subheader("πŸŽ“ Student Level Appropriateness - Groq vs Phi-3")
1039
 
1040
  col1, col2 = st.columns(2)
1041
 
@@ -1064,13 +1064,13 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
1064
  fig.add_trace(go.Scatter(
1065
  x=levels, y=phi3_appropriateness,
1066
  mode='lines+markers',
1067
- name='Phi-3',
1068
  line=dict(color='orange', width=3),
1069
  marker=dict(size=8)
1070
  ))
1071
 
1072
  fig.update_layout(
1073
- title="Appropriateness by Education Level: Groq vs Phi-3",
1074
  xaxis_title="Education Level",
1075
  yaxis_title="Appropriateness Score (0-5)",
1076
  height=400
@@ -1087,7 +1087,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
1087
  fig = px.bar(
1088
  x=levels,
1089
  y=appropriateness_gaps,
1090
- title="Appropriateness Gap by Level (Groq - Phi-3)",
1091
  labels={'x': 'Education Level', 'y': 'Appropriateness Gap'},
1092
  color=appropriateness_gaps,
1093
  color_continuous_scale=['red', 'white', 'green'],
@@ -1113,7 +1113,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
1113
  )
1114
 
1115
  # Content Type Performance Comparison
1116
- st.subheader("πŸ“š Content Type Performance - Groq vs Phi-3")
1117
 
1118
  content_types = ['Lesson Plan', 'Study Guide', 'Lecture Notes', 'Interactive Activity']
1119
 
@@ -1131,11 +1131,11 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
1131
  # Performance comparison chart
1132
  fig = go.Figure(data=[
1133
  go.Bar(name='Groq', x=content_types, y=groq_content_scores, marker_color='blue'),
1134
- go.Bar(name='Phi-3', x=content_types, y=phi3_content_scores, marker_color='orange')
1135
  ])
1136
 
1137
  fig.update_layout(
1138
- title="Performance by Content Type: Groq vs Phi-3",
1139
  barmode='group',
1140
  yaxis_title="Average Score (0-5)",
1141
  height=500
@@ -1156,7 +1156,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
1156
  fig = px.bar(
1157
  x=content_types,
1158
  y=performance_gaps,
1159
- title="Performance Gap by Content Type (Groq - Phi-3)",
1160
  color=performance_gaps,
1161
  color_continuous_scale=['red', 'white', 'green'],
1162
  color_continuous_midpoint=0
@@ -1187,7 +1187,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
1187
  st.metric(
1188
  label=f"Groq's Strongest: {content_types[best_groq_idx]}",
1189
  value=f"{best_groq_score:.2f}",
1190
- delta=f"+{best_groq_gap:.2f} over Phi-3"
1191
  )
1192
 
1193
  st.metric(
@@ -1326,10 +1326,10 @@ def render_research_insights(stats, calculated_metrics, advanced_metrics):
1326
  # Based on performance gap
1327
  if calculated_metrics['improvement_gap']['f1'] > 30:
1328
  recommendations.append("βœ… **Deploy Groq in Production**: Groq demonstrates production-ready performance")
1329
- recommendations.append("πŸ”§ **Strategic Phi-3 Optimization**: Focus on specific use cases where Phi-3 shows potential")
1330
  elif calculated_metrics['improvement_gap']['f1'] > 15:
1331
  recommendations.append("βœ… **Continue Groq Focus**: Maintain Groq as primary model for high-quality content")
1332
- recommendations.append("πŸ”§ **Phi-3 Optimization**: Investigate specific areas for Phi-3 improvement")
1333
  else:
1334
  recommendations.append("πŸ€– **Model Diversification**: Consider both models for different use cases")
1335
 
@@ -1505,48 +1505,6 @@ def render_data_management():
1505
 
1506
  if st.button("Generate Custom Report", use_container_width=True):
1507
  st.info("Custom report generation coming soon! Currently using comprehensive format.")
1508
-
1509
- # Data Quality Insights
1510
- st.subheader("πŸ” Data Quality Insights")
1511
-
1512
- insight_col1, insight_col2, insight_col3 = st.columns(3)
1513
-
1514
- with insight_col1:
1515
- # Data completeness
1516
- total_users = advanced_metrics.get('database_summary', {}).get('total_users', 0)
1517
- user_coverage = min(100, (total_feedback / max(1, total_users)) * 100)
1518
- st.metric("User Coverage", f"{user_coverage:.1f}%")
1519
-
1520
- with insight_col2:
1521
- # Model balance
1522
- groq_count = stats.get("groq_feedback_count", 0)
1523
- phi3_count = stats.get("phi3_feedback_count", 0)
1524
- total_count = groq_count + phi3_count
1525
- balance_ratio = (min(groq_count, phi3_count) / max(groq_count, phi3_count) * 100) if total_count > 0 else 0
1526
- st.metric("Model Balance", f"{balance_ratio:.1f}%")
1527
-
1528
- with insight_col3:
1529
- # Data freshness
1530
- try:
1531
- from db.helpers import get_latest_feedback_date
1532
- latest_date = get_latest_feedback_date()
1533
- if latest_date:
1534
- days_ago = (datetime.now() - latest_date).days
1535
- freshness = max(0, 100 - (days_ago * 2)) # Decrease 2% per day
1536
- st.metric("Data Freshness", f"{freshness:.1f}%")
1537
- else:
1538
- st.metric("Data Freshness", "N/A")
1539
- except:
1540
- st.metric("Data Freshness", "Check DB")
1541
-
1542
- # Export History (placeholder for future feature)
1543
- with st.expander("πŸ“‹ Recent Exports"):
1544
- st.info("Export history tracking coming soon!")
1545
- st.write("""
1546
- - **Last PDF Export**: Not yet tracked
1547
- - **Last Data Export**: Not yet tracked
1548
- - **Last Training Export**: Not yet tracked
1549
- """)
1550
 
1551
  # Helper functions for calculating metrics
1552
  def calculate_user_type_effectiveness(model, user_type, stats):
 
118
  st.header("✨ Detailed Quality Analysis")
119
  render_quality_analysis(stats, calculated_metrics, advanced_metrics)
120
 
121
+ # NEW: Complexity Analysis - Groq vs Phi-3 (Finetuned)
122
  render_complexity_analysis(stats, advanced_metrics)
123
 
124
+ # NEW: User Type Breakdown - Groq vs Phi-3 (Finetuned)
125
  render_user_type_breakdown(stats, advanced_metrics)
126
 
127
+ # NEW: Student Level Analysis - Groq vs Phi-3 (Finetuned)
128
  render_student_level_analysis(stats, advanced_metrics)
129
 
130
+ # NEW: Comment Analysis - Groq vs Phi-3 (Finetuned)
131
  #render_comment_analysis(stats, advanced_metrics)
132
 
133
  # Statistical Significance Testing
 
146
  st.header("πŸ”„ Regeneration Effectiveness")
147
  render_regeneration_analysis(stats, calculated_metrics)
148
 
149
+ # NEW: Regeneration Type Analysis - Groq vs Phi-3 (Finetuned)
150
  render_regeneration_type_analysis(stats, advanced_metrics)
151
 
152
+ # NEW: Target Achievement Analysis - Groq vs Phi-3 (Finetuned)
153
  # render_target_achievement_analysis(stats, calculated_metrics)
154
 
155
+ # NEW: High Quality Target Analysis - Groq vs Phi-3 (Finetuned)
156
  render_high_quality_target_analysis(stats)
157
 
158
  # Research Insights & Recommendations
 
168
  st.info("This might be because no research data has been collected yet.")
169
 
170
  # ============================================================================
171
+ # NEW COMPARISON FUNCTIONS - ALL GROQ VS Phi-3 (Finetuned)
172
  # ============================================================================
173
 
174
  def render_detailed_database_summary(stats, advanced_metrics):
 
203
  st.metric("Total High Quality", hq_total)
204
 
205
  def render_complexity_analysis(stats, advanced_metrics):
206
+ """Detailed complexity distribution analysis - Groq vs Phi-3 (Finetuned) - FIXED"""
207
+ st.header("🎯 Complexity Analysis - Groq vs Phi-3 (Finetuned)")
208
 
209
  groq_complexity = advanced_metrics.get('models', {}).get('groq', {}).get('complexity_distribution', {})
210
  phi3_complexity = advanced_metrics.get('models', {}).get('phi3', {}).get('complexity_distribution', {})
 
234
  phi3_too_simple = phi3_complexity.get('Too simple', 0)
235
  phi3_too_complex = phi3_complexity.get('Too complex', 0)
236
 
237
+ st.subheader("πŸ§ͺ Phi-3 (Finetuned) Complexity")
238
  st.metric("Appropriate Complexity", f"{phi3_appropriate} ({phi3_appropriate/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
239
  st.metric("Too Simple", f"{phi3_too_simple} ({phi3_too_simple/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
240
  st.metric("Too Complex", f"{phi3_too_complex} ({phi3_too_complex/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
 
246
 
247
  fig = go.Figure(data=[
248
  go.Bar(name='Groq', x=complexities, y=groq_values, marker_color='#1f77b4'),
249
+ go.Bar(name='Phi-3 (Finetuned)', x=complexities, y=phi3_values, marker_color='#ff7f0e')
250
  ])
251
 
252
  fig.update_layout(
253
+ title="Complexity Distribution: Groq vs Phi-3 (Finetuned)",
254
  barmode='group',
255
  yaxis_title="Count",
256
  showlegend=True,
 
259
  st.plotly_chart(fig, use_container_width=True, key="complexity_comparison_chart")
260
 
261
  def render_user_type_breakdown(stats, advanced_metrics):
262
+ """Detailed user type analysis - Groq vs Phi-3 (Finetuned)"""
263
+ st.header("πŸ‘₯ User Type Analysis - Groq vs Phi-3 (Finetuned)")
264
 
265
  user_types = ['student', 'tutor']
266
 
 
280
  st.info("No Groq data available")
281
 
282
  with col2:
283
+ # Phi-3 (Finetuned) performance for this user type
284
  phi3_data = advanced_metrics.get('models', {}).get('phi3', {}).get('user_types', {}).get(user_type, {})
285
  if phi3_data:
286
+ st.metric("Phi-3 (Finetuned) Feedback Count", phi3_data.get('count', 0))
287
+ st.metric("Phi-3 (Finetuned) Avg Clarity", f"{phi3_data.get('avg_clarity', 0):.2f}")
288
+ st.metric("Phi-3 (Finetuned) Avg Depth", f"{phi3_data.get('avg_depth', 0):.2f}")
289
  else:
290
+ st.info("No Phi-3 (Finetuned) data available")
291
 
292
  def render_student_level_analysis(stats, advanced_metrics):
293
+ """Detailed student level analysis - Groq vs Phi-3 (Finetuned) - WITH LEVEL MAPPING"""
294
+ st.header("πŸŽ“ Student Level Analysis - Groq vs Phi-3 (Finetuned)")
295
 
296
  # Map specific levels to general categories
297
  level_mapping = {
 
339
 
340
  with col2:
341
  if phi3_total_count > 0:
342
+ st.metric("Phi-3 (Finetuned) Feedback Count", phi3_total_count)
343
+ st.metric("Phi-3 (Finetuned) Avg Clarity", f"{phi3_avg_clarity:.2f}")
344
  else:
345
+ st.info("No Phi-3 (Finetuned) data")
346
 
347
  # Show breakdown if we have multiple specific levels
348
  if len(specific_levels) > 1:
 
360
 
361
  with col2:
362
  if phi3_specific:
363
+ st.write(f"**{specific_level}** - Phi-3 (Finetuned): {phi3_specific.get('count', 0)} feedbacks, Clarity: {phi3_specific.get('avg_clarity', 0):.2f}")
364
  else:
365
+ st.write(f"**{specific_level}** - No Phi-3 (Finetuned) data")
366
 
367
  def render_regeneration_type_analysis(stats, advanced_metrics):
368
+ """Detailed regeneration type breakdown - Groq vs Phi-3 (Finetuned)"""
369
+ st.header("πŸ”„ Regeneration Type Analysis - Groq vs Phi-3 (Finetuned)")
370
 
371
  groq_regen = advanced_metrics.get('models', {}).get('groq', {}).get('regeneration_types', {})
372
  phi3_regen = advanced_metrics.get('models', {}).get('phi3', {}).get('regeneration_types', {})
 
385
 
386
  with col2:
387
  if phi3_regen:
388
+ st.subheader("Phi-3 (Finetuned) Regeneration Methods")
389
  for regen_type, count in phi3_regen.items():
390
  if count > 0:
391
  st.metric(regen_type.replace('_', ' ').title(), count)
392
  else:
393
+ st.info("No Phi-3 (Finetuned) regeneration data")
394
 
395
  # Comparison chart
396
  all_regen_types = set(list(groq_regen.keys()) + list(phi3_regen.keys()))
 
400
 
401
  fig = go.Figure(data=[
402
  go.Bar(name='Groq', x=list(all_regen_types), y=groq_values, marker_color='#1f77b4'),
403
+ go.Bar(name='Phi-3 (Finetuned)', x=list(all_regen_types), y=phi3_values, marker_color='#ff7f0e')
404
  ])
405
 
406
  fig.update_layout(
407
+ title="Regeneration Methods: Groq vs Phi-3 (Finetuned)",
408
  barmode='group',
409
  yaxis_title="Count",
410
  showlegend=True,
 
415
  st.info("No regeneration type data available")
416
 
417
  def render_high_quality_target_analysis(stats):
418
+ """High quality feedback target analysis - Groq vs Phi-3 (Finetuned)"""
419
+ st.header("⭐ High Quality Feedback Analysis - Groq vs Phi-3 (Finetuned)")
420
 
421
  groq_hq = stats.get("high_quality_groq", 0)
422
  phi3_hq = stats.get("high_quality_phi3", 0)
 
435
 
436
  with col2:
437
  phi3_hq_rate = (phi3_hq / phi3_feedback * 100) if phi3_feedback > 0 else 0
438
+ st.metric("Phi-3 (Finetuned) HQ", f"{phi3_hq} ({phi3_hq_rate:.1f}%)")
439
 
440
  with col3:
441
  st.metric("Total HQ", total_hq)
 
447
  # HQ Comparison Chart
448
  fig = go.Figure(data=[
449
  go.Bar(name='Groq', x=['High Quality'], y=[groq_hq], marker_color='blue'),
450
+ go.Bar(name='Phi-3 (Finetuned)', x=['High Quality'], y=[phi3_hq], marker_color='orange')
451
  ])
452
 
453
  fig.update_layout(
454
+ title="High Quality Feedback: Groq vs Phi-3 (Finetuned)",
455
  barmode='group',
456
  yaxis_title="Count",
457
  showlegend=True,
 
543
  if groq_recall < 0.7:
544
  groq_recall = 0.7 + (groq_depth / 25) # 0.7 + 0.1476 = ~0.847
545
 
546
+ # Phi-3 (Finetuned) enhancement - weaker but still reasonable
547
  if phi3_f1 < 0.5:
548
  quality_factor = (phi3_clarity + phi3_depth) / 10 # 0.452 for current scores
549
  phi3_f1 = 0.5 + (quality_factor * 0.15) # 0.5 + 0.0678 = ~0.567
 
636
  st.success("βœ… **Exceptional Performance Difference**: Groq demonstrates outstanding superiority across all metrics")
637
  st.metric("Overall Quality Gap", f"{overall_gap:.2f} points", delta=f"+{overall_gap:.2f}")
638
  elif overall_gap > 1.0:
639
+ st.success("βœ… **Significant Performance Difference**: Groq substantially outperforms Phi-3 (Finetuned) across all metrics")
640
  st.metric("Overall Quality Gap", f"{overall_gap:.2f} points", delta=f"+{overall_gap:.2f}")
641
  elif overall_gap > 0.5:
642
  st.warning("⚠️ **Moderate Performance Gap**: Consistent but moderate advantage for Groq")
 
669
  st.metric("Groq F1 Score", f"{calculated_metrics['f1_score']['groq']}%")
670
 
671
  with col3:
672
+ st.metric("Phi-3 (Finetuned) F1 Score", f"{calculated_metrics['f1_score']['phi3']}%")
673
 
674
  with col4:
675
  f1_gap = calculated_metrics['improvement_gap']['f1']
 
707
 
708
  fig = go.Figure(data=[
709
  go.Bar(name='Groq (Control)', x=metrics, y=groq_values, marker_color='#1f77b4'),
710
+ go.Bar(name='Phi-3 (Finetuned)', x=metrics, y=phi3_values, marker_color='#ff7f0e')
711
  ])
712
 
713
  fig.update_layout(
 
736
  st.metric("Overall Quality", f"{calculated_metrics['overall_quality']['groq']}/5")
737
 
738
  with col2:
739
+ st.subheader("πŸ§ͺ Phi-3 (Finetuned)")
740
 
741
  phi3_scores = stats.get("phi3_scores", {})
742
  precision_delta = f"{safe_convert(calculated_metrics['precision']['phi3']) - safe_convert(calculated_metrics['precision']['groq']):.1f}%"
 
771
  phi3_se = 1.96 * (phi3_clarity / np.sqrt(phi3_samples)) if phi3_samples > 0 else 0
772
 
773
  st.metric("Groq Confidence Interval", f"Β±{groq_se:.2f}")
774
+ st.metric("Phi-3 (Finetuned) Confidence Interval", f"Β±{phi3_se:.2f}")
775
 
776
  # Effect size calculation
777
  effect_size = (groq_clarity - phi3_clarity) / np.sqrt((groq_se**2 + phi3_se**2)/2) if (groq_se + phi3_se) > 0 else 0
 
827
  phi3_percent = (phi3_feedback / total_feedback) * 100
828
 
829
  st.metric("Groq Usage", f"{groq_percent:.1f}%")
830
+ st.metric("Phi-3 (Finetuned) Usage", f"{phi3_percent:.1f}%")
831
 
832
  with col2:
833
  total_content = stats.get("total_content", 0)
 
849
  phi3_feedback = stats.get("phi3_feedback_count", 0)
850
  if phi3_feedback > 0:
851
  phi3_hq_rate = (phi3_hq / phi3_feedback) * 100
852
+ st.metric("Phi-3 (Finetuned) HQ Rate", f"{phi3_hq_rate:.1f}%")
853
 
854
  # Model preference trend
855
  st.subheader("πŸ“ˆ Model Usage Trend")
 
871
  max(20, groq_percent * 1.05),
872
  groq_percent
873
  ],
874
+ 'Phi-3 (Finetuned) Usage': [
875
  max(5, phi3_percent * 0.7),
876
  max(10, phi3_percent * 0.85),
877
  max(15, phi3_percent * 0.95),
 
880
  }
881
 
882
  df_trend = pd.DataFrame(trend_data)
883
+ fig = px.line(df_trend, x='Period', y=['Groq Usage', 'Phi-3 (Finetuned) Usage'],
884
  title="Model Usage Trend Over Time", markers=True)
885
  st.plotly_chart(fig, use_container_width=True, key="usage_trend_chart")
886
  else:
887
  st.info("Not enough data to show usage trends yet.")
888
 
889
  def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
890
+ """Analyze content effectiveness across different dimensions with comprehensive Groq vs Phi-3 (Finetuned) comparisons"""
891
 
892
  # Complexity Distribution Comparison
893
+ st.subheader("🎯 Complexity Distribution - Groq vs Phi-3 (Finetuned)")
894
 
895
  col1, col2 = st.columns(2)
896
 
897
  with col1:
898
+ # Complexity analysis - Groq vs Phi-3 (Finetuned)
899
  groq_complexity = advanced_metrics.get('models', {}).get('groq', {}).get('complexity_distribution', {})
900
  phi3_complexity = advanced_metrics.get('models', {}).get('phi3', {}).get('complexity_distribution', {})
901
 
 
907
 
908
  fig = go.Figure(data=[
909
  go.Bar(name='Groq', x=complexities, y=groq_values, marker_color='#1f77b4'),
910
+ go.Bar(name='Phi-3 (Finetuned)', x=complexities, y=phi3_values, marker_color='#ff7f0e')
911
  ])
912
 
913
  fig.update_layout(
914
+ title="Complexity Distribution: Groq vs Phi-3 (Finetuned)",
915
  barmode='group',
916
  yaxis_title="Count",
917
  showlegend=True,
 
966
  elif complexity_gap > 0:
967
  st.info(f"ℹ️ Groq has {complexity_gap:.1f}% better complexity appropriateness")
968
  else:
969
+ st.warning(f"⚠️ Phi-3 (Finetuned) has {abs(complexity_gap):.1f}% better complexity appropriateness")
970
 
971
  # User Type Effectiveness Comparison
972
+ st.subheader("πŸ‘₯ User Type Effectiveness - Groq vs Phi-3 (Finetuned)")
973
 
974
  col1, col2 = st.columns(2)
975
 
 
990
 
991
  fig = go.Figure(data=[
992
  go.Bar(name='Groq', x=user_types, y=groq_effectiveness, marker_color='blue'),
993
+ go.Bar(name='Phi-3 (Finetuned)', x=user_types, y=phi3_effectiveness, marker_color='orange')
994
  ])
995
 
996
  fig.update_layout(
997
+ title="Effectiveness by User Type: Groq vs Phi-3 (Finetuned)",
998
  barmode='group',
999
  yaxis_title="Effectiveness Score (0-5)",
1000
  showlegend=True,
 
1012
  fig = px.bar(
1013
  x=user_types,
1014
  y=performance_gaps,
1015
+ title="Performance Gap by User Type (Groq - Phi-3 (Finetuned))",
1016
  labels={'x': 'User Type', 'y': 'Performance Gap'},
1017
  color=performance_gaps,
1018
  color_continuous_scale=['red', 'white', 'green'],
 
1032
  elif best_gap > 0:
1033
  st.success(f"πŸ† **Significant Advantage**: Groq performs {best_gap:.2f} points better for {best_user_type}s")
1034
  else:
1035
+ st.warning(f"πŸ“‰ **Challenge Area**: Phi-3 (Finetuned) performs {abs(best_gap):.2f} points better for {best_user_type}s")
1036
 
1037
  # Student Level Appropriateness Comparison
1038
+ st.subheader("πŸŽ“ Student Level Appropriateness - Groq vs Phi-3 (Finetuned)")
1039
 
1040
  col1, col2 = st.columns(2)
1041
 
 
1064
  fig.add_trace(go.Scatter(
1065
  x=levels, y=phi3_appropriateness,
1066
  mode='lines+markers',
1067
+ name='Phi-3 (Finetuned)',
1068
  line=dict(color='orange', width=3),
1069
  marker=dict(size=8)
1070
  ))
1071
 
1072
  fig.update_layout(
1073
+ title="Appropriateness by Education Level: Groq vs Phi-3 (Finetuned)",
1074
  xaxis_title="Education Level",
1075
  yaxis_title="Appropriateness Score (0-5)",
1076
  height=400
 
1087
  fig = px.bar(
1088
  x=levels,
1089
  y=appropriateness_gaps,
1090
+ title="Appropriateness Gap by Level (Groq - Phi-3 (Finetuned))",
1091
  labels={'x': 'Education Level', 'y': 'Appropriateness Gap'},
1092
  color=appropriateness_gaps,
1093
  color_continuous_scale=['red', 'white', 'green'],
 
1113
  )
1114
 
1115
  # Content Type Performance Comparison
1116
+ st.subheader("πŸ“š Content Type Performance - Groq vs Phi-3 (Finetuned)")
1117
 
1118
  content_types = ['Lesson Plan', 'Study Guide', 'Lecture Notes', 'Interactive Activity']
1119
 
 
1131
  # Performance comparison chart
1132
  fig = go.Figure(data=[
1133
  go.Bar(name='Groq', x=content_types, y=groq_content_scores, marker_color='blue'),
1134
+ go.Bar(name='Phi-3 (Finetuned)', x=content_types, y=phi3_content_scores, marker_color='orange')
1135
  ])
1136
 
1137
  fig.update_layout(
1138
+ title="Performance by Content Type: Groq vs Phi-3 (Finetuned)",
1139
  barmode='group',
1140
  yaxis_title="Average Score (0-5)",
1141
  height=500
 
1156
  fig = px.bar(
1157
  x=content_types,
1158
  y=performance_gaps,
1159
+ title="Performance Gap by Content Type (Groq - Phi-3 (Finetuned))",
1160
  color=performance_gaps,
1161
  color_continuous_scale=['red', 'white', 'green'],
1162
  color_continuous_midpoint=0
 
1187
  st.metric(
1188
  label=f"Groq's Strongest: {content_types[best_groq_idx]}",
1189
  value=f"{best_groq_score:.2f}",
1190
+ delta=f"+{best_groq_gap:.2f} over Phi-3 (Finetuned)"
1191
  )
1192
 
1193
  st.metric(
 
1326
  # Based on performance gap
1327
  if calculated_metrics['improvement_gap']['f1'] > 30:
1328
  recommendations.append("βœ… **Deploy Groq in Production**: Groq demonstrates production-ready performance")
1329
+ recommendations.append("πŸ”§ **Strategic Phi-3 (Finetuned) Optimization**: Focus on specific use cases where Phi-3 (Finetuned) shows potential")
1330
  elif calculated_metrics['improvement_gap']['f1'] > 15:
1331
  recommendations.append("βœ… **Continue Groq Focus**: Maintain Groq as primary model for high-quality content")
1332
+ recommendations.append("πŸ”§ **Phi-3 (Finetuned) Optimization**: Investigate specific areas for Phi-3 (Finetuned) improvement")
1333
  else:
1334
  recommendations.append("πŸ€– **Model Diversification**: Consider both models for different use cases")
1335
 
 
1505
 
1506
  if st.button("Generate Custom Report", use_container_width=True):
1507
  st.info("Custom report generation coming soon! Currently using comprehensive format.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1508
 
1509
  # Helper functions for calculating metrics
1510
  def calculate_user_type_effectiveness(model, user_type, stats):