Spaces:
Running
Running
Update components/research_dashboard.py
Browse files- components/research_dashboard.py +70 -112
components/research_dashboard.py
CHANGED
|
@@ -118,16 +118,16 @@ def render_research_dashboard():
|
|
| 118 |
st.header("β¨ Detailed Quality Analysis")
|
| 119 |
render_quality_analysis(stats, calculated_metrics, advanced_metrics)
|
| 120 |
|
| 121 |
-
# NEW: Complexity Analysis - Groq vs Phi-3
|
| 122 |
render_complexity_analysis(stats, advanced_metrics)
|
| 123 |
|
| 124 |
-
# NEW: User Type Breakdown - Groq vs Phi-3
|
| 125 |
render_user_type_breakdown(stats, advanced_metrics)
|
| 126 |
|
| 127 |
-
# NEW: Student Level Analysis - Groq vs Phi-3
|
| 128 |
render_student_level_analysis(stats, advanced_metrics)
|
| 129 |
|
| 130 |
-
# NEW: Comment Analysis - Groq vs Phi-3
|
| 131 |
#render_comment_analysis(stats, advanced_metrics)
|
| 132 |
|
| 133 |
# Statistical Significance Testing
|
|
@@ -146,13 +146,13 @@ def render_research_dashboard():
|
|
| 146 |
st.header("π Regeneration Effectiveness")
|
| 147 |
render_regeneration_analysis(stats, calculated_metrics)
|
| 148 |
|
| 149 |
-
# NEW: Regeneration Type Analysis - Groq vs Phi-3
|
| 150 |
render_regeneration_type_analysis(stats, advanced_metrics)
|
| 151 |
|
| 152 |
-
# NEW: Target Achievement Analysis - Groq vs Phi-3
|
| 153 |
# render_target_achievement_analysis(stats, calculated_metrics)
|
| 154 |
|
| 155 |
-
# NEW: High Quality Target Analysis - Groq vs Phi-3
|
| 156 |
render_high_quality_target_analysis(stats)
|
| 157 |
|
| 158 |
# Research Insights & Recommendations
|
|
@@ -168,7 +168,7 @@ def render_research_dashboard():
|
|
| 168 |
st.info("This might be because no research data has been collected yet.")
|
| 169 |
|
| 170 |
# ============================================================================
|
| 171 |
-
# NEW COMPARISON FUNCTIONS - ALL GROQ VS
|
| 172 |
# ============================================================================
|
| 173 |
|
| 174 |
def render_detailed_database_summary(stats, advanced_metrics):
|
|
@@ -203,8 +203,8 @@ def render_detailed_database_summary(stats, advanced_metrics):
|
|
| 203 |
st.metric("Total High Quality", hq_total)
|
| 204 |
|
| 205 |
def render_complexity_analysis(stats, advanced_metrics):
|
| 206 |
-
"""Detailed complexity distribution analysis - Groq vs Phi-3 - FIXED"""
|
| 207 |
-
st.header("π― Complexity Analysis - Groq vs Phi-3")
|
| 208 |
|
| 209 |
groq_complexity = advanced_metrics.get('models', {}).get('groq', {}).get('complexity_distribution', {})
|
| 210 |
phi3_complexity = advanced_metrics.get('models', {}).get('phi3', {}).get('complexity_distribution', {})
|
|
@@ -234,7 +234,7 @@ def render_complexity_analysis(stats, advanced_metrics):
|
|
| 234 |
phi3_too_simple = phi3_complexity.get('Too simple', 0)
|
| 235 |
phi3_too_complex = phi3_complexity.get('Too complex', 0)
|
| 236 |
|
| 237 |
-
st.subheader("π§ͺ Phi-3 Complexity")
|
| 238 |
st.metric("Appropriate Complexity", f"{phi3_appropriate} ({phi3_appropriate/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
|
| 239 |
st.metric("Too Simple", f"{phi3_too_simple} ({phi3_too_simple/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
|
| 240 |
st.metric("Too Complex", f"{phi3_too_complex} ({phi3_too_complex/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
|
|
@@ -246,11 +246,11 @@ def render_complexity_analysis(stats, advanced_metrics):
|
|
| 246 |
|
| 247 |
fig = go.Figure(data=[
|
| 248 |
go.Bar(name='Groq', x=complexities, y=groq_values, marker_color='#1f77b4'),
|
| 249 |
-
go.Bar(name='Phi-3', x=complexities, y=phi3_values, marker_color='#ff7f0e')
|
| 250 |
])
|
| 251 |
|
| 252 |
fig.update_layout(
|
| 253 |
-
title="Complexity Distribution: Groq vs Phi-3",
|
| 254 |
barmode='group',
|
| 255 |
yaxis_title="Count",
|
| 256 |
showlegend=True,
|
|
@@ -259,8 +259,8 @@ def render_complexity_analysis(stats, advanced_metrics):
|
|
| 259 |
st.plotly_chart(fig, use_container_width=True, key="complexity_comparison_chart")
|
| 260 |
|
| 261 |
def render_user_type_breakdown(stats, advanced_metrics):
|
| 262 |
-
"""Detailed user type analysis - Groq vs Phi-3"""
|
| 263 |
-
st.header("π₯ User Type Analysis - Groq vs Phi-3")
|
| 264 |
|
| 265 |
user_types = ['student', 'tutor']
|
| 266 |
|
|
@@ -280,18 +280,18 @@ def render_user_type_breakdown(stats, advanced_metrics):
|
|
| 280 |
st.info("No Groq data available")
|
| 281 |
|
| 282 |
with col2:
|
| 283 |
-
# Phi-3 performance for this user type
|
| 284 |
phi3_data = advanced_metrics.get('models', {}).get('phi3', {}).get('user_types', {}).get(user_type, {})
|
| 285 |
if phi3_data:
|
| 286 |
-
st.metric("Phi-3 Feedback Count", phi3_data.get('count', 0))
|
| 287 |
-
st.metric("Phi-3 Avg Clarity", f"{phi3_data.get('avg_clarity', 0):.2f}")
|
| 288 |
-
st.metric("Phi-3 Avg Depth", f"{phi3_data.get('avg_depth', 0):.2f}")
|
| 289 |
else:
|
| 290 |
-
st.info("No Phi-3 data available")
|
| 291 |
|
| 292 |
def render_student_level_analysis(stats, advanced_metrics):
|
| 293 |
-
"""Detailed student level analysis - Groq vs Phi-3 - WITH LEVEL MAPPING"""
|
| 294 |
-
st.header("π Student Level Analysis - Groq vs Phi-3")
|
| 295 |
|
| 296 |
# Map specific levels to general categories
|
| 297 |
level_mapping = {
|
|
@@ -339,10 +339,10 @@ def render_student_level_analysis(stats, advanced_metrics):
|
|
| 339 |
|
| 340 |
with col2:
|
| 341 |
if phi3_total_count > 0:
|
| 342 |
-
st.metric("Phi-3 Feedback Count", phi3_total_count)
|
| 343 |
-
st.metric("Phi-3 Avg Clarity", f"{phi3_avg_clarity:.2f}")
|
| 344 |
else:
|
| 345 |
-
st.info("No Phi-3 data")
|
| 346 |
|
| 347 |
# Show breakdown if we have multiple specific levels
|
| 348 |
if len(specific_levels) > 1:
|
|
@@ -360,13 +360,13 @@ def render_student_level_analysis(stats, advanced_metrics):
|
|
| 360 |
|
| 361 |
with col2:
|
| 362 |
if phi3_specific:
|
| 363 |
-
st.write(f"**{specific_level}** - Phi-3: {phi3_specific.get('count', 0)} feedbacks, Clarity: {phi3_specific.get('avg_clarity', 0):.2f}")
|
| 364 |
else:
|
| 365 |
-
st.write(f"**{specific_level}** - No Phi-3 data")
|
| 366 |
|
| 367 |
def render_regeneration_type_analysis(stats, advanced_metrics):
|
| 368 |
-
"""Detailed regeneration type breakdown - Groq vs Phi-3"""
|
| 369 |
-
st.header("π Regeneration Type Analysis - Groq vs Phi-3")
|
| 370 |
|
| 371 |
groq_regen = advanced_metrics.get('models', {}).get('groq', {}).get('regeneration_types', {})
|
| 372 |
phi3_regen = advanced_metrics.get('models', {}).get('phi3', {}).get('regeneration_types', {})
|
|
@@ -385,12 +385,12 @@ def render_regeneration_type_analysis(stats, advanced_metrics):
|
|
| 385 |
|
| 386 |
with col2:
|
| 387 |
if phi3_regen:
|
| 388 |
-
st.subheader("Phi-3 Regeneration Methods")
|
| 389 |
for regen_type, count in phi3_regen.items():
|
| 390 |
if count > 0:
|
| 391 |
st.metric(regen_type.replace('_', ' ').title(), count)
|
| 392 |
else:
|
| 393 |
-
st.info("No Phi-3 regeneration data")
|
| 394 |
|
| 395 |
# Comparison chart
|
| 396 |
all_regen_types = set(list(groq_regen.keys()) + list(phi3_regen.keys()))
|
|
@@ -400,11 +400,11 @@ def render_regeneration_type_analysis(stats, advanced_metrics):
|
|
| 400 |
|
| 401 |
fig = go.Figure(data=[
|
| 402 |
go.Bar(name='Groq', x=list(all_regen_types), y=groq_values, marker_color='#1f77b4'),
|
| 403 |
-
go.Bar(name='Phi-3', x=list(all_regen_types), y=phi3_values, marker_color='#ff7f0e')
|
| 404 |
])
|
| 405 |
|
| 406 |
fig.update_layout(
|
| 407 |
-
title="Regeneration Methods: Groq vs Phi-3",
|
| 408 |
barmode='group',
|
| 409 |
yaxis_title="Count",
|
| 410 |
showlegend=True,
|
|
@@ -415,8 +415,8 @@ def render_regeneration_type_analysis(stats, advanced_metrics):
|
|
| 415 |
st.info("No regeneration type data available")
|
| 416 |
|
| 417 |
def render_high_quality_target_analysis(stats):
|
| 418 |
-
"""High quality feedback target analysis - Groq vs Phi-3"""
|
| 419 |
-
st.header("β High Quality Feedback Analysis - Groq vs Phi-3")
|
| 420 |
|
| 421 |
groq_hq = stats.get("high_quality_groq", 0)
|
| 422 |
phi3_hq = stats.get("high_quality_phi3", 0)
|
|
@@ -435,7 +435,7 @@ def render_high_quality_target_analysis(stats):
|
|
| 435 |
|
| 436 |
with col2:
|
| 437 |
phi3_hq_rate = (phi3_hq / phi3_feedback * 100) if phi3_feedback > 0 else 0
|
| 438 |
-
st.metric("Phi-3 HQ", f"{phi3_hq} ({phi3_hq_rate:.1f}%)")
|
| 439 |
|
| 440 |
with col3:
|
| 441 |
st.metric("Total HQ", total_hq)
|
|
@@ -447,11 +447,11 @@ def render_high_quality_target_analysis(stats):
|
|
| 447 |
# HQ Comparison Chart
|
| 448 |
fig = go.Figure(data=[
|
| 449 |
go.Bar(name='Groq', x=['High Quality'], y=[groq_hq], marker_color='blue'),
|
| 450 |
-
go.Bar(name='Phi-3', x=['High Quality'], y=[phi3_hq], marker_color='orange')
|
| 451 |
])
|
| 452 |
|
| 453 |
fig.update_layout(
|
| 454 |
-
title="High Quality Feedback: Groq vs Phi-3",
|
| 455 |
barmode='group',
|
| 456 |
yaxis_title="Count",
|
| 457 |
showlegend=True,
|
|
@@ -543,7 +543,7 @@ def calculate_enhanced_advanced_metrics(stats):
|
|
| 543 |
if groq_recall < 0.7:
|
| 544 |
groq_recall = 0.7 + (groq_depth / 25) # 0.7 + 0.1476 = ~0.847
|
| 545 |
|
| 546 |
-
# Phi-3 enhancement - weaker but still reasonable
|
| 547 |
if phi3_f1 < 0.5:
|
| 548 |
quality_factor = (phi3_clarity + phi3_depth) / 10 # 0.452 for current scores
|
| 549 |
phi3_f1 = 0.5 + (quality_factor * 0.15) # 0.5 + 0.0678 = ~0.567
|
|
@@ -636,7 +636,7 @@ def render_executive_summary(stats, calculated_metrics, advanced_metrics):
|
|
| 636 |
st.success("β
**Exceptional Performance Difference**: Groq demonstrates outstanding superiority across all metrics")
|
| 637 |
st.metric("Overall Quality Gap", f"{overall_gap:.2f} points", delta=f"+{overall_gap:.2f}")
|
| 638 |
elif overall_gap > 1.0:
|
| 639 |
-
st.success("β
**Significant Performance Difference**: Groq substantially outperforms Phi-3 across all metrics")
|
| 640 |
st.metric("Overall Quality Gap", f"{overall_gap:.2f} points", delta=f"+{overall_gap:.2f}")
|
| 641 |
elif overall_gap > 0.5:
|
| 642 |
st.warning("β οΈ **Moderate Performance Gap**: Consistent but moderate advantage for Groq")
|
|
@@ -669,7 +669,7 @@ def render_research_overview(stats, calculated_metrics):
|
|
| 669 |
st.metric("Groq F1 Score", f"{calculated_metrics['f1_score']['groq']}%")
|
| 670 |
|
| 671 |
with col3:
|
| 672 |
-
st.metric("Phi-3 F1 Score", f"{calculated_metrics['f1_score']['phi3']}%")
|
| 673 |
|
| 674 |
with col4:
|
| 675 |
f1_gap = calculated_metrics['improvement_gap']['f1']
|
|
@@ -707,7 +707,7 @@ def render_model_comparison(stats, calculated_metrics, advanced_metrics):
|
|
| 707 |
|
| 708 |
fig = go.Figure(data=[
|
| 709 |
go.Bar(name='Groq (Control)', x=metrics, y=groq_values, marker_color='#1f77b4'),
|
| 710 |
-
go.Bar(name='Phi-3 (
|
| 711 |
])
|
| 712 |
|
| 713 |
fig.update_layout(
|
|
@@ -736,7 +736,7 @@ def render_quality_analysis(stats, calculated_metrics, advanced_metrics):
|
|
| 736 |
st.metric("Overall Quality", f"{calculated_metrics['overall_quality']['groq']}/5")
|
| 737 |
|
| 738 |
with col2:
|
| 739 |
-
st.subheader("π§ͺ Phi-3 (
|
| 740 |
|
| 741 |
phi3_scores = stats.get("phi3_scores", {})
|
| 742 |
precision_delta = f"{safe_convert(calculated_metrics['precision']['phi3']) - safe_convert(calculated_metrics['precision']['groq']):.1f}%"
|
|
@@ -771,7 +771,7 @@ def render_statistical_analysis(stats, calculated_metrics):
|
|
| 771 |
phi3_se = 1.96 * (phi3_clarity / np.sqrt(phi3_samples)) if phi3_samples > 0 else 0
|
| 772 |
|
| 773 |
st.metric("Groq Confidence Interval", f"Β±{groq_se:.2f}")
|
| 774 |
-
st.metric("Phi-3 Confidence Interval", f"Β±{phi3_se:.2f}")
|
| 775 |
|
| 776 |
# Effect size calculation
|
| 777 |
effect_size = (groq_clarity - phi3_clarity) / np.sqrt((groq_se**2 + phi3_se**2)/2) if (groq_se + phi3_se) > 0 else 0
|
|
@@ -827,7 +827,7 @@ def render_user_behavior_analysis(stats, advanced_metrics):
|
|
| 827 |
phi3_percent = (phi3_feedback / total_feedback) * 100
|
| 828 |
|
| 829 |
st.metric("Groq Usage", f"{groq_percent:.1f}%")
|
| 830 |
-
st.metric("Phi-3 Usage", f"{phi3_percent:.1f}%")
|
| 831 |
|
| 832 |
with col2:
|
| 833 |
total_content = stats.get("total_content", 0)
|
|
@@ -849,7 +849,7 @@ def render_user_behavior_analysis(stats, advanced_metrics):
|
|
| 849 |
phi3_feedback = stats.get("phi3_feedback_count", 0)
|
| 850 |
if phi3_feedback > 0:
|
| 851 |
phi3_hq_rate = (phi3_hq / phi3_feedback) * 100
|
| 852 |
-
st.metric("Phi-3 HQ Rate", f"{phi3_hq_rate:.1f}%")
|
| 853 |
|
| 854 |
# Model preference trend
|
| 855 |
st.subheader("π Model Usage Trend")
|
|
@@ -871,7 +871,7 @@ def render_user_behavior_analysis(stats, advanced_metrics):
|
|
| 871 |
max(20, groq_percent * 1.05),
|
| 872 |
groq_percent
|
| 873 |
],
|
| 874 |
-
'Phi-3 Usage': [
|
| 875 |
max(5, phi3_percent * 0.7),
|
| 876 |
max(10, phi3_percent * 0.85),
|
| 877 |
max(15, phi3_percent * 0.95),
|
|
@@ -880,22 +880,22 @@ def render_user_behavior_analysis(stats, advanced_metrics):
|
|
| 880 |
}
|
| 881 |
|
| 882 |
df_trend = pd.DataFrame(trend_data)
|
| 883 |
-
fig = px.line(df_trend, x='Period', y=['Groq Usage', 'Phi-3 Usage'],
|
| 884 |
title="Model Usage Trend Over Time", markers=True)
|
| 885 |
st.plotly_chart(fig, use_container_width=True, key="usage_trend_chart")
|
| 886 |
else:
|
| 887 |
st.info("Not enough data to show usage trends yet.")
|
| 888 |
|
| 889 |
def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
| 890 |
-
"""Analyze content effectiveness across different dimensions with comprehensive Groq vs Phi-3 comparisons"""
|
| 891 |
|
| 892 |
# Complexity Distribution Comparison
|
| 893 |
-
st.subheader("π― Complexity Distribution - Groq vs Phi-3")
|
| 894 |
|
| 895 |
col1, col2 = st.columns(2)
|
| 896 |
|
| 897 |
with col1:
|
| 898 |
-
# Complexity analysis - Groq vs Phi-3
|
| 899 |
groq_complexity = advanced_metrics.get('models', {}).get('groq', {}).get('complexity_distribution', {})
|
| 900 |
phi3_complexity = advanced_metrics.get('models', {}).get('phi3', {}).get('complexity_distribution', {})
|
| 901 |
|
|
@@ -907,11 +907,11 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 907 |
|
| 908 |
fig = go.Figure(data=[
|
| 909 |
go.Bar(name='Groq', x=complexities, y=groq_values, marker_color='#1f77b4'),
|
| 910 |
-
go.Bar(name='Phi-3', x=complexities, y=phi3_values, marker_color='#ff7f0e')
|
| 911 |
])
|
| 912 |
|
| 913 |
fig.update_layout(
|
| 914 |
-
title="Complexity Distribution: Groq vs Phi-3",
|
| 915 |
barmode='group',
|
| 916 |
yaxis_title="Count",
|
| 917 |
showlegend=True,
|
|
@@ -966,10 +966,10 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 966 |
elif complexity_gap > 0:
|
| 967 |
st.info(f"βΉοΈ Groq has {complexity_gap:.1f}% better complexity appropriateness")
|
| 968 |
else:
|
| 969 |
-
st.warning(f"β οΈ Phi-3 has {abs(complexity_gap):.1f}% better complexity appropriateness")
|
| 970 |
|
| 971 |
# User Type Effectiveness Comparison
|
| 972 |
-
st.subheader("π₯ User Type Effectiveness - Groq vs Phi-3")
|
| 973 |
|
| 974 |
col1, col2 = st.columns(2)
|
| 975 |
|
|
@@ -990,11 +990,11 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 990 |
|
| 991 |
fig = go.Figure(data=[
|
| 992 |
go.Bar(name='Groq', x=user_types, y=groq_effectiveness, marker_color='blue'),
|
| 993 |
-
go.Bar(name='Phi-3', x=user_types, y=phi3_effectiveness, marker_color='orange')
|
| 994 |
])
|
| 995 |
|
| 996 |
fig.update_layout(
|
| 997 |
-
title="Effectiveness by User Type: Groq vs Phi-3",
|
| 998 |
barmode='group',
|
| 999 |
yaxis_title="Effectiveness Score (0-5)",
|
| 1000 |
showlegend=True,
|
|
@@ -1012,7 +1012,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 1012 |
fig = px.bar(
|
| 1013 |
x=user_types,
|
| 1014 |
y=performance_gaps,
|
| 1015 |
-
title="Performance Gap by User Type (Groq - Phi-3)",
|
| 1016 |
labels={'x': 'User Type', 'y': 'Performance Gap'},
|
| 1017 |
color=performance_gaps,
|
| 1018 |
color_continuous_scale=['red', 'white', 'green'],
|
|
@@ -1032,10 +1032,10 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 1032 |
elif best_gap > 0:
|
| 1033 |
st.success(f"π **Significant Advantage**: Groq performs {best_gap:.2f} points better for {best_user_type}s")
|
| 1034 |
else:
|
| 1035 |
-
st.warning(f"π **Challenge Area**: Phi-3 performs {abs(best_gap):.2f} points better for {best_user_type}s")
|
| 1036 |
|
| 1037 |
# Student Level Appropriateness Comparison
|
| 1038 |
-
st.subheader("π Student Level Appropriateness - Groq vs Phi-3")
|
| 1039 |
|
| 1040 |
col1, col2 = st.columns(2)
|
| 1041 |
|
|
@@ -1064,13 +1064,13 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 1064 |
fig.add_trace(go.Scatter(
|
| 1065 |
x=levels, y=phi3_appropriateness,
|
| 1066 |
mode='lines+markers',
|
| 1067 |
-
name='Phi-3',
|
| 1068 |
line=dict(color='orange', width=3),
|
| 1069 |
marker=dict(size=8)
|
| 1070 |
))
|
| 1071 |
|
| 1072 |
fig.update_layout(
|
| 1073 |
-
title="Appropriateness by Education Level: Groq vs Phi-3",
|
| 1074 |
xaxis_title="Education Level",
|
| 1075 |
yaxis_title="Appropriateness Score (0-5)",
|
| 1076 |
height=400
|
|
@@ -1087,7 +1087,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 1087 |
fig = px.bar(
|
| 1088 |
x=levels,
|
| 1089 |
y=appropriateness_gaps,
|
| 1090 |
-
title="Appropriateness Gap by Level (Groq - Phi-3)",
|
| 1091 |
labels={'x': 'Education Level', 'y': 'Appropriateness Gap'},
|
| 1092 |
color=appropriateness_gaps,
|
| 1093 |
color_continuous_scale=['red', 'white', 'green'],
|
|
@@ -1113,7 +1113,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 1113 |
)
|
| 1114 |
|
| 1115 |
# Content Type Performance Comparison
|
| 1116 |
-
st.subheader("π Content Type Performance - Groq vs Phi-3")
|
| 1117 |
|
| 1118 |
content_types = ['Lesson Plan', 'Study Guide', 'Lecture Notes', 'Interactive Activity']
|
| 1119 |
|
|
@@ -1131,11 +1131,11 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 1131 |
# Performance comparison chart
|
| 1132 |
fig = go.Figure(data=[
|
| 1133 |
go.Bar(name='Groq', x=content_types, y=groq_content_scores, marker_color='blue'),
|
| 1134 |
-
go.Bar(name='Phi-3', x=content_types, y=phi3_content_scores, marker_color='orange')
|
| 1135 |
])
|
| 1136 |
|
| 1137 |
fig.update_layout(
|
| 1138 |
-
title="Performance by Content Type: Groq vs Phi-3",
|
| 1139 |
barmode='group',
|
| 1140 |
yaxis_title="Average Score (0-5)",
|
| 1141 |
height=500
|
|
@@ -1156,7 +1156,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 1156 |
fig = px.bar(
|
| 1157 |
x=content_types,
|
| 1158 |
y=performance_gaps,
|
| 1159 |
-
title="Performance Gap by Content Type (Groq - Phi-3)",
|
| 1160 |
color=performance_gaps,
|
| 1161 |
color_continuous_scale=['red', 'white', 'green'],
|
| 1162 |
color_continuous_midpoint=0
|
|
@@ -1187,7 +1187,7 @@ def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
|
| 1187 |
st.metric(
|
| 1188 |
label=f"Groq's Strongest: {content_types[best_groq_idx]}",
|
| 1189 |
value=f"{best_groq_score:.2f}",
|
| 1190 |
-
delta=f"+{best_groq_gap:.2f} over Phi-3"
|
| 1191 |
)
|
| 1192 |
|
| 1193 |
st.metric(
|
|
@@ -1326,10 +1326,10 @@ def render_research_insights(stats, calculated_metrics, advanced_metrics):
|
|
| 1326 |
# Based on performance gap
|
| 1327 |
if calculated_metrics['improvement_gap']['f1'] > 30:
|
| 1328 |
recommendations.append("β
**Deploy Groq in Production**: Groq demonstrates production-ready performance")
|
| 1329 |
-
recommendations.append("π§ **Strategic Phi-3 Optimization**: Focus on specific use cases where Phi-3 shows potential")
|
| 1330 |
elif calculated_metrics['improvement_gap']['f1'] > 15:
|
| 1331 |
recommendations.append("β
**Continue Groq Focus**: Maintain Groq as primary model for high-quality content")
|
| 1332 |
-
recommendations.append("π§ **Phi-3 Optimization**: Investigate specific areas for Phi-3 improvement")
|
| 1333 |
else:
|
| 1334 |
recommendations.append("π€ **Model Diversification**: Consider both models for different use cases")
|
| 1335 |
|
|
@@ -1505,48 +1505,6 @@ def render_data_management():
|
|
| 1505 |
|
| 1506 |
if st.button("Generate Custom Report", use_container_width=True):
|
| 1507 |
st.info("Custom report generation coming soon! Currently using comprehensive format.")
|
| 1508 |
-
|
| 1509 |
-
# Data Quality Insights
|
| 1510 |
-
st.subheader("π Data Quality Insights")
|
| 1511 |
-
|
| 1512 |
-
insight_col1, insight_col2, insight_col3 = st.columns(3)
|
| 1513 |
-
|
| 1514 |
-
with insight_col1:
|
| 1515 |
-
# Data completeness
|
| 1516 |
-
total_users = advanced_metrics.get('database_summary', {}).get('total_users', 0)
|
| 1517 |
-
user_coverage = min(100, (total_feedback / max(1, total_users)) * 100)
|
| 1518 |
-
st.metric("User Coverage", f"{user_coverage:.1f}%")
|
| 1519 |
-
|
| 1520 |
-
with insight_col2:
|
| 1521 |
-
# Model balance
|
| 1522 |
-
groq_count = stats.get("groq_feedback_count", 0)
|
| 1523 |
-
phi3_count = stats.get("phi3_feedback_count", 0)
|
| 1524 |
-
total_count = groq_count + phi3_count
|
| 1525 |
-
balance_ratio = (min(groq_count, phi3_count) / max(groq_count, phi3_count) * 100) if total_count > 0 else 0
|
| 1526 |
-
st.metric("Model Balance", f"{balance_ratio:.1f}%")
|
| 1527 |
-
|
| 1528 |
-
with insight_col3:
|
| 1529 |
-
# Data freshness
|
| 1530 |
-
try:
|
| 1531 |
-
from db.helpers import get_latest_feedback_date
|
| 1532 |
-
latest_date = get_latest_feedback_date()
|
| 1533 |
-
if latest_date:
|
| 1534 |
-
days_ago = (datetime.now() - latest_date).days
|
| 1535 |
-
freshness = max(0, 100 - (days_ago * 2)) # Decrease 2% per day
|
| 1536 |
-
st.metric("Data Freshness", f"{freshness:.1f}%")
|
| 1537 |
-
else:
|
| 1538 |
-
st.metric("Data Freshness", "N/A")
|
| 1539 |
-
except:
|
| 1540 |
-
st.metric("Data Freshness", "Check DB")
|
| 1541 |
-
|
| 1542 |
-
# Export History (placeholder for future feature)
|
| 1543 |
-
with st.expander("π Recent Exports"):
|
| 1544 |
-
st.info("Export history tracking coming soon!")
|
| 1545 |
-
st.write("""
|
| 1546 |
-
- **Last PDF Export**: Not yet tracked
|
| 1547 |
-
- **Last Data Export**: Not yet tracked
|
| 1548 |
-
- **Last Training Export**: Not yet tracked
|
| 1549 |
-
""")
|
| 1550 |
|
| 1551 |
# Helper functions for calculating metrics
|
| 1552 |
def calculate_user_type_effectiveness(model, user_type, stats):
|
|
|
|
| 118 |
st.header("β¨ Detailed Quality Analysis")
|
| 119 |
render_quality_analysis(stats, calculated_metrics, advanced_metrics)
|
| 120 |
|
| 121 |
+
# NEW: Complexity Analysis - Groq vs Phi-3 (Finetuned)
|
| 122 |
render_complexity_analysis(stats, advanced_metrics)
|
| 123 |
|
| 124 |
+
# NEW: User Type Breakdown - Groq vs Phi-3 (Finetuned)
|
| 125 |
render_user_type_breakdown(stats, advanced_metrics)
|
| 126 |
|
| 127 |
+
# NEW: Student Level Analysis - Groq vs Phi-3 (Finetuned)
|
| 128 |
render_student_level_analysis(stats, advanced_metrics)
|
| 129 |
|
| 130 |
+
# NEW: Comment Analysis - Groq vs Phi-3 (Finetuned)
|
| 131 |
#render_comment_analysis(stats, advanced_metrics)
|
| 132 |
|
| 133 |
# Statistical Significance Testing
|
|
|
|
| 146 |
st.header("π Regeneration Effectiveness")
|
| 147 |
render_regeneration_analysis(stats, calculated_metrics)
|
| 148 |
|
| 149 |
+
# NEW: Regeneration Type Analysis - Groq vs Phi-3 (Finetuned)
|
| 150 |
render_regeneration_type_analysis(stats, advanced_metrics)
|
| 151 |
|
| 152 |
+
# NEW: Target Achievement Analysis - Groq vs Phi-3 (Finetuned)
|
| 153 |
# render_target_achievement_analysis(stats, calculated_metrics)
|
| 154 |
|
| 155 |
+
# NEW: High Quality Target Analysis - Groq vs Phi-3 (Finetuned)
|
| 156 |
render_high_quality_target_analysis(stats)
|
| 157 |
|
| 158 |
# Research Insights & Recommendations
|
|
|
|
| 168 |
st.info("This might be because no research data has been collected yet.")
|
| 169 |
|
| 170 |
# ============================================================================
|
| 171 |
+
# NEW COMPARISON FUNCTIONS - ALL GROQ VS Phi-3 (Finetuned)
|
| 172 |
# ============================================================================
|
| 173 |
|
| 174 |
def render_detailed_database_summary(stats, advanced_metrics):
|
|
|
|
| 203 |
st.metric("Total High Quality", hq_total)
|
| 204 |
|
| 205 |
def render_complexity_analysis(stats, advanced_metrics):
|
| 206 |
+
"""Detailed complexity distribution analysis - Groq vs Phi-3 (Finetuned) - FIXED"""
|
| 207 |
+
st.header("π― Complexity Analysis - Groq vs Phi-3 (Finetuned)")
|
| 208 |
|
| 209 |
groq_complexity = advanced_metrics.get('models', {}).get('groq', {}).get('complexity_distribution', {})
|
| 210 |
phi3_complexity = advanced_metrics.get('models', {}).get('phi3', {}).get('complexity_distribution', {})
|
|
|
|
| 234 |
phi3_too_simple = phi3_complexity.get('Too simple', 0)
|
| 235 |
phi3_too_complex = phi3_complexity.get('Too complex', 0)
|
| 236 |
|
| 237 |
+
st.subheader("π§ͺ Phi-3 (Finetuned) Complexity")
|
| 238 |
st.metric("Appropriate Complexity", f"{phi3_appropriate} ({phi3_appropriate/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
|
| 239 |
st.metric("Too Simple", f"{phi3_too_simple} ({phi3_too_simple/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
|
| 240 |
st.metric("Too Complex", f"{phi3_too_complex} ({phi3_too_complex/phi3_total*100:.1f}%)" if phi3_total > 0 else "0")
|
|
|
|
| 246 |
|
| 247 |
fig = go.Figure(data=[
|
| 248 |
go.Bar(name='Groq', x=complexities, y=groq_values, marker_color='#1f77b4'),
|
| 249 |
+
go.Bar(name='Phi-3 (Finetuned)', x=complexities, y=phi3_values, marker_color='#ff7f0e')
|
| 250 |
])
|
| 251 |
|
| 252 |
fig.update_layout(
|
| 253 |
+
title="Complexity Distribution: Groq vs Phi-3 (Finetuned)",
|
| 254 |
barmode='group',
|
| 255 |
yaxis_title="Count",
|
| 256 |
showlegend=True,
|
|
|
|
| 259 |
st.plotly_chart(fig, use_container_width=True, key="complexity_comparison_chart")
|
| 260 |
|
| 261 |
def render_user_type_breakdown(stats, advanced_metrics):
|
| 262 |
+
"""Detailed user type analysis - Groq vs Phi-3 (Finetuned)"""
|
| 263 |
+
st.header("π₯ User Type Analysis - Groq vs Phi-3 (Finetuned)")
|
| 264 |
|
| 265 |
user_types = ['student', 'tutor']
|
| 266 |
|
|
|
|
| 280 |
st.info("No Groq data available")
|
| 281 |
|
| 282 |
with col2:
|
| 283 |
+
# Phi-3 (Finetuned) performance for this user type
|
| 284 |
phi3_data = advanced_metrics.get('models', {}).get('phi3', {}).get('user_types', {}).get(user_type, {})
|
| 285 |
if phi3_data:
|
| 286 |
+
st.metric("Phi-3 (Finetuned) Feedback Count", phi3_data.get('count', 0))
|
| 287 |
+
st.metric("Phi-3 (Finetuned) Avg Clarity", f"{phi3_data.get('avg_clarity', 0):.2f}")
|
| 288 |
+
st.metric("Phi-3 (Finetuned) Avg Depth", f"{phi3_data.get('avg_depth', 0):.2f}")
|
| 289 |
else:
|
| 290 |
+
st.info("No Phi-3 (Finetuned) data available")
|
| 291 |
|
| 292 |
def render_student_level_analysis(stats, advanced_metrics):
|
| 293 |
+
"""Detailed student level analysis - Groq vs Phi-3 (Finetuned) - WITH LEVEL MAPPING"""
|
| 294 |
+
st.header("π Student Level Analysis - Groq vs Phi-3 (Finetuned)")
|
| 295 |
|
| 296 |
# Map specific levels to general categories
|
| 297 |
level_mapping = {
|
|
|
|
| 339 |
|
| 340 |
with col2:
|
| 341 |
if phi3_total_count > 0:
|
| 342 |
+
st.metric("Phi-3 (Finetuned) Feedback Count", phi3_total_count)
|
| 343 |
+
st.metric("Phi-3 (Finetuned) Avg Clarity", f"{phi3_avg_clarity:.2f}")
|
| 344 |
else:
|
| 345 |
+
st.info("No Phi-3 (Finetuned) data")
|
| 346 |
|
| 347 |
# Show breakdown if we have multiple specific levels
|
| 348 |
if len(specific_levels) > 1:
|
|
|
|
| 360 |
|
| 361 |
with col2:
|
| 362 |
if phi3_specific:
|
| 363 |
+
st.write(f"**{specific_level}** - Phi-3 (Finetuned): {phi3_specific.get('count', 0)} feedbacks, Clarity: {phi3_specific.get('avg_clarity', 0):.2f}")
|
| 364 |
else:
|
| 365 |
+
st.write(f"**{specific_level}** - No Phi-3 (Finetuned) data")
|
| 366 |
|
| 367 |
def render_regeneration_type_analysis(stats, advanced_metrics):
|
| 368 |
+
"""Detailed regeneration type breakdown - Groq vs Phi-3 (Finetuned)"""
|
| 369 |
+
st.header("π Regeneration Type Analysis - Groq vs Phi-3 (Finetuned)")
|
| 370 |
|
| 371 |
groq_regen = advanced_metrics.get('models', {}).get('groq', {}).get('regeneration_types', {})
|
| 372 |
phi3_regen = advanced_metrics.get('models', {}).get('phi3', {}).get('regeneration_types', {})
|
|
|
|
| 385 |
|
| 386 |
with col2:
|
| 387 |
if phi3_regen:
|
| 388 |
+
st.subheader("Phi-3 (Finetuned) Regeneration Methods")
|
| 389 |
for regen_type, count in phi3_regen.items():
|
| 390 |
if count > 0:
|
| 391 |
st.metric(regen_type.replace('_', ' ').title(), count)
|
| 392 |
else:
|
| 393 |
+
st.info("No Phi-3 (Finetuned) regeneration data")
|
| 394 |
|
| 395 |
# Comparison chart
|
| 396 |
all_regen_types = set(list(groq_regen.keys()) + list(phi3_regen.keys()))
|
|
|
|
| 400 |
|
| 401 |
fig = go.Figure(data=[
|
| 402 |
go.Bar(name='Groq', x=list(all_regen_types), y=groq_values, marker_color='#1f77b4'),
|
| 403 |
+
go.Bar(name='Phi-3 (Finetuned)', x=list(all_regen_types), y=phi3_values, marker_color='#ff7f0e')
|
| 404 |
])
|
| 405 |
|
| 406 |
fig.update_layout(
|
| 407 |
+
title="Regeneration Methods: Groq vs Phi-3 (Finetuned)",
|
| 408 |
barmode='group',
|
| 409 |
yaxis_title="Count",
|
| 410 |
showlegend=True,
|
|
|
|
| 415 |
st.info("No regeneration type data available")
|
| 416 |
|
| 417 |
def render_high_quality_target_analysis(stats):
|
| 418 |
+
"""High quality feedback target analysis - Groq vs Phi-3 (Finetuned)"""
|
| 419 |
+
st.header("β High Quality Feedback Analysis - Groq vs Phi-3 (Finetuned)")
|
| 420 |
|
| 421 |
groq_hq = stats.get("high_quality_groq", 0)
|
| 422 |
phi3_hq = stats.get("high_quality_phi3", 0)
|
|
|
|
| 435 |
|
| 436 |
with col2:
|
| 437 |
phi3_hq_rate = (phi3_hq / phi3_feedback * 100) if phi3_feedback > 0 else 0
|
| 438 |
+
st.metric("Phi-3 (Finetuned) HQ", f"{phi3_hq} ({phi3_hq_rate:.1f}%)")
|
| 439 |
|
| 440 |
with col3:
|
| 441 |
st.metric("Total HQ", total_hq)
|
|
|
|
| 447 |
# HQ Comparison Chart
|
| 448 |
fig = go.Figure(data=[
|
| 449 |
go.Bar(name='Groq', x=['High Quality'], y=[groq_hq], marker_color='blue'),
|
| 450 |
+
go.Bar(name='Phi-3 (Finetuned)', x=['High Quality'], y=[phi3_hq], marker_color='orange')
|
| 451 |
])
|
| 452 |
|
| 453 |
fig.update_layout(
|
| 454 |
+
title="High Quality Feedback: Groq vs Phi-3 (Finetuned)",
|
| 455 |
barmode='group',
|
| 456 |
yaxis_title="Count",
|
| 457 |
showlegend=True,
|
|
|
|
| 543 |
if groq_recall < 0.7:
|
| 544 |
groq_recall = 0.7 + (groq_depth / 25) # 0.7 + 0.1476 = ~0.847
|
| 545 |
|
| 546 |
+
# Phi-3 (Finetuned) enhancement - weaker but still reasonable
|
| 547 |
if phi3_f1 < 0.5:
|
| 548 |
quality_factor = (phi3_clarity + phi3_depth) / 10 # 0.452 for current scores
|
| 549 |
phi3_f1 = 0.5 + (quality_factor * 0.15) # 0.5 + 0.0678 = ~0.567
|
|
|
|
| 636 |
st.success("β
**Exceptional Performance Difference**: Groq demonstrates outstanding superiority across all metrics")
|
| 637 |
st.metric("Overall Quality Gap", f"{overall_gap:.2f} points", delta=f"+{overall_gap:.2f}")
|
| 638 |
elif overall_gap > 1.0:
|
| 639 |
+
st.success("β
**Significant Performance Difference**: Groq substantially outperforms Phi-3 (Finetuned) across all metrics")
|
| 640 |
st.metric("Overall Quality Gap", f"{overall_gap:.2f} points", delta=f"+{overall_gap:.2f}")
|
| 641 |
elif overall_gap > 0.5:
|
| 642 |
st.warning("β οΈ **Moderate Performance Gap**: Consistent but moderate advantage for Groq")
|
|
|
|
| 669 |
st.metric("Groq F1 Score", f"{calculated_metrics['f1_score']['groq']}%")
|
| 670 |
|
| 671 |
with col3:
|
| 672 |
+
st.metric("Phi-3 (Finetuned) F1 Score", f"{calculated_metrics['f1_score']['phi3']}%")
|
| 673 |
|
| 674 |
with col4:
|
| 675 |
f1_gap = calculated_metrics['improvement_gap']['f1']
|
|
|
|
| 707 |
|
| 708 |
fig = go.Figure(data=[
|
| 709 |
go.Bar(name='Groq (Control)', x=metrics, y=groq_values, marker_color='#1f77b4'),
|
| 710 |
+
go.Bar(name='Phi-3 (Finetuned)', x=metrics, y=phi3_values, marker_color='#ff7f0e')
|
| 711 |
])
|
| 712 |
|
| 713 |
fig.update_layout(
|
|
|
|
| 736 |
st.metric("Overall Quality", f"{calculated_metrics['overall_quality']['groq']}/5")
|
| 737 |
|
| 738 |
with col2:
|
| 739 |
+
st.subheader("π§ͺ Phi-3 (Finetuned)")
|
| 740 |
|
| 741 |
phi3_scores = stats.get("phi3_scores", {})
|
| 742 |
precision_delta = f"{safe_convert(calculated_metrics['precision']['phi3']) - safe_convert(calculated_metrics['precision']['groq']):.1f}%"
|
|
|
|
| 771 |
phi3_se = 1.96 * (phi3_clarity / np.sqrt(phi3_samples)) if phi3_samples > 0 else 0
|
| 772 |
|
| 773 |
st.metric("Groq Confidence Interval", f"Β±{groq_se:.2f}")
|
| 774 |
+
st.metric("Phi-3 (Finetuned) Confidence Interval", f"Β±{phi3_se:.2f}")
|
| 775 |
|
| 776 |
# Effect size calculation
|
| 777 |
effect_size = (groq_clarity - phi3_clarity) / np.sqrt((groq_se**2 + phi3_se**2)/2) if (groq_se + phi3_se) > 0 else 0
|
|
|
|
| 827 |
phi3_percent = (phi3_feedback / total_feedback) * 100
|
| 828 |
|
| 829 |
st.metric("Groq Usage", f"{groq_percent:.1f}%")
|
| 830 |
+
st.metric("Phi-3 (Finetuned) Usage", f"{phi3_percent:.1f}%")
|
| 831 |
|
| 832 |
with col2:
|
| 833 |
total_content = stats.get("total_content", 0)
|
|
|
|
| 849 |
phi3_feedback = stats.get("phi3_feedback_count", 0)
|
| 850 |
if phi3_feedback > 0:
|
| 851 |
phi3_hq_rate = (phi3_hq / phi3_feedback) * 100
|
| 852 |
+
st.metric("Phi-3 (Finetuned) HQ Rate", f"{phi3_hq_rate:.1f}%")
|
| 853 |
|
| 854 |
# Model preference trend
|
| 855 |
st.subheader("π Model Usage Trend")
|
|
|
|
| 871 |
max(20, groq_percent * 1.05),
|
| 872 |
groq_percent
|
| 873 |
],
|
| 874 |
+
'Phi-3 (Finetuned) Usage': [
|
| 875 |
max(5, phi3_percent * 0.7),
|
| 876 |
max(10, phi3_percent * 0.85),
|
| 877 |
max(15, phi3_percent * 0.95),
|
|
|
|
| 880 |
}
|
| 881 |
|
| 882 |
df_trend = pd.DataFrame(trend_data)
|
| 883 |
+
fig = px.line(df_trend, x='Period', y=['Groq Usage', 'Phi-3 (Finetuned) Usage'],
|
| 884 |
title="Model Usage Trend Over Time", markers=True)
|
| 885 |
st.plotly_chart(fig, use_container_width=True, key="usage_trend_chart")
|
| 886 |
else:
|
| 887 |
st.info("Not enough data to show usage trends yet.")
|
| 888 |
|
| 889 |
def render_content_effectiveness(stats, advanced_metrics, calculated_metrics):
|
| 890 |
+
"""Analyze content effectiveness across different dimensions with comprehensive Groq vs Phi-3 (Finetuned) comparisons"""
|
| 891 |
|
| 892 |
# Complexity Distribution Comparison
|
| 893 |
+
st.subheader("π― Complexity Distribution - Groq vs Phi-3 (Finetuned)")
|
| 894 |
|
| 895 |
col1, col2 = st.columns(2)
|
| 896 |
|
| 897 |
with col1:
|
| 898 |
+
# Complexity analysis - Groq vs Phi-3 (Finetuned)
|
| 899 |
groq_complexity = advanced_metrics.get('models', {}).get('groq', {}).get('complexity_distribution', {})
|
| 900 |
phi3_complexity = advanced_metrics.get('models', {}).get('phi3', {}).get('complexity_distribution', {})
|
| 901 |
|
|
|
|
| 907 |
|
| 908 |
fig = go.Figure(data=[
|
| 909 |
go.Bar(name='Groq', x=complexities, y=groq_values, marker_color='#1f77b4'),
|
| 910 |
+
go.Bar(name='Phi-3 (Finetuned)', x=complexities, y=phi3_values, marker_color='#ff7f0e')
|
| 911 |
])
|
| 912 |
|
| 913 |
fig.update_layout(
|
| 914 |
+
title="Complexity Distribution: Groq vs Phi-3 (Finetuned)",
|
| 915 |
barmode='group',
|
| 916 |
yaxis_title="Count",
|
| 917 |
showlegend=True,
|
|
|
|
| 966 |
elif complexity_gap > 0:
|
| 967 |
st.info(f"βΉοΈ Groq has {complexity_gap:.1f}% better complexity appropriateness")
|
| 968 |
else:
|
| 969 |
+
st.warning(f"β οΈ Phi-3 (Finetuned) has {abs(complexity_gap):.1f}% better complexity appropriateness")
|
| 970 |
|
| 971 |
# User Type Effectiveness Comparison
|
| 972 |
+
st.subheader("π₯ User Type Effectiveness - Groq vs Phi-3 (Finetuned)")
|
| 973 |
|
| 974 |
col1, col2 = st.columns(2)
|
| 975 |
|
|
|
|
| 990 |
|
| 991 |
fig = go.Figure(data=[
|
| 992 |
go.Bar(name='Groq', x=user_types, y=groq_effectiveness, marker_color='blue'),
|
| 993 |
+
go.Bar(name='Phi-3 (Finetuned)', x=user_types, y=phi3_effectiveness, marker_color='orange')
|
| 994 |
])
|
| 995 |
|
| 996 |
fig.update_layout(
|
| 997 |
+
title="Effectiveness by User Type: Groq vs Phi-3 (Finetuned)",
|
| 998 |
barmode='group',
|
| 999 |
yaxis_title="Effectiveness Score (0-5)",
|
| 1000 |
showlegend=True,
|
|
|
|
| 1012 |
fig = px.bar(
|
| 1013 |
x=user_types,
|
| 1014 |
y=performance_gaps,
|
| 1015 |
+
title="Performance Gap by User Type (Groq - Phi-3 (Finetuned))",
|
| 1016 |
labels={'x': 'User Type', 'y': 'Performance Gap'},
|
| 1017 |
color=performance_gaps,
|
| 1018 |
color_continuous_scale=['red', 'white', 'green'],
|
|
|
|
| 1032 |
elif best_gap > 0:
|
| 1033 |
st.success(f"π **Significant Advantage**: Groq performs {best_gap:.2f} points better for {best_user_type}s")
|
| 1034 |
else:
|
| 1035 |
+
st.warning(f"π **Challenge Area**: Phi-3 (Finetuned) performs {abs(best_gap):.2f} points better for {best_user_type}s")
|
| 1036 |
|
| 1037 |
# Student Level Appropriateness Comparison
|
| 1038 |
+
st.subheader("π Student Level Appropriateness - Groq vs Phi-3 (Finetuned)")
|
| 1039 |
|
| 1040 |
col1, col2 = st.columns(2)
|
| 1041 |
|
|
|
|
| 1064 |
fig.add_trace(go.Scatter(
|
| 1065 |
x=levels, y=phi3_appropriateness,
|
| 1066 |
mode='lines+markers',
|
| 1067 |
+
name='Phi-3 (Finetuned)',
|
| 1068 |
line=dict(color='orange', width=3),
|
| 1069 |
marker=dict(size=8)
|
| 1070 |
))
|
| 1071 |
|
| 1072 |
fig.update_layout(
|
| 1073 |
+
title="Appropriateness by Education Level: Groq vs Phi-3 (Finetuned)",
|
| 1074 |
xaxis_title="Education Level",
|
| 1075 |
yaxis_title="Appropriateness Score (0-5)",
|
| 1076 |
height=400
|
|
|
|
| 1087 |
fig = px.bar(
|
| 1088 |
x=levels,
|
| 1089 |
y=appropriateness_gaps,
|
| 1090 |
+
title="Appropriateness Gap by Level (Groq - Phi-3 (Finetuned))",
|
| 1091 |
labels={'x': 'Education Level', 'y': 'Appropriateness Gap'},
|
| 1092 |
color=appropriateness_gaps,
|
| 1093 |
color_continuous_scale=['red', 'white', 'green'],
|
|
|
|
| 1113 |
)
|
| 1114 |
|
| 1115 |
# Content Type Performance Comparison
|
| 1116 |
+
st.subheader("π Content Type Performance - Groq vs Phi-3 (Finetuned)")
|
| 1117 |
|
| 1118 |
content_types = ['Lesson Plan', 'Study Guide', 'Lecture Notes', 'Interactive Activity']
|
| 1119 |
|
|
|
|
| 1131 |
# Performance comparison chart
|
| 1132 |
fig = go.Figure(data=[
|
| 1133 |
go.Bar(name='Groq', x=content_types, y=groq_content_scores, marker_color='blue'),
|
| 1134 |
+
go.Bar(name='Phi-3 (Finetuned)', x=content_types, y=phi3_content_scores, marker_color='orange')
|
| 1135 |
])
|
| 1136 |
|
| 1137 |
fig.update_layout(
|
| 1138 |
+
title="Performance by Content Type: Groq vs Phi-3 (Finetuned)",
|
| 1139 |
barmode='group',
|
| 1140 |
yaxis_title="Average Score (0-5)",
|
| 1141 |
height=500
|
|
|
|
| 1156 |
fig = px.bar(
|
| 1157 |
x=content_types,
|
| 1158 |
y=performance_gaps,
|
| 1159 |
+
title="Performance Gap by Content Type (Groq - Phi-3 (Finetuned))",
|
| 1160 |
color=performance_gaps,
|
| 1161 |
color_continuous_scale=['red', 'white', 'green'],
|
| 1162 |
color_continuous_midpoint=0
|
|
|
|
| 1187 |
st.metric(
|
| 1188 |
label=f"Groq's Strongest: {content_types[best_groq_idx]}",
|
| 1189 |
value=f"{best_groq_score:.2f}",
|
| 1190 |
+
delta=f"+{best_groq_gap:.2f} over Phi-3 (Finetuned)"
|
| 1191 |
)
|
| 1192 |
|
| 1193 |
st.metric(
|
|
|
|
| 1326 |
# Based on performance gap
|
| 1327 |
if calculated_metrics['improvement_gap']['f1'] > 30:
|
| 1328 |
recommendations.append("β
**Deploy Groq in Production**: Groq demonstrates production-ready performance")
|
| 1329 |
+
recommendations.append("π§ **Strategic Phi-3 (Finetuned) Optimization**: Focus on specific use cases where Phi-3 (Finetuned) shows potential")
|
| 1330 |
elif calculated_metrics['improvement_gap']['f1'] > 15:
|
| 1331 |
recommendations.append("β
**Continue Groq Focus**: Maintain Groq as primary model for high-quality content")
|
| 1332 |
+
recommendations.append("π§ **Phi-3 (Finetuned) Optimization**: Investigate specific areas for Phi-3 (Finetuned) improvement")
|
| 1333 |
else:
|
| 1334 |
recommendations.append("π€ **Model Diversification**: Consider both models for different use cases")
|
| 1335 |
|
|
|
|
| 1505 |
|
| 1506 |
if st.button("Generate Custom Report", use_container_width=True):
|
| 1507 |
st.info("Custom report generation coming soon! Currently using comprehensive format.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1508 |
|
| 1509 |
# Helper functions for calculating metrics
|
| 1510 |
def calculate_user_type_effectiveness(model, user_type, stats):
|