courtneyf2 commited on
Commit
d614182
·
verified ·
1 Parent(s): 9deb76c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -52
app.py CHANGED
@@ -309,11 +309,11 @@ with gr.Blocks() as demo:
309
  tokenize_btn = gr.Button("Tokenize Text")
310
 
311
  with gr.Row():
312
- token_display = gr.Textbox(label="Tokens", lines=3)
313
- token_count = gr.Number(label="Token Count")
314
 
315
  with gr.Row():
316
- token_info = gr.Textbox(label="Tokenization Info", lines=2)
317
 
318
  with gr.Tab("Context & Predictions"):
319
  gr.Markdown("### Next-word predictions and context understanding")
@@ -330,12 +330,13 @@ with gr.Blocks() as demo:
330
  predict_btn = gr.Button("Get Next Word Predictions")
331
 
332
  with gr.Row():
333
- predictions_output = gr.Textbox(label="Most Likely Next Words", lines=5)
334
 
335
  with gr.Row():
336
  context_window_info = gr.Textbox(
337
  label="Context Window Status",
338
- value="Click 'Get Predictions' to see token usage"
 
339
  )
340
 
341
  with gr.Tab("Attention Network"):
@@ -849,7 +850,7 @@ with gr.Blocks() as demo:
849
 
850
  with gr.Tabs() as week9_tabs:
851
 
852
- with gr.Tab("Generate for Evaluation"):
853
  gr.Markdown("""
854
  ### Generate Multiple Versions for Comparison
855
  Create three versions of a response with different temperature settings.
@@ -861,7 +862,7 @@ with gr.Blocks() as demo:
861
  label="Enter your prompt",
862
  placeholder="e.g., Summarise the main benefits of cloud computing for small businesses",
863
  lines=3,
864
- value="Explain the main differences between SQL and NoSQL databases."
865
  )
866
 
867
  with gr.Row():
@@ -900,7 +901,7 @@ with gr.Blocks() as demo:
900
  eval_output1 = gr.Textbox(
901
  label="Output 1",
902
  lines=6,
903
- interactive=False
904
  )
905
  gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
906
  with gr.Row():
@@ -914,7 +915,8 @@ with gr.Blocks() as demo:
914
  gr.Markdown("**Version 2** (Temp: 0.7)")
915
  eval_output2 = gr.Textbox(
916
  label="Output 2",
917
- lines=6
 
918
  )
919
  gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
920
  with gr.Row():
@@ -928,7 +930,8 @@ with gr.Blocks() as demo:
928
  gr.Markdown("**Version 3** (Temp: 1.0)")
929
  eval_output3 = gr.Textbox(
930
  label="Output 3",
931
- lines=6
 
932
  )
933
  gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
934
  with gr.Row():
@@ -944,33 +947,36 @@ with gr.Blocks() as demo:
944
  with gr.Row():
945
  ratings_summary = gr.Textbox(
946
  label="Ratings Summary",
947
- lines=6
 
948
  )
949
-
950
- with gr.Tab("Consistency Testing"):
951
  gr.Markdown("""
952
- ### Test Response Consistency
953
- Run the same prompt multiple times to see how consistent the model's outputs are.
954
- Look for patterns in what stays the same vs what changes.
 
 
955
  """)
956
 
957
  with gr.Row():
958
- consistency_prompt = gr.Textbox(
959
- label="Enter your prompt",
960
- placeholder="e.g., What are the key principles of user interface design?",
961
  lines=3,
962
- value="What are the three most important considerations when choosing a database system?"
963
  )
964
 
965
  with gr.Row():
966
- consistency_runs = gr.Slider(
967
- minimum=3,
968
- maximum=5,
969
- value=3,
970
- step=1,
971
- label="Number of times to run (3-5)"
972
  )
973
- consistency_temp = gr.Slider(
 
 
974
  minimum=0.1,
975
  maximum=1.0,
976
  value=0.7,
@@ -979,28 +985,54 @@ with gr.Blocks() as demo:
979
  )
980
 
981
  with gr.Row():
982
- generate_consistency_btn = gr.Button("Test Consistency", variant="primary")
983
 
984
- gr.Markdown("### Compare the Responses")
985
- gr.Markdown("Look for: What information appears in all responses? What varies? Any contradictions?")
986
 
987
  with gr.Row():
988
- consistency_output1 = gr.Textbox(label="Response 1", lines=5)
989
- consistency_output2 = gr.Textbox(label="Response 2", lines=5)
 
 
 
990
 
991
- with gr.Row():
992
- consistency_output3 = gr.Textbox(label="Response 3", lines=5)
993
- consistency_output4 = gr.Textbox(label="Response 4", lines=5, visible=False)
994
 
995
  with gr.Row():
996
- consistency_output5 = gr.Textbox(label="Response 5", lines=5, interactive=False, visible=False)
 
 
 
 
 
 
 
997
 
998
  with gr.Row():
999
- consistency_analysis = gr.Textbox(
1000
- label="Analysis Notes",
1001
- placeholder="Note patterns you observe: What's consistent? What varies? Any contradictions?",
1002
- lines=4
1003
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1004
 
1005
  def update_consistency_visibility(num_runs):
1006
  """Show/hide output boxes based on number of runs"""
@@ -1029,18 +1061,10 @@ with gr.Blocks() as demo:
1029
  outputs=[ratings_summary]
1030
  )
1031
 
1032
- consistency_runs.change(
1033
- update_consistency_visibility,
1034
- inputs=[consistency_runs],
1035
- outputs=[consistency_output1, consistency_output2, consistency_output3,
1036
- consistency_output4, consistency_output5]
1037
- )
1038
-
1039
- generate_consistency_btn.click(
1040
- test_consistency,
1041
- inputs=[consistency_prompt, consistency_runs, consistency_temp],
1042
- outputs=[consistency_output1, consistency_output2, consistency_output3,
1043
- consistency_output4, consistency_output5]
1044
  )
1045
 
1046
  demo.launch()
 
309
  tokenize_btn = gr.Button("Tokenize Text")
310
 
311
  with gr.Row():
312
+ token_display = gr.Textbox(label="Tokens", lines=3, interactive=False)
313
+ token_count = gr.Number(label="Token Count", interactive=False)
314
 
315
  with gr.Row():
316
+ token_info = gr.Textbox(label="Tokenization Info", lines=2, interactive=False)
317
 
318
  with gr.Tab("Context & Predictions"):
319
  gr.Markdown("### Next-word predictions and context understanding")
 
330
  predict_btn = gr.Button("Get Next Word Predictions")
331
 
332
  with gr.Row():
333
+ predictions_output = gr.Textbox(label="Most Likely Next Words", lines=5, interactive=False)
334
 
335
  with gr.Row():
336
  context_window_info = gr.Textbox(
337
  label="Context Window Status",
338
+ value="Click 'Get Predictions' to see token usage",
339
+ interactive=False
340
  )
341
 
342
  with gr.Tab("Attention Network"):
 
850
 
851
  with gr.Tabs() as week9_tabs:
852
 
853
+ with gr.Tab("Human Evaluation"):
854
  gr.Markdown("""
855
  ### Generate Multiple Versions for Comparison
856
  Create three versions of a response with different temperature settings.
 
862
  label="Enter your prompt",
863
  placeholder="e.g., Summarise the main benefits of cloud computing for small businesses",
864
  lines=3,
865
+ value="Write three different, creative metaphors to explain the concept of a neural network to a child."
866
  )
867
 
868
  with gr.Row():
 
901
  eval_output1 = gr.Textbox(
902
  label="Output 1",
903
  lines=6,
904
+ # interactive=False
905
  )
906
  gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
907
  with gr.Row():
 
915
  gr.Markdown("**Version 2** (Temp: 0.7)")
916
  eval_output2 = gr.Textbox(
917
  label="Output 2",
918
+ lines=6,
919
+ # interactive=False
920
  )
921
  gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
922
  with gr.Row():
 
930
  gr.Markdown("**Version 3** (Temp: 1.0)")
931
  eval_output3 = gr.Textbox(
932
  label="Output 3",
933
+ lines=6,
934
+ # interactive=False
935
  )
936
  gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
937
  with gr.Row():
 
947
  with gr.Row():
948
  ratings_summary = gr.Textbox(
949
  label="Ratings Summary",
950
+ lines=6,
951
+ # interactive=False
952
  )
953
+ with gr.Tab("Automatic Evaluation"):
 
954
  gr.Markdown("""
955
+ ### Generate a Response and Compare to Your Reference Answer
956
+
957
+ This demonstrates how automatic metrics like BLEU and word overlap work in practice.
958
+ You'll provide a "reference answer" (what a good response should say), then see how
959
+ the model's response compares using automatic metrics.
960
  """)
961
 
962
  with gr.Row():
963
+ metric_prompt = gr.Textbox(
964
+ label="Enter your prompt (question or task)",
965
+ placeholder="e.g., What are the main benefits of using a relational database?",
966
  lines=3,
967
+ value="What are the three main principles of user-centered design?"
968
  )
969
 
970
  with gr.Row():
971
+ metric_reference = gr.Textbox(
972
+ label="Enter your reference answer (what a good answer should include)",
973
+ placeholder="Write what you consider a good/correct answer to your prompt...",
974
+ lines=5,
975
+ value="The three main principles of user-centered design are: 1) Focus on users and their needs throughout the design process, 2) Involve users early and often through testing and feedback, and 3) Iterate designs based on user feedback to continuously improve the experience."
 
976
  )
977
+
978
+ with gr.Row():
979
+ metric_temp = gr.Slider(
980
  minimum=0.1,
981
  maximum=1.0,
982
  value=0.7,
 
985
  )
986
 
987
  with gr.Row():
988
+ generate_metric_btn = gr.Button("Generate Model Response & Calculate Metrics", variant="primary")
989
 
990
+ gr.Markdown("### Model Response")
 
991
 
992
  with gr.Row():
993
+ metric_generated = gr.Textbox(
994
+ label="Generated Answer (model's response)",
995
+ lines=6,
996
+ # interactive=False
997
+ )
998
 
999
+ gr.Markdown("### Evaluation Metrics")
 
 
1000
 
1001
  with gr.Row():
1002
+ with gr.Column():
1003
+ metric_overlap_display = gr.Textbox(
1004
+ label="Word Overlap",
1005
+ lines=1,
1006
+ # interactive=False
1007
+ )
1008
+ with gr.Column():
1009
+ gr.Markdown("**Quick Summary:** This shows the % of reference words that appear in the generated response")
1010
 
1011
  with gr.Row():
1012
+ metric_report = gr.Textbox(
1013
+ label="Detailed Metrics Report",
1014
+ lines=18,
1015
+ # interactive=False
1016
  )
1017
+
1018
+ gr.Markdown("""
1019
+ ### Understanding the Metrics
1020
+
1021
+ **Word Overlap:** What % of words from your reference appear in the generated response?
1022
+ - Shows which words matched, which were missing, which were added
1023
+ - High overlap = similar vocabulary used
1024
+
1025
+ **BLEU Score:** Modified word overlap that penalises very short responses
1026
+ - Used commonly for translation and summarisation
1027
+ - Ranges roughly 0-100 (higher = more overlap)
1028
+
1029
+ **Important Limitations:**
1030
+ - These metrics only measure word overlap, NOT meaning or quality
1031
+ - A response with low overlap might still be correct (using synonyms)
1032
+ - A response with high overlap might still be wrong (same words, wrong meaning)
1033
+ - Always use human judgment alongside automatic metrics!
1034
+ """)
1035
+
1036
 
1037
  def update_consistency_visibility(num_runs):
1038
  """Show/hide output boxes based on number of runs"""
 
1061
  outputs=[ratings_summary]
1062
  )
1063
 
1064
+ generate_metric_btn.click(
1065
+ generate_and_compare,
1066
+ inputs=[metric_prompt, metric_reference, metric_temp],
1067
+ outputs=[metric_generated, metric_report, metric_overlap_display]
 
 
 
 
 
 
 
 
1068
  )
1069
 
1070
  demo.launch()