Spaces:

courtneyf2
/

IS41720_ZoneB

Running

App Files Files Community

courtneyf2 commited on 15 days ago

Commit

d614182

verified ·

1 Parent(s): 9deb76c

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -52

app.py CHANGED Viewed

@@ -309,11 +309,11 @@ with gr.Blocks() as demo:
                         tokenize_btn = gr.Button("Tokenize Text")
                     with gr.Row():
-                        token_display = gr.Textbox(label="Tokens", lines=3)
-                        token_count = gr.Number(label="Token Count")
                     with gr.Row():
-                        token_info = gr.Textbox(label="Tokenization Info", lines=2)
                 with gr.Tab("Context & Predictions"):
                     gr.Markdown("### Next-word predictions and context understanding")
@@ -330,12 +330,13 @@ with gr.Blocks() as demo:
                         predict_btn = gr.Button("Get Next Word Predictions")
                     with gr.Row():
-                        predictions_output = gr.Textbox(label="Most Likely Next Words", lines=5)
                     with gr.Row():
                         context_window_info = gr.Textbox(
                             label="Context Window Status",
-                            value="Click 'Get Predictions' to see token usage"
                         )
                 with gr.Tab("Attention Network"):
@@ -849,7 +850,7 @@ with gr.Blocks() as demo:
             with gr.Tabs() as week9_tabs:
-                with gr.Tab("Generate for Evaluation"):
                     gr.Markdown("""
                     ### Generate Multiple Versions for Comparison
                     Create three versions of a response with different temperature settings.
@@ -861,7 +862,7 @@ with gr.Blocks() as demo:
                             label="Enter your prompt",
                             placeholder="e.g., Summarise the main benefits of cloud computing for small businesses",
                             lines=3,
-                            value="Explain the main differences between SQL and NoSQL databases."
                         )
                     with gr.Row():
@@ -900,7 +901,7 @@ with gr.Blocks() as demo:
                             eval_output1 = gr.Textbox(
                                 label="Output 1",
                                 lines=6,
-                                interactive=False
                             )
                             gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
                             with gr.Row():
@@ -914,7 +915,8 @@ with gr.Blocks() as demo:
                             gr.Markdown("**Version 2** (Temp: 0.7)")
                             eval_output2 = gr.Textbox(
                                 label="Output 2",
-                                lines=6
                             )
                             gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
                             with gr.Row():
@@ -928,7 +930,8 @@ with gr.Blocks() as demo:
                             gr.Markdown("**Version 3** (Temp: 1.0)")
                             eval_output3 = gr.Textbox(
                                 label="Output 3",
-                                lines=6
                             )
                             gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
                             with gr.Row():
@@ -944,33 +947,36 @@ with gr.Blocks() as demo:
                     with gr.Row():
                         ratings_summary = gr.Textbox(
                             label="Ratings Summary",
-                            lines=6
                         )
-                with gr.Tab("Consistency Testing"):
                     gr.Markdown("""
-                    ### Test Response Consistency
-                    Run the same prompt multiple times to see how consistent the model's outputs are.
-                    Look for patterns in what stays the same vs what changes.
                     """)
                     with gr.Row():
-                        consistency_prompt = gr.Textbox(
-                            label="Enter your prompt",
-                            placeholder="e.g., What are the key principles of user interface design?",
                             lines=3,
-                            value="What are the three most important considerations when choosing a database system?"
                         )
                     with gr.Row():
-                        consistency_runs = gr.Slider(
-                            minimum=3,
-                            maximum=5,
-                            value=3,
-                            step=1,
-                            label="Number of times to run (3-5)"
                         )
-                        consistency_temp = gr.Slider(
                             minimum=0.1,
                             maximum=1.0,
                             value=0.7,
@@ -979,28 +985,54 @@ with gr.Blocks() as demo:
                         )
                     with gr.Row():
-                        generate_consistency_btn = gr.Button("Test Consistency", variant="primary")
-                    gr.Markdown("### Compare the Responses")
-                    gr.Markdown("Look for: What information appears in all responses? What varies? Any contradictions?")
                     with gr.Row():
-                        consistency_output1 = gr.Textbox(label="Response 1", lines=5)
-                        consistency_output2 = gr.Textbox(label="Response 2", lines=5)
-                    with gr.Row():
-                        consistency_output3 = gr.Textbox(label="Response 3", lines=5)
-                        consistency_output4 = gr.Textbox(label="Response 4", lines=5, visible=False)
                     with gr.Row():
-                        consistency_output5 = gr.Textbox(label="Response 5", lines=5, interactive=False, visible=False)
                     with gr.Row():
-                        consistency_analysis = gr.Textbox(
-                            label="Analysis Notes",
-                            placeholder="Note patterns you observe: What's consistent? What varies? Any contradictions?",
-                            lines=4
                         )
             def update_consistency_visibility(num_runs):
                 """Show/hide output boxes based on number of runs"""
@@ -1029,18 +1061,10 @@ with gr.Blocks() as demo:
                 outputs=[ratings_summary]
             )
-            consistency_runs.change(
-                update_consistency_visibility,
-                inputs=[consistency_runs],
-                outputs=[consistency_output1, consistency_output2, consistency_output3,
-                        consistency_output4, consistency_output5]
-            )
-            generate_consistency_btn.click(
-                test_consistency,
-                inputs=[consistency_prompt, consistency_runs, consistency_temp],
-                outputs=[consistency_output1, consistency_output2, consistency_output3,
-                        consistency_output4, consistency_output5]
             )
     demo.launch()

                         tokenize_btn = gr.Button("Tokenize Text")
                     with gr.Row():
+                        token_display = gr.Textbox(label="Tokens", lines=3, interactive=False)
+                        token_count = gr.Number(label="Token Count", interactive=False)
                     with gr.Row():
+                        token_info = gr.Textbox(label="Tokenization Info", lines=2, interactive=False)
                 with gr.Tab("Context & Predictions"):
                     gr.Markdown("### Next-word predictions and context understanding")
                         predict_btn = gr.Button("Get Next Word Predictions")
                     with gr.Row():
+                        predictions_output = gr.Textbox(label="Most Likely Next Words", lines=5, interactive=False)
                     with gr.Row():
                         context_window_info = gr.Textbox(
                             label="Context Window Status",
+                            value="Click 'Get Predictions' to see token usage",
+                            interactive=False
                         )
                 with gr.Tab("Attention Network"):
             with gr.Tabs() as week9_tabs:
+                with gr.Tab("Human Evaluation"):
                     gr.Markdown("""
                     ### Generate Multiple Versions for Comparison
                     Create three versions of a response with different temperature settings.
                             label="Enter your prompt",
                             placeholder="e.g., Summarise the main benefits of cloud computing for small businesses",
                             lines=3,
+                            value="Write three different, creative metaphors to explain the concept of a neural network to a child."
                         )
                     with gr.Row():
                             eval_output1 = gr.Textbox(
                                 label="Output 1",
                                 lines=6,
+                                # interactive=False
                             )
                             gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
                             with gr.Row():
                             gr.Markdown("**Version 2** (Temp: 0.7)")
                             eval_output2 = gr.Textbox(
                                 label="Output 2",
+                                lines=6,
+                                # interactive=False
                             )
                             gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
                             with gr.Row():
                             gr.Markdown("**Version 3** (Temp: 1.0)")
                             eval_output3 = gr.Textbox(
                                 label="Output 3",
+                                lines=6,
+                                # interactive=False
                             )
                             gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
                             with gr.Row():
                     with gr.Row():
                         ratings_summary = gr.Textbox(
                             label="Ratings Summary",
+                            lines=6,
+                            # interactive=False
                         )
+                with gr.Tab("Automatic Evaluation"):
                     gr.Markdown("""
+                    ### Generate a Response and Compare to Your Reference Answer
+                    This demonstrates how automatic metrics like BLEU and word overlap work in practice.
+                    You'll provide a "reference answer" (what a good response should say), then see how
+                    the model's response compares using automatic metrics.
                     """)
                     with gr.Row():
+                        metric_prompt = gr.Textbox(
+                            label="Enter your prompt (question or task)",
+                            placeholder="e.g., What are the main benefits of using a relational database?",
                             lines=3,
+                            value="What are the three main principles of user-centered design?"
                         )
                     with gr.Row():
+                        metric_reference = gr.Textbox(
+                            label="Enter your reference answer (what a good answer should include)",
+                            placeholder="Write what you consider a good/correct answer to your prompt...",
+                            lines=5,
+                            value="The three main principles of user-centered design are: 1) Focus on users and their needs throughout the design process, 2) Involve users early and often through testing and feedback, and 3) Iterate designs based on user feedback to continuously improve the experience."
                         )
+                    with gr.Row():
+                        metric_temp = gr.Slider(
                             minimum=0.1,
                             maximum=1.0,
                             value=0.7,
                         )
                     with gr.Row():
+                        generate_metric_btn = gr.Button("Generate Model Response & Calculate Metrics", variant="primary")
+                    gr.Markdown("### Model Response")
                     with gr.Row():
+                        metric_generated = gr.Textbox(
+                            label="Generated Answer (model's response)",
+                            lines=6,
+                            # interactive=False
+                        )
+                    gr.Markdown("### Evaluation Metrics")
                     with gr.Row():
+                        with gr.Column():
+                            metric_overlap_display = gr.Textbox(
+                                label="Word Overlap",
+                                lines=1,
+                                # interactive=False
+                            )
+                        with gr.Column():
+                            gr.Markdown("**Quick Summary:** This shows the % of reference words that appear in the generated response")
                     with gr.Row():
+                        metric_report = gr.Textbox(
+                            label="Detailed Metrics Report",
+                            lines=18,
+                            # interactive=False
                         )
+                    gr.Markdown("""
+                    ### Understanding the Metrics
+                    **Word Overlap:** What % of words from your reference appear in the generated response?
+                    - Shows which words matched, which were missing, which were added
+                    - High overlap = similar vocabulary used
+                    **BLEU Score:** Modified word overlap that penalises very short responses
+                    - Used commonly for translation and summarisation
+                    - Ranges roughly 0-100 (higher = more overlap)
+                    **Important Limitations:**
+                    - These metrics only measure word overlap, NOT meaning or quality
+                    - A response with low overlap might still be correct (using synonyms)
+                    - A response with high overlap might still be wrong (same words, wrong meaning)
+                    - Always use human judgment alongside automatic metrics!
+                    """)
             def update_consistency_visibility(num_runs):
                 """Show/hide output boxes based on number of runs"""
                 outputs=[ratings_summary]
             )
+            generate_metric_btn.click(
+                generate_and_compare,
+                inputs=[metric_prompt, metric_reference, metric_temp],
+                outputs=[metric_generated, metric_report, metric_overlap_display]
             )
     demo.launch()