Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -309,11 +309,11 @@ with gr.Blocks() as demo:
|
|
| 309 |
tokenize_btn = gr.Button("Tokenize Text")
|
| 310 |
|
| 311 |
with gr.Row():
|
| 312 |
-
token_display = gr.Textbox(label="Tokens", lines=3)
|
| 313 |
-
token_count = gr.Number(label="Token Count")
|
| 314 |
|
| 315 |
with gr.Row():
|
| 316 |
-
token_info = gr.Textbox(label="Tokenization Info", lines=2)
|
| 317 |
|
| 318 |
with gr.Tab("Context & Predictions"):
|
| 319 |
gr.Markdown("### Next-word predictions and context understanding")
|
|
@@ -330,12 +330,13 @@ with gr.Blocks() as demo:
|
|
| 330 |
predict_btn = gr.Button("Get Next Word Predictions")
|
| 331 |
|
| 332 |
with gr.Row():
|
| 333 |
-
predictions_output = gr.Textbox(label="Most Likely Next Words", lines=5)
|
| 334 |
|
| 335 |
with gr.Row():
|
| 336 |
context_window_info = gr.Textbox(
|
| 337 |
label="Context Window Status",
|
| 338 |
-
value="Click 'Get Predictions' to see token usage"
|
|
|
|
| 339 |
)
|
| 340 |
|
| 341 |
with gr.Tab("Attention Network"):
|
|
@@ -849,7 +850,7 @@ with gr.Blocks() as demo:
|
|
| 849 |
|
| 850 |
with gr.Tabs() as week9_tabs:
|
| 851 |
|
| 852 |
-
with gr.Tab("
|
| 853 |
gr.Markdown("""
|
| 854 |
### Generate Multiple Versions for Comparison
|
| 855 |
Create three versions of a response with different temperature settings.
|
|
@@ -861,7 +862,7 @@ with gr.Blocks() as demo:
|
|
| 861 |
label="Enter your prompt",
|
| 862 |
placeholder="e.g., Summarise the main benefits of cloud computing for small businesses",
|
| 863 |
lines=3,
|
| 864 |
-
value="
|
| 865 |
)
|
| 866 |
|
| 867 |
with gr.Row():
|
|
@@ -900,7 +901,7 @@ with gr.Blocks() as demo:
|
|
| 900 |
eval_output1 = gr.Textbox(
|
| 901 |
label="Output 1",
|
| 902 |
lines=6,
|
| 903 |
-
interactive=False
|
| 904 |
)
|
| 905 |
gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
|
| 906 |
with gr.Row():
|
|
@@ -914,7 +915,8 @@ with gr.Blocks() as demo:
|
|
| 914 |
gr.Markdown("**Version 2** (Temp: 0.7)")
|
| 915 |
eval_output2 = gr.Textbox(
|
| 916 |
label="Output 2",
|
| 917 |
-
lines=6
|
|
|
|
| 918 |
)
|
| 919 |
gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
|
| 920 |
with gr.Row():
|
|
@@ -928,7 +930,8 @@ with gr.Blocks() as demo:
|
|
| 928 |
gr.Markdown("**Version 3** (Temp: 1.0)")
|
| 929 |
eval_output3 = gr.Textbox(
|
| 930 |
label="Output 3",
|
| 931 |
-
lines=6
|
|
|
|
| 932 |
)
|
| 933 |
gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
|
| 934 |
with gr.Row():
|
|
@@ -944,33 +947,36 @@ with gr.Blocks() as demo:
|
|
| 944 |
with gr.Row():
|
| 945 |
ratings_summary = gr.Textbox(
|
| 946 |
label="Ratings Summary",
|
| 947 |
-
lines=6
|
|
|
|
| 948 |
)
|
| 949 |
-
|
| 950 |
-
with gr.Tab("Consistency Testing"):
|
| 951 |
gr.Markdown("""
|
| 952 |
-
###
|
| 953 |
-
|
| 954 |
-
|
|
|
|
|
|
|
| 955 |
""")
|
| 956 |
|
| 957 |
with gr.Row():
|
| 958 |
-
|
| 959 |
-
label="Enter your prompt",
|
| 960 |
-
placeholder="e.g., What are the
|
| 961 |
lines=3,
|
| 962 |
-
value="What are the three
|
| 963 |
)
|
| 964 |
|
| 965 |
with gr.Row():
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
-
|
| 971 |
-
label="Number of times to run (3-5)"
|
| 972 |
)
|
| 973 |
-
|
|
|
|
|
|
|
| 974 |
minimum=0.1,
|
| 975 |
maximum=1.0,
|
| 976 |
value=0.7,
|
|
@@ -979,28 +985,54 @@ with gr.Blocks() as demo:
|
|
| 979 |
)
|
| 980 |
|
| 981 |
with gr.Row():
|
| 982 |
-
|
| 983 |
|
| 984 |
-
gr.Markdown("###
|
| 985 |
-
gr.Markdown("Look for: What information appears in all responses? What varies? Any contradictions?")
|
| 986 |
|
| 987 |
with gr.Row():
|
| 988 |
-
|
| 989 |
-
|
|
|
|
|
|
|
|
|
|
| 990 |
|
| 991 |
-
|
| 992 |
-
consistency_output3 = gr.Textbox(label="Response 3", lines=5)
|
| 993 |
-
consistency_output4 = gr.Textbox(label="Response 4", lines=5, visible=False)
|
| 994 |
|
| 995 |
with gr.Row():
|
| 996 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 997 |
|
| 998 |
with gr.Row():
|
| 999 |
-
|
| 1000 |
-
label="
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1004 |
|
| 1005 |
def update_consistency_visibility(num_runs):
|
| 1006 |
"""Show/hide output boxes based on number of runs"""
|
|
@@ -1029,18 +1061,10 @@ with gr.Blocks() as demo:
|
|
| 1029 |
outputs=[ratings_summary]
|
| 1030 |
)
|
| 1031 |
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
-
inputs=[
|
| 1035 |
-
outputs=[
|
| 1036 |
-
consistency_output4, consistency_output5]
|
| 1037 |
-
)
|
| 1038 |
-
|
| 1039 |
-
generate_consistency_btn.click(
|
| 1040 |
-
test_consistency,
|
| 1041 |
-
inputs=[consistency_prompt, consistency_runs, consistency_temp],
|
| 1042 |
-
outputs=[consistency_output1, consistency_output2, consistency_output3,
|
| 1043 |
-
consistency_output4, consistency_output5]
|
| 1044 |
)
|
| 1045 |
|
| 1046 |
demo.launch()
|
|
|
|
| 309 |
tokenize_btn = gr.Button("Tokenize Text")
|
| 310 |
|
| 311 |
with gr.Row():
|
| 312 |
+
token_display = gr.Textbox(label="Tokens", lines=3, interactive=False)
|
| 313 |
+
token_count = gr.Number(label="Token Count", interactive=False)
|
| 314 |
|
| 315 |
with gr.Row():
|
| 316 |
+
token_info = gr.Textbox(label="Tokenization Info", lines=2, interactive=False)
|
| 317 |
|
| 318 |
with gr.Tab("Context & Predictions"):
|
| 319 |
gr.Markdown("### Next-word predictions and context understanding")
|
|
|
|
| 330 |
predict_btn = gr.Button("Get Next Word Predictions")
|
| 331 |
|
| 332 |
with gr.Row():
|
| 333 |
+
predictions_output = gr.Textbox(label="Most Likely Next Words", lines=5, interactive=False)
|
| 334 |
|
| 335 |
with gr.Row():
|
| 336 |
context_window_info = gr.Textbox(
|
| 337 |
label="Context Window Status",
|
| 338 |
+
value="Click 'Get Predictions' to see token usage",
|
| 339 |
+
interactive=False
|
| 340 |
)
|
| 341 |
|
| 342 |
with gr.Tab("Attention Network"):
|
|
|
|
| 850 |
|
| 851 |
with gr.Tabs() as week9_tabs:
|
| 852 |
|
| 853 |
+
with gr.Tab("Human Evaluation"):
|
| 854 |
gr.Markdown("""
|
| 855 |
### Generate Multiple Versions for Comparison
|
| 856 |
Create three versions of a response with different temperature settings.
|
|
|
|
| 862 |
label="Enter your prompt",
|
| 863 |
placeholder="e.g., Summarise the main benefits of cloud computing for small businesses",
|
| 864 |
lines=3,
|
| 865 |
+
value="Write three different, creative metaphors to explain the concept of a neural network to a child."
|
| 866 |
)
|
| 867 |
|
| 868 |
with gr.Row():
|
|
|
|
| 901 |
eval_output1 = gr.Textbox(
|
| 902 |
label="Output 1",
|
| 903 |
lines=6,
|
| 904 |
+
# interactive=False
|
| 905 |
)
|
| 906 |
gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
|
| 907 |
with gr.Row():
|
|
|
|
| 915 |
gr.Markdown("**Version 2** (Temp: 0.7)")
|
| 916 |
eval_output2 = gr.Textbox(
|
| 917 |
label="Output 2",
|
| 918 |
+
lines=6,
|
| 919 |
+
# interactive=False
|
| 920 |
)
|
| 921 |
gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
|
| 922 |
with gr.Row():
|
|
|
|
| 930 |
gr.Markdown("**Version 3** (Temp: 1.0)")
|
| 931 |
eval_output3 = gr.Textbox(
|
| 932 |
label="Output 3",
|
| 933 |
+
lines=6,
|
| 934 |
+
# interactive=False
|
| 935 |
)
|
| 936 |
gr.Markdown("**Rate this output (1=Poor, 5=Excellent):**")
|
| 937 |
with gr.Row():
|
|
|
|
| 947 |
with gr.Row():
|
| 948 |
ratings_summary = gr.Textbox(
|
| 949 |
label="Ratings Summary",
|
| 950 |
+
lines=6,
|
| 951 |
+
# interactive=False
|
| 952 |
)
|
| 953 |
+
with gr.Tab("Automatic Evaluation"):
|
|
|
|
| 954 |
gr.Markdown("""
|
| 955 |
+
### Generate a Response and Compare to Your Reference Answer
|
| 956 |
+
|
| 957 |
+
This demonstrates how automatic metrics like BLEU and word overlap work in practice.
|
| 958 |
+
You'll provide a "reference answer" (what a good response should say), then see how
|
| 959 |
+
the model's response compares using automatic metrics.
|
| 960 |
""")
|
| 961 |
|
| 962 |
with gr.Row():
|
| 963 |
+
metric_prompt = gr.Textbox(
|
| 964 |
+
label="Enter your prompt (question or task)",
|
| 965 |
+
placeholder="e.g., What are the main benefits of using a relational database?",
|
| 966 |
lines=3,
|
| 967 |
+
value="What are the three main principles of user-centered design?"
|
| 968 |
)
|
| 969 |
|
| 970 |
with gr.Row():
|
| 971 |
+
metric_reference = gr.Textbox(
|
| 972 |
+
label="Enter your reference answer (what a good answer should include)",
|
| 973 |
+
placeholder="Write what you consider a good/correct answer to your prompt...",
|
| 974 |
+
lines=5,
|
| 975 |
+
value="The three main principles of user-centered design are: 1) Focus on users and their needs throughout the design process, 2) Involve users early and often through testing and feedback, and 3) Iterate designs based on user feedback to continuously improve the experience."
|
|
|
|
| 976 |
)
|
| 977 |
+
|
| 978 |
+
with gr.Row():
|
| 979 |
+
metric_temp = gr.Slider(
|
| 980 |
minimum=0.1,
|
| 981 |
maximum=1.0,
|
| 982 |
value=0.7,
|
|
|
|
| 985 |
)
|
| 986 |
|
| 987 |
with gr.Row():
|
| 988 |
+
generate_metric_btn = gr.Button("Generate Model Response & Calculate Metrics", variant="primary")
|
| 989 |
|
| 990 |
+
gr.Markdown("### Model Response")
|
|
|
|
| 991 |
|
| 992 |
with gr.Row():
|
| 993 |
+
metric_generated = gr.Textbox(
|
| 994 |
+
label="Generated Answer (model's response)",
|
| 995 |
+
lines=6,
|
| 996 |
+
# interactive=False
|
| 997 |
+
)
|
| 998 |
|
| 999 |
+
gr.Markdown("### Evaluation Metrics")
|
|
|
|
|
|
|
| 1000 |
|
| 1001 |
with gr.Row():
|
| 1002 |
+
with gr.Column():
|
| 1003 |
+
metric_overlap_display = gr.Textbox(
|
| 1004 |
+
label="Word Overlap",
|
| 1005 |
+
lines=1,
|
| 1006 |
+
# interactive=False
|
| 1007 |
+
)
|
| 1008 |
+
with gr.Column():
|
| 1009 |
+
gr.Markdown("**Quick Summary:** This shows the % of reference words that appear in the generated response")
|
| 1010 |
|
| 1011 |
with gr.Row():
|
| 1012 |
+
metric_report = gr.Textbox(
|
| 1013 |
+
label="Detailed Metrics Report",
|
| 1014 |
+
lines=18,
|
| 1015 |
+
# interactive=False
|
| 1016 |
)
|
| 1017 |
+
|
| 1018 |
+
gr.Markdown("""
|
| 1019 |
+
### Understanding the Metrics
|
| 1020 |
+
|
| 1021 |
+
**Word Overlap:** What % of words from your reference appear in the generated response?
|
| 1022 |
+
- Shows which words matched, which were missing, which were added
|
| 1023 |
+
- High overlap = similar vocabulary used
|
| 1024 |
+
|
| 1025 |
+
**BLEU Score:** Modified word overlap that penalises very short responses
|
| 1026 |
+
- Used commonly for translation and summarisation
|
| 1027 |
+
- Ranges roughly 0-100 (higher = more overlap)
|
| 1028 |
+
|
| 1029 |
+
**Important Limitations:**
|
| 1030 |
+
- These metrics only measure word overlap, NOT meaning or quality
|
| 1031 |
+
- A response with low overlap might still be correct (using synonyms)
|
| 1032 |
+
- A response with high overlap might still be wrong (same words, wrong meaning)
|
| 1033 |
+
- Always use human judgment alongside automatic metrics!
|
| 1034 |
+
""")
|
| 1035 |
+
|
| 1036 |
|
| 1037 |
def update_consistency_visibility(num_runs):
|
| 1038 |
"""Show/hide output boxes based on number of runs"""
|
|
|
|
| 1061 |
outputs=[ratings_summary]
|
| 1062 |
)
|
| 1063 |
|
| 1064 |
+
generate_metric_btn.click(
|
| 1065 |
+
generate_and_compare,
|
| 1066 |
+
inputs=[metric_prompt, metric_reference, metric_temp],
|
| 1067 |
+
outputs=[metric_generated, metric_report, metric_overlap_display]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1068 |
)
|
| 1069 |
|
| 1070 |
demo.launch()
|