Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Feb 1

Commit

c319c31

•

1 Parent(s): 963fb4a

Added runner for pricing fact checks to assess the level of fact embedding in the latest model

Browse files

Files changed (2) hide show

pages/030_Test_Runner.py +114 -33
src/datatypes.py +6 -0

pages/030_Test_Runner.py CHANGED Viewed

@@ -1,45 +1,126 @@
 import streamlit as st
 from src.architectures import *
 from src.common import generate_group_tag
 from src.testing import TestGenerator
 from src.st_helpers import st_setup
 if Architecture.architectures is None:
     Architecture.load_architectures()
 if st_setup('LLM Arch'):
-    summary = st.container()
-    with summary:
-        st.write("# Test Runner")
-        st.write("## Run a new test")
-        st.write("### Comment:")
-        comment = st.text_input("Optional comment for the test")
-        st.write("### Architectures to include:")
-        selected_archs = st.multiselect(label="Architectures", options=[a.name for a in Architecture.architectures])
-        st.write("### Number of questions to ask:")
-        q_count = st.slider(label="Number of questions", min_value=1, max_value=TestGenerator.question_count(), step=1)
-        st.write("### Tag:")
-        tag = generate_group_tag()
-        st.write(f'Test will be tagged as "{tag}" - record this for easy searching later')
-        total_tests = len(selected_archs) * q_count
-        st.write("### Run:")
-        st.write(f"**{total_tests}** total tests will be run")
-        if st.button("**Run**", disabled=(total_tests==0)):
-            progress = st.progress(0.0, text="Running tests...")
-            questions = TestGenerator.get_random_questions(q_count)
-            num_complete = 0
-            for arch_name in selected_archs:
-                architecture = Architecture.get_architecture(arch_name)
-                for q in questions:
-                    architecture(ArchitectureRequest(q), trace_tags=[tag, "TestRunner"], trace_comment=comment)
-                    num_complete += 1
-                    if num_complete == total_tests:
-                        progress.empty()
-                    else:
-                        progress.progress(num_complete/total_tests, f"Run {num_complete} of {total_tests} tests...")

+import regex as re
 import streamlit as st
+from pandas import DataFrame
+from random import choices
 from src.architectures import *
 from src.common import generate_group_tag
+from src.datatypes import *
 from src.testing import TestGenerator
 from src.st_helpers import st_setup
+# Componentise different test options
+def display_custom_test():
+    st.write("## Run a new custom test")
+    st.write("### Comment:")
+    comment = st.text_input("Optional comment for the test", key="custom_test_comment")
+    st.write("### Architectures to include:")
+    selected_archs = st.multiselect(label="Architectures", options=[a.name for a in Architecture.architectures], key="custom_test_archs")
+    st.write("### Number of questions to ask:")
+    q_count = st.slider(label="Number of questions", min_value=1, max_value=TestGenerator.question_count(), step=1)
+    st.write("### Tag:")
+    tag = generate_group_tag()
+    st.write(f'Test will be tagged as "{tag}" - record this for easy searching later')
+    total_tests = len(selected_archs) * q_count
+    st.write("### Run:")
+    st.write(f"**{total_tests}** total tests will be run")
+    if st.button("**Run**", disabled=(total_tests == 0), key="custom_test_button"):
+        progress = st.progress(0.0, text="Running tests...")
+        questions = TestGenerator.get_random_questions(q_count)
+        num_complete = 0
+        for arch_name in selected_archs:
+            architecture = Architecture.get_architecture(arch_name)
+            for q in questions:
+                architecture(ArchitectureRequest(q), trace_tags=[tag, "TestRunner"], trace_comment=comment)
+                num_complete += 1
+                if num_complete == total_tests:
+                    progress.empty()
+                else:
+                    progress.progress(num_complete / total_tests, f"Run {num_complete} of {total_tests} tests...")
+def display_pricing_fact_test():
+    def get_question_price_pairs():
+        DataLoader.load_data()
+        pairs = []
+        for p in Product.all.values():
+            price = p.price
+            product_name = p.name
+            category_name = p.category.lower_singular_name
+            if category_name == "tv":
+                category_name = "TV"
+            question = f'How much is the {product_name} {category_name}?'
+            pairs.append((question, price))
+        return pairs
+    def get_price_from_response(response: str) -> float:
+        prices = re.findall('\$[,\d]+\.\d\d', response)
+        if len(prices) == 0:
+            print(f"Found no price in response '{response}'")
+            return -0.1
+        return float(prices[0][1:].replace(',',''))
+    st.write("## Run a pricing fact test")
+    st.write("### Comment:")
+    comment = st.text_input("Optional comment for the test", key="pricing_test_comment")
+    st.write("### Architectures to include:")
+    selected_archs = st.multiselect(label="Architectures", options=[a.name for a in Architecture.architectures], key="pricing_test_archs")
+    question_price_pairs = get_question_price_pairs()
+    st.write("### Number of questions to ask:")
+    q_count = st.slider(label="Number of questions", min_value=1, max_value=len(question_price_pairs), step=1)
+    st.write("### Tag:")
+    tag = generate_group_tag()
+    st.write(f'Test will be tagged as "{tag}" - record this for easy searching later')
+    total_tests = len(selected_archs) * q_count
+    st.write("### Run:")
+    st.write(f"**{total_tests}** total tests will be run")
+    if st.button("**Run**", disabled=(total_tests == 0), key="pricing_test_button"):
+        progress = st.progress(0.0, text="Running tests...")
+        questions = choices(question_price_pairs, k=q_count)
+        num_complete = 0
+        answer_stats = {}
+        for arch_name in selected_archs:
+            answer_stats[arch_name] = [0, 0]  # [Correct, Incorrect] only used locally here
+            architecture = Architecture.get_architecture(arch_name)
+            for question, price in questions:
+                request = ArchitectureRequest(question)
+                architecture(request, trace_tags=[tag, "TestRunner"], trace_comment=comment)
+                if price == get_price_from_response(request.response):
+                    answer_stats[arch_name][0] += 1
+                else:
+                    answer_stats[arch_name][1] += 1
+                num_complete += 1
+                if num_complete == total_tests:
+                    progress.empty()
+                else:
+                    progress.progress(num_complete / total_tests, f"Run {num_complete} of {total_tests} tests...")
+        table_data = []
+        for arch_name in selected_archs:
+            correct = answer_stats[arch_name][0]
+            incorrect = answer_stats[arch_name][1]
+            total = correct + incorrect
+            percent_correct = round(correct / total * 100, 1)
+            table_data.append([arch_name, correct, incorrect, total, f'{percent_correct:.1f}%'])
+        df = DataFrame(table_data, columns=['Architecture', 'Correct', 'Incorrect', 'Total', '% Correct'])
+        st.table(df.assign(hack='').set_index('hack'))
 if Architecture.architectures is None:
     Architecture.load_architectures()
 if st_setup('LLM Arch'):
+    st.write("# Test Runner")
+    with st.expander("Pricing Fact Tests"):
+        display_pricing_fact_test()
+    with st.expander("Custom Tests"):
+        display_custom_test()

src/datatypes.py CHANGED Viewed

@@ -106,6 +106,12 @@ class Category:
             return self.name[:-1]  # Clip the s
         return self.name
 class Feature:
     all = {}

             return self.name[:-1]  # Clip the s
         return self.name
+    @property
+    def lower_singular_name(self):
+        if self.name[-1] == "s":
+            return self.name[:-1].lower()  # Clip the s
+        return self.name.lower()
 class Feature:
     all = {}