Spaces:

alfraser
/

llm-arch

Runtime error

App Files Files Community

alfraser commited on Feb 1, 2024

Commit

bb7db2c

1 Parent(s): e999f4f

Modified test runner to dispatch requests in parallel to make use of the fact that there is a lot of wait time for the LLM. Defaulting to 16 threads.

Browse files

Files changed (2) hide show

pages/030_Test_Runner.py +21 -30
src/testing.py +58 -2

pages/030_Test_Runner.py CHANGED Viewed

@@ -6,7 +6,7 @@ from random import choices
 from src.architectures import *
 from src.common import generate_group_tag
 from src.datatypes import *
-from src.testing import TestGenerator
 from src.st_helpers import st_setup
@@ -30,18 +30,10 @@ def display_custom_test():
     st.write("### Run:")
     st.write(f"**{total_tests}** total tests will be run")
     if st.button("**Run**", disabled=(total_tests == 0), key="custom_test_button"):
-        progress = st.progress(0.0, text="Running tests...")
-        questions = TestGenerator.get_random_questions(q_count)
-        num_complete = 0
-        for arch_name in selected_archs:
-            architecture = Architecture.get_architecture(arch_name)
-            for q in questions:
-                architecture(ArchitectureRequest(q), trace_tags=[tag, "TestRunner"], trace_comment=comment)
-                num_complete += 1
-                if num_complete == total_tests:
-                    progress.empty()
-                else:
-                    progress.progress(num_complete / total_tests, f"Run {num_complete} of {total_tests} tests...")
 def display_pricing_fact_test():
@@ -83,25 +75,24 @@ def display_pricing_fact_test():
     st.write("### Run:")
     st.write(f"**{total_tests}** total tests will be run")
     if st.button("**Run**", disabled=(total_tests == 0), key="pricing_test_button"):
-        progress = st.progress(0.0, text="Running tests...")
-        questions = choices(question_price_pairs, k=q_count)
-        num_complete = 0
         answer_stats = {}
         for arch_name in selected_archs:
-            answer_stats[arch_name] = [0, 0]  # [Correct, Incorrect] only used locally here
-            architecture = Architecture.get_architecture(arch_name)
-            for question, price in questions:
-                request = ArchitectureRequest(question)
-                architecture(request, trace_tags=[tag, "TestRunner"], trace_comment=comment)
-                if price == get_price_from_response(request.response):
-                    answer_stats[arch_name][0] += 1
                 else:
-                    answer_stats[arch_name][1] += 1
-                num_complete += 1
-                if num_complete == total_tests:
-                    progress.empty()
-                else:
-                    progress.progress(num_complete / total_tests, f"Run {num_complete} of {total_tests} tests...")
         table_data = []
         for arch_name in selected_archs:
             correct = answer_stats[arch_name][0]
@@ -110,7 +101,7 @@ def display_pricing_fact_test():
             percent_correct = round(correct / total * 100, 1)
             table_data.append([arch_name, correct, incorrect, total, f'{percent_correct:.1f}%'])
         df = DataFrame(table_data, columns=['Architecture', 'Correct', 'Incorrect', 'Total', '% Correct'])
-        st.table(df.assign(hack='').set_index('hack'))
 if Architecture.architectures is None:

 from src.architectures import *
 from src.common import generate_group_tag
 from src.datatypes import *
+from src.testing import TestGenerator, batch_test
 from src.st_helpers import st_setup
     st.write("### Run:")
     st.write(f"**{total_tests}** total tests will be run")
     if st.button("**Run**", disabled=(total_tests == 0), key="custom_test_button"):
+        with st.spinner():
+            questions = TestGenerator.get_random_questions(q_count)
+            batch_test(questions=questions, architectures=selected_archs,
+                       trace_tags=[tag, "TestRunner"], trace_comment=comment)
 def display_pricing_fact_test():
     st.write("### Run:")
     st.write(f"**{total_tests}** total tests will be run")
     if st.button("**Run**", disabled=(total_tests == 0), key="pricing_test_button"):
+        question_price_pairs = choices(question_price_pairs, k=q_count)
+        question_price_dict = {qpp[0]: qpp[1] for qpp in question_price_pairs}
+        questions = list(question_price_dict.keys())
         answer_stats = {}
         for arch_name in selected_archs:
+            answer_stats[arch_name] = [0, 0]  # [correct, incorrect]
+        with st.spinner():
+            results: List[Tuple[str, str, str]] = batch_test(questions=questions, architectures=selected_archs,
+                                                             trace_tags=[tag, "TestRunner"], trace_comment=comment)
+            for arch, query, response in results:
+                target_price = question_price_dict[query]
+                answer_price = get_price_from_response(response)
+                if target_price == answer_price:
+                    answer_stats[arch][0] += 1
                 else:
+                    answer_stats[arch][1] += 1
         table_data = []
         for arch_name in selected_archs:
             correct = answer_stats[arch_name][0]
             percent_correct = round(correct / total * 100, 1)
             table_data.append([arch_name, correct, incorrect, total, f'{percent_correct:.1f}%'])
         df = DataFrame(table_data, columns=['Architecture', 'Correct', 'Incorrect', 'Total', '% Correct'])
+        st.table(df.assign(no_index='').set_index('no_index'))
 if Architecture.architectures is None:

src/testing.py CHANGED Viewed

@@ -7,13 +7,69 @@ import sqlite3
 import sys
 from huggingface_hub import Repository
 from random import choices
-from typing import List, Dict, Optional
-from src.architectures import Architecture
 from src.common import data_dir
 class TestGenerator:
     """
     Wrapper class to hold testing questions and serve up examples

 import sys
 from huggingface_hub import Repository
+from queue import Queue
 from random import choices
+from threading import Thread
+from typing import Dict, List, Optional, Tuple
+from src.architectures import Architecture, ArchitectureRequest
 from src.common import data_dir
+class ArchitectureTestWorker(Thread):
+    def __init__(self, work_queue: Queue, worker_name: str, trace_tags: List[str], trace_comment: str):
+        Thread.__init__(self)
+        self.work_queue = work_queue
+        self.worker_name = worker_name
+        self.trace_tags = trace_tags
+        self.trace_comment = trace_comment
+    def run(self):
+        running: bool = True
+        while running:
+            arch, request = self.work_queue.get()
+            try:
+                if arch is None:
+                    running = False
+                else:
+                    print(f'{self.worker_name} running "{request.request}" through {arch}')
+                    architecture = Architecture.get_architecture(arch)
+                    architecture(request, trace_tags=self.trace_tags, trace_comment=self.trace_comment)
+            finally:
+                self.work_queue.task_done()
+def batch_test(questions: List[str], architectures: List[str], trace_comment: str = "",
+               trace_tags: List[str] = [], num_workers: int = 16) -> List[Tuple[str, str, str]]:
+    """
+    Creates a worked pool and dispatches the questions, returnin the answers per architecture, question
+    :param questions: A list of the questions
+    :param architectures: A list of the names of the architectures
+    :param num_workers: The number of works to run
+    :return: A list of Tuples of (arch_name, question, answer)
+    """
+    queue = Queue()
+    question_record: Dict[Tuple[str, str], ArchitectureRequest] = {}
+    for q in questions:
+        for a in architectures:
+            request = ArchitectureRequest(q)
+            question_record[(a, q)] = request
+            queue.put((a, request))
+    for i in range(num_workers):
+        worker = ArchitectureTestWorker(work_queue=queue, worker_name=f'Worker {i+1}',
+                                        trace_tags=trace_tags, trace_comment=trace_comment)
+        worker.daemon = True
+        worker.start()
+        queue.put((None, None))  # Flag to finish
+    queue.join()
+    # Repackage and return just the list of (arch_name, question, answer)
+    return [(k[0], k[1], v.response) for k, v in question_record.items()]
 class TestGenerator:
     """
     Wrapper class to hold testing questions and serve up examples