Spaces:
Running
Running
| """ | |
| Test Runner - Logika przeprowadzania testów A/B (wersja Streamlit) | |
| """ | |
| import time | |
| import json | |
| import pandas as pd | |
| from datetime import datetime | |
| from pathlib import Path | |
| from io import BytesIO | |
| from docx import Document | |
| from docx.shared import Pt, RGBColor, Inches | |
| from docx.enum.text import WD_PARAGRAPH_ALIGNMENT | |
| class TestRunner: | |
| """Zarządza przeprowadzaniem testów A/B promptów""" | |
| def __init__(self, api_handler): | |
| """ | |
| Args: | |
| api_handler: Instancja APIHandler | |
| """ | |
| self.api_handler = api_handler | |
| self.responses = [] | |
| self.is_running = False | |
| self.should_cancel = False | |
| def run_test(self, prompt_a, prompt_b, num_responses, model, temperature, max_tokens, progress_callback=None, log_callback=None): | |
| """ | |
| Przeprowadza test A/B | |
| Args: | |
| prompt_a: Treść promptu A (string) | |
| prompt_b: Treść promptu B (string) | |
| num_responses: Liczba odpowiedzi dla każdego promptu | |
| model: Model OpenAI | |
| temperature: Temperatura | |
| max_tokens: Max tokens | |
| progress_callback: Opcjonalna funkcja do aktualizacji progress bara | |
| log_callback: Opcjonalna funkcja do logowania | |
| Returns: | |
| list: Lista słowników z odpowiedziami | |
| """ | |
| self.responses = [] | |
| self.is_running = True | |
| self.should_cancel = False | |
| total_iterations = num_responses * 2 | |
| current = 0 | |
| # Generowanie odpowiedzi dla promptu A | |
| if log_callback: | |
| log_callback(f"🔄 Generowanie odpowiedzi dla PROMPTU A...") | |
| for i in range(num_responses): | |
| if self.should_cancel: | |
| if log_callback: | |
| log_callback("⚠️ Test anulowany przez użytkownika") | |
| self.is_running = False | |
| return [] | |
| current += 1 | |
| if progress_callback: | |
| progress_callback(current, total_iterations) | |
| response = self.api_handler.generate_response( | |
| prompt_a, model, temperature, max_tokens | |
| ) | |
| self.responses.append({ | |
| 'Option': 'A', | |
| 'Response_ID': i + 1, | |
| 'Response': response, | |
| 'Score': None | |
| }) | |
| if log_callback: | |
| if response.startswith("ERROR"): | |
| log_callback(f" A-{i+1}/{num_responses}... ❌ {response}") | |
| else: | |
| log_callback(f" A-{i+1}/{num_responses}... ✅ ({len(response)} znaków)") | |
| time.sleep(0.5) # Krótka pauza między requestami | |
| # Generowanie odpowiedzi dla promptu B | |
| if log_callback: | |
| log_callback(f"\n🔄 Generowanie odpowiedzi dla PROMPTU B...") | |
| for i in range(num_responses): | |
| if self.should_cancel: | |
| if log_callback: | |
| log_callback("⚠️ Test anulowany przez użytkownika") | |
| self.is_running = False | |
| return [] | |
| current += 1 | |
| if progress_callback: | |
| progress_callback(current, total_iterations) | |
| response = self.api_handler.generate_response( | |
| prompt_b, model, temperature, max_tokens | |
| ) | |
| self.responses.append({ | |
| 'Option': 'B', | |
| 'Response_ID': i + 1, | |
| 'Response': response, | |
| 'Score': None | |
| }) | |
| if log_callback: | |
| if response.startswith("ERROR"): | |
| log_callback(f" B-{i+1}/{num_responses}... ❌ {response}") | |
| else: | |
| log_callback(f" B-{i+1}/{num_responses}... ✅ ({len(response)} znaków)") | |
| time.sleep(0.5) | |
| if log_callback: | |
| log_callback(f"\n✅ GENEROWANIE ZAKOŃCZONE - wygenerowano {len(self.responses)} odpowiedzi") | |
| self.is_running = False | |
| return self.responses | |
| def calculate_results(self, responses_with_scores): | |
| """ | |
| Oblicza wyniki testu na podstawie ocen | |
| Args: | |
| responses_with_scores: Lista odpowiedzi z wypełnionymi ocenami | |
| Returns: | |
| dict: Wyniki w formacie {'A': {'count': X, 'score': Y}, 'B': {...}} | |
| """ | |
| results = {} | |
| for option in ['A', 'B']: | |
| option_responses = [r for r in responses_with_scores if r['Option'] == option] | |
| scores = [r['Score'] for r in option_responses if r['Score'] is not None] | |
| if scores: | |
| avg_score = sum(scores) / len(scores) | |
| results[option] = { | |
| 'count': len(scores), | |
| 'score': round(avg_score, 2), | |
| 'min': min(scores), | |
| 'max': max(scores) | |
| } | |
| return results | |
| def export_to_csv(self, responses_with_scores, results, settings): | |
| """ | |
| Eksportuje wyniki do CSV (zwraca BytesIO dla Streamlit download) | |
| Args: | |
| responses_with_scores: Lista odpowiedzi z ocenami | |
| results: Wyniki testu | |
| settings: Ustawienia testu | |
| Returns: | |
| BytesIO: Bufor CSV do pobrania | |
| """ | |
| # Przygotuj dane do zapisu | |
| df = pd.DataFrame(responses_with_scores) | |
| # Dodaj metadane jako pierwsze wiersze (jako komentarze) | |
| metadata = [ | |
| f"# Test A/B Prompt - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", | |
| f"# Model: {settings.get('model', 'N/A')}", | |
| f"# Temperature: {settings.get('temperature', 'N/A')}", | |
| f"# Max Tokens: {settings.get('max_tokens', 'N/A')}", | |
| f"# Top P: {settings.get('top_p', 'N/A')}", | |
| f"# Num Responses: {settings.get('num_responses', 'N/A')}", | |
| f"#", | |
| f"# WYNIKI:", | |
| f"# Option A - Count: {results['A']['count']}, Score: {results['A']['score']}", | |
| f"# Option B - Count: {results['B']['count']}, Score: {results['B']['score']}", | |
| f"#" | |
| ] | |
| # Zapisz do bufora | |
| buffer = BytesIO() | |
| # Zapisz metadane | |
| for line in metadata: | |
| buffer.write((line + "\n").encode('utf-8')) | |
| # Zapisz DataFrame | |
| df.to_csv(buffer, index=False, encoding='utf-8') | |
| buffer.seek(0) | |
| return buffer | |
| def cancel_test(self): | |
| """Anuluje trwający test""" | |
| self.should_cancel = True | |
| def export_to_excel(self, responses_with_scores, results, settings): | |
| """ | |
| Eksportuje wyniki do Excel (zwraca BytesIO dla Streamlit download) | |
| Args: | |
| responses_with_scores: Lista odpowiedzi z ocenami | |
| results: Wyniki testu | |
| settings: Ustawienia testu | |
| Returns: | |
| BytesIO: Bufor Excel do pobrania | |
| """ | |
| buffer = BytesIO() | |
| with pd.ExcelWriter(buffer, engine='openpyxl') as writer: | |
| # Arkusz 1: Podsumowanie | |
| summary_data = { | |
| 'Parametr': [ | |
| 'Data testu', | |
| 'Model', | |
| 'Temperature', | |
| 'Max Tokens', | |
| 'Liczba odpowiedzi', | |
| '', | |
| 'Option A - Średnia ocena', | |
| 'Option A - Liczba', | |
| 'Option A - Min', | |
| 'Option A - Max', | |
| '', | |
| 'Option B - Średnia ocena', | |
| 'Option B - Liczba', | |
| 'Option B - Min', | |
| 'Option B - Max' | |
| ], | |
| 'Wartość': [ | |
| datetime.now().strftime('%Y-%m-%d %H:%M:%S'), | |
| settings.get('model', 'N/A'), | |
| settings.get('temperature', 'N/A'), | |
| settings.get('max_tokens', 'N/A'), | |
| settings.get('num_responses', 'N/A'), | |
| '', | |
| results['A']['score'], | |
| results['A']['count'], | |
| results['A']['min'], | |
| results['A']['max'], | |
| '', | |
| results['B']['score'], | |
| results['B']['count'], | |
| results['B']['min'], | |
| results['B']['max'] | |
| ] | |
| } | |
| df_summary = pd.DataFrame(summary_data) | |
| df_summary.to_excel(writer, sheet_name='Podsumowanie', index=False) | |
| # Arkusz 2: Wszystkie odpowiedzi | |
| df_responses = pd.DataFrame(responses_with_scores) | |
| df_responses.to_excel(writer, sheet_name='Odpowiedzi', index=False) | |
| buffer.seek(0) | |
| return buffer | |
| def export_to_json(self, responses_with_scores, results, settings): | |
| """ | |
| Eksportuje wyniki do JSON (zwraca BytesIO dla Streamlit download) | |
| Args: | |
| responses_with_scores: Lista odpowiedzi z ocenami | |
| results: Wyniki testu | |
| settings: Ustawienia testu | |
| Returns: | |
| BytesIO: Bufor JSON do pobrania | |
| """ | |
| data = { | |
| 'metadata': { | |
| 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), | |
| 'model': settings.get('model', 'N/A'), | |
| 'temperature': settings.get('temperature', 'N/A'), | |
| 'max_tokens': settings.get('max_tokens', 'N/A'), | |
| 'num_responses': settings.get('num_responses', 'N/A') | |
| }, | |
| 'results': results, | |
| 'responses': responses_with_scores | |
| } | |
| buffer = BytesIO() | |
| json_str = json.dumps(data, ensure_ascii=False, indent=2) | |
| buffer.write(json_str.encode('utf-8')) | |
| buffer.seek(0) | |
| return buffer | |
| def export_to_txt(self, responses_with_scores, results, settings): | |
| """ | |
| Eksportuje wyniki do TXT (zwraca BytesIO dla Streamlit download) | |
| Args: | |
| responses_with_scores: Lista odpowiedzi z ocenami | |
| results: Wyniki testu | |
| settings: Ustawienia testu | |
| Returns: | |
| BytesIO: Bufor TXT do pobrania | |
| """ | |
| buffer = BytesIO() | |
| # Header | |
| lines = [ | |
| "=" * 80, | |
| "WYNIKI TESTU A/B PROMPTÓW", | |
| "=" * 80, | |
| "", | |
| f"Data testu: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", | |
| f"Model: {settings.get('model', 'N/A')}", | |
| f"Temperature: {settings.get('temperature', 'N/A')}", | |
| f"Max Tokens: {settings.get('max_tokens', 'N/A')}", | |
| f"Liczba odpowiedzi: {settings.get('num_responses', 'N/A')}", | |
| "", | |
| "=" * 80, | |
| "PODSUMOWANIE WYNIKÓW", | |
| "=" * 80, | |
| "", | |
| f"Option A:", | |
| f" Średnia ocena: {results['A']['score']}", | |
| f" Liczba: {results['A']['count']}", | |
| f" Min: {results['A']['min']}", | |
| f" Max: {results['A']['max']}", | |
| "", | |
| f"Option B:", | |
| f" Średnia ocena: {results['B']['score']}", | |
| f" Liczba: {results['B']['count']}", | |
| f" Min: {results['B']['min']}", | |
| f" Max: {results['B']['max']}", | |
| "", | |
| "=" * 80, | |
| "WSZYSTKIE ODPOWIEDZI", | |
| "=" * 80, | |
| "" | |
| ] | |
| # Responses | |
| for resp in responses_with_scores: | |
| lines.extend([ | |
| f"\nOption: {resp['Option']}-{resp['Response_ID']}", | |
| f"Ocena: {resp['Score']}", | |
| "-" * 80, | |
| f"{resp['Response']}", | |
| "-" * 80 | |
| ]) | |
| text = "\n".join(lines) | |
| buffer.write(text.encode('utf-8')) | |
| buffer.seek(0) | |
| return buffer | |
| def export_to_markdown(self, responses_with_scores, results, settings): | |
| """ | |
| Eksportuje wyniki do Markdown (zwraca BytesIO dla Streamlit download) | |
| Args: | |
| responses_with_scores: Lista odpowiedzi z ocenami | |
| results: Wyniki testu | |
| settings: Ustawienia testu | |
| Returns: | |
| BytesIO: Bufor Markdown do pobrania | |
| """ | |
| buffer = BytesIO() | |
| lines = [ | |
| "# Wyniki Testu A/B Promptów", | |
| "", | |
| "## Metadata", | |
| "", | |
| f"- **Data testu**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", | |
| f"- **Model**: {settings.get('model', 'N/A')}", | |
| f"- **Temperature**: {settings.get('temperature', 'N/A')}", | |
| f"- **Max Tokens**: {settings.get('max_tokens', 'N/A')}", | |
| f"- **Liczba odpowiedzi**: {settings.get('num_responses', 'N/A')}", | |
| "", | |
| "## Podsumowanie Wyników", | |
| "", | |
| "| Option | Średnia Ocena | Liczba | Min | Max |", | |
| "|--------|---------------|--------|-----|-----|", | |
| f"| A | {results['A']['score']:.2f} | {results['A']['count']} | {results['A']['min']} | {results['A']['max']} |", | |
| f"| B | {results['B']['score']:.2f} | {results['B']['count']} | {results['B']['min']} | {results['B']['max']} |", | |
| "" | |
| ] | |
| # Zwycięzca | |
| if results['A']['score'] > results['B']['score']: | |
| diff = results['A']['score'] - results['B']['score'] | |
| lines.append(f"### 🏆 Zwycięzca: Prompt A (przewaga: +{diff:.2f})") | |
| elif results['B']['score'] > results['A']['score']: | |
| diff = results['B']['score'] - results['A']['score'] | |
| lines.append(f"### 🏆 Zwycięzca: Prompt B (przewaga: +{diff:.2f})") | |
| else: | |
| lines.append("### 🤝 Remis") | |
| lines.extend([ | |
| "", | |
| "## Wszystkie Odpowiedzi", | |
| "" | |
| ]) | |
| # Responses | |
| for resp in responses_with_scores: | |
| lines.extend([ | |
| f"### Option {resp['Option']}-{resp['Response_ID']} (Ocena: {resp['Score']})", | |
| "", | |
| "```", | |
| resp['Response'], | |
| "```", | |
| "" | |
| ]) | |
| text = "\n".join(lines) | |
| buffer.write(text.encode('utf-8')) | |
| buffer.seek(0) | |
| return buffer | |
| def export_to_word(self, responses_with_scores, results, settings): | |
| """ | |
| Eksportuje wyniki do Word (zwraca BytesIO dla Streamlit download) | |
| Args: | |
| responses_with_scores: Lista odpowiedzi z ocenami | |
| results: Wyniki testu | |
| settings: Ustawienia testu | |
| Returns: | |
| BytesIO: Bufor Word do pobrania | |
| """ | |
| doc = Document() | |
| # Title | |
| title = doc.add_heading('Wyniki Testu A/B Promptów', 0) | |
| title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER | |
| # Metadata | |
| doc.add_heading('Metadata', level=1) | |
| metadata_items = [ | |
| f"Data testu: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", | |
| f"Model: {settings.get('model', 'N/A')}", | |
| f"Temperature: {settings.get('temperature', 'N/A')}", | |
| f"Max Tokens: {settings.get('max_tokens', 'N/A')}", | |
| f"Liczba odpowiedzi: {settings.get('num_responses', 'N/A')}" | |
| ] | |
| for item in metadata_items: | |
| doc.add_paragraph(item, style='List Bullet') | |
| # Results Summary | |
| doc.add_heading('Podsumowanie Wyników', level=1) | |
| # Table | |
| table = doc.add_table(rows=3, cols=5) | |
| table.style = 'Light Grid Accent 1' | |
| # Header | |
| headers = ['Option', 'Średnia Ocena', 'Liczba', 'Min', 'Max'] | |
| for i, header in enumerate(headers): | |
| table.rows[0].cells[i].text = header | |
| # Option A | |
| table.rows[1].cells[0].text = 'A' | |
| table.rows[1].cells[1].text = f"{results['A']['score']:.2f}" | |
| table.rows[1].cells[2].text = str(results['A']['count']) | |
| table.rows[1].cells[3].text = str(results['A']['min']) | |
| table.rows[1].cells[4].text = str(results['A']['max']) | |
| # Option B | |
| table.rows[2].cells[0].text = 'B' | |
| table.rows[2].cells[1].text = f"{results['B']['score']:.2f}" | |
| table.rows[2].cells[2].text = str(results['B']['count']) | |
| table.rows[2].cells[3].text = str(results['B']['min']) | |
| table.rows[2].cells[4].text = str(results['B']['max']) | |
| # Winner | |
| doc.add_paragraph() | |
| if results['A']['score'] > results['B']['score']: | |
| diff = results['A']['score'] - results['B']['score'] | |
| winner_para = doc.add_paragraph() | |
| winner_run = winner_para.add_run(f"🏆 Zwycięzca: Prompt A (przewaga: +{diff:.2f})") | |
| winner_run.bold = True | |
| winner_run.font.size = Pt(14) | |
| elif results['B']['score'] > results['A']['score']: | |
| diff = results['B']['score'] - results['A']['score'] | |
| winner_para = doc.add_paragraph() | |
| winner_run = winner_para.add_run(f"🏆 Zwycięzca: Prompt B (przewaga: +{diff:.2f})") | |
| winner_run.bold = True | |
| winner_run.font.size = Pt(14) | |
| else: | |
| winner_para = doc.add_paragraph() | |
| winner_run = winner_para.add_run("🤝 Remis") | |
| winner_run.bold = True | |
| winner_run.font.size = Pt(14) | |
| # All responses | |
| doc.add_page_break() | |
| doc.add_heading('Wszystkie Odpowiedzi', level=1) | |
| for resp in responses_with_scores: | |
| doc.add_heading(f"Option {resp['Option']}-{resp['Response_ID']} (Ocena: {resp['Score']})", level=2) | |
| doc.add_paragraph(resp['Response']) | |
| doc.add_paragraph() | |
| # Save to buffer | |
| buffer = BytesIO() | |
| doc.save(buffer) | |
| buffer.seek(0) | |
| return buffer | |