Spaces:
Running
Running
Upload 12 files
Browse files- app.py +173 -0
- dynamic_cheatsheet/.DS_Store +0 -0
- dynamic_cheatsheet/__init__.py +0 -0
- dynamic_cheatsheet/language_model.py +298 -0
- dynamic_cheatsheet/utils/__init__.py +0 -0
- dynamic_cheatsheet/utils/evaluation.py +293 -0
- dynamic_cheatsheet/utils/execute_code.py +101 -0
- dynamic_cheatsheet/utils/extractor.py +115 -0
- dynamic_cheatsheet/utils/sonnet_eval.py +511 -0
- prompts/curator_prompt_for_dc_cumulative.txt +149 -0
- prompts/generator_prompt.txt +81 -0
- requirements.txt +7 -0
app.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
# Ensure SAMBANOVA_BASE_URL is in the environment for litellm
|
3 |
+
# This should be set before dynamic_cheatsheet.language_model is imported if it relies on it at import time,
|
4 |
+
# but it's generally used at runtime when making the API call.
|
5 |
+
# Setting it here early in app.py is a safeguard.
|
6 |
+
SAMBANOVA_DEFINED_BASE_URL = "https://api.sambanova.ai/v1"
|
7 |
+
if "SAMBANOVA_BASE_URL" not in os.environ:
|
8 |
+
os.environ["SAMBANOVA_BASE_URL"] = SAMBANOVA_DEFINED_BASE_URL
|
9 |
+
print(f"SAMBANOVA_BASE_URL environment variable set to: {SAMBANOVA_DEFINED_BASE_URL}")
|
10 |
+
elif os.environ["SAMBANOVA_BASE_URL"] != SAMBANOVA_DEFINED_BASE_URL:
|
11 |
+
print(f"Warning: SAMBANOVA_BASE_URL environment variable is already set to {os.environ['SAMBANOVA_BASE_URL']}, but app expects {SAMBANOVA_DEFINED_BASE_URL}. Using the existing one.")
|
12 |
+
|
13 |
+
import gradio as gr
|
14 |
+
import sys
|
15 |
+
|
16 |
+
# Add the project root to the Python path to allow importing dynamic_cheatsheet
|
17 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
|
18 |
+
|
19 |
+
from dynamic_cheatsheet.language_model import LanguageModel
|
20 |
+
|
21 |
+
# --- Configuration ---
|
22 |
+
SAMBANOVA_API_KEY = os.environ.get("SAMBANOVA_API_KEY")
|
23 |
+
# SAMBANOVA_BASE_URL is now set from SAMBANOVA_DEFINED_BASE_URL to env var if not present
|
24 |
+
SAMBANOVA_MODEL_NAME = "samba/DeepSeek-R1-Distill-Llama-70B" # Using litellm convention for SambaNova
|
25 |
+
|
26 |
+
GENERATOR_PROMPT_PATH = "prompts/generator_prompt.txt"
|
27 |
+
CURATOR_PROMPT_PATH = "prompts/curator_prompt_for_dc_cumulative.txt"
|
28 |
+
|
29 |
+
GENERATOR_PROMPT = ""
|
30 |
+
CURATOR_PROMPT = ""
|
31 |
+
|
32 |
+
try:
|
33 |
+
with open(GENERATOR_PROMPT_PATH, "r") as f:
|
34 |
+
GENERATOR_PROMPT = f.read()
|
35 |
+
with open(CURATOR_PROMPT_PATH, "r") as f:
|
36 |
+
CURATOR_PROMPT = f.read()
|
37 |
+
except FileNotFoundError:
|
38 |
+
print(f"Error: Prompt files not found at {GENERATOR_PROMPT_PATH} or {CURATOR_PROMPT_PATH}. Please ensure they exist.")
|
39 |
+
GENERATOR_PROMPT = "You are a helpful assistant. Given a question and a cheatsheet, provide an answer. Cheatsheet: [[CHEATSHEET]] Question: [[QUESTION]] FINAL ANSWER: <answer></answer>"
|
40 |
+
CURATOR_PROMPT = "You are a helpful assistant. Given a question, a model answer, and a previous cheatsheet, update the cheatsheet. Previous Cheatsheet: [[PREVIOUS_CHEATSHEET]] Question: [[QUESTION]] Model Answer: [[MODEL_ANSWER]] NEW CHEATSHEET: <cheatsheet></cheatsheet>"
|
41 |
+
|
42 |
+
# --- Global variable for cheatsheet ---
|
43 |
+
current_cheatsheet_cache = "(empty)"
|
44 |
+
|
45 |
+
def initialize_model():
|
46 |
+
if not SAMBANOVA_API_KEY:
|
47 |
+
raise gr.Error("SAMBANOVA_API_KEY environment variable not set. Please set it in your Hugging Face Space secrets or local environment.")
|
48 |
+
# LanguageModel will be modified to handle samba/ prefix using env vars for API key/base URL via litellm
|
49 |
+
model = LanguageModel(
|
50 |
+
model_name=SAMBANOVA_MODEL_NAME
|
51 |
+
)
|
52 |
+
return model
|
53 |
+
|
54 |
+
def generate_cheatsheet_func(training_data_text, progress=gr.Progress(track_tqdm=True)):
|
55 |
+
global current_cheatsheet_cache
|
56 |
+
if not training_data_text.strip():
|
57 |
+
current_cheatsheet_cache = "(empty)"
|
58 |
+
return "Training data is empty. Cheatsheet reset to (empty)."
|
59 |
+
|
60 |
+
model = initialize_model()
|
61 |
+
|
62 |
+
training_examples = [ex.strip() for ex in training_data_text.split("\n") if ex.strip()]
|
63 |
+
|
64 |
+
cheatsheet_content = "(empty)"
|
65 |
+
|
66 |
+
progress(0, desc="Initializing Cheatsheet Generation")
|
67 |
+
for i, example_input in enumerate(progress.tqdm(training_examples, desc="Generating Cheatsheet")):
|
68 |
+
print(f"Processing training example {i+1}/{len(training_examples)}: {example_input[:50]}...")
|
69 |
+
try:
|
70 |
+
results_dict = model.advanced_generate(
|
71 |
+
approach_name="DynamicCheatsheet_Cumulative",
|
72 |
+
input_txt=example_input,
|
73 |
+
cheatsheet=cheatsheet_content,
|
74 |
+
generator_template=GENERATOR_PROMPT,
|
75 |
+
cheatsheet_template=CURATOR_PROMPT,
|
76 |
+
temperature=0.1,
|
77 |
+
max_tokens=1024
|
78 |
+
)
|
79 |
+
cheatsheet_content = results_dict.get("final_cheatsheet", cheatsheet_content)
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Error processing example '{example_input[:50]}...': {e}")
|
82 |
+
# Continue with the current cheatsheet, and show error in UI
|
83 |
+
gr.Warning(f"Error on example '{example_input[:30]}...': {e}. Skipping this example.")
|
84 |
+
pass
|
85 |
+
current_cheatsheet_cache = cheatsheet_content
|
86 |
+
return current_cheatsheet_cache
|
87 |
+
|
88 |
+
def get_answers_func(user_query):
|
89 |
+
global current_cheatsheet_cache
|
90 |
+
if not user_query.strip():
|
91 |
+
return "Query is empty.", "Query is empty."
|
92 |
+
|
93 |
+
model = initialize_model()
|
94 |
+
answer_with_cheatsheet = "Error retrieving answer."
|
95 |
+
answer_without_cheatsheet = "Error retrieving answer."
|
96 |
+
|
97 |
+
# Inference WITH cheatsheet
|
98 |
+
try:
|
99 |
+
print(f"Querying WITH cheatsheet ({current_cheatsheet_cache[:50]}...)")
|
100 |
+
results_with_cheatsheet = model.advanced_generate(
|
101 |
+
approach_name="DynamicCheatsheet_Cumulative",
|
102 |
+
input_txt=user_query,
|
103 |
+
cheatsheet=current_cheatsheet_cache,
|
104 |
+
generator_template=GENERATOR_PROMPT,
|
105 |
+
cheatsheet_template=CURATOR_PROMPT,
|
106 |
+
temperature=0.1,
|
107 |
+
max_tokens=512
|
108 |
+
)
|
109 |
+
answer_with_cheatsheet = results_with_cheatsheet.get("final_answer", "Error: Could not extract answer.")
|
110 |
+
except Exception as e:
|
111 |
+
print(f"Error (with cheatsheet): {e}")
|
112 |
+
answer_with_cheatsheet = f"Error during inference with cheatsheet: {e}"
|
113 |
+
|
114 |
+
# Inference WITHOUT cheatsheet
|
115 |
+
try:
|
116 |
+
print(f"Querying WITHOUT cheatsheet...")
|
117 |
+
results_without_cheatsheet = model.advanced_generate(
|
118 |
+
approach_name="DynamicCheatsheet_Cumulative",
|
119 |
+
input_txt=user_query,
|
120 |
+
cheatsheet="(empty)",
|
121 |
+
generator_template=GENERATOR_PROMPT,
|
122 |
+
cheatsheet_template=CURATOR_PROMPT,
|
123 |
+
temperature=0.1,
|
124 |
+
max_tokens=512
|
125 |
+
)
|
126 |
+
answer_without_cheatsheet = results_without_cheatsheet.get("final_answer", "Error: Could not extract answer.")
|
127 |
+
except Exception as e:
|
128 |
+
print(f"Error (without cheatsheet): {e}")
|
129 |
+
answer_without_cheatsheet = f"Error during inference without cheatsheet: {e}"
|
130 |
+
|
131 |
+
return answer_with_cheatsheet, answer_without_cheatsheet
|
132 |
+
|
133 |
+
# --- Gradio Interface ---
|
134 |
+
with gr.Blocks(title="Task Caching Demo", theme=gr.themes.Soft()) as demo:
|
135 |
+
gr.Markdown("# Task Caching Demo")
|
136 |
+
gr.Markdown("Demonstrates the effect of using a dynamically generated cheatsheet (Task Caching) on model inference. Uses SambaNova API via `litellm`.")
|
137 |
+
|
138 |
+
with gr.Tabs():
|
139 |
+
with gr.TabItem("1. Generate Cheatsheet (Task Caching)"):
|
140 |
+
gr.Markdown("Paste your training data below, one example per line. This data will be used to build a cumulative cheatsheet. The process may take some time depending on the number of examples.")
|
141 |
+
training_data_input = gr.Textbox(lines=10, label="Training Data", placeholder="Example 1: What is the capital of France?\nExample 2: Solve 2+2.")
|
142 |
+
generate_cheatsheet_button = gr.Button("Generate Cheatsheet (Task Caching)", variant="primary")
|
143 |
+
cheatsheet_output = gr.Textbox(label="Generated Cheatsheet", lines=15, interactive=False, show_label=True)
|
144 |
+
generate_cheatsheet_button.click(
|
145 |
+
fn=generate_cheatsheet_func,
|
146 |
+
inputs=training_data_input,
|
147 |
+
outputs=cheatsheet_output,
|
148 |
+
show_progress="full"
|
149 |
+
)
|
150 |
+
|
151 |
+
with gr.TabItem("2. Test Inference"):
|
152 |
+
gr.Markdown("Enter your query below. The model will attempt to answer it twice: once using the generated cheatsheet (if any), and once without it.")
|
153 |
+
query_input = gr.Textbox(lines=3, label="Your Query", placeholder="e.g., What is the solution to 5 6 6 8 in the Game of 24?")
|
154 |
+
get_answers_button = gr.Button("Get Answers", variant="primary")
|
155 |
+
|
156 |
+
with gr.Row():
|
157 |
+
answer_with_cheatsheet_output = gr.Textbox(label="Answer WITH Cheatsheet", lines=10, interactive=False, show_label=True)
|
158 |
+
answer_without_cheatsheet_output = gr.Textbox(label="Answer WITHOUT Cheatsheet", lines=10, interactive=False, show_label=True)
|
159 |
+
|
160 |
+
get_answers_button.click(
|
161 |
+
fn=get_answers_func,
|
162 |
+
inputs=query_input,
|
163 |
+
outputs=[answer_with_cheatsheet_output, answer_without_cheatsheet_output]
|
164 |
+
)
|
165 |
+
|
166 |
+
gr.Markdown("**Important:** Ensure `SAMBANOVA_API_KEY` is set as a secret in your Hugging Face Space or as an environment variable if running locally. `SAMBANOVA_BASE_URL` is set to `https://api.sambanova.ai/v1` by default if not found in environment.")
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
+
if not SAMBANOVA_API_KEY:
|
170 |
+
print("Warning: SAMBANOVA_API_KEY is not set. The application will likely fail to contact the SambaNova API.")
|
171 |
+
print("Please set the SAMBANOVA_API_KEY environment variable.")
|
172 |
+
demo.launch()
|
173 |
+
|
dynamic_cheatsheet/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
dynamic_cheatsheet/__init__.py
ADDED
File without changes
|
dynamic_cheatsheet/language_model.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import tiktoken
|
3 |
+
from typing import List, Tuple
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
from .utils.execute_code import extract_and_run_python_code
|
6 |
+
from .utils.extractor import extract_answer, extract_cheatsheet
|
7 |
+
from litellm import completion
|
8 |
+
from functools import partial
|
9 |
+
import os # Added for SAMBANOVA env vars
|
10 |
+
|
11 |
+
class LanguageModel:
|
12 |
+
def __init__(self,
|
13 |
+
model_name: str,
|
14 |
+
) -> None:
|
15 |
+
"""
|
16 |
+
LanguageModel class to interact with different language models.
|
17 |
+
|
18 |
+
Arguments:
|
19 |
+
model_name : str : The name of the language model to use.
|
20 |
+
|
21 |
+
Raises:
|
22 |
+
ValueError : If the model name is not found or supported.
|
23 |
+
"""
|
24 |
+
|
25 |
+
self.model_name = model_name
|
26 |
+
|
27 |
+
# Known model list (remains the same)
|
28 |
+
known_model_list = [
|
29 |
+
"openai/gpt-4o-mini", "openai/gpt-4o-mini-2024-07-18",
|
30 |
+
"openai/gpt-4o", "openai/gpt-4o-2024-08-06", "openai/gpt-4o-2024-11-20",
|
31 |
+
"openai/gpt-3.5-turbo",
|
32 |
+
"together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
33 |
+
"meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
34 |
+
"openai/o3-mini", "openai/o3-mini-2025-01-31",
|
35 |
+
"openai/o1", "openai/o1-2024-12-17",
|
36 |
+
"anthropic/claude-3-5-sonnet-latest", "anthropic/claude-3-5-sonnet-20241022",
|
37 |
+
"anthropic/claude-3-5-haiku-latest", "anthropic/claude-3-5-haiku-20241022",
|
38 |
+
"anthropic/claude-3-7-sonnet-latest", "anthropic/claude-3-7-sonnet-20250219",
|
39 |
+
"together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
|
40 |
+
"together_ai/deepseek-ai/DeepSeek-R1",
|
41 |
+
"together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
42 |
+
"together_ai/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
43 |
+
"together_ai/Qwen/Qwen2.5-Coder-32B-Instruct",
|
44 |
+
"together_ai/Qwen/QwQ-32B",
|
45 |
+
"together_ai/Qwen/Qwen2-72B-Instruct",
|
46 |
+
"together_ai/Qwen/Qwen2.5-7B-Instruct-Turbo",
|
47 |
+
"together_ai/Qwen/Qwen2.5-72B-Instruct-Turbo",
|
48 |
+
"gemini/gemini-2.0-flash",
|
49 |
+
"ollama/llama3:70b",
|
50 |
+
]
|
51 |
+
|
52 |
+
# Load the client for the model based on the model name
|
53 |
+
if self.model_name.startswith("samba/"):
|
54 |
+
samba_api_key = os.environ.get("SAMBANOVA_API_KEY")
|
55 |
+
samba_base_url = os.environ.get("SAMBANOVA_BASE_URL", "https://api.sambanova.ai/v1") # Default if not set
|
56 |
+
if not samba_api_key:
|
57 |
+
raise ValueError("SAMBANOVA_API_KEY environment variable not set for SambaNova model.")
|
58 |
+
# For SambaNova (OpenAI compatible), explicitly pass api_key and api_base
|
59 |
+
# The model name for litellm should be just the model identifier, not the full "samba/" prefix if api_base is provided.
|
60 |
+
# However, litellm docs suggest that for OpenAI compatible endpoints, the model name passed to `completion`
|
61 |
+
# should be what the endpoint expects. The `model` param in `partial` here is the one sent in the request body.
|
62 |
+
# The `custom_llm_provider` in litellm is another way, but direct params are simpler for OpenAI compatibility.
|
63 |
+
# Let's try keeping the model name as is (e.g. "samba/DeepSeek-R1-Distill-Llama-70B")
|
64 |
+
# and provide api_key and api_base. LiteLLM should use these for any model if provided.
|
65 |
+
# If this doesn't work, the model name might need to be stripped of "samba/" if api_base is set.
|
66 |
+
# According to LiteLLM docs, for custom OpenAI-compatible endpoints, you can pass `base_url` and `api_key`.
|
67 |
+
# The `model` parameter to `litellm.completion` will be the actual model ID the endpoint expects.
|
68 |
+
# The `self.model_name` here is e.g. "samba/DeepSeek-R1-Distill-Llama-70B".
|
69 |
+
# We need to ensure the `model` argument to `completion` is just "DeepSeek-R1-Distill-Llama-70B"
|
70 |
+
# and set `custom_llm_provider="openai"` along with `api_base` and `api_key`.
|
71 |
+
# Or, if SambaNova is a recognized provider by a different name in litellm, use that.
|
72 |
+
# Given the error, litellm is not recognizing "samba/" as a provider directly.
|
73 |
+
# The simp actual_model_name = self.model_name.split("samba/", 1)[1] if "samba/" in self.model_name else self.model_name
|
74 |
+
self.client = partial(completion,
|
75 |
+
model=actual_model_name,
|
76 |
+
api_key=samba_api_key,
|
77 |
+
api_base=samba_base_url,
|
78 |
+
custom_llm_provider="openai"
|
79 |
+
)
|
80 |
+
print(f"Initialized SambaNova model '{actual_model_name}' via custom OpenAI provider settings with api_base: {samba_base_url}")
|
81 |
+
elif self.model_name in known_model_list:
|
82 |
+
self.client = partial(completion, model=self.model_name)
|
83 |
+
else:
|
84 |
+
print(f"Warning: Model '{self.model_name}' not in explicit list and does not start with recognized prefixes. Attempting to initialize with litellm directly.")
|
85 |
+
try:
|
86 |
+
self.client = partial(completion, model=self.model_name)
|
87 |
+
print(f"Successfully initialized model '{self.model_name}' via litellm fallback.")
|
88 |
+
except Exception as e: raise ValueError(f"Model '{self.model_name}' is not in the known list, does not start with recognized prefixes, and could not be initialized by litellm directly: {{e}}")
|
89 |
+
self.gpt4Tokenizer = tiktoken.encoding_for_model("gpt-4o")
|
90 |
+
|
91 |
+
def count_tokens(self, text: str) -> int:
|
92 |
+
"""
|
93 |
+
Count the number of tokens in the text.
|
94 |
+
"""
|
95 |
+
tokens = self.gpt4Tokenizer.encode(text)
|
96 |
+
return len(tokens)
|
97 |
+
|
98 |
+
def generate(self,
|
99 |
+
history: List[str],
|
100 |
+
temperature: float = 0.1,
|
101 |
+
max_tokens: int = 2048,
|
102 |
+
current_depth: int = 1,
|
103 |
+
max_depth_num_rounds: int = 3,
|
104 |
+
allow_code_execution: bool = True,
|
105 |
+
code_execution_flag: str = "EXECUTE CODE!",
|
106 |
+
final_output: str = ""
|
107 |
+
) -> str:
|
108 |
+
"""
|
109 |
+
Generate a response from the language model.
|
110 |
+
"""
|
111 |
+
if len(history) == 0:
|
112 |
+
raise ValueError("History must contain at least one message.")
|
113 |
+
|
114 |
+
# The self.client is already a partial function with model, api_key, base_url, etc., pre-filled for SambaNova
|
115 |
+
response = self.client(
|
116 |
+
messages=history,
|
117 |
+
# model=self.model_name, # This is now part of the partial self.client for SambaNova
|
118 |
+
temperature=temperature,
|
119 |
+
max_tokens=max_tokens, # litellm uses max_tokens or max_completion_tokens
|
120 |
+
)
|
121 |
+
output = response.choices[0].message.content # Corrected access to content
|
122 |
+
|
123 |
+
pre_code_execution_flag = output.split(code_execution_flag)[0].strip()
|
124 |
+
if allow_code_execution and code_execution_flag in output and pre_code_execution_flag.endswith("```"):
|
125 |
+
output_prefix = output.split(code_execution_flag)[0].strip()
|
126 |
+
executed_code = extract_and_run_python_code(output_prefix)
|
127 |
+
executed_code = executed_code.strip()
|
128 |
+
current_output = f"{output_prefix}\n{code_execution_flag}\n\n{executed_code}"
|
129 |
+
final_output = f"{final_output}\n\n{current_output}".strip()
|
130 |
+
|
131 |
+
if current_depth <= max_depth_num_rounds:
|
132 |
+
warning_txt = ""
|
133 |
+
if current_depth == max_depth_num_rounds:
|
134 |
+
warning_txt = f" (This is the last round. No more code execution will be allowed. Please present your final solution now.)"
|
135 |
+
new_messages = [
|
136 |
+
{"role": "assistant", "content": current_output},
|
137 |
+
{"role": "user", "content": f"Proceed with any additional steps required and provide the completed solution. If everything is already complete, type FINAL ANSWER and submit it in the expected format. If you are stuck, please try alternative methods to solve the problem and provide the final solution.{warning_txt}"}
|
138 |
+
]
|
139 |
+
history += new_messages
|
140 |
+
return self.generate(
|
141 |
+
history=history,
|
142 |
+
temperature=temperature,
|
143 |
+
max_tokens=max_tokens,
|
144 |
+
current_depth=current_depth+1,
|
145 |
+
max_depth_num_rounds=max_depth_num_rounds,
|
146 |
+
allow_code_execution=allow_code_execution,
|
147 |
+
code_execution_flag=code_execution_flag,
|
148 |
+
final_output=final_output,
|
149 |
+
)
|
150 |
+
else:
|
151 |
+
return final_output
|
152 |
+
else:
|
153 |
+
final_output = f"{final_output}\n\n{output}".strip()
|
154 |
+
return final_output
|
155 |
+
|
156 |
+
def advanced_generate(self,
|
157 |
+
approach_name: str,
|
158 |
+
input_txt: str,
|
159 |
+
cheatsheet: str = None,
|
160 |
+
generator_template: str = None,
|
161 |
+
cheatsheet_template: str = None,
|
162 |
+
temperature: float = 0.0,
|
163 |
+
max_tokens: int = 2048,
|
164 |
+
max_num_rounds: int = 1,
|
165 |
+
allow_code_execution: bool = True,
|
166 |
+
code_execution_flag: str = "EXECUTE CODE!",
|
167 |
+
add_previous_answers_to_cheatsheet: bool = True,
|
168 |
+
original_input_corpus: List[str] = None,
|
169 |
+
original_input_embeddings: np.ndarray = None,
|
170 |
+
generator_outputs_so_far: List[str] = None,
|
171 |
+
retrieve_top_k: int = 3,
|
172 |
+
) -> dict:
|
173 |
+
"""
|
174 |
+
Generate a response from the language model.
|
175 |
+
Returns dict instead of Tuple for clarity.
|
176 |
+
"""
|
177 |
+
|
178 |
+
if approach_name == "default":
|
179 |
+
generator_prompt = generator_template.replace("[[QUESTION]]", input_txt).replace("[[CHEATSHEET]]", "(empty)")
|
180 |
+
generator_history = [
|
181 |
+
{"role": "user", "content": generator_prompt},
|
182 |
+
]
|
183 |
+
generator_output = self.generate(
|
184 |
+
history=generator_history,
|
185 |
+
temperature=temperature,
|
186 |
+
max_tokens=max_tokens,
|
187 |
+
allow_code_execution=allow_code_execution,
|
188 |
+
code_execution_flag=code_execution_flag,
|
189 |
+
)
|
190 |
+
generator_answer = extract_answer(generator_output)
|
191 |
+
return {
|
192 |
+
"input_txt": input_txt,
|
193 |
+
"steps": [
|
194 |
+
{
|
195 |
+
"round": 0,
|
196 |
+
"generator_prompt": generator_prompt,
|
197 |
+
"generator_output": generator_output,
|
198 |
+
"generator_answer": generator_answer,
|
199 |
+
"current_cheatsheet": None,
|
200 |
+
"new_cheatsheet": None,
|
201 |
+
}
|
202 |
+
],
|
203 |
+
"previous_answers": None,
|
204 |
+
"final_answer": generator_answer,
|
205 |
+
"final_output": generator_output,
|
206 |
+
"final_cheatsheet": None,
|
207 |
+
}
|
208 |
+
|
209 |
+
elif approach_name == "DynamicCheatsheet_Cumulative":
|
210 |
+
if cheatsheet is None:
|
211 |
+
raise ValueError("Cheatsheet must be provided for DynamicCheatsheet_Cumulative approach.")
|
212 |
+
if generator_template is None or cheatsheet_template is None:
|
213 |
+
raise ValueError("Generator and Cheatsheet templates must be provided for DynamicCheatsheet_Cumulative approach.")
|
214 |
+
|
215 |
+
steps = []
|
216 |
+
previous_answers = []
|
217 |
+
current_cheatsheet_in_round = cheatsheet # Use a local var for the loop
|
218 |
+
|
219 |
+
for round_num in range(max(1, max_num_rounds)):
|
220 |
+
generator_cheatsheet_content = current_cheatsheet_in_round
|
221 |
+
if round_num > 0 and add_previous_answers_to_cheatsheet and previous_answers:
|
222 |
+
previous_answers_txt = f"PREVIOUS ANSWERS:\n{'; '.join(previous_answers)}"
|
223 |
+
generator_cheatsheet_content = f"{generator_cheatsheet_content}\n\n{previous_answers_txt}"
|
224 |
+
|
225 |
+
generator_prompt = generator_template.replace("[[QUESTION]]", input_txt).replace("[[CHEATSHEET]]", generator_cheatsheet_content)
|
226 |
+
|
227 |
+
generator_history = [{"role": "user", "content": generator_prompt}]
|
228 |
+
generator_output = self.generate(
|
229 |
+
history=generator_history,
|
230 |
+
temperature=temperature,
|
231 |
+
max_tokens=max_tokens,
|
232 |
+
allow_code_execution=allow_code_execution,
|
233 |
+
code_execution_flag=code_execution_flag,
|
234 |
+
)
|
235 |
+
generator_answer = extract_answer(generator_output)
|
236 |
+
|
237 |
+
cheatsheet_prompt = cheatsheet_template.replace("[[QUESTION]]", input_txt).replace("[[MODEL_ANSWER]]", generator_output).replace("[[PREVIOUS_CHEATSHEET]]", current_cheatsheet_in_round)
|
238 |
+
cheatsheet_history = [{"role": "user", "content": cheatsheet_prompt}]
|
239 |
+
# Pass explicit provider details for curator model if it's also SambaNova
|
240 |
+
# Assuming curator uses the same model instance for now.
|
241 |
+
cheatsheet_model_output = self.generate(
|
242 |
+
history=cheatsheet_history,
|
243 |
+
temperature=temperature,
|
244 |
+
max_tokens=2*max_tokens, # As per original
|
245 |
+
allow_code_execution=False,
|
246 |
+
)
|
247 |
+
new_cheatsheet = extract_cheatsheet(response=cheatsheet_model_output, old_cheatsheet=current_cheatsheet_in_round)
|
248 |
+
|
249 |
+
steps.append({
|
250 |
+
"round": round_num,
|
251 |
+
"generator_prompt": generator_prompt,
|
252 |
+
"generator_output": generator_output,
|
253 |
+
"generator_answer": generator_answer,
|
254 |
+
"current_cheatsheet": current_cheatsheet_in_round,
|
255 |
+
"new_cheatsheet": new_cheatsheet,
|
256 |
+
})
|
257 |
+
current_cheatsheet_in_round = new_cheatsheet # Update for next potential round
|
258 |
+
if generator_answer:
|
259 |
+
previous_answers.append(f"Round {round_num+1}: {generator_answer}")
|
260 |
+
|
261 |
+
return {
|
262 |
+
"input_txt": input_txt,
|
263 |
+
"steps": steps,
|
264 |
+
"previous_answers": previous_answers,
|
265 |
+
"final_answer": generator_answer, # Answer from the last round
|
266 |
+
"final_cheatsheet": current_cheatsheet_in_round, # Cheatsheet from the last round
|
267 |
+
"final_output": generator_output, # Full output from the last generator call
|
268 |
+
}
|
269 |
+
elif approach_name == "FullHistoryAppending":
|
270 |
+
length_of_history = len(generator_outputs_so_far) if generator_outputs_so_far else 0
|
271 |
+
curated_cheatsheet = "(empty)"
|
272 |
+
if length_of_history > 0 and original_input_corpus and generator_outputs_so_far:
|
273 |
+
curated_cheatsheet = "### PREVIOUS SOLUTIONS (START)\n\n"
|
274 |
+
for i, (prev_input, prev_output) in enumerate(zip(original_input_corpus[:length_of_history], generator_outputs_so_far[:length_of_history])):
|
275 |
+
curated_cheatsheet += f"#### Previous Input #{i+1}:\n\n{prev_input}\n\n#### Model Solution to Previous Input #{i+1}:\n\n{prev_output}\n---\n---\n\n"
|
276 |
+
curated_cheatsheet += "#### PREVIOUS SOLUTIONS (END)"
|
277 |
+
|
278 |
+
generator_prompt = generator_template.replace("[[QUESTION]]", input_txt).replace("[[CHEATSHEET]]", curated_cheatsheet)
|
279 |
+
generator_history = [{"role": "user", "content": generator_prompt}]
|
280 |
+
generator_output = self.generate(
|
281 |
+
history=generator_history,
|
282 |
+
temperature=temperature,
|
283 |
+
max_tokens=max_tokens,
|
284 |
+
allow_code_execution=allow_code_execution,
|
285 |
+
code_execution_flag=code_execution_flag,
|
286 |
+
)
|
287 |
+
generator_answer = extract_answer(generator_output)
|
288 |
+
return {
|
289 |
+
"input_txt": input_txt,
|
290 |
+
"steps": [],
|
291 |
+
"previous_answers": [],
|
292 |
+
"final_answer": generator_answer,
|
293 |
+
"final_cheatsheet": curated_cheatsheet,
|
294 |
+
"final_output": generator_output,
|
295 |
+
}
|
296 |
+
else:
|
297 |
+
raise ValueError(f"Unknown approach_name: {approach_name}")
|
298 |
+
|
dynamic_cheatsheet/utils/__init__.py
ADDED
File without changes
|
dynamic_cheatsheet/utils/evaluation.py
ADDED
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import re
|
3 |
+
import os
|
4 |
+
from typing import List
|
5 |
+
# from .sonnet_eval import sonnet_errors
|
6 |
+
from .execute_code import execute_code_with_timeout
|
7 |
+
|
8 |
+
|
9 |
+
def clean_output_for_arithmetic(output: str) -> str:
|
10 |
+
"""
|
11 |
+
Clean the output for arithmetic problems.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
output (str): The output to clean.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
str: The cleaned output.
|
18 |
+
"""
|
19 |
+
if "=" in output:
|
20 |
+
output = output.split("=")[1].strip()
|
21 |
+
if " is" in output:
|
22 |
+
output = output.split(" is")[1].strip()
|
23 |
+
if " equals" in output:
|
24 |
+
output = output.split(" equals")[1].strip()
|
25 |
+
if " evaluates to" in output:
|
26 |
+
output = output.split(" evaluates to")[1].strip()
|
27 |
+
if " is equal to" in output:
|
28 |
+
output = output.split(" is equal to")[1].strip()
|
29 |
+
return output
|
30 |
+
|
31 |
+
|
32 |
+
def clean_output_for_GameOf24(output: str) -> str:
|
33 |
+
"""
|
34 |
+
Clean the output for GameOf24 problems.
|
35 |
+
"""
|
36 |
+
if "=" in output:
|
37 |
+
output = output.split("=")[0].strip()
|
38 |
+
if "is" in output:
|
39 |
+
output = output.split("is")[1].strip()
|
40 |
+
if "equals" in output:
|
41 |
+
output = output.split("equals")[0].strip()
|
42 |
+
if "evaluates to" in output:
|
43 |
+
output = output.split("evaluates to")[0].strip()
|
44 |
+
return output
|
45 |
+
|
46 |
+
|
47 |
+
def eval_for_GameOf24(input: str, output: str) -> bool:
|
48 |
+
"""
|
49 |
+
Given an input and output, check if the output is correct and follows the rules of the game.
|
50 |
+
"""
|
51 |
+
clean_output = output
|
52 |
+
|
53 |
+
clean_output = clean_output_for_GameOf24(output)
|
54 |
+
clean_output = clean_output.replace("x", "*").strip()
|
55 |
+
clean_output = clean_output.replace("×", "*").strip()
|
56 |
+
clean_output = clean_output.replace("÷", "/").strip()
|
57 |
+
|
58 |
+
try:
|
59 |
+
# Get the value of the expression using eval
|
60 |
+
value = eval(clean_output)
|
61 |
+
if not (abs(value - 24) < 1e-3):
|
62 |
+
return False
|
63 |
+
# Split the input and output digits by space
|
64 |
+
input_digits = input.split(" ")
|
65 |
+
# Replace the following symbols with space
|
66 |
+
replacements = ["+", "-", "*", "/", "÷", "(", ")"]
|
67 |
+
for symbol in replacements:
|
68 |
+
clean_output = clean_output.replace(symbol, " ")
|
69 |
+
# Replace multiple spaces with single space
|
70 |
+
clean_output = re.sub(" +", " ", clean_output)
|
71 |
+
clean_output = clean_output.strip()
|
72 |
+
output_digits = clean_output.split(" ")
|
73 |
+
# Sort the digits
|
74 |
+
input_digits.sort()
|
75 |
+
output_digits.sort()
|
76 |
+
# Check if the digits are the same
|
77 |
+
if input_digits != output_digits:
|
78 |
+
return False
|
79 |
+
return True
|
80 |
+
except Exception as e:
|
81 |
+
return False
|
82 |
+
|
83 |
+
|
84 |
+
def remove_punctuation(output: str) -> str:
|
85 |
+
"""
|
86 |
+
Remove punctuation from the output.
|
87 |
+
"""
|
88 |
+
markers = [",", ";", ":", ".", '"']
|
89 |
+
for marker in markers:
|
90 |
+
output = output.replace(marker, "")
|
91 |
+
return output
|
92 |
+
|
93 |
+
|
94 |
+
def convert_newline_to_space(output: str) -> str:
|
95 |
+
"""
|
96 |
+
Convert newline to space.
|
97 |
+
"""
|
98 |
+
output = output.replace("\n", " ")
|
99 |
+
return output
|
100 |
+
|
101 |
+
|
102 |
+
def eval_for_exact_matching_with_no_punctuation(
|
103 |
+
output: str, target: str
|
104 |
+
) -> bool:
|
105 |
+
"""
|
106 |
+
Evaluate if the output is exactly the same as the target.
|
107 |
+
"""
|
108 |
+
output = remove_punctuation(output)
|
109 |
+
output = convert_newline_to_space(output)
|
110 |
+
if target == output:
|
111 |
+
return True
|
112 |
+
return False
|
113 |
+
|
114 |
+
|
115 |
+
def eval_for_softmatch(input: str, output: str, target: str) -> bool:
|
116 |
+
"""
|
117 |
+
Evaluate if the output is a soft match of the target.
|
118 |
+
"""
|
119 |
+
output = remove_punctuation(output)
|
120 |
+
if target in output:
|
121 |
+
return True
|
122 |
+
return False
|
123 |
+
|
124 |
+
|
125 |
+
def eval_for_CheckmateInOne(input: str, output: str, target: str) -> bool:
|
126 |
+
"""
|
127 |
+
Evaluate if the output is a checkmate in one.
|
128 |
+
"""
|
129 |
+
output = output.strip()
|
130 |
+
if output[-1] == "#":
|
131 |
+
output = output.split(" ")[-1].strip()
|
132 |
+
# Based on the input, determine the number of the last move
|
133 |
+
last_move = input.split(".")[-1].strip()
|
134 |
+
move_idx = input.split(".")[-2].split(" ")[-1].strip()
|
135 |
+
# If the last move is an empty string, then the last move is white; otherwise, it is black
|
136 |
+
if last_move == "":
|
137 |
+
last_move = "White"
|
138 |
+
else:
|
139 |
+
last_move = "Black"
|
140 |
+
next_move_idx = str(int(move_idx) + 1)
|
141 |
+
if not (next_move_idx in output):
|
142 |
+
if target in output or (target[1] == 'x' and (target[0] + target[2:]) in output):
|
143 |
+
return True
|
144 |
+
else:
|
145 |
+
output = output.split(next_move_idx)[0].strip()
|
146 |
+
if target in output or (target[1] == 'x' and (target[0] + target[2:]) in output):
|
147 |
+
return True
|
148 |
+
return False
|
149 |
+
|
150 |
+
|
151 |
+
def eval_equation_balancer(input: str, output: str, target: str) -> bool:
|
152 |
+
"""
|
153 |
+
Evaluate if the output is a valid equation balancer.
|
154 |
+
"""
|
155 |
+
output = output.split("=")[0].strip()
|
156 |
+
target_val = target.split("=")[1].strip()
|
157 |
+
target = target.split("=")[0].strip()
|
158 |
+
# First make sure that the output has the same format as the target (when operators (e.g., +, -, *, /) are removed)
|
159 |
+
output_nums = output.replace("+", "").replace("-", "").replace("*", "").replace("/", "").replace(" ", "").strip()
|
160 |
+
target_nums = target.replace("+", "").replace("-", "").replace("*", "").replace("/", "").replace(" ", "").strip()
|
161 |
+
if output_nums != target_nums:
|
162 |
+
return False
|
163 |
+
# Now, evaluate the output and target
|
164 |
+
try:
|
165 |
+
output_value = eval(output)
|
166 |
+
if abs(output_value - eval(target_val)) < 1e-6:
|
167 |
+
return True
|
168 |
+
except Exception as e:
|
169 |
+
return False
|
170 |
+
return False
|
171 |
+
|
172 |
+
|
173 |
+
def eval_for_multiple_choice(input_text: str, final_answer: str, target: str) -> bool:
|
174 |
+
"""
|
175 |
+
Evaluates if the final answer matches the target using pattern matching.
|
176 |
+
|
177 |
+
Args:
|
178 |
+
input_text (str): The original question text including options
|
179 |
+
final_answer (str): The model's answer
|
180 |
+
target (str): The correct answer
|
181 |
+
|
182 |
+
Returns:
|
183 |
+
bool: True if answer is correct, False otherwise
|
184 |
+
"""
|
185 |
+
# Handle empty or None inputs
|
186 |
+
if not final_answer or not target:
|
187 |
+
return False
|
188 |
+
|
189 |
+
def clean_text(text: str) -> str:
|
190 |
+
if not text:
|
191 |
+
return ""
|
192 |
+
return text.lower().strip().replace('`', '').replace('(', '').replace(')', '')
|
193 |
+
|
194 |
+
def extract_option_text(input_text: str, option_letter: str) -> str:
|
195 |
+
try:
|
196 |
+
# Try different formats of options sections
|
197 |
+
options_section = ""
|
198 |
+
if 'options:' in input_text.lower():
|
199 |
+
options_section = input_text.lower().split('options:')[1].strip()
|
200 |
+
elif 'choices:' in input_text.lower():
|
201 |
+
options_section = input_text.lower().split('choices:')[1].strip()
|
202 |
+
|
203 |
+
if not options_section:
|
204 |
+
# Try to find options in the format (A) text, (B) text
|
205 |
+
lines = input_text.lower().split('\n')
|
206 |
+
for i, line in enumerate(lines):
|
207 |
+
if line.strip().startswith(f'({option_letter})') or line.strip().startswith(f'{option_letter})'):
|
208 |
+
return line.split(')', 1)[1].strip()
|
209 |
+
|
210 |
+
# Process the options section if found
|
211 |
+
for line in options_section.split('\n'):
|
212 |
+
line = line.strip()
|
213 |
+
if line.startswith(f'({option_letter})') or line.startswith(f'{option_letter})'):
|
214 |
+
return line.split(')', 1)[1].strip()
|
215 |
+
# Handle options like "A. text" format
|
216 |
+
if line.startswith(f'{option_letter}.'):
|
217 |
+
return line.split('.', 1)[1].strip()
|
218 |
+
except:
|
219 |
+
return ''
|
220 |
+
return ''
|
221 |
+
|
222 |
+
# Full option match (A), (B), etc. (e.g., (A) == (A))
|
223 |
+
if final_answer == target:
|
224 |
+
return True
|
225 |
+
|
226 |
+
# Clean and normalize inputs
|
227 |
+
clean_answer = clean_text(final_answer)
|
228 |
+
clean_target = clean_text(target)
|
229 |
+
|
230 |
+
# Handle target formats: (A), A), A, etc.
|
231 |
+
target_letter = ""
|
232 |
+
if len(clean_target) == 1:
|
233 |
+
target_letter = clean_target
|
234 |
+
elif clean_target.endswith(')'):
|
235 |
+
target_letter = clean_target[-2]
|
236 |
+
else:
|
237 |
+
# Extract the last character if it's a letter a-d or A-D
|
238 |
+
last_char = clean_target[-1]
|
239 |
+
if last_char in 'abcd':
|
240 |
+
target_letter = last_char
|
241 |
+
|
242 |
+
# Direct letter match (a, b, c, d)
|
243 |
+
if len(clean_answer) == 1 and clean_answer in 'abcd' and clean_answer == target_letter:
|
244 |
+
return True
|
245 |
+
|
246 |
+
# Handle answer formats like "A" or "A."
|
247 |
+
if clean_answer.startswith(target_letter) and (len(clean_answer) == 1 or
|
248 |
+
(len(clean_answer) == 2 and clean_answer[1] == '.')):
|
249 |
+
return True
|
250 |
+
|
251 |
+
# Handle answer formats like "Option A" or "Answer is A"
|
252 |
+
if clean_answer.endswith(target_letter) and (clean_answer[-2:] == f" {target_letter}" or
|
253 |
+
clean_answer[-3:] == f" {target_letter}."):
|
254 |
+
return True
|
255 |
+
|
256 |
+
# Text content match - check if the target option text is in the answer
|
257 |
+
target_text = extract_option_text(input_text, target_letter)
|
258 |
+
|
259 |
+
if target_text and target_text in clean_answer:
|
260 |
+
return True
|
261 |
+
|
262 |
+
# Handle numerical answers (if target is a number and answer contains that number)
|
263 |
+
if target_letter.isdigit() and target_letter in clean_answer:
|
264 |
+
return True
|
265 |
+
|
266 |
+
return False
|
267 |
+
|
268 |
+
|
269 |
+
def eval_for_pyton_programming_puzzles(input: str, output: str) -> bool:
|
270 |
+
"""
|
271 |
+
Evaluate if the output is a valid Python programming puzzle solution.
|
272 |
+
"""
|
273 |
+
if "```python" in output:
|
274 |
+
output = output.split("```python")[-1].strip()
|
275 |
+
output = output.split("```")[0].strip()
|
276 |
+
|
277 |
+
if "def sat" in output:
|
278 |
+
if "from typing" not in output:
|
279 |
+
output = f"from typing import *\n{output}"
|
280 |
+
code = f"{output}\nanswer = solution()\nprint(sat(answer))"
|
281 |
+
else:
|
282 |
+
code = f"from typing import *\n{input}\n{output}\nanswer = solution()\nprint(sat(answer))"
|
283 |
+
|
284 |
+
code = code.replace("List[", "list[")
|
285 |
+
eval_bool = execute_code_with_timeout(code, timeout=3)
|
286 |
+
|
287 |
+
if "NameError: name 'answer' is not defined" in eval_bool:
|
288 |
+
print(f"Eval bool: {eval_bool}")
|
289 |
+
print(f"Code:\n{code}")
|
290 |
+
print("*" * 100)
|
291 |
+
if "True" in eval_bool:
|
292 |
+
return True
|
293 |
+
return False
|
dynamic_cheatsheet/utils/execute_code.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module provides functions to extract and execute Python code from a string.
|
3 |
+
|
4 |
+
The functions are:
|
5 |
+
* extract_and_run_python_code(txt: str) -> str: Extracts and executes Python code from a string.
|
6 |
+
* execute_code_with_timeout(code: str, timeout: int = 3) -> str: Executes Python code with a timeout and returns the output.
|
7 |
+
|
8 |
+
Additional functions can be added as needed.
|
9 |
+
"""
|
10 |
+
|
11 |
+
import os
|
12 |
+
import tempfile
|
13 |
+
from subprocess import Popen, PIPE, TimeoutExpired
|
14 |
+
|
15 |
+
def extract_and_run_python_code(txt: str) -> str:
|
16 |
+
"""
|
17 |
+
Extract and execute Python code from a provided string.
|
18 |
+
|
19 |
+
Handles missing print statements for non-comment last lines,
|
20 |
+
executes the code, and captures output or errors.
|
21 |
+
|
22 |
+
Parameters:
|
23 |
+
txt (str): Input string containing a possible Python code block.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
str: Execution result or error message wrapped in output formatting.
|
27 |
+
"""
|
28 |
+
def extract_code(input_str: str) -> str:
|
29 |
+
"""Extract Python code block delimited by ```python and ```."""
|
30 |
+
try:
|
31 |
+
return input_str.split("```python", 1)[1].split("```", 1)[0].strip()
|
32 |
+
except IndexError:
|
33 |
+
raise ValueError("No valid Python code block found.")
|
34 |
+
|
35 |
+
def ensure_print_statement(code: str) -> str:
|
36 |
+
"""
|
37 |
+
Append a print statement if the last line isn't a comment or a print statement.
|
38 |
+
"""
|
39 |
+
lines = code.splitlines()
|
40 |
+
last_line = lines[-1].rstrip()
|
41 |
+
if not last_line.startswith(("print(", "#", " ", "\t")) and (not ("return" in last_line)):# and len((last_line.split(" "))) == 1:
|
42 |
+
lines[-1] = f"print({last_line})"
|
43 |
+
return "\n".join(lines)
|
44 |
+
|
45 |
+
if "```python" not in txt:
|
46 |
+
return None # Return early if no Python code block is present
|
47 |
+
|
48 |
+
try:
|
49 |
+
# Extract and sanitize the code
|
50 |
+
code_block = extract_code(txt)
|
51 |
+
code_with_print = ensure_print_statement(code_block)
|
52 |
+
|
53 |
+
# Execute the code and return output
|
54 |
+
python_output = execute_code_with_timeout(code_with_print)
|
55 |
+
# return f"PYTHON CODE OUTPUT:\n'''\n{python_output}\n'''"
|
56 |
+
return f"Output of the Python code above:\n```\n{python_output}\n```"
|
57 |
+
|
58 |
+
except Exception as error:
|
59 |
+
return f"PYTHON CODE OUTPUT:\n```\nError: {str(error)}\n```"
|
60 |
+
|
61 |
+
|
62 |
+
# Python code execution function with timeout
|
63 |
+
# TODO (msuzgun): Improve the security of this function by using a sandboxed environment
|
64 |
+
def execute_code_with_timeout(code: str, timeout: int = 3) -> str:
|
65 |
+
"""
|
66 |
+
Execute Python code with a timeout and return the output.
|
67 |
+
|
68 |
+
Parameters:
|
69 |
+
code (str): Python code to execute.
|
70 |
+
timeout (int): Timeout duration in seconds.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
str: Captured output or error message from the code execution.
|
74 |
+
"""
|
75 |
+
with tempfile.NamedTemporaryFile(
|
76 |
+
mode="w+t", suffix=".py", delete=False
|
77 |
+
) as temp_file:
|
78 |
+
temp_file.write(code)
|
79 |
+
temp_file.flush()
|
80 |
+
|
81 |
+
try:
|
82 |
+
# In case alias python=python3 is not set, use python3 instead of python
|
83 |
+
process = Popen(["python3", temp_file.name], stdout=PIPE, stderr=PIPE)
|
84 |
+
stdout, stderr = process.communicate(timeout=timeout)
|
85 |
+
captured_output = stdout.decode().strip()
|
86 |
+
error_output = stderr.decode().strip()
|
87 |
+
|
88 |
+
if captured_output == "":
|
89 |
+
if error_output != "":
|
90 |
+
captured_output = f"Error in execution: {error_output}"
|
91 |
+
else:
|
92 |
+
captured_output = "(No output was generated. It is possible that you did not include a print statement in your code. If you want to see the output, please include a print statement.)"
|
93 |
+
|
94 |
+
except TimeoutExpired:
|
95 |
+
process.kill()
|
96 |
+
captured_output = "Execution took too long, aborting..."
|
97 |
+
|
98 |
+
finally:
|
99 |
+
os.remove(temp_file.name)
|
100 |
+
|
101 |
+
return captured_output
|
dynamic_cheatsheet/utils/extractor.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file contains the functions to extract the final answer, cheatsheet and solution evaluation from model responses.
|
3 |
+
|
4 |
+
The functions are:
|
5 |
+
* extract_answer(response: str) -> str: Extracts the final answer from the model response.
|
6 |
+
* extract_cheatsheet(response: str, old_cheatsheet: str) -> str: Extracts the cheatsheet from the model response.
|
7 |
+
* extract_solution(response: str, header: str = "SOLUTION EVALUATION:", error_message : str = "No solution evaluation found") -> str: Extracts the solution evaluation from the model response.
|
8 |
+
|
9 |
+
Additional functions can be added as needed.
|
10 |
+
"""
|
11 |
+
|
12 |
+
def extract_answer(
|
13 |
+
response: str,
|
14 |
+
) -> str:
|
15 |
+
"""
|
16 |
+
Extracts the final answer from the model response.
|
17 |
+
|
18 |
+
Arguments:
|
19 |
+
response : str : The response from the model.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
str : The extracted final answer (if not found, returns "No final answer found").
|
23 |
+
"""
|
24 |
+
if "<answer>" in response:
|
25 |
+
# <answer> (content) </answer>
|
26 |
+
try:
|
27 |
+
txt = response.split("<answer>")[-1].strip()
|
28 |
+
txt = txt.split("</answer>")[0].strip()
|
29 |
+
return txt
|
30 |
+
except:
|
31 |
+
return "No final answer found"
|
32 |
+
else:
|
33 |
+
if not("FINAL ANSWER" in response):
|
34 |
+
return "No final answer found"
|
35 |
+
try:
|
36 |
+
response = response.split("FINAL ANSWER")[-1].strip()
|
37 |
+
if response[0] == ":":
|
38 |
+
response = response[1:].strip()
|
39 |
+
|
40 |
+
# First decide whether to split by "```" or "'''" based on the presence of "```" or "'''"
|
41 |
+
idx_1 = response.find("'''")
|
42 |
+
idx_2 = response.find("```")
|
43 |
+
if min(idx_1, idx_2) != -1:
|
44 |
+
if idx_1 < idx_2:
|
45 |
+
response = response.split("'''")[1].strip()
|
46 |
+
else:
|
47 |
+
response = response.split("```")[1].strip()
|
48 |
+
else:
|
49 |
+
if idx_1 == -1:
|
50 |
+
response = response.split("```")[1].strip()
|
51 |
+
else:
|
52 |
+
response = response.split("'''")[1].strip()
|
53 |
+
|
54 |
+
# Special case for P3-Test task: If the first line contains "python" then remove it
|
55 |
+
if response.split("\n")[0].strip().lower() == "python":
|
56 |
+
response = "\n".join(response.split("\n")[1:]).strip()
|
57 |
+
return response
|
58 |
+
except:
|
59 |
+
return "No final answer found"
|
60 |
+
|
61 |
+
|
62 |
+
def extract_cheatsheet(
|
63 |
+
response: str,
|
64 |
+
old_cheatsheet: str,
|
65 |
+
) -> str:
|
66 |
+
"""
|
67 |
+
Extracts the cheatsheet from the model response.
|
68 |
+
|
69 |
+
Arguments:
|
70 |
+
response : str : The response from the model.
|
71 |
+
old_cheatsheet : str : The old cheatsheet to return if the new one is not found.
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
str : The extracted cheatsheet (if not found, returns the old cheatsheet).
|
75 |
+
"""
|
76 |
+
response = response.strip()
|
77 |
+
# <cheatsheet> (content) </cheatsheet>
|
78 |
+
if "<cheatsheet>" in response:
|
79 |
+
try:
|
80 |
+
txt = response.split("<cheatsheet>")[1].strip()
|
81 |
+
txt = txt.split("</cheatsheet>")[0].strip()
|
82 |
+
return txt
|
83 |
+
except:
|
84 |
+
return old_cheatsheet
|
85 |
+
else:
|
86 |
+
return old_cheatsheet
|
87 |
+
|
88 |
+
|
89 |
+
def extract_solution(
|
90 |
+
response: str,
|
91 |
+
header: str = "SOLUTION EVALUATION:",
|
92 |
+
error_message : str = "No solution evaluation found",
|
93 |
+
) -> str:
|
94 |
+
"""
|
95 |
+
Extracts the solution evaluation from the model response.
|
96 |
+
|
97 |
+
Arguments:
|
98 |
+
response : str : The response from the model.
|
99 |
+
header : str : The header to search for the solution evaluation.
|
100 |
+
error_message : str : The error message to return if the solution evaluation is not found.
|
101 |
+
|
102 |
+
Returns:
|
103 |
+
str : The extracted solution evaluation (if not found, returns the error message).
|
104 |
+
"""
|
105 |
+
response = response.strip()
|
106 |
+
try:
|
107 |
+
txt = response.split(header)[1]
|
108 |
+
try:
|
109 |
+
txt = txt.split("'''")[1].strip()
|
110 |
+
except:
|
111 |
+
return txt.strip()
|
112 |
+
except:
|
113 |
+
return response
|
114 |
+
# return error_message
|
115 |
+
return txt
|
dynamic_cheatsheet/utils/sonnet_eval.py
ADDED
@@ -0,0 +1,511 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Defines function sonnet_errors(poem, target: str) -> Dict[str, Any]
|
3 |
+
which takes a target rhyme scheme (and optionally a list of required words) and returns a dict of errors
|
4 |
+
|
5 |
+
Returns an empty dictionary if there are no errors, so bool(sonnet_errors(poem, target)) is False if there are no
|
6 |
+
errors. It's a permissive check for sonnets errors, meaning that if it is unsure then it doesn't return an error.
|
7 |
+
|
8 |
+
Specifically,
|
9 |
+
|
10 |
+
* Check if it adheres to a given rhyming scheme
|
11 |
+
* Check if each line has 10-11 syllables, more precisely, there's some pronounciation of each line with 10-11 syllalbes
|
12 |
+
|
13 |
+
This omits a few things like rhymes and iambic pentameter.
|
14 |
+
|
15 |
+
# Rhymes
|
16 |
+
|
17 |
+
For rhymes, we use python `pronouncing` library based on:
|
18 |
+
|
19 |
+
* CMU pronouncing dictionary http://www.speech.cs.cmu.edu/cgi-bin/cmudict
|
20 |
+
|
21 |
+
# Syllable counting
|
22 |
+
|
23 |
+
Given that there are multiple ways to pronounce many words (e.g. "caramel" can be pronounced with 2 or 3 syllables),
|
24 |
+
we adopt a "permissive" approach and consult multiple tools for syllable counting:
|
25 |
+
|
26 |
+
* pronounce - a well-known pronunciation dict based on from CMU's pronouncing dictionary
|
27 |
+
* syllables - a Python library for syllable counting
|
28 |
+
* pyphen - a Python wrapper for the hyphenation library
|
29 |
+
"""
|
30 |
+
|
31 |
+
from typing import Set, Dict, Any
|
32 |
+
import re
|
33 |
+
import joblib
|
34 |
+
import pyphen
|
35 |
+
import syllables
|
36 |
+
import pronouncing
|
37 |
+
|
38 |
+
|
39 |
+
ALLOWED_SYLLABLES = {
|
40 |
+
10,
|
41 |
+
11,
|
42 |
+
} # about 3-4% of legit lines have 11 syllables, so we allow it, > 99% have 10 or 11
|
43 |
+
NUM_REQUIRED_WORDS = 3
|
44 |
+
|
45 |
+
memory = joblib.Memory(
|
46 |
+
".cache", verbose=0
|
47 |
+
) # use cache to speed up repeated rhyme/syllable calls
|
48 |
+
|
49 |
+
|
50 |
+
def sonnet_errors(poem: str, target: str, verbose=False) -> Dict[str, Any]:
|
51 |
+
"""
|
52 |
+
Checks for sonnet errors with respect to target rhyme scheme (and optional required words)
|
53 |
+
|
54 |
+
args:
|
55 |
+
poem: the poem to check
|
56 |
+
target: the rhyme scheme, e.g. "ABBA ABBA CDC DCD"
|
57 |
+
optionally target can have a list of required words, like
|
58 |
+
"ABBA ABBA CDC DCD, love train snail" each of these must be in the poem
|
59 |
+
verbose: if True, print out more details
|
60 |
+
"""
|
61 |
+
if ", " in target:
|
62 |
+
scheme, rest = target.split(", ")
|
63 |
+
required_words = rest.split()
|
64 |
+
else:
|
65 |
+
scheme = target
|
66 |
+
required_words = []
|
67 |
+
|
68 |
+
errors = scheme_errors(poem, scheme, verbose=verbose)
|
69 |
+
assert isinstance(errors, dict)
|
70 |
+
missing_words = [w for w in required_words if w.lower() not in poem.lower()]
|
71 |
+
if any(missing_words):
|
72 |
+
errors["missing words"] = missing_words
|
73 |
+
|
74 |
+
syllable_errors = []
|
75 |
+
for line in split_poem(poem):
|
76 |
+
variations = syllable_variations(line)
|
77 |
+
if not (variations & ALLOWED_SYLLABLES):
|
78 |
+
syllable_errors.append((line, sorted(variations)))
|
79 |
+
if syllable_errors:
|
80 |
+
errors["syllable errors"] = syllable_errors
|
81 |
+
|
82 |
+
return errors
|
83 |
+
|
84 |
+
|
85 |
+
def clean_word(text: str):
|
86 |
+
return text.lower().strip(",.!?;: \"'[]()/")
|
87 |
+
|
88 |
+
|
89 |
+
def clean_line(line: str):
|
90 |
+
"""
|
91 |
+
Clean a line from a poem.
|
92 |
+
Check if line ends with (A) or (B) ... and remove it
|
93 |
+
"""
|
94 |
+
line = re.sub(r"\s*\([A-Za-z]\)\s*$", "", line)
|
95 |
+
return line.strip()
|
96 |
+
|
97 |
+
|
98 |
+
def split_poem(poem: str, min_line_len=3):
|
99 |
+
ans = [clean_line(l) for l in poem.splitlines()]
|
100 |
+
return [l for l in ans if len(l) > min_line_len]
|
101 |
+
|
102 |
+
|
103 |
+
@memory.cache
|
104 |
+
def slant_rhyming_parts(word: str):
|
105 |
+
consonants = set("BCDFGHJKLMNPQRSTVWXYZ")
|
106 |
+
ans = [
|
107 |
+
"".join(
|
108 |
+
("R" if "R" in p else (p if p in consonants else "?"))
|
109 |
+
for p in pronouncing.rhyming_part(ph).split()
|
110 |
+
)
|
111 |
+
for ph in pronouncing.phones_for_word(word)
|
112 |
+
]
|
113 |
+
ans = [a for a in ans if not all(i == "?" for i in a)]
|
114 |
+
ans = [a.replace("?", "") + ("?" if a.endswith("?") else "") for a in ans]
|
115 |
+
return set(ans)
|
116 |
+
|
117 |
+
|
118 |
+
@memory.cache
|
119 |
+
def get_rhymes(w):
|
120 |
+
return set(pronouncing.rhymes(w))
|
121 |
+
|
122 |
+
|
123 |
+
def scheme_errors(poem: str, scheme: str, verbose=False):
|
124 |
+
"""Find errors with respect to a given rhyming scheme"""
|
125 |
+
lines = split_poem(poem)
|
126 |
+
scheme = scheme.replace(" ", "")
|
127 |
+
|
128 |
+
if len(lines) != len(scheme):
|
129 |
+
return {
|
130 |
+
"line count": f"Poem has {len(lines)} != {len(scheme)} lines in pattern {scheme}"
|
131 |
+
}
|
132 |
+
|
133 |
+
last_words = [clean_word(l.replace("-", " ").split()[-1]) for l in lines]
|
134 |
+
|
135 |
+
dictionary = pronouncing.cmudict.dict() # we ignore words not in dictionary
|
136 |
+
|
137 |
+
groups = []
|
138 |
+
for chars in sorted(set(scheme)):
|
139 |
+
groups.append(
|
140 |
+
[w for w, p in zip(last_words, scheme) if p == chars and w in dictionary]
|
141 |
+
)
|
142 |
+
|
143 |
+
slant_sets = {w: set(slant_rhyming_parts(w)) for g in groups for w in g}
|
144 |
+
|
145 |
+
scores = {}
|
146 |
+
|
147 |
+
if verbose:
|
148 |
+
print(groups)
|
149 |
+
|
150 |
+
for g in groups:
|
151 |
+
internal_words = set(g)
|
152 |
+
external_words = {w for h in groups if h is not g for w in h}
|
153 |
+
if len(internal_words) == 1:
|
154 |
+
continue # don't check rhymes if only word word in the group is in dictionary
|
155 |
+
for w in g:
|
156 |
+
rhymes = get_rhymes(w)
|
157 |
+
scores[w] = []
|
158 |
+
for comparisons in [internal_words, external_words]:
|
159 |
+
m = dict(rhymes=[], slant_rhymes=[])
|
160 |
+
scores[w].append(m)
|
161 |
+
for v in comparisons:
|
162 |
+
if v == w:
|
163 |
+
continue
|
164 |
+
if v in rhymes:
|
165 |
+
m["rhymes"].append(v)
|
166 |
+
elif slant_sets[v] & slant_sets[w]:
|
167 |
+
m["slant_rhymes"].append(v)
|
168 |
+
|
169 |
+
error_reasons = {}
|
170 |
+
suspicious_reasons = {}
|
171 |
+
|
172 |
+
for w in scores:
|
173 |
+
internal, external = scores[w]
|
174 |
+
|
175 |
+
if internal["rhymes"] or internal["slant_rhymes"]:
|
176 |
+
pass # ok if it rhymes (perfect or slant) with at least one other word in the group
|
177 |
+
elif len(external["rhymes"]) >= 2:
|
178 |
+
error_reasons[w] = "no internal rhymes, 2+ external perfect rhymes"
|
179 |
+
elif external["rhymes"]:
|
180 |
+
if len(external["slant_rhymes"]) >= 2:
|
181 |
+
error_reasons[
|
182 |
+
w
|
183 |
+
] = "no internal rhymes, 1 external perfect rhyme, 2+ external slant rhymes"
|
184 |
+
else:
|
185 |
+
suspicious_reasons[
|
186 |
+
w
|
187 |
+
] = "no internal rhymes/slant rhymes, 1 external perfect rhymes"
|
188 |
+
elif len(external["slant_rhymes"]) >= 3:
|
189 |
+
error_reasons[
|
190 |
+
w
|
191 |
+
] = "no internal rhymes/slant rhymes, 3+ external slant rhymes"
|
192 |
+
if verbose:
|
193 |
+
print(w, "internal:", internal, "external:", external)
|
194 |
+
|
195 |
+
if len(error_reasons) + len(suspicious_reasons) >= 3:
|
196 |
+
error_reasons.update(suspicious_reasons)
|
197 |
+
|
198 |
+
return {
|
199 |
+
w: {
|
200 |
+
"reason": error_reasons[w],
|
201 |
+
"internal": scores[w][0],
|
202 |
+
"external": scores[w][1],
|
203 |
+
}
|
204 |
+
for w in error_reasons
|
205 |
+
}
|
206 |
+
|
207 |
+
|
208 |
+
def syllable_variations(text, verbose=False) -> Set[int]:
|
209 |
+
"""
|
210 |
+
Given a text, return the set of possible numbers of syllables. It's a set because some words like "caramel" can
|
211 |
+
be pronounced with different numbers of syllables.
|
212 |
+
"""
|
213 |
+
ans = {0}
|
214 |
+
for word in re.split("[ -]+", text):
|
215 |
+
word = clean_word(word)
|
216 |
+
if not word:
|
217 |
+
continue
|
218 |
+
options = word_syllables(word)
|
219 |
+
options = range(
|
220 |
+
min(options), max(options) + 1
|
221 |
+
) # make it a range (so {2, 4} moves to [2, 3, 4])
|
222 |
+
ans = {x + y for x in ans for y in options}
|
223 |
+
return ans
|
224 |
+
|
225 |
+
|
226 |
+
@memory.cache
|
227 |
+
def word_syllables(word: str) -> Set[int]:
|
228 |
+
assert word == clean_word(
|
229 |
+
word
|
230 |
+
), "Word should be cleaned before hitting word_syllables cache"
|
231 |
+
return SyllableCounters.count_word(word)
|
232 |
+
|
233 |
+
|
234 |
+
class SyllableCounters:
|
235 |
+
"""
|
236 |
+
Simple class to count syllables in text.
|
237 |
+
"""
|
238 |
+
|
239 |
+
_cmu_dict = None
|
240 |
+
_pyphen_counter = None
|
241 |
+
|
242 |
+
@staticmethod
|
243 |
+
def cmu_dict():
|
244 |
+
if not SyllableCounters._cmu_dict:
|
245 |
+
SyllableCounters._cmu_dict = pronouncing.cmudict.dict()
|
246 |
+
return SyllableCounters._cmu_dict
|
247 |
+
|
248 |
+
def cmu(word):
|
249 |
+
return {
|
250 |
+
pronouncing.syllable_count(pro) for pro in pronouncing.phones_for_word(word)
|
251 |
+
}
|
252 |
+
|
253 |
+
@staticmethod
|
254 |
+
def pyphen_counter():
|
255 |
+
if not SyllableCounters._pyphen_counter:
|
256 |
+
SyllableCounters._pyphen_counter = pyphen.Pyphen(lang="en")
|
257 |
+
return SyllableCounters._pyphen_counter
|
258 |
+
|
259 |
+
@staticmethod
|
260 |
+
def count_word(word) -> Set[int]:
|
261 |
+
if not word:
|
262 |
+
return {0}
|
263 |
+
|
264 |
+
cmu = SyllableCounters.cmu(word)
|
265 |
+
|
266 |
+
pyph = SyllableCounters.pyphen_counter().inserted(word).count("-") + 1
|
267 |
+
|
268 |
+
syll = syllables.estimate(word)
|
269 |
+
|
270 |
+
ans = cmu | {pyph, syll}
|
271 |
+
|
272 |
+
if 0 in ans and len(ans) > 1:
|
273 |
+
ans.remove(0)
|
274 |
+
|
275 |
+
return ans
|
276 |
+
|
277 |
+
|
278 |
+
TESTS = [
|
279 |
+
["In savannah where tall trees kiss the sky,", 10],
|
280 |
+
["A giraffe named Joe with love-stricken grace,", 10],
|
281 |
+
["Did find a turtle named Sarah nearby,", 10],
|
282 |
+
["Their eyes did meet, hearts raced in sweet embrace.", 10],
|
283 |
+
["Though nature's laws deemed their love quite absurd,", 10],
|
284 |
+
["Joe's neck would bend to whisper words of flame,", 10],
|
285 |
+
["And Sarah's shell would tremble at each word,", 10],
|
286 |
+
["In love's bizarre dance, they found no one to blame.", 11],
|
287 |
+
["Through sun and storm, they'd wander, hoof and claw,", 10],
|
288 |
+
["With love that no one ever could unravel,", 11],
|
289 |
+
["In each other's eyes, perfection they saw,", 10],
|
290 |
+
["A love so fierce, no distance could they travel.", 11],
|
291 |
+
["So let us learn from turtle and giraffe,", 10],
|
292 |
+
["That love's own shape can make the coldest laugh.", 10],
|
293 |
+
["In yonder sky where colours blend so high,", 10],
|
294 |
+
["A rainbow arcs, a bridge 'twixt earth and air.", 10],
|
295 |
+
["Its radiant hues draw every gazing eye,", 12],
|
296 |
+
["A painter's dream, a sight beyond compare.", 10],
|
297 |
+
["Yet in the world of man, delight so small,", 10],
|
298 |
+
["As gumball's sphere, with colours bright and clear.", 10],
|
299 |
+
["Such simple joy it brings to one and all,", 10],
|
300 |
+
["Its sweetness matched by colours we hold dear.", 10],
|
301 |
+
["Both nature's arc and candy sphere delight,", 10],
|
302 |
+
["The vast expanse and tiny bite unite,", 10],
|
303 |
+
["In tales of wonder, stories to be told.", 10],
|
304 |
+
["So let us cherish both the grand and small,", 10],
|
305 |
+
["For beauty’s found in rainbow and in gumball.", 11],
|
306 |
+
["When night's embrace hath shrouded all in black,", 10],
|
307 |
+
["A flashlight's beam doth pierce the dark so deep,", 10],
|
308 |
+
["From paths we've chosen, and vows we mean to keep.", 11],
|
309 |
+
["Thou art like that beam, true, clear, and bright,", 9],
|
310 |
+
["Cutting through the fog of my mind's own night,", 10],
|
311 |
+
["Yet oft I find, by folly or by chance,", 10],
|
312 |
+
["Distractions lead my wandering glance.", 9],
|
313 |
+
["But even as stars, obscured by fleeting cloud,", 11],
|
314 |
+
["Return to grace the heavens, proud and loud,", 10],
|
315 |
+
["So shall my focus, once by ails distraught,", 10],
|
316 |
+
["Return to thee, as ever it hath sought.", 10],
|
317 |
+
["For in this world of fleeting sight and sound,", 10],
|
318 |
+
]
|
319 |
+
|
320 |
+
|
321 |
+
def fixed_tests():
|
322 |
+
failures = []
|
323 |
+
for line, expected in TESTS:
|
324 |
+
variations = syllable_variations(line)
|
325 |
+
if expected not in variations:
|
326 |
+
print(f"Line `{line}` has {expected} syllables which isn't in {variations}")
|
327 |
+
failures.append((line, expected, variations))
|
328 |
+
|
329 |
+
# tests from https://www.mentalfloss.com/article/53661/car-mel-or-car-mel-3-reasons-syllabically-ambiguous-words :
|
330 |
+
for words, expected in [
|
331 |
+
(
|
332 |
+
"fire tire hour liar buyer flower drawer layer loyal royal file orange poem crayon".split(),
|
333 |
+
[1, 2],
|
334 |
+
),
|
335 |
+
(
|
336 |
+
"caramel mayonnaise family chocolate camera different separate favorite realtor".split(),
|
337 |
+
[2, 3],
|
338 |
+
),
|
339 |
+
("mischievous".split(), [3, 4]),
|
340 |
+
]:
|
341 |
+
for w in words:
|
342 |
+
variations = syllable_variations(w)
|
343 |
+
for i in expected:
|
344 |
+
if i not in variations:
|
345 |
+
print(
|
346 |
+
f"{w} give syllable_variations {variations} but should include {i}"
|
347 |
+
)
|
348 |
+
failures.append((w, i, variations))
|
349 |
+
return failures
|
350 |
+
|
351 |
+
|
352 |
+
def summarize_errors(errors, num_samples):
|
353 |
+
print(
|
354 |
+
f"Sonnet failure rate: {len(errors)/num_samples:.1%} out of {num_samples:,}, breakdown:"
|
355 |
+
)
|
356 |
+
wnl = sum("line count" in e for e in errors.values()) / num_samples
|
357 |
+
print(f"{wnl:.1%} wrong number of lines")
|
358 |
+
mw = sum(bool("missing words" in e) for e in errors.values()) / num_samples
|
359 |
+
print(f"{mw:.1%} missing words")
|
360 |
+
bl = sum(bool("syllable errors" in e) for e in errors.values()) / num_samples
|
361 |
+
print(f"{bl:.1%} poems with at least one line with wrong number of syllables")
|
362 |
+
rhyme_errors = (
|
363 |
+
sum(any(" " not in k for k in e) for e in errors.values()) / num_samples
|
364 |
+
)
|
365 |
+
both = (
|
366 |
+
sum(
|
367 |
+
(bool("syllable errors" in e) and any(" " not in k for k in e))
|
368 |
+
for e in errors.values()
|
369 |
+
)
|
370 |
+
/ num_samples
|
371 |
+
)
|
372 |
+
print(
|
373 |
+
f"{rhyme_errors:.1%} poems with rhyme errors ({both:.1%} poems with both rhyme and syllable errors)"
|
374 |
+
)
|
375 |
+
|
376 |
+
|
377 |
+
def corpus_check_scheme(corpus_filename, scheme):
|
378 |
+
with open(corpus_filename, "r") as f:
|
379 |
+
poems = [p.strip() for p in f.read().split("\n\n") if p]
|
380 |
+
errors = {}
|
381 |
+
for p in poems:
|
382 |
+
e = sonnet_errors(p, scheme)
|
383 |
+
if e:
|
384 |
+
errors[p] = e
|
385 |
+
print("*" * 50)
|
386 |
+
sonnet_errors(p, scheme, verbose=True)
|
387 |
+
print("scheme", scheme)
|
388 |
+
print(p)
|
389 |
+
print()
|
390 |
+
print(e)
|
391 |
+
print("<" * 50)
|
392 |
+
|
393 |
+
summarize_errors(errors, len(poems))
|
394 |
+
|
395 |
+
|
396 |
+
def test():
|
397 |
+
assert not sonnet_errors(
|
398 |
+
"""Not like the brazen giant of Greek fame,
|
399 |
+
With conquering limbs astride from land to land;
|
400 |
+
Here at our sea-washed, sunset gates shall stand
|
401 |
+
A mighty woman with a torch, whose flame
|
402 |
+
Is the imprisoned lightning, and her name
|
403 |
+
Mother of Exiles. From her beacon-hand
|
404 |
+
Glows world-wide welcome; her mild eyes command
|
405 |
+
The air-bridged harbor that twin cities frame.
|
406 |
+
|
407 |
+
"Keep, ancient lands, your storied pomp!" cries she
|
408 |
+
With silent lips. "Give me your tired, your poor,
|
409 |
+
Your huddled masses yearning to breathe free,
|
410 |
+
The wretched refuse of your teeming shore.
|
411 |
+
Send these, the homeless, tempest-tost to me,
|
412 |
+
I lift my lamp beside the golden door!"
|
413 |
+
""",
|
414 |
+
"ABBA ABBA CDCDCD",
|
415 |
+
)
|
416 |
+
|
417 |
+
assert not sonnet_errors(
|
418 |
+
"""How do I love thee? Let me count the ways.
|
419 |
+
I love thee to the depth and breadth and height
|
420 |
+
My soul can reach, when feeling out of sight
|
421 |
+
For the ends of being and ideal grace.
|
422 |
+
I love thee to the level of every day’s
|
423 |
+
Most quiet need, by sun and candle-light.
|
424 |
+
I love thee freely, as men strive for right.
|
425 |
+
I love thee purely, as they turn from praise.
|
426 |
+
I love thee with the passion put to use
|
427 |
+
In my old griefs, and with my childhood’s faith.
|
428 |
+
I love thee with a love I seemed to lose
|
429 |
+
With my lost saints. I love thee with the breath,
|
430 |
+
Smiles, tears, of all my life; and, if God choose,
|
431 |
+
I shall but love thee better after death.""",
|
432 |
+
"abba abba cdcdcd",
|
433 |
+
)
|
434 |
+
|
435 |
+
assert not sonnet_errors(
|
436 |
+
"""When, in disgrace with fortune and men’s eyes,
|
437 |
+
I all alone beweep my outcast state,
|
438 |
+
And trouble deaf heaven with my bootless cries,
|
439 |
+
And look upon myself, and curse my fate,
|
440 |
+
Wishing me like to one more rich in hope,
|
441 |
+
Featur’d like him, like him with friends possess’d,
|
442 |
+
Desiring this man’s art and that man’s scope,
|
443 |
+
With what I most enjoy contented least;
|
444 |
+
Yet in these thoughts myself almost despising,
|
445 |
+
Haply I think on thee, and then my state,
|
446 |
+
Like to the lark at break of day arising
|
447 |
+
From sullen earth, sings hymns at heaven’s gate;
|
448 |
+
For thy sweet love remember’d such wealth brings
|
449 |
+
That then I scorn to change my state with kings.""",
|
450 |
+
"ABAB CDCD EFEF GG",
|
451 |
+
)
|
452 |
+
|
453 |
+
assert sonnet_errors(
|
454 |
+
"""How do I love thee? Let me count the ways.
|
455 |
+
I love thee to the depth and breadth and height
|
456 |
+
My soul can reach, when feeling out of sight
|
457 |
+
For the ends of being and ideal grace.
|
458 |
+
I love thee to the level of every day’s
|
459 |
+
Most quiet need, by sun and candle-light.
|
460 |
+
I love thee freely, as men strive for right.
|
461 |
+
I love thee purely, as they turn from praise.
|
462 |
+
I love thee with the passion put to use
|
463 |
+
In my old griefs, and with my childhood’s faith.
|
464 |
+
I love thee with a love I seemed to lose
|
465 |
+
With my lost saints. I love thee with the breath,
|
466 |
+
Smiles, tears, of all my life; and, if God choose,
|
467 |
+
I shall but love thee better after death.""",
|
468 |
+
"ABAB CDCD EFEF GG",
|
469 |
+
)
|
470 |
+
|
471 |
+
aaa = sonnet_errors(
|
472 |
+
"""How do I love thee? Let me count the ways.
|
473 |
+
I love thee to the depth and breadth and height
|
474 |
+
My soul can reach, when feeling out of sight
|
475 |
+
For the ends of being and ideal grace.
|
476 |
+
I love thee to the level of every day’s
|
477 |
+
Most quiet need, by sun and candle-light.
|
478 |
+
I love thee freely, as men strive for right.
|
479 |
+
I love thee purely, as they turn from praise.
|
480 |
+
I love thee with the passion put to use
|
481 |
+
In my old griefs, and with my childhood’s faith.
|
482 |
+
I love thee with a love I seemed to lose
|
483 |
+
With my lost saints. I love thee with the breath,
|
484 |
+
Smiles, tears, of all my life; and, if God choose,
|
485 |
+
I shall but love thee better after death.""",
|
486 |
+
"ABBA ABBA CDC DCD",
|
487 |
+
# abba abba cdc dcd: (correct)
|
488 |
+
# "ABAB CDCD EFEF GG", (false)
|
489 |
+
)
|
490 |
+
|
491 |
+
print(aaa)
|
492 |
+
|
493 |
+
aaa = sonnet_errors(
|
494 |
+
"""How do I love thee? Let me count the ways (A)
|
495 |
+
I love thee to the depth and breadth and height (B)
|
496 |
+
My soul can reach, when feeling out of sight (B)
|
497 |
+
For the ends of being and ideal grace (A)
|
498 |
+
I love thee to the level of every day’s (A)
|
499 |
+
Most quiet need, by sun and candle-light (B)
|
500 |
+
I love thee freely, as men strive for right (B)
|
501 |
+
I love thee purely, as they turn from praise (A)
|
502 |
+
I love thee with the passion put to use (C)
|
503 |
+
In my old griefs, and with my childhood’s faith (D)
|
504 |
+
I love thee with a love I seemed to lose (C)
|
505 |
+
With my lost saints. I love thee with the breath (D)
|
506 |
+
Smiles, tears, of all my life; and, if God choose (C)
|
507 |
+
I shall but love thee better after death (D).""",
|
508 |
+
"ABBA ABBA CDC DCD",
|
509 |
+
# abba abba cdc dcd: (correct)
|
510 |
+
# "ABAB CDCD EFEF GG", (false)
|
511 |
+
)
|
prompts/curator_prompt_for_dc_cumulative.txt
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CHEATSHEET REFRENCE CURATOR
|
2 |
+
|
3 |
+
#### 1. Purpose and Goals
|
4 |
+
As the Cheatsheet Curator, you are tasked with creating a continuously evolving reference designed to help solve a wide variety of tasks, including algorithmic challenges, debugging, creative writing, and more. The cheatsheet's purpose is to consolidate verified solutions, reusable strategies, and critical insights into a single, well-structured resource.
|
5 |
+
|
6 |
+
- The cheatsheet should include quick, accurate, reliable, and practical solutions to a range of technical and creative challenges.
|
7 |
+
- After seeing each input, you should improve the content of the cheatsheet, synthesizing lessons, insights, tricks, and errors learned from past problems and adapting to new challenges.
|
8 |
+
|
9 |
+
---
|
10 |
+
|
11 |
+
#### 2. Core Responsibilities
|
12 |
+
As the Cheatsheet Curator, you should:
|
13 |
+
- Curate and preserve knolwedge: Select and document only the most relevant, most useful, and most actionable solutions and strategies, while preserving old content of the cheatsheet.
|
14 |
+
- Maintain accuracy: Ensure that all entries in the cheatsheet are accurate, clear, and well-contextualized.
|
15 |
+
- Refine and update content: Continuously update and improve the content of the cheatsheet by incorporating new insights and solutions, removing repetitions or trivial information, and adding efficient solutions.
|
16 |
+
- Ensure practicality and comprehensiveness: Provide critical and informative examples, as well as efficient code snippets and actionable guidelines.
|
17 |
+
|
18 |
+
Before updating the cheatsheet, however, you should first assess the correctness of the provided solution and strategically incorporate code blocks, insights, and solutions into the new cheatsheet. Always aim to preserve and keep correct, useful, and illustrative solutions and strategies for future cheatsheets.
|
19 |
+
|
20 |
+
---
|
21 |
+
|
22 |
+
#### 3. Principles and Best Practices
|
23 |
+
1. Accuracy and Relevance:
|
24 |
+
- Only include solutions and strategies that have been tested and proven effective.
|
25 |
+
- Clearly state any assumptions, limitations, or dependencies (e.g., specific Python libraries or solution hacks).
|
26 |
+
- For computational problems, encourage Python usage for more accurate calculations.
|
27 |
+
|
28 |
+
2. Iterative Refinement:
|
29 |
+
- Continuously improve the cheatsheet by synthesizing both old and new solutions, refining explanations, and removing redundancies.
|
30 |
+
- Rather than deleting old content and writing new content each time, consider ways to maintain table content and synthesize information from multiple solutions.
|
31 |
+
- After solving a new problem, document any reusable codes, algorithms, strategies, edge cases, or optimization techniques.
|
32 |
+
|
33 |
+
3. Clarity and Usability:
|
34 |
+
- Write concise, actioanble, well-structured entries.
|
35 |
+
- Focus on key insights or strategies that make solutions correct and effective.
|
36 |
+
|
37 |
+
4. Reusability:
|
38 |
+
- Provide clear solutions, pseudocodes, and meta strategies that are easily adaptable to different contexts.
|
39 |
+
- Avoid trivial content; focus on non-obvious, critical solution details and approaches.
|
40 |
+
- Make sure to add as many examples as you can in the cheatsheet.
|
41 |
+
- Any useful, efficient, generalizable, and illustrative solutions to the previous problems should be included in the cheatsheet.
|
42 |
+
|
43 |
+
---
|
44 |
+
|
45 |
+
#### 4. Cheatsheet Structure
|
46 |
+
The cheatsheet can be divided into the following sections:
|
47 |
+
|
48 |
+
1. Solutions, Implementation Patterns, and Code Snippets:
|
49 |
+
- Document reusable code snippets, algorithms, and solution templates.
|
50 |
+
- Include descriptions, annotated examples, and potential pitfalls, albeit succinctly.
|
51 |
+
|
52 |
+
2. [OPTIONAL] Edge Cases and Validation Traps:
|
53 |
+
- Catalog scenarios that commonly cause errors or unexpected behavior.
|
54 |
+
- Provide checks, validations, or alternative approaches to handle them.
|
55 |
+
|
56 |
+
3. General Meta-Reasoning Strategies:
|
57 |
+
- Describe high-level problem-solving frameworks and heuristics (e.g., use Python to solve heuristic problems; in bipartite graphs, max matching = min vertex cover, etc.)
|
58 |
+
- Provide concrete yet succinct step-by-step guides for tackling complex problems.
|
59 |
+
|
60 |
+
4. Implement a Usage Counter
|
61 |
+
- Each entry must include a usage count: Increase the count every time a strategy is successfully used in problem-solving.
|
62 |
+
- Use the count to prioritize frequently used solutions over rarely applied ones.
|
63 |
+
|
64 |
+
---
|
65 |
+
|
66 |
+
#### 5. Formatting Guidelines
|
67 |
+
Use the following structure for each memory item:
|
68 |
+
|
69 |
+
```
|
70 |
+
<memory_item>
|
71 |
+
<description>
|
72 |
+
[Briefly describe the problem context, purpose, and key aspects of the solution.] (Refence: Q1, Q2, Q6, etc.)
|
73 |
+
</description>
|
74 |
+
<example>
|
75 |
+
[Provide a well-documented code snippet, worked-out solution, or efficient strategy.]
|
76 |
+
</example>
|
77 |
+
</memory_item>
|
78 |
+
** Count: [Number of times this strategy has been used to solve a problem.]
|
79 |
+
|
80 |
+
|
81 |
+
<memory_item>
|
82 |
+
[...]
|
83 |
+
</memory_item>
|
84 |
+
|
85 |
+
[...]
|
86 |
+
|
87 |
+
<memory_item>
|
88 |
+
[...]
|
89 |
+
</memory_item>
|
90 |
+
|
91 |
+
```
|
92 |
+
|
93 |
+
- Tagging: Use references like `(Q14)` or `(Q22)` to link entries to their originating contexts.
|
94 |
+
- Grouping: Organize entries into logical sections and subsections.
|
95 |
+
- Prioritizing: incorporate efficient algorithmic solutions, tricks, and strategies into the cheatsheet.
|
96 |
+
- Diversity: Have as many useful and relevant memory items as possible to guide the model to tackle future questions.
|
97 |
+
|
98 |
+
N.B. Keep in mind that once the cheatsheet is updated, any previous content not directly included will be lost and cannot be retrieved. Therefore, make sure to explicitly copy any (or all) relevant information from the previous cheatsheet to the new cheatsheet!!!
|
99 |
+
|
100 |
+
---
|
101 |
+
|
102 |
+
#### 6. Cheatsheet Template
|
103 |
+
Use the following format for creating and updating the cheatsheet:
|
104 |
+
|
105 |
+
NEW CHEATSHEET:
|
106 |
+
```
|
107 |
+
<cheatsheet>
|
108 |
+
|
109 |
+
Version: [Version Number]
|
110 |
+
|
111 |
+
SOLUTIONS, IMPLEMENTATION PATTERNS, AND CODE SNIPPETS
|
112 |
+
<memory_item>
|
113 |
+
[...]
|
114 |
+
</memory_item>
|
115 |
+
|
116 |
+
<memory_item>
|
117 |
+
[...]
|
118 |
+
</memory_item>
|
119 |
+
|
120 |
+
GENERAL META-REASONING STRATEGIES
|
121 |
+
<memory_item>
|
122 |
+
[...]
|
123 |
+
</memory_item>
|
124 |
+
|
125 |
+
</cheatsheet>
|
126 |
+
```
|
127 |
+
|
128 |
+
N.B. Make sure that all information related to the cheatsheet is wrapped inside the <cheatsheet> block. The cheatsheet can be as long as circa 2000-2500 words.
|
129 |
+
|
130 |
+
-----
|
131 |
+
-----
|
132 |
+
|
133 |
+
## PREVIOUS CHEATSHEET
|
134 |
+
|
135 |
+
[[PREVIOUS_CHEATSHEET]]
|
136 |
+
|
137 |
+
-----
|
138 |
+
-----
|
139 |
+
|
140 |
+
## CURRENT INPUT
|
141 |
+
|
142 |
+
[[QUESTION]]
|
143 |
+
|
144 |
+
-----
|
145 |
+
-----
|
146 |
+
|
147 |
+
## MODEL ANSWER TO THE CURRENT INPUT
|
148 |
+
|
149 |
+
[[MODEL_ANSWER]]
|
prompts/generator_prompt.txt
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GENERATOR (PROBLEM SOLVER)
|
2 |
+
|
3 |
+
Instruction: You are an expert problem-solving assistant tasked with analyzing and solving various questions using a combination of your expertise and provided reference materials. Each task will include:
|
4 |
+
1. A specific question or problem to solve
|
5 |
+
2. A cheatsheet containing relevant strategies, patterns, and examples from similar problems
|
6 |
+
|
7 |
+
---
|
8 |
+
|
9 |
+
## 1. ANALYSIS & STRATEGY
|
10 |
+
|
11 |
+
- Carefully analyze both the question and cheatsheet before starting
|
12 |
+
- Search for and identify any applicable patterns, strategies, or examples within the cheatsheet
|
13 |
+
- Create a structured approach to solving the problem at hand
|
14 |
+
- Review and document any limitations in the provided reference materials
|
15 |
+
|
16 |
+
## 2. SOLUTION DEVELOPMENT
|
17 |
+
|
18 |
+
- Present your solution using clear, logical steps that others can follow and review
|
19 |
+
- Explain your reasoning and methodology before presenting final conclusions
|
20 |
+
- Provide detailed explanations for each step of the process
|
21 |
+
- Check and verify all assumptions and intermediate calculations
|
22 |
+
|
23 |
+
## 3. PROGRAMMING TASKS
|
24 |
+
|
25 |
+
When coding is required:
|
26 |
+
- Write clean, efficient Python code
|
27 |
+
- Follow the strict code formatting and execution protocol (always use the Python code formatting block; furthermore, after the code block, always explicitly request execution by appending: "EXECUTE CODE!"):
|
28 |
+
```python
|
29 |
+
# Your code here
|
30 |
+
```
|
31 |
+
EXECUTE CODE!
|
32 |
+
|
33 |
+
- All required imports and dependencies should be clearly declared at the top of your code
|
34 |
+
- Include clear inline comments to explain any complex programming logic
|
35 |
+
- Perform result validation after executing your code
|
36 |
+
- Apply optimization techniques from the cheatsheet when applicable
|
37 |
+
- The code should be completely self-contained without external file dependencies--it should be ready to be executed right away
|
38 |
+
- Do not include any placeholders, system-specific paths, or hard-coded local paths
|
39 |
+
- Feel free to use standard and widely-used pip packages
|
40 |
+
- Opt for alternative methods if errors persist during execution
|
41 |
+
- Exclude local paths and engine-specific settings (e.g., avoid configurations like chess.engine.SimpleEngine.popen_uci("/usr/bin/stockfish"))
|
42 |
+
|
43 |
+
## 4. FINAL ANSWER FORMAT
|
44 |
+
|
45 |
+
ALWAYS present your final answer in the following format:
|
46 |
+
|
47 |
+
FINAL ANSWER:
|
48 |
+
<answer>
|
49 |
+
(final answer)
|
50 |
+
</answer>
|
51 |
+
|
52 |
+
N.B. Make sure that the final answer is properly wrapped inside the <answer> block.
|
53 |
+
|
54 |
+
* For multiple-choice questions: Only provide the letter choice (e.g., (A))
|
55 |
+
* For numerical answers: Only provide the final number (e.g., 42)
|
56 |
+
* For other types of answers, including free-response answers: Provide the complete final answer
|
57 |
+
|
58 |
+
Example:
|
59 |
+
Q: What is the meaning of life?
|
60 |
+
A: [...]
|
61 |
+
FINAL ANSWER:
|
62 |
+
<answer>
|
63 |
+
42
|
64 |
+
</answer>
|
65 |
+
|
66 |
+
-----
|
67 |
+
|
68 |
+
CHEATSHEET:
|
69 |
+
'''
|
70 |
+
[[CHEATSHEET]]
|
71 |
+
'''
|
72 |
+
|
73 |
+
-----
|
74 |
+
-----
|
75 |
+
|
76 |
+
Now it is time to solve the following question.
|
77 |
+
|
78 |
+
CURRENT INPUT:
|
79 |
+
'''
|
80 |
+
[[QUESTION]]
|
81 |
+
'''
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
litellm
|
3 |
+
numpy
|
4 |
+
scikit-learn
|
5 |
+
tiktoken
|
6 |
+
# openai # litellm handles openai compatible endpoints, direct openai sdk might not be needed by the app itself
|
7 |
+
|