Spaces:
Running
Running
Update gen_api_answer.py
Browse files- gen_api_answer.py +105 -104
gen_api_answer.py
CHANGED
@@ -3,88 +3,22 @@ import anthropic
|
|
3 |
from together import Together
|
4 |
import json
|
5 |
import re
|
6 |
-
import
|
|
|
7 |
|
8 |
# Initialize clients
|
9 |
anthropic_client = anthropic.Anthropic()
|
10 |
openai_client = OpenAI()
|
11 |
together_client = Together()
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
GENERATION_PROMPT = """Please generate a random human message and an AI response in the format of a QA dataset. The human input should not be a one-word answer question like "What is the capital of France?". The AI response generated should be at least a few sentences long."""
|
18 |
-
|
19 |
-
RESPONSE_GENERATION_SYSTEM_PROMPT = "You are an assistant that generates random responses to human messages for testing purposes. Generate bad responses (with a mix of correct and incorrect information) 60% of the time and good responses 40% of the time. Do not say which type of response you are generating, just generate the response."
|
20 |
-
|
21 |
-
def get_random_human_ai_pair():
|
22 |
-
# Select system prompt with specified probabilities
|
23 |
-
system_prompt = random.choices(
|
24 |
-
[GOOD_SYSTEM_PROMPT, BAD_SYSTEM_PROMPT, AMBIGUOUS_SYSTEM_PROMPT],
|
25 |
-
weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
|
26 |
-
)[0]
|
27 |
-
|
28 |
-
# Log which type of response is being generated
|
29 |
-
prompt_type = {
|
30 |
-
GOOD_SYSTEM_PROMPT: "good",
|
31 |
-
BAD_SYSTEM_PROMPT: "bad",
|
32 |
-
AMBIGUOUS_SYSTEM_PROMPT: "ambiguous"
|
33 |
-
}[system_prompt]
|
34 |
-
print(f"Generating {prompt_type} response")
|
35 |
-
|
36 |
-
# Randomly choose between GPT-3.5 and Claude with 65%/35% weights
|
37 |
-
model_choice = random.choices([
|
38 |
-
("gpt-3.5-turbo", get_openai_response),
|
39 |
-
("claude-3-5-haiku-latest", get_anthropic_response)
|
40 |
-
], weights=[0.5, 0.5])[0]
|
41 |
-
model_name, api_func = model_choice
|
42 |
-
|
43 |
-
# Generate response using selected model
|
44 |
-
response = api_func(
|
45 |
-
model_name=model_name,
|
46 |
-
prompt=GENERATION_PROMPT,
|
47 |
-
system_prompt=system_prompt,
|
48 |
-
max_tokens=500,
|
49 |
-
temperature=1
|
50 |
-
)
|
51 |
-
|
52 |
-
# Define default messages outside the try block
|
53 |
-
default_human = "How do muscles grow?"
|
54 |
-
default_ai = """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis."""
|
55 |
-
|
56 |
-
# Parse the response to get the human input and AI response
|
57 |
-
try:
|
58 |
-
# First try to parse the entire response as JSON
|
59 |
-
try:
|
60 |
-
# Clean the response by replacing newlines with spaces
|
61 |
-
cleaned_response = response.replace('\n', ' ').replace('\r', '')
|
62 |
-
data = json.loads(cleaned_response)
|
63 |
-
except json.JSONDecodeError:
|
64 |
-
# If that fails, try to find JSON within the response
|
65 |
-
json_match = re.search(r"{.*}", response, re.DOTALL)
|
66 |
-
if json_match:
|
67 |
-
cleaned_match = json_match.group(0).replace('\n', ' ').replace('\r', '')
|
68 |
-
data = json.loads(cleaned_match)
|
69 |
-
else:
|
70 |
-
raise json.JSONDecodeError("No valid JSON found", response, 0)
|
71 |
-
|
72 |
-
# Extract messages with fallbacks
|
73 |
-
human_message = data.get("human", default_human)
|
74 |
-
ai_message = data.get("ai", default_ai)
|
75 |
-
|
76 |
-
# Debug logging
|
77 |
-
print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...'")
|
78 |
-
|
79 |
-
except Exception as e:
|
80 |
-
print(f"Failed to parse response: {str(e)}\n {response}")
|
81 |
-
human_message = default_human
|
82 |
-
ai_message = default_ai
|
83 |
-
|
84 |
-
return human_message, ai_message
|
85 |
|
86 |
JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
87 |
|
|
|
88 |
|
89 |
def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
90 |
"""Get response from OpenAI API"""
|
@@ -102,7 +36,6 @@ def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, m
|
|
102 |
except Exception as e:
|
103 |
return f"Error with OpenAI model {model_name}: {str(e)}"
|
104 |
|
105 |
-
|
106 |
def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
107 |
"""Get response from Anthropic API"""
|
108 |
try:
|
@@ -117,7 +50,6 @@ def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT
|
|
117 |
except Exception as e:
|
118 |
return f"Error with Anthropic model {model_name}: {str(e)}"
|
119 |
|
120 |
-
|
121 |
def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
122 |
"""Get response from Together API"""
|
123 |
try:
|
@@ -135,8 +67,40 @@ def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT,
|
|
135 |
except Exception as e:
|
136 |
return f"Error with Together model {model_name}: {str(e)}"
|
137 |
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
"""Get response from appropriate API based on model organization"""
|
141 |
if not model_info:
|
142 |
return "Model not found or unsupported."
|
@@ -144,18 +108,33 @@ def get_model_response(model_name, model_info, prompt, system_prompt=JUDGE_SYSTE
|
|
144 |
api_model = model_info["api_model"]
|
145 |
organization = model_info["organization"]
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
try:
|
148 |
if organization == "OpenAI":
|
149 |
-
return get_openai_response(
|
|
|
|
|
150 |
elif organization == "Anthropic":
|
151 |
-
return get_anthropic_response(
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
else:
|
153 |
# All other organizations use Together API
|
154 |
-
return get_together_response(
|
|
|
|
|
155 |
except Exception as e:
|
156 |
return f"Error with {organization} model {model_name}: {str(e)}"
|
157 |
|
158 |
-
|
159 |
def parse_model_response(response):
|
160 |
try:
|
161 |
# Debug print
|
@@ -179,27 +158,49 @@ def parse_model_response(response):
|
|
179 |
print(f"Failed to parse response: {str(e)}")
|
180 |
return "Error", f"Failed to parse response: {response}"
|
181 |
|
182 |
-
def
|
183 |
-
"""Generate AI response using GPT-3.5-turbo"""
|
184 |
-
if not human_msg.strip():
|
185 |
-
return "", False
|
186 |
-
|
187 |
try:
|
188 |
-
response
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
)
|
195 |
-
|
196 |
-
if
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
except Exception as e:
|
205 |
-
|
|
|
|
3 |
from together import Together
|
4 |
import json
|
5 |
import re
|
6 |
+
import os
|
7 |
+
import requests
|
8 |
|
9 |
# Initialize clients
|
10 |
anthropic_client = anthropic.Anthropic()
|
11 |
openai_client = OpenAI()
|
12 |
together_client = Together()
|
13 |
+
hf_api_key = os.getenv("HF_API_KEY")
|
14 |
+
huggingface_client = OpenAI(
|
15 |
+
base_url="https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud/v1/",
|
16 |
+
api_key=hf_api_key
|
17 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
|
20 |
|
21 |
+
ALTERNATIVE_JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction."""
|
22 |
|
23 |
def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
24 |
"""Get response from OpenAI API"""
|
|
|
36 |
except Exception as e:
|
37 |
return f"Error with OpenAI model {model_name}: {str(e)}"
|
38 |
|
|
|
39 |
def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
40 |
"""Get response from Anthropic API"""
|
41 |
try:
|
|
|
50 |
except Exception as e:
|
51 |
return f"Error with Anthropic model {model_name}: {str(e)}"
|
52 |
|
|
|
53 |
def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
|
54 |
"""Get response from Together API"""
|
55 |
try:
|
|
|
67 |
except Exception as e:
|
68 |
return f"Error with Together model {model_name}: {str(e)}"
|
69 |
|
70 |
+
def get_hf_response(model_name, prompt, max_tokens=500):
|
71 |
+
"""Get response from Hugging Face model"""
|
72 |
+
try:
|
73 |
+
headers = {
|
74 |
+
"Accept": "application/json",
|
75 |
+
"Authorization": f"Bearer {hf_api_key}",
|
76 |
+
"Content-Type": "application/json"
|
77 |
+
}
|
78 |
+
|
79 |
+
payload = {
|
80 |
+
"inputs": prompt,
|
81 |
+
"parameters": {
|
82 |
+
"max_new_tokens": max_tokens,
|
83 |
+
"return_full_text": False
|
84 |
+
}
|
85 |
+
}
|
86 |
+
|
87 |
+
response = requests.post(
|
88 |
+
"https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
|
89 |
+
headers=headers,
|
90 |
+
json=payload
|
91 |
+
)
|
92 |
+
return response.json()[0]["generated_text"]
|
93 |
+
except Exception as e:
|
94 |
+
return f"Error with Hugging Face model {model_name}: {str(e)}"
|
95 |
+
|
96 |
+
def get_model_response(
|
97 |
+
model_name,
|
98 |
+
model_info,
|
99 |
+
prompt,
|
100 |
+
use_alternative_prompt=False,
|
101 |
+
max_tokens=500,
|
102 |
+
temperature=0
|
103 |
+
):
|
104 |
"""Get response from appropriate API based on model organization"""
|
105 |
if not model_info:
|
106 |
return "Model not found or unsupported."
|
|
|
108 |
api_model = model_info["api_model"]
|
109 |
organization = model_info["organization"]
|
110 |
|
111 |
+
# Select the appropriate system prompt
|
112 |
+
if use_alternative_prompt:
|
113 |
+
system_prompt = ALTERNATIVE_JUDGE_SYSTEM_PROMPT
|
114 |
+
else:
|
115 |
+
system_prompt = JUDGE_SYSTEM_PROMPT
|
116 |
+
|
117 |
try:
|
118 |
if organization == "OpenAI":
|
119 |
+
return get_openai_response(
|
120 |
+
api_model, prompt, system_prompt, max_tokens, temperature
|
121 |
+
)
|
122 |
elif organization == "Anthropic":
|
123 |
+
return get_anthropic_response(
|
124 |
+
api_model, prompt, system_prompt, max_tokens, temperature
|
125 |
+
)
|
126 |
+
elif organization == "Prometheus":
|
127 |
+
return get_hf_response(
|
128 |
+
api_model, prompt, max_tokens
|
129 |
+
)
|
130 |
else:
|
131 |
# All other organizations use Together API
|
132 |
+
return get_together_response(
|
133 |
+
api_model, prompt, system_prompt, max_tokens, temperature
|
134 |
+
)
|
135 |
except Exception as e:
|
136 |
return f"Error with {organization} model {model_name}: {str(e)}"
|
137 |
|
|
|
138 |
def parse_model_response(response):
|
139 |
try:
|
140 |
# Debug print
|
|
|
158 |
print(f"Failed to parse response: {str(e)}")
|
159 |
return "Error", f"Failed to parse response: {response}"
|
160 |
|
161 |
+
def alternative_parse_model_response(output):
|
|
|
|
|
|
|
|
|
162 |
try:
|
163 |
+
print(f"Raw model response: {output}")
|
164 |
+
|
165 |
+
# Remove "Feedback:" prefix if present (case insensitive)
|
166 |
+
output = re.sub(r'^feedback:\s*', '', output.strip(), flags=re.IGNORECASE)
|
167 |
+
|
168 |
+
# First, try to match the pattern "... [RESULT] X"
|
169 |
+
pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
|
170 |
+
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
|
171 |
+
if match:
|
172 |
+
feedback = match.group(1).strip()
|
173 |
+
score = int(match.group(2))
|
174 |
+
return str(score), feedback
|
175 |
+
|
176 |
+
# If no match, try to match "... Score: X"
|
177 |
+
pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
|
178 |
+
match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
|
179 |
+
if match:
|
180 |
+
feedback = match.group(1).strip()
|
181 |
+
score = int(match.group(2))
|
182 |
+
return str(score), feedback
|
183 |
+
|
184 |
+
# Pattern to handle [Score X] at the end
|
185 |
+
pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
|
186 |
+
match = re.search(pattern, output, re.DOTALL)
|
187 |
+
if match:
|
188 |
+
feedback = match.group(1).strip()
|
189 |
+
score = int(match.group(2))
|
190 |
+
return str(score), feedback
|
191 |
+
|
192 |
+
# Final fallback attempt
|
193 |
+
pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
|
194 |
+
match = re.search(pattern, output)
|
195 |
+
if match:
|
196 |
+
score = int(match.group(1))
|
197 |
+
feedback = output[:match.start()].rstrip()
|
198 |
+
# Remove any trailing brackets from feedback
|
199 |
+
feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
|
200 |
+
return str(score), feedback
|
201 |
+
|
202 |
+
return "Error", f"Failed to parse response: {output}"
|
203 |
+
|
204 |
except Exception as e:
|
205 |
+
print(f"Failed to parse response: {str(e)}")
|
206 |
+
return "Error", f"Exception during parsing: {str(e)}"
|