kaikaidai commited on
Commit
6e812c0
1 Parent(s): e098d1e

Update gen_api_answer.py

Browse files
Files changed (1) hide show
  1. gen_api_answer.py +105 -104
gen_api_answer.py CHANGED
@@ -3,88 +3,22 @@ import anthropic
3
  from together import Together
4
  import json
5
  import re
6
- import random
 
7
 
8
  # Initialize clients
9
  anthropic_client = anthropic.Anthropic()
10
  openai_client = OpenAI()
11
  together_client = Together()
12
-
13
- GOOD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The AI response generated should be roughly a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
14
- BAD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The AI response generated should be roughly a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
15
- AMBIGUOUS_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The AI response generated should be roughly a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
16
-
17
- GENERATION_PROMPT = """Please generate a random human message and an AI response in the format of a QA dataset. The human input should not be a one-word answer question like "What is the capital of France?". The AI response generated should be at least a few sentences long."""
18
-
19
- RESPONSE_GENERATION_SYSTEM_PROMPT = "You are an assistant that generates random responses to human messages for testing purposes. Generate bad responses (with a mix of correct and incorrect information) 60% of the time and good responses 40% of the time. Do not say which type of response you are generating, just generate the response."
20
-
21
- def get_random_human_ai_pair():
22
- # Select system prompt with specified probabilities
23
- system_prompt = random.choices(
24
- [GOOD_SYSTEM_PROMPT, BAD_SYSTEM_PROMPT, AMBIGUOUS_SYSTEM_PROMPT],
25
- weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
26
- )[0]
27
-
28
- # Log which type of response is being generated
29
- prompt_type = {
30
- GOOD_SYSTEM_PROMPT: "good",
31
- BAD_SYSTEM_PROMPT: "bad",
32
- AMBIGUOUS_SYSTEM_PROMPT: "ambiguous"
33
- }[system_prompt]
34
- print(f"Generating {prompt_type} response")
35
-
36
- # Randomly choose between GPT-3.5 and Claude with 65%/35% weights
37
- model_choice = random.choices([
38
- ("gpt-3.5-turbo", get_openai_response),
39
- ("claude-3-5-haiku-latest", get_anthropic_response)
40
- ], weights=[0.5, 0.5])[0]
41
- model_name, api_func = model_choice
42
-
43
- # Generate response using selected model
44
- response = api_func(
45
- model_name=model_name,
46
- prompt=GENERATION_PROMPT,
47
- system_prompt=system_prompt,
48
- max_tokens=500,
49
- temperature=1
50
- )
51
-
52
- # Define default messages outside the try block
53
- default_human = "How do muscles grow?"
54
- default_ai = """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis."""
55
-
56
- # Parse the response to get the human input and AI response
57
- try:
58
- # First try to parse the entire response as JSON
59
- try:
60
- # Clean the response by replacing newlines with spaces
61
- cleaned_response = response.replace('\n', ' ').replace('\r', '')
62
- data = json.loads(cleaned_response)
63
- except json.JSONDecodeError:
64
- # If that fails, try to find JSON within the response
65
- json_match = re.search(r"{.*}", response, re.DOTALL)
66
- if json_match:
67
- cleaned_match = json_match.group(0).replace('\n', ' ').replace('\r', '')
68
- data = json.loads(cleaned_match)
69
- else:
70
- raise json.JSONDecodeError("No valid JSON found", response, 0)
71
-
72
- # Extract messages with fallbacks
73
- human_message = data.get("human", default_human)
74
- ai_message = data.get("ai", default_ai)
75
-
76
- # Debug logging
77
- print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...'")
78
-
79
- except Exception as e:
80
- print(f"Failed to parse response: {str(e)}\n {response}")
81
- human_message = default_human
82
- ai_message = default_ai
83
-
84
- return human_message, ai_message
85
 
86
  JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
87
 
 
88
 
89
  def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
90
  """Get response from OpenAI API"""
@@ -102,7 +36,6 @@ def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, m
102
  except Exception as e:
103
  return f"Error with OpenAI model {model_name}: {str(e)}"
104
 
105
-
106
  def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
107
  """Get response from Anthropic API"""
108
  try:
@@ -117,7 +50,6 @@ def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT
117
  except Exception as e:
118
  return f"Error with Anthropic model {model_name}: {str(e)}"
119
 
120
-
121
  def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
122
  """Get response from Together API"""
123
  try:
@@ -135,8 +67,40 @@ def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT,
135
  except Exception as e:
136
  return f"Error with Together model {model_name}: {str(e)}"
137
 
138
-
139
- def get_model_response(model_name, model_info, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  """Get response from appropriate API based on model organization"""
141
  if not model_info:
142
  return "Model not found or unsupported."
@@ -144,18 +108,33 @@ def get_model_response(model_name, model_info, prompt, system_prompt=JUDGE_SYSTE
144
  api_model = model_info["api_model"]
145
  organization = model_info["organization"]
146
 
 
 
 
 
 
 
147
  try:
148
  if organization == "OpenAI":
149
- return get_openai_response(api_model, prompt, system_prompt, max_tokens, temperature)
 
 
150
  elif organization == "Anthropic":
151
- return get_anthropic_response(api_model, prompt, system_prompt, max_tokens, temperature)
 
 
 
 
 
 
152
  else:
153
  # All other organizations use Together API
154
- return get_together_response(api_model, prompt, system_prompt, max_tokens, temperature)
 
 
155
  except Exception as e:
156
  return f"Error with {organization} model {model_name}: {str(e)}"
157
 
158
-
159
  def parse_model_response(response):
160
  try:
161
  # Debug print
@@ -179,27 +158,49 @@ def parse_model_response(response):
179
  print(f"Failed to parse response: {str(e)}")
180
  return "Error", f"Failed to parse response: {response}"
181
 
182
- def generate_ai_response(human_msg):
183
- """Generate AI response using GPT-3.5-turbo"""
184
- if not human_msg.strip():
185
- return "", False
186
-
187
  try:
188
- response = get_openai_response(
189
- "gpt-3.5-turbo",
190
- human_msg,
191
- system_prompt=RESPONSE_GENERATION_SYSTEM_PROMPT,
192
- max_tokens=1000,
193
- temperature=1
194
- )
195
- # Extract just the response content since we don't need JSON format here
196
- if isinstance(response, str):
197
- # Clean up any JSON formatting if present
198
- try:
199
- data = json.loads(response)
200
- response = data.get("content", response)
201
- except json.JSONDecodeError:
202
- pass
203
- return response, False # Return response and button interactive state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  except Exception as e:
205
- return f"Error generating response: {str(e)}", False
 
 
3
  from together import Together
4
  import json
5
  import re
6
+ import os
7
+ import requests
8
 
9
  # Initialize clients
10
  anthropic_client = anthropic.Anthropic()
11
  openai_client = OpenAI()
12
  together_client = Together()
13
+ hf_api_key = os.getenv("HF_API_KEY")
14
+ huggingface_client = OpenAI(
15
+ base_url="https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud/v1/",
16
+ api_key=hf_api_key
17
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
20
 
21
+ ALTERNATIVE_JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction."""
22
 
23
  def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
24
  """Get response from OpenAI API"""
 
36
  except Exception as e:
37
  return f"Error with OpenAI model {model_name}: {str(e)}"
38
 
 
39
  def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
40
  """Get response from Anthropic API"""
41
  try:
 
50
  except Exception as e:
51
  return f"Error with Anthropic model {model_name}: {str(e)}"
52
 
 
53
  def get_together_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
54
  """Get response from Together API"""
55
  try:
 
67
  except Exception as e:
68
  return f"Error with Together model {model_name}: {str(e)}"
69
 
70
+ def get_hf_response(model_name, prompt, max_tokens=500):
71
+ """Get response from Hugging Face model"""
72
+ try:
73
+ headers = {
74
+ "Accept": "application/json",
75
+ "Authorization": f"Bearer {hf_api_key}",
76
+ "Content-Type": "application/json"
77
+ }
78
+
79
+ payload = {
80
+ "inputs": prompt,
81
+ "parameters": {
82
+ "max_new_tokens": max_tokens,
83
+ "return_full_text": False
84
+ }
85
+ }
86
+
87
+ response = requests.post(
88
+ "https://otb7jglxy6r37af6.us-east-1.aws.endpoints.huggingface.cloud",
89
+ headers=headers,
90
+ json=payload
91
+ )
92
+ return response.json()[0]["generated_text"]
93
+ except Exception as e:
94
+ return f"Error with Hugging Face model {model_name}: {str(e)}"
95
+
96
+ def get_model_response(
97
+ model_name,
98
+ model_info,
99
+ prompt,
100
+ use_alternative_prompt=False,
101
+ max_tokens=500,
102
+ temperature=0
103
+ ):
104
  """Get response from appropriate API based on model organization"""
105
  if not model_info:
106
  return "Model not found or unsupported."
 
108
  api_model = model_info["api_model"]
109
  organization = model_info["organization"]
110
 
111
+ # Select the appropriate system prompt
112
+ if use_alternative_prompt:
113
+ system_prompt = ALTERNATIVE_JUDGE_SYSTEM_PROMPT
114
+ else:
115
+ system_prompt = JUDGE_SYSTEM_PROMPT
116
+
117
  try:
118
  if organization == "OpenAI":
119
+ return get_openai_response(
120
+ api_model, prompt, system_prompt, max_tokens, temperature
121
+ )
122
  elif organization == "Anthropic":
123
+ return get_anthropic_response(
124
+ api_model, prompt, system_prompt, max_tokens, temperature
125
+ )
126
+ elif organization == "Prometheus":
127
+ return get_hf_response(
128
+ api_model, prompt, max_tokens
129
+ )
130
  else:
131
  # All other organizations use Together API
132
+ return get_together_response(
133
+ api_model, prompt, system_prompt, max_tokens, temperature
134
+ )
135
  except Exception as e:
136
  return f"Error with {organization} model {model_name}: {str(e)}"
137
 
 
138
  def parse_model_response(response):
139
  try:
140
  # Debug print
 
158
  print(f"Failed to parse response: {str(e)}")
159
  return "Error", f"Failed to parse response: {response}"
160
 
161
+ def alternative_parse_model_response(output):
 
 
 
 
162
  try:
163
+ print(f"Raw model response: {output}")
164
+
165
+ # Remove "Feedback:" prefix if present (case insensitive)
166
+ output = re.sub(r'^feedback:\s*', '', output.strip(), flags=re.IGNORECASE)
167
+
168
+ # First, try to match the pattern "... [RESULT] X"
169
+ pattern = r"(.*?)\s*\[RESULT\]\s*[\(\[]?(\d+)[\)\]]?"
170
+ match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
171
+ if match:
172
+ feedback = match.group(1).strip()
173
+ score = int(match.group(2))
174
+ return str(score), feedback
175
+
176
+ # If no match, try to match "... Score: X"
177
+ pattern = r"(.*?)\s*(?:Score|Result)\s*:\s*[\(\[]?(\d+)[\)\]]?"
178
+ match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
179
+ if match:
180
+ feedback = match.group(1).strip()
181
+ score = int(match.group(2))
182
+ return str(score), feedback
183
+
184
+ # Pattern to handle [Score X] at the end
185
+ pattern = r"(.*?)\s*\[(?:Score|Result)\s*[\(\[]?(\d+)[\)\]]?\]$"
186
+ match = re.search(pattern, output, re.DOTALL)
187
+ if match:
188
+ feedback = match.group(1).strip()
189
+ score = int(match.group(2))
190
+ return str(score), feedback
191
+
192
+ # Final fallback attempt
193
+ pattern = r"[\(\[]?(\d+)[\)\]]?\s*\]?$"
194
+ match = re.search(pattern, output)
195
+ if match:
196
+ score = int(match.group(1))
197
+ feedback = output[:match.start()].rstrip()
198
+ # Remove any trailing brackets from feedback
199
+ feedback = re.sub(r'\s*\[[^\]]*$', '', feedback).strip()
200
+ return str(score), feedback
201
+
202
+ return "Error", f"Failed to parse response: {output}"
203
+
204
  except Exception as e:
205
+ print(f"Failed to parse response: {str(e)}")
206
+ return "Error", f"Exception during parsing: {str(e)}"