Update llmeval.py
Browse files- llmeval.py +74 -13
llmeval.py
CHANGED
|
@@ -194,32 +194,93 @@ class LLM_as_Evaluator():
|
|
| 194 |
|
| 195 |
def Observation_LLM_Evaluator(self,promptversion):
|
| 196 |
|
| 197 |
-
metrics=["biological_context_alignment","contextual_relevance_alignment","
|
| 198 |
|
| 199 |
data_to_evaluate=de.GetData(promptversion)
|
| 200 |
-
import time
|
| 201 |
|
| 202 |
-
for metric in metrics:
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
{"role":"user","content":f"""
|
| 208 |
-
Prompt :{data_to_evaluate["prompt"]}
|
| 209 |
-
Context :{data_to_evaluate["context"]}
|
| 210 |
-
Agent's Response : {data_to_evaluate["response"]}
|
| 211 |
-
"""}
|
| 212 |
-
]
|
| 213 |
-
evaluation_response=self.___engine_core(messages=messages)
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
data={
|
| 216 |
|
|
|
|
| 217 |
"promptversion":promptversion,
|
| 218 |
"biological_context_alignment":"",
|
| 219 |
"contextual_relevance_alignment":"",
|
| 220 |
"unit_coherence":"",
|
| 221 |
"response_specificity":""
|
| 222 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
de.Update(data=data)
|
| 224 |
|
| 225 |
|
|
|
|
| 194 |
|
| 195 |
def Observation_LLM_Evaluator(self,promptversion):
|
| 196 |
|
| 197 |
+
metrics=["biological_context_alignment","contextual_relevance_alignment","response_specificity","unit_coherence"]
|
| 198 |
|
| 199 |
data_to_evaluate=de.GetData(promptversion)
|
|
|
|
| 200 |
|
|
|
|
| 201 |
|
| 202 |
+
evaluation_responses=[]
|
| 203 |
+
|
| 204 |
+
for metric in metrics:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
+
match metric:
|
| 207 |
+
case "biological_context_alignment":
|
| 208 |
+
|
| 209 |
+
messages =[
|
| 210 |
+
|
| 211 |
+
{"role":"system","content":SYSTEM_FOR_BIO_CONTEXT_ALIGNMENT},
|
| 212 |
+
{"role":"user","content":f"""
|
| 213 |
+
Prompt :{data_to_evaluate["prompt"]}
|
| 214 |
+
Context :{data_to_evaluate["context"]}
|
| 215 |
+
Agent's Response : {data_to_evaluate["response"]}
|
| 216 |
+
"""}
|
| 217 |
+
]
|
| 218 |
+
evaluation_response=self.___engine_core(messages=messages)
|
| 219 |
+
evaluation_responses.append({"bio_context_alignment":evaluation_response})
|
| 220 |
+
|
| 221 |
+
case "contextual_relevance_alignment":
|
| 222 |
+
|
| 223 |
+
messages =[
|
| 224 |
+
|
| 225 |
+
{"role":"system","content":SYSTEM_FOR_CONTEXTUAL_RELEVANCE_ALIGNMENT},
|
| 226 |
+
{"role":"user","content":f"""
|
| 227 |
+
Prompt :{data_to_evaluate["prompt"]}
|
| 228 |
+
Context :{data_to_evaluate["context"]}
|
| 229 |
+
Agent's Response : {data_to_evaluate["response"]}
|
| 230 |
+
"""}
|
| 231 |
+
]
|
| 232 |
+
evaluation_response=self.___engine_core(messages=messages)
|
| 233 |
+
evaluation_responses.append({"contextual_relevance_alignment":evaluation_response})
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
case "response_specificity":
|
| 237 |
+
|
| 238 |
+
messages =[
|
| 239 |
+
|
| 240 |
+
{"role":"system","content":SYSTEM_PROMPT_FOR_RESPONSE_SPECIFICITY},
|
| 241 |
+
{"role":"user","content":f"""
|
| 242 |
+
Prompt :{data_to_evaluate["prompt"]}
|
| 243 |
+
Context :{data_to_evaluate["context"]}
|
| 244 |
+
Agent's Response : {data_to_evaluate["response"]}
|
| 245 |
+
"""}
|
| 246 |
+
]
|
| 247 |
+
evaluation_response=self.___engine_core(messages=messages)
|
| 248 |
+
evaluation_responses.append({"response_specificity":evaluation_response})
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
case "unit_coherence":
|
| 252 |
+
|
| 253 |
+
messages =[
|
| 254 |
+
|
| 255 |
+
{"role":"system","content":SYSTEM_PROMPT_FOR_TRIAD_COHERENCE},
|
| 256 |
+
{"role":"user","content":f"""
|
| 257 |
+
Prompt :{data_to_evaluate["prompt"]}
|
| 258 |
+
Context :{data_to_evaluate["context"]}
|
| 259 |
+
Agent's Response : {data_to_evaluate["response"]}
|
| 260 |
+
"""}
|
| 261 |
+
]
|
| 262 |
+
evaluation_response=self.___engine_core(messages=messages)
|
| 263 |
+
evaluation_responses.append({"unit_coherence":evaluation_response})
|
| 264 |
+
|
| 265 |
+
|
| 266 |
data={
|
| 267 |
|
| 268 |
+
|
| 269 |
"promptversion":promptversion,
|
| 270 |
"biological_context_alignment":"",
|
| 271 |
"contextual_relevance_alignment":"",
|
| 272 |
"unit_coherence":"",
|
| 273 |
"response_specificity":""
|
| 274 |
}
|
| 275 |
+
|
| 276 |
+
for resp in evaluation_responses:
|
| 277 |
+
|
| 278 |
+
data["biological_context_alignment"]=resp["biological_context_alignment"]
|
| 279 |
+
data["contextual_relevance_alignment"]=resp["contextual_relevance_alignment"]
|
| 280 |
+
data["unit_coherence"]=resp["unit_coherence"]
|
| 281 |
+
data["response_specificity"]=resp["response_specificity"]
|
| 282 |
+
|
| 283 |
+
|
| 284 |
de.Update(data=data)
|
| 285 |
|
| 286 |
|