Spaces:
Sleeping
Sleeping
IliaLarchenko
commited on
Commit
•
8e8067f
1
Parent(s):
3d8833c
Fixed tests and candidate simulation
Browse files- pytest.ini +5 -0
- tests/candidate.py +21 -13
- tests/test_e2e.py +9 -6
pytest.ini
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[pytest]
|
2 |
+
log_cli = true
|
3 |
+
log_cli_level = INFO
|
4 |
+
log_cli_format = %(asctime)s [%(levelname)s] %(message)s
|
5 |
+
log_cli_date_format = %Y-%m-%d %H:%M:%S
|
tests/candidate.py
CHANGED
@@ -87,40 +87,48 @@ def complete_interview(
|
|
87 |
previous_code = ""
|
88 |
|
89 |
if max_messages is None:
|
90 |
-
max_messages =
|
91 |
|
92 |
for _ in range(max_messages):
|
|
|
93 |
if mode == "empty":
|
94 |
-
|
95 |
elif mode == "gibberish":
|
96 |
-
|
97 |
elif mode == "repeat":
|
98 |
-
|
99 |
else:
|
100 |
response = client.chat.completions.create(
|
101 |
model=model, messages=messages_candidate, temperature=1, response_format={"type": "json_object"}, stream=False
|
102 |
)
|
103 |
try:
|
104 |
response_json = json.loads(response.choices[0].message.content)
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
except:
|
107 |
continue
|
108 |
|
109 |
-
candidate_message
|
110 |
-
|
111 |
-
if not candidate_message and mode != "empty":
|
112 |
-
print("No message in response")
|
113 |
continue
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
118 |
|
119 |
chat_display.append([candidate_message, None])
|
120 |
|
121 |
send_time = time.time()
|
122 |
for messages_interviewer, chat_display, previous_code, _ in send_request(
|
123 |
-
|
124 |
):
|
125 |
pass
|
126 |
|
|
|
87 |
previous_code = ""
|
88 |
|
89 |
if max_messages is None:
|
90 |
+
max_messages = 25 if mode == "normal" else 5
|
91 |
|
92 |
for _ in range(max_messages):
|
93 |
+
code = ""
|
94 |
if mode == "empty":
|
95 |
+
candidate_message = ""
|
96 |
elif mode == "gibberish":
|
97 |
+
candidate_message = "".join(random.choices(string.ascii_letters + string.digits, k=50))
|
98 |
elif mode == "repeat":
|
99 |
+
candidate_message = chat_display[-1][1]
|
100 |
else:
|
101 |
response = client.chat.completions.create(
|
102 |
model=model, messages=messages_candidate, temperature=1, response_format={"type": "json_object"}, stream=False
|
103 |
)
|
104 |
try:
|
105 |
response_json = json.loads(response.choices[0].message.content)
|
106 |
+
candidate_message = response_json.get("message", "")
|
107 |
+
code = response_json.get("code_and_notes", "")
|
108 |
+
finished = response_json.get("finished", False)
|
109 |
+
question = response_json.get("question", False)
|
110 |
+
|
111 |
+
if finished and not question and not code:
|
112 |
+
break
|
113 |
except:
|
114 |
continue
|
115 |
|
116 |
+
if not candidate_message and not code and mode != "empty":
|
117 |
+
print("No message or code in response")
|
|
|
|
|
118 |
continue
|
119 |
|
120 |
+
if candidate_message:
|
121 |
+
messages_candidate.append({"role": "assistant", "content": candidate_message})
|
122 |
+
interview_data["transcript"].append(f"CANDIDATE MESSAGE: {candidate_message}")
|
123 |
+
if code:
|
124 |
+
interview_data["transcript"].append(f"CANDIDATE CODE AND NOTES: {code}")
|
125 |
+
messages_candidate.append({"role": "assistant", "content": code})
|
126 |
|
127 |
chat_display.append([candidate_message, None])
|
128 |
|
129 |
send_time = time.time()
|
130 |
for messages_interviewer, chat_display, previous_code, _ in send_request(
|
131 |
+
code, previous_code, messages_interviewer, chat_display, llm, tts=None, silent=True
|
132 |
):
|
133 |
pass
|
134 |
|
tests/test_e2e.py
CHANGED
@@ -2,6 +2,7 @@ from tests.candidate import complete_interview
|
|
2 |
from tests.grader import grade
|
3 |
from concurrent.futures import ThreadPoolExecutor
|
4 |
import random
|
|
|
5 |
from typing import List
|
6 |
|
7 |
|
@@ -14,7 +15,9 @@ def complete_and_grade_interview(interview_type: str, mode: str = "normal", min_
|
|
14 |
:return: Overall score of the interview.
|
15 |
"""
|
16 |
file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
|
17 |
-
feedback = grade(file_path, model="gpt-
|
|
|
|
|
18 |
assert feedback["overall_score"] > min_score
|
19 |
return feedback["overall_score"]
|
20 |
|
@@ -31,13 +34,13 @@ def test_complete_interview() -> None:
|
|
31 |
futures = [executor.submit(complete_and_grade_interview, it) for it in interview_types]
|
32 |
|
33 |
# Test edge cases: empty, gibberish, repeat for one random interview type each
|
34 |
-
|
35 |
-
futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="
|
36 |
-
futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="
|
37 |
-
futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="repeat", min_score=0.0))
|
38 |
|
39 |
for future in futures:
|
40 |
score = future.result()
|
41 |
scores.append(score)
|
42 |
|
43 |
-
|
|
|
|
2 |
from tests.grader import grade
|
3 |
from concurrent.futures import ThreadPoolExecutor
|
4 |
import random
|
5 |
+
import logging
|
6 |
from typing import List
|
7 |
|
8 |
|
|
|
15 |
:return: Overall score of the interview.
|
16 |
"""
|
17 |
file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
|
18 |
+
feedback = grade(file_path, model="gpt-4o")
|
19 |
+
|
20 |
+
logging.info(f"Interview type: {interview_type}, mode: {mode}, score: {feedback['overall_score']}")
|
21 |
assert feedback["overall_score"] > min_score
|
22 |
return feedback["overall_score"]
|
23 |
|
|
|
34 |
futures = [executor.submit(complete_and_grade_interview, it) for it in interview_types]
|
35 |
|
36 |
# Test edge cases: empty, gibberish, repeat for one random interview type each
|
37 |
+
futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="empty"))
|
38 |
+
futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="gibberish"))
|
39 |
+
futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="repeat"))
|
|
|
40 |
|
41 |
for future in futures:
|
42 |
score = future.result()
|
43 |
scores.append(score)
|
44 |
|
45 |
+
logging.info(f"Average score: {sum(scores) / len(scores)}")
|
46 |
+
assert sum(scores) / len(scores) > 0.7
|