IliaLarchenko commited on
Commit
8e8067f
1 Parent(s): 3d8833c

Fixed tests and candidate simulation

Browse files
Files changed (3) hide show
  1. pytest.ini +5 -0
  2. tests/candidate.py +21 -13
  3. tests/test_e2e.py +9 -6
pytest.ini ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [pytest]
2
+ log_cli = true
3
+ log_cli_level = INFO
4
+ log_cli_format = %(asctime)s [%(levelname)s] %(message)s
5
+ log_cli_date_format = %Y-%m-%d %H:%M:%S
tests/candidate.py CHANGED
@@ -87,40 +87,48 @@ def complete_interview(
87
  previous_code = ""
88
 
89
  if max_messages is None:
90
- max_messages = 30 if mode == "normal" else 5
91
 
92
  for _ in range(max_messages):
 
93
  if mode == "empty":
94
- response_content = ""
95
  elif mode == "gibberish":
96
- response_content = "".join(random.choices(string.ascii_letters + string.digits, k=50))
97
  elif mode == "repeat":
98
- response_content = chat_display[-1][1]
99
  else:
100
  response = client.chat.completions.create(
101
  model=model, messages=messages_candidate, temperature=1, response_format={"type": "json_object"}, stream=False
102
  )
103
  try:
104
  response_json = json.loads(response.choices[0].message.content)
105
- response_content = response_json.get("message", "")
 
 
 
 
 
 
106
  except:
107
  continue
108
 
109
- candidate_message = response_content
110
-
111
- if not candidate_message and mode != "empty":
112
- print("No message in response")
113
  continue
114
 
115
- messages_candidate.append({"role": "assistant", "content": candidate_message})
116
-
117
- interview_data["transcript"].append(f"CANDIDATE MESSAGE: {candidate_message}")
 
 
 
118
 
119
  chat_display.append([candidate_message, None])
120
 
121
  send_time = time.time()
122
  for messages_interviewer, chat_display, previous_code, _ in send_request(
123
- candidate_message, previous_code, messages_interviewer, chat_display, llm, tts=None, silent=True
124
  ):
125
  pass
126
 
 
87
  previous_code = ""
88
 
89
  if max_messages is None:
90
+ max_messages = 25 if mode == "normal" else 5
91
 
92
  for _ in range(max_messages):
93
+ code = ""
94
  if mode == "empty":
95
+ candidate_message = ""
96
  elif mode == "gibberish":
97
+ candidate_message = "".join(random.choices(string.ascii_letters + string.digits, k=50))
98
  elif mode == "repeat":
99
+ candidate_message = chat_display[-1][1]
100
  else:
101
  response = client.chat.completions.create(
102
  model=model, messages=messages_candidate, temperature=1, response_format={"type": "json_object"}, stream=False
103
  )
104
  try:
105
  response_json = json.loads(response.choices[0].message.content)
106
+ candidate_message = response_json.get("message", "")
107
+ code = response_json.get("code_and_notes", "")
108
+ finished = response_json.get("finished", False)
109
+ question = response_json.get("question", False)
110
+
111
+ if finished and not question and not code:
112
+ break
113
  except:
114
  continue
115
 
116
+ if not candidate_message and not code and mode != "empty":
117
+ print("No message or code in response")
 
 
118
  continue
119
 
120
+ if candidate_message:
121
+ messages_candidate.append({"role": "assistant", "content": candidate_message})
122
+ interview_data["transcript"].append(f"CANDIDATE MESSAGE: {candidate_message}")
123
+ if code:
124
+ interview_data["transcript"].append(f"CANDIDATE CODE AND NOTES: {code}")
125
+ messages_candidate.append({"role": "assistant", "content": code})
126
 
127
  chat_display.append([candidate_message, None])
128
 
129
  send_time = time.time()
130
  for messages_interviewer, chat_display, previous_code, _ in send_request(
131
+ code, previous_code, messages_interviewer, chat_display, llm, tts=None, silent=True
132
  ):
133
  pass
134
 
tests/test_e2e.py CHANGED
@@ -2,6 +2,7 @@ from tests.candidate import complete_interview
2
  from tests.grader import grade
3
  from concurrent.futures import ThreadPoolExecutor
4
  import random
 
5
  from typing import List
6
 
7
 
@@ -14,7 +15,9 @@ def complete_and_grade_interview(interview_type: str, mode: str = "normal", min_
14
  :return: Overall score of the interview.
15
  """
16
  file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
17
- feedback = grade(file_path, model="gpt-4-turbo")
 
 
18
  assert feedback["overall_score"] > min_score
19
  return feedback["overall_score"]
20
 
@@ -31,13 +34,13 @@ def test_complete_interview() -> None:
31
  futures = [executor.submit(complete_and_grade_interview, it) for it in interview_types]
32
 
33
  # Test edge cases: empty, gibberish, repeat for one random interview type each
34
- # The test are placeholders for not, I will increase thresholds later
35
- futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="empty", min_score=0.0))
36
- futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="gibberish", min_score=0.0))
37
- futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="repeat", min_score=0.0))
38
 
39
  for future in futures:
40
  score = future.result()
41
  scores.append(score)
42
 
43
- assert sum(scores) / len(scores) > 0.6
 
 
2
  from tests.grader import grade
3
  from concurrent.futures import ThreadPoolExecutor
4
  import random
5
+ import logging
6
  from typing import List
7
 
8
 
 
15
  :return: Overall score of the interview.
16
  """
17
  file_path, _ = complete_interview(interview_type, "test", model="gpt-4o-mini", mode=mode)
18
+ feedback = grade(file_path, model="gpt-4o")
19
+
20
+ logging.info(f"Interview type: {interview_type}, mode: {mode}, score: {feedback['overall_score']}")
21
  assert feedback["overall_score"] > min_score
22
  return feedback["overall_score"]
23
 
 
34
  futures = [executor.submit(complete_and_grade_interview, it) for it in interview_types]
35
 
36
  # Test edge cases: empty, gibberish, repeat for one random interview type each
37
+ futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="empty"))
38
+ futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="gibberish"))
39
+ futures.append(executor.submit(complete_and_grade_interview, random.choice(interview_types), mode="repeat"))
 
40
 
41
  for future in futures:
42
  score = future.result()
43
  scores.append(score)
44
 
45
+ logging.info(f"Average score: {sum(scores) / len(scores)}")
46
+ assert sum(scores) / len(scores) > 0.7