Spaces:
Sleeping
Sleeping
Upload 6 files
Browse filesadding required file
- Readme.md +49 -0
- requirements.txt +3 -0
- src/data_preparation.py +5 -0
- src/evaluation.py +25 -0
- src/model.py +81 -0
- training_data.jsonl +0 -0
Readme.md
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SelectRight
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
This project aims to rank candidates for a role by comparing their resumes and interview transcripts using a language model.
|
5 |
+
|
6 |
+
## Folder Structure
|
7 |
+
```
|
8 |
+
MLE_Trial_Task/
|
9 |
+
βββ data/
|
10 |
+
β βββ candidates.csv (optional, can be uploaded via the app)
|
11 |
+
βββ core_services/
|
12 |
+
β βββ bot9_ai/
|
13 |
+
β βββ modules/
|
14 |
+
β βββ LLM/
|
15 |
+
β βββ OpenAi.py
|
16 |
+
βββ src/
|
17 |
+
β βββ __init__.py
|
18 |
+
β βββ data_preparation.py
|
19 |
+
β βββ model.py
|
20 |
+
β βββ evaluation.py
|
21 |
+
β βββ bias_analysis.py
|
22 |
+
β βββ report_generation.py
|
23 |
+
βββ app.py
|
24 |
+
βββ requirements.txt
|
25 |
+
βββ README.md
|
26 |
+
```
|
27 |
+
|
28 |
+
## Setup
|
29 |
+
1. Clone the repository.
|
30 |
+
2. Install the required dependencies:
|
31 |
+
```bash
|
32 |
+
pip install -r requirements.txt
|
33 |
+
```
|
34 |
+
3. Run the Streamlit app:
|
35 |
+
```bash
|
36 |
+
streamlit run app.py
|
37 |
+
```
|
38 |
+
|
39 |
+
## Files
|
40 |
+
- `data/candidates.csv`: The dataset file (optional, can be uploaded via the app).
|
41 |
+
- `llmservice/OpenAi.py`: Contains the `OpenAi` class.
|
42 |
+
- `src/data_preparation.py`: Script for loading the dataset.
|
43 |
+
- `src/model.py`: Script for defining the model.
|
44 |
+
- `src/evaluation.py`: Script for evaluating the model.
|
45 |
+
- `src/bias_analysis.py`: Script for analyzing biases.
|
46 |
+
- `src/report_generation.py`: Script for generating the report.
|
47 |
+
- `app.py`: Streamlit app script.
|
48 |
+
- `requirements.txt`: List of dependencies.
|
49 |
+
- `README.md`: Project overview and setup instructions.
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
openai
|
3 |
+
streamlit
|
src/data_preparation.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
def load_data(file):
|
4 |
+
data = pd.read_csv(file)
|
5 |
+
return data
|
src/evaluation.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model import compare_candidates
|
2 |
+
|
3 |
+
def evaluate_model(openai, data):
|
4 |
+
correct_predictions = 0
|
5 |
+
|
6 |
+
for index, row in data.iterrows():
|
7 |
+
candidateA = {
|
8 |
+
'resume': row['candidateAResume'],
|
9 |
+
'transcript': row['candidateATranscript']
|
10 |
+
}
|
11 |
+
candidateB = {
|
12 |
+
'resume': row['candidateBResume'],
|
13 |
+
'transcript': row['candidateBTranscript']
|
14 |
+
}
|
15 |
+
role = row['role']
|
16 |
+
|
17 |
+
prediction = compare_candidates(openai, candidateA, candidateB, role)
|
18 |
+
|
19 |
+
if prediction:
|
20 |
+
if (prediction == 'Candidate A' and row['winnerId'] == row['candidateAId']) or \
|
21 |
+
(prediction == 'Candidate B' and row['winnerId'] == row['candidateBId']):
|
22 |
+
correct_predictions += 1
|
23 |
+
|
24 |
+
accuracy = correct_predictions / len(data)
|
25 |
+
return accuracy
|
src/model.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import json
|
3 |
+
|
4 |
+
def initialize_openai(api_key):
|
5 |
+
openai.api_key = api_key
|
6 |
+
|
7 |
+
def prepare_training_data(training_data):
|
8 |
+
training_prompts = []
|
9 |
+
for index, row in training_data.iterrows():
|
10 |
+
job_description = row['role']
|
11 |
+
candidateA_resume = row['candidateAResume']
|
12 |
+
candidateB_resume = row['candidateBResume']
|
13 |
+
candidateA_transcript = row['candidateATranscript']
|
14 |
+
candidateB_transcript = row['candidateBTranscript']
|
15 |
+
winner_id = row['winnerId']
|
16 |
+
|
17 |
+
prompt = f"Job Description:\n{job_description}\n\nCandidate A Resume:\n{candidateA_resume}\n\nCandidate B Resume:\n{candidateB_resume}\n\nCandidate A Transcript:\n{candidateA_transcript}\n\nCandidate B Transcript:\n{candidateB_transcript}\n\nPreferred Candidate:"
|
18 |
+
completion = f"{winner_id}"
|
19 |
+
|
20 |
+
training_prompts.append({"prompt": prompt, "completion": completion})
|
21 |
+
|
22 |
+
with open("training_data.jsonl", "w") as f:
|
23 |
+
for item in training_prompts:
|
24 |
+
f.write(json.dumps(item) + "\n")
|
25 |
+
|
26 |
+
def upload_training_data(file_path):
|
27 |
+
with open(file_path, "rb") as f:
|
28 |
+
response = openai.files.create(
|
29 |
+
file=f,
|
30 |
+
purpose='fine-tune'
|
31 |
+
)
|
32 |
+
print("response-upload--->", response)
|
33 |
+
return response.id
|
34 |
+
|
35 |
+
def create_fine_tuning_job(file_id):
|
36 |
+
response = openai.fine_tuning.jobs.create(
|
37 |
+
training_file=file_id,
|
38 |
+
model="gpt-4o-2024-08-06",
|
39 |
+
)
|
40 |
+
print("response-create--->",response)
|
41 |
+
return response.fine_tuned_model
|
42 |
+
|
43 |
+
def fine_tune_model(training_data):
|
44 |
+
# Prepare training data
|
45 |
+
prepare_training_data(training_data)
|
46 |
+
|
47 |
+
# Upload training data
|
48 |
+
file_id = upload_training_data("training_data.jsonl")
|
49 |
+
|
50 |
+
# Create fine-tuning job
|
51 |
+
fine_tuned_model = create_fine_tuning_job(file_id)
|
52 |
+
|
53 |
+
return fine_tuned_model
|
54 |
+
|
55 |
+
def extract_keywords(resume, job_description, model):
|
56 |
+
prompt = f"Extract key skills and qualifications from the following resume based on the job description:\n\nJob Description:\n{job_description}\n\nResume:\n{resume}\n\nKey Skills and Qualifications:"
|
57 |
+
messages = [
|
58 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
59 |
+
{"role": "user", "content": prompt}
|
60 |
+
]
|
61 |
+
response = openai.chat.completions.create(model=model, messages=messages, max_tokens=100)
|
62 |
+
return response.choices[0].message.content
|
63 |
+
|
64 |
+
def rate_skills(transcript, job_description, model):
|
65 |
+
prompt = f"Rate the skills of the candidate based on the following interview transcript and job description:\n\nJob Description:\n{job_description}\n\nInterview Transcript:\n{transcript}\n\nSkill Ratings:"
|
66 |
+
messages = [
|
67 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
68 |
+
{"role": "user", "content": prompt}
|
69 |
+
]
|
70 |
+
response = openai.chat.completions.create(model=model, messages=messages, max_tokens=100)
|
71 |
+
return response.choices[0].message.content
|
72 |
+
|
73 |
+
def compare_candidates(candidateA, candidateB, job_description, model):
|
74 |
+
prompt = f"Based on the following details, return the candidate_id of the candidate which is the best fit for the role:\n\nJob Description:\n{job_description}\n\nCandidate A:\n{candidateA}\n\nCandidate B:\n{candidateB}\n\nPreferred Candidate:, ONLY RETURN THE CANDIDATE ID which would be of format '8ab47434-09a9-44e6-8c77-f9fd20c57765'"
|
75 |
+
messages = [
|
76 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
77 |
+
{"role": "user", "content": prompt},
|
78 |
+
{"role": "system", "content": "<candidate_id>"}
|
79 |
+
]
|
80 |
+
response = openai.chat.completions.create(model=model, messages=messages, max_tokens=100)
|
81 |
+
return response.choices[0].message.content
|
training_data.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|