FilipinosRich's picture
Ran black
5f111f9
import boto3
import os
import json
import pandas as pd
from urllib.parse import urlparse
import random
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain, SequentialChain
llm = ChatOpenAI(temperature=0.0, openai_api_key=os.environ["OPENAI"])
def generate_skills() -> list:
template_generate_skills = """
Can you generate me a list of skills you would need to be successfully employed in a Data Scientist role?
Return 10 skills as a JSON list.
"""
prompt_generate_skills = ChatPromptTemplate.from_template(
template=template_generate_skills
)
role_skills = LLMChain(
llm=llm, prompt=prompt_generate_skills, output_key="role_skills"
)
generate_skills_chain = SequentialChain(
chains=[role_skills],
input_variables=[],
output_variables=["role_skills"],
verbose=False,
)
result = generate_skills_chain({})
result_array = json.loads(result["role_skills"])["skills"]
return result_array
def generate_resume(skills: list) -> str:
template_generate_resume = """
Given the following list of skills as an array delimited by three backticks, generate a resume of a data scientist with 3 years of experience.
Make sure to include a section "skills" in the resume.
```
{skills}
```
"""
prompt_generate_resume = ChatPromptTemplate.from_template(
template=template_generate_resume
)
resume = LLMChain(llm=llm, prompt=prompt_generate_resume, output_key="resume")
generate_resume_chain = SequentialChain(
chains=[resume],
input_variables=["skills"],
output_variables=["resume"],
verbose=False,
)
result = generate_resume_chain({"skills": skills})
return result
def retrieve_skills(resume: str) -> str:
template_retrieve_skills = """
Given the following resume delimited by three backticks, retrieve the skills this data scientist possesses.
Return them as a JSON list.
```
{resume}
```
"""
prompt_retrieve_skills = ChatPromptTemplate.from_template(
template=template_retrieve_skills
)
skills = LLMChain(llm=llm, prompt=prompt_retrieve_skills, output_key="skills")
retrieve_skills_chain = SequentialChain(
chains=[skills],
input_variables=["resume"],
output_variables=["skills"],
verbose=False,
)
result = retrieve_skills_chain({"resume": resume})
result_array = json.loads(result["skills"])
return result_array
def get_score(true_values: list, predicted_values: list) -> float:
intersection_list = [value for value in predicted_values if value in true_values]
print(intersection_list)
return len(intersection_list) / len(true_values)
if __name__ == "__main__":
role_skills = generate_skills()
random_skills = random.sample(role_skills, 3)
resume = generate_resume(random_skills)
skills = retrieve_skills(resume)
score = get_score(random_skills, skills)
print(random_skills)
print(skills)
print(score)
# def get_resumes() -> str:
# s3 = boto3.client(
# 's3',
# region_name='eu-west-1'
# )
# resumes = s3.get_object(Bucket='ausy-datalake-drift-nonprod', Key='resume-matcher/raw/resume-dataset.csv')
# resumes_list = resumes['Body'].read().decode('utf-8').splitlines()
# resumes_list = resumes['Body'].read().decode('utf-8').splitlines()
# resumes_list = str(resumes_list).replace('. ', '.\n')
# resumes_list = str(resumes_list).replace('•', '\n - ')
# resumes_list = [s.replace('. ', '.\n') for s in resumes_list]
# resumes_list = [s.replace('•', '\n - ') for s in resumes_list]
# resume_string =''.join(resumes_list)
# s3_uri = urlparse("s3://ausy-datalake-drift-nonprod/resume-matcher/raw/resume-dataset.csv", allow_fragments=False).geturl()
# resumes_list = pd.read_csv(s3_uri, header=None, encoding='utf-8')[0].tolist()
# return resumes_list
# def get_skills(resumes: str) -> list:
# template_resumes_get_skills = """
# Given the following string, delimited by <RESUMES> and </RESUMES> which contains resumes which are not properly formatted, categorize the resumes based on domain.
# For each domain list the skills of the resumes that are part of that domain.
# Create a JSON object where they keys are the domains and the values are a list containing the skills.
# Return that JSON object only.
# <RESUMES>
# {resumes}
# </RESUMES>
# """
# prompt_vacancy_get_skills = ChatPromptTemplate.from_template(template=template_resumes_get_skills)
# resume_skills = LLMChain(llm=llm, prompt=prompt_vacancy_get_skills, output_key="resume_skills")
# get_skills_resumes_chain = SequentialChain(
# chains=[resume_skills],
# input_variables=["resumes"],
# output_variables=["resume_skills"],
# verbose=False
# )
# result = get_skills_resumes_chain({"resumes": resumes})
# # print(result)
# resume_skills = json.loads(result['resume_skills'])
# print(resume_skills)
# if __name__ == "__main__":
# resumes = get_resumes()
# print(resumes)
# for x in resumes:
# get_skills(x)