import boto3 import os import json import pandas as pd from urllib.parse import urlparse import random from langchain.chat_models import ChatOpenAI from langchain.prompts import ChatPromptTemplate from langchain.chains import LLMChain, SequentialChain llm = ChatOpenAI(temperature=0.0, openai_api_key=os.environ["OPENAI"]) def generate_skills() -> list: template_generate_skills = """ Can you generate me a list of skills you would need to be successfully employed in a Data Scientist role? Return 10 skills as a JSON list. """ prompt_generate_skills = ChatPromptTemplate.from_template( template=template_generate_skills ) role_skills = LLMChain( llm=llm, prompt=prompt_generate_skills, output_key="role_skills" ) generate_skills_chain = SequentialChain( chains=[role_skills], input_variables=[], output_variables=["role_skills"], verbose=False, ) result = generate_skills_chain({}) result_array = json.loads(result["role_skills"])["skills"] return result_array def generate_resume(skills: list) -> str: template_generate_resume = """ Given the following list of skills as an array delimited by three backticks, generate a resume of a data scientist with 3 years of experience. Make sure to include a section "skills" in the resume. ``` {skills} ``` """ prompt_generate_resume = ChatPromptTemplate.from_template( template=template_generate_resume ) resume = LLMChain(llm=llm, prompt=prompt_generate_resume, output_key="resume") generate_resume_chain = SequentialChain( chains=[resume], input_variables=["skills"], output_variables=["resume"], verbose=False, ) result = generate_resume_chain({"skills": skills}) return result def retrieve_skills(resume: str) -> str: template_retrieve_skills = """ Given the following resume delimited by three backticks, retrieve the skills this data scientist possesses. Return them as a JSON list. ``` {resume} ``` """ prompt_retrieve_skills = ChatPromptTemplate.from_template( template=template_retrieve_skills ) skills = LLMChain(llm=llm, prompt=prompt_retrieve_skills, output_key="skills") retrieve_skills_chain = SequentialChain( chains=[skills], input_variables=["resume"], output_variables=["skills"], verbose=False, ) result = retrieve_skills_chain({"resume": resume}) result_array = json.loads(result["skills"]) return result_array def get_score(true_values: list, predicted_values: list) -> float: intersection_list = [value for value in predicted_values if value in true_values] print(intersection_list) return len(intersection_list) / len(true_values) if __name__ == "__main__": role_skills = generate_skills() random_skills = random.sample(role_skills, 3) resume = generate_resume(random_skills) skills = retrieve_skills(resume) score = get_score(random_skills, skills) print(random_skills) print(skills) print(score) # def get_resumes() -> str: # s3 = boto3.client( # 's3', # region_name='eu-west-1' # ) # resumes = s3.get_object(Bucket='ausy-datalake-drift-nonprod', Key='resume-matcher/raw/resume-dataset.csv') # resumes_list = resumes['Body'].read().decode('utf-8').splitlines() # resumes_list = resumes['Body'].read().decode('utf-8').splitlines() # resumes_list = str(resumes_list).replace('. ', '.\n') # resumes_list = str(resumes_list).replace('•', '\n - ') # resumes_list = [s.replace('. ', '.\n') for s in resumes_list] # resumes_list = [s.replace('•', '\n - ') for s in resumes_list] # resume_string =''.join(resumes_list) # s3_uri = urlparse("s3://ausy-datalake-drift-nonprod/resume-matcher/raw/resume-dataset.csv", allow_fragments=False).geturl() # resumes_list = pd.read_csv(s3_uri, header=None, encoding='utf-8')[0].tolist() # return resumes_list # def get_skills(resumes: str) -> list: # template_resumes_get_skills = """ # Given the following string, delimited by and which contains resumes which are not properly formatted, categorize the resumes based on domain. # For each domain list the skills of the resumes that are part of that domain. # Create a JSON object where they keys are the domains and the values are a list containing the skills. # Return that JSON object only. # # {resumes} # # """ # prompt_vacancy_get_skills = ChatPromptTemplate.from_template(template=template_resumes_get_skills) # resume_skills = LLMChain(llm=llm, prompt=prompt_vacancy_get_skills, output_key="resume_skills") # get_skills_resumes_chain = SequentialChain( # chains=[resume_skills], # input_variables=["resumes"], # output_variables=["resume_skills"], # verbose=False # ) # result = get_skills_resumes_chain({"resumes": resumes}) # # print(result) # resume_skills = json.loads(result['resume_skills']) # print(resume_skills) # if __name__ == "__main__": # resumes = get_resumes() # print(resumes) # for x in resumes: # get_skills(x)