FilipinosRich commited on
Commit
e481d34
·
1 Parent(s): c54f404

First test for 1 data scientist role with a random selection of 3 skills

Browse files
Files changed (3) hide show
  1. Pipfile +2 -0
  2. Pipfile.lock +22 -7
  3. test.py +132 -33
Pipfile CHANGED
@@ -9,6 +9,8 @@ requests = "*"
9
  openai = "*"
10
  langchain = "*"
11
  boto3 = "*"
 
 
12
 
13
  [dev-packages]
14
 
 
9
  openai = "*"
10
  langchain = "*"
11
  boto3 = "*"
12
+ utils = "*"
13
+ s3fs = "*"
14
 
15
  [dev-packages]
16
 
Pipfile.lock CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_meta": {
3
  "hash": {
4
- "sha256": "773a20e359549c3df8e208ef5e12bb82c398272ee42c4ce709f4783f5989b6b6"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
@@ -159,19 +159,19 @@
159
  },
160
  "boto3": {
161
  "hashes": [
162
- "sha256:01f078047eb4d238c6b9c6cc623f2af33b4ae67980c5326691e35cb5493ff6c7",
163
- "sha256:4cc0c6005be910e52077227e670930ab55a41ba86cdb6d1c052571d08cd4d32c"
164
  ],
165
  "index": "pypi",
166
- "version": "==1.28.9"
167
  },
168
  "botocore": {
169
  "hashes": [
170
- "sha256:bd849d3ac95f1781385ed831d753a04a3ec870a59d6598175aaedd71dc2baf5f",
171
- "sha256:e56ccd3536a90094ea5b176b5dd33bfe4f049efdf71af468ea1661bd424c787d"
172
  ],
173
  "markers": "python_version >= '3.7'",
174
- "version": "==1.31.9"
175
  },
176
  "certifi": {
177
  "hashes": [
@@ -1360,6 +1360,14 @@
1360
  "markers": "python_version >= '3.8'",
1361
  "version": "==0.9.2"
1362
  },
 
 
 
 
 
 
 
 
1363
  "s3transfer": {
1364
  "hashes": [
1365
  "sha256:3c0da2d074bf35d6870ef157158641178a4204a6e689e82546083e31e0311346",
@@ -1510,6 +1518,13 @@
1510
  "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
1511
  "version": "==1.26.16"
1512
  },
 
 
 
 
 
 
 
1513
  "uvicorn": {
1514
  "hashes": [
1515
  "sha256:1d55d46b83ee4ce82b4e82f621f2050adb3eb7b5481c13f9af1744951cae2f1f",
 
1
  {
2
  "_meta": {
3
  "hash": {
4
+ "sha256": "baada24f686970aa797e81ab57198eff4adaa42274331ba31f06e107dacc04af"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
 
159
  },
160
  "boto3": {
161
  "hashes": [
162
+ "sha256:67001b3f512cbe2e00e352c65fb443b504e5e388fee39d73bcc42da1ae87d9e3",
163
+ "sha256:cb8af03f553f1c7db7137bc897785baeeaa97b8fde483eb1cdb1f1ef3cec9cb7"
164
  ],
165
  "index": "pypi",
166
+ "version": "==1.28.10"
167
  },
168
  "botocore": {
169
  "hashes": [
170
+ "sha256:736a9412f405d6985570c4a87b533c2396dd8d4042d8c7a0ca14e73d4f1bcf9d",
171
+ "sha256:a3bfd3627a490faedf37d79373d6957936d7720888ca85466e0471cb921e4557"
172
  ],
173
  "markers": "python_version >= '3.7'",
174
+ "version": "==1.31.10"
175
  },
176
  "certifi": {
177
  "hashes": [
 
1360
  "markers": "python_version >= '3.8'",
1361
  "version": "==0.9.2"
1362
  },
1363
+ "s3fs": {
1364
+ "hashes": [
1365
+ "sha256:2ca5de8dc18ad7ad350c0bd01aef0406aa5d0fff78a561f0f710f9d9858abdd0",
1366
+ "sha256:91c1dfb45e5217bd441a7a560946fe865ced6225ff7eb0fb459fe6e601a95ed3"
1367
+ ],
1368
+ "index": "pypi",
1369
+ "version": "==0.4.2"
1370
+ },
1371
  "s3transfer": {
1372
  "hashes": [
1373
  "sha256:3c0da2d074bf35d6870ef157158641178a4204a6e689e82546083e31e0311346",
 
1518
  "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
1519
  "version": "==1.26.16"
1520
  },
1521
+ "utils": {
1522
+ "hashes": [
1523
+ "sha256:ff04549b95deb2fd7a82ddaea077e2f41209079e2482df231c358770b27b72a7"
1524
+ ],
1525
+ "index": "pypi",
1526
+ "version": "==1.0.1"
1527
+ },
1528
  "uvicorn": {
1529
  "hashes": [
1530
  "sha256:1d55d46b83ee4ce82b4e82f621f2050adb3eb7b5481c13f9af1744951cae2f1f",
test.py CHANGED
@@ -1,6 +1,9 @@
1
  import boto3
2
  import os
3
  import json
 
 
 
4
 
5
  from langchain.chat_models import ChatOpenAI
6
  from langchain.prompts import ChatPromptTemplate
@@ -8,53 +11,149 @@ from langchain.chains import LLMChain, SequentialChain
8
 
9
  llm = ChatOpenAI(temperature=0.0, openai_api_key=os.environ["OPENAI"])
10
 
11
- def get_resume_string() -> str:
12
 
13
- s3 = boto3.client(
14
- 's3',
15
- region_name='eu-west-1'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
 
18
- resumes = s3.get_object(Bucket='ausy-datalake-drift-nonprod', Key='resume-matcher/raw/resume-dataset.csv')
 
 
19
 
20
- resumes_list = resumes['Body'].read().decode('utf-8').splitlines()
21
- resumes_list = [s.replace('. ', '.\n') for s in resumes_list]
22
- resumes_list = [s.replace('•', '\n - ') for s in resumes_list]
23
- # resume_string =''.join(resumes_list)
24
 
25
- return resumes_list
 
 
26
 
27
- def get_skills(resumes: str) -> list:
 
 
 
28
 
29
- template_resumes_get_skills = """
30
- Given the following string, delimited by <RESUMES> and </RESUMES> which contains resumes which are not properly formatted, categorize the resumes based on domain.
31
- For each domain list the skills of the resumes that are part of that domain.
32
-
33
- Create a JSON object where they keys are the domains and the values are a list containing the skills.
 
 
 
 
34
 
35
- Return that JSON object only.
36
 
37
- <RESUMES>
38
- {resumes}
39
- </RESUMES>
 
 
 
 
 
 
 
 
40
  """
41
 
42
- prompt_vacancy_get_skills = ChatPromptTemplate.from_template(template=template_resumes_get_skills)
43
- resume_skills = LLMChain(llm=llm, prompt=prompt_vacancy_get_skills, output_key="resume_skills")
44
 
45
- get_skills_resumes_chain = SequentialChain(
46
- chains=[resume_skills],
47
- input_variables=["resumes"],
48
- output_variables=["resume_skills"],
49
  verbose=False
50
  )
51
 
52
- result = get_skills_resumes_chain({"resumes": resumes})
53
- # print(result)
54
- resume_skills = json.loads(result['resume_skills'])
55
- print(resume_skills)
 
 
 
 
 
56
 
57
  if __name__ == "__main__":
58
- resumes = get_resume_string()
59
- for x in resumes:
60
- get_skills(x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import boto3
2
  import os
3
  import json
4
+ import pandas as pd
5
+ from urllib.parse import urlparse
6
+ import random
7
 
8
  from langchain.chat_models import ChatOpenAI
9
  from langchain.prompts import ChatPromptTemplate
 
11
 
12
  llm = ChatOpenAI(temperature=0.0, openai_api_key=os.environ["OPENAI"])
13
 
 
14
 
15
+
16
+
17
+ def generate_skills() -> list:
18
+
19
+ template_generate_skills = """
20
+ Can you generate me a list of skills you would need to be successfully employed in a Data Scientist role?
21
+ Return 10 skills as a JSON list.
22
+ """
23
+
24
+ prompt_generate_skills = ChatPromptTemplate.from_template(template=template_generate_skills)
25
+ role_skills = LLMChain(llm=llm, prompt=prompt_generate_skills, output_key="role_skills")
26
+
27
+ generate_skills_chain = SequentialChain(
28
+ chains=[role_skills],
29
+ input_variables=[],
30
+ output_variables=["role_skills"],
31
+ verbose=False
32
  )
33
 
34
+ result = generate_skills_chain({})
35
+ result_array = json.loads(result["role_skills"])["skills"]
36
+ return result_array
37
 
38
+ def generate_resume(skills: list) -> str:
 
 
 
39
 
40
+ template_generate_resume = """
41
+ Given the following list of skills as an array delimited by three backticks, generate a resume of a data scientist with 3 years of experience.
42
+ Make sure to include a section "skills" in the resume.
43
 
44
+ ```
45
+ {skills}
46
+ ```
47
+ """
48
 
49
+ prompt_generate_resume = ChatPromptTemplate.from_template(template=template_generate_resume)
50
+ resume = LLMChain(llm=llm, prompt=prompt_generate_resume, output_key="resume")
51
+
52
+ generate_resume_chain = SequentialChain(
53
+ chains=[resume],
54
+ input_variables=["skills"],
55
+ output_variables=["resume"],
56
+ verbose=False
57
+ )
58
 
59
+ result = generate_resume_chain({"skills": skills})
60
 
61
+ return result
62
+
63
+ def retrieve_skills(resume: str) -> str:
64
+
65
+ template_retrieve_skills = """
66
+ Given the following resume delimited by three backticks, retrieve the skills this data scientist possesses.
67
+ Return them as a JSON list.
68
+
69
+ ```
70
+ {resume}
71
+ ```
72
  """
73
 
74
+ prompt_retrieve_skills = ChatPromptTemplate.from_template(template=template_retrieve_skills)
75
+ skills = LLMChain(llm=llm, prompt=prompt_retrieve_skills, output_key="skills")
76
 
77
+ retrieve_skills_chain = SequentialChain(
78
+ chains=[skills],
79
+ input_variables=["resume"],
80
+ output_variables=["skills"],
81
  verbose=False
82
  )
83
 
84
+ result = retrieve_skills_chain({"resume": resume})
85
+ result_array = json.loads(result["skills"])
86
+
87
+ return result_array
88
+
89
+ def get_score(true_values:list, predicted_values:list) -> float:
90
+ intersection_list = [value for value in predicted_values if value in true_values]
91
+ print(intersection_list)
92
+ return len(intersection_list)/len(true_values)
93
 
94
  if __name__ == "__main__":
95
+ role_skills = generate_skills()
96
+ random_skills = random.sample(role_skills, 3)
97
+ resume = generate_resume(random_skills)
98
+ skills = retrieve_skills(resume)
99
+ score = get_score(random_skills, skills)
100
+ print(random_skills)
101
+ print(skills)
102
+ print(score)
103
+
104
+ # def get_resumes() -> str:
105
+
106
+ # s3 = boto3.client(
107
+ # 's3',
108
+ # region_name='eu-west-1'
109
+ # )
110
+
111
+ # resumes = s3.get_object(Bucket='ausy-datalake-drift-nonprod', Key='resume-matcher/raw/resume-dataset.csv')
112
+
113
+ # resumes_list = resumes['Body'].read().decode('utf-8').splitlines()
114
+ # resumes_list = resumes['Body'].read().decode('utf-8').splitlines()
115
+ # resumes_list = str(resumes_list).replace('. ', '.\n')
116
+ # resumes_list = str(resumes_list).replace('•', '\n - ')
117
+ # resumes_list = [s.replace('. ', '.\n') for s in resumes_list]
118
+ # resumes_list = [s.replace('•', '\n - ') for s in resumes_list]
119
+ # resume_string =''.join(resumes_list)
120
+ # s3_uri = urlparse("s3://ausy-datalake-drift-nonprod/resume-matcher/raw/resume-dataset.csv", allow_fragments=False).geturl()
121
+ # resumes_list = pd.read_csv(s3_uri, header=None, encoding='utf-8')[0].tolist()
122
+
123
+ # return resumes_list
124
+
125
+ # def get_skills(resumes: str) -> list:
126
+
127
+ # template_resumes_get_skills = """
128
+ # Given the following string, delimited by <RESUMES> and </RESUMES> which contains resumes which are not properly formatted, categorize the resumes based on domain.
129
+ # For each domain list the skills of the resumes that are part of that domain.
130
+
131
+ # Create a JSON object where they keys are the domains and the values are a list containing the skills.
132
+
133
+ # Return that JSON object only.
134
+
135
+ # <RESUMES>
136
+ # {resumes}
137
+ # </RESUMES>
138
+ # """
139
+
140
+ # prompt_vacancy_get_skills = ChatPromptTemplate.from_template(template=template_resumes_get_skills)
141
+ # resume_skills = LLMChain(llm=llm, prompt=prompt_vacancy_get_skills, output_key="resume_skills")
142
+
143
+ # get_skills_resumes_chain = SequentialChain(
144
+ # chains=[resume_skills],
145
+ # input_variables=["resumes"],
146
+ # output_variables=["resume_skills"],
147
+ # verbose=False
148
+ # )
149
+
150
+ # result = get_skills_resumes_chain({"resumes": resumes})
151
+ # # print(result)
152
+ # resume_skills = json.loads(result['resume_skills'])
153
+ # print(resume_skills)
154
+
155
+ # if __name__ == "__main__":
156
+ # resumes = get_resumes()
157
+ # print(resumes)
158
+ # for x in resumes:
159
+ # get_skills(x)