celise88 commited on
Commit
fc51d61
1 Parent(s): 6edb646

add ONET webscrape functionality for importance ratings

Browse files
Files changed (3) hide show
  1. main.py +2 -0
  2. requirements.txt +3 -1
  3. scrape_onet.py +147 -0
main.py CHANGED
@@ -14,6 +14,7 @@ from fastapi.responses import HTMLResponse
14
  import pandas as pd
15
  import time
16
  from uuid import uuid1
 
17
  from localStoragePy import localStoragePy
18
  localStorage = localStoragePy('pathfinder', 'text')
19
 
@@ -23,6 +24,7 @@ from user_utils import Hash
23
 
24
  # APP SETUP
25
  app = FastAPI()
 
26
  app.mount("/static", StaticFiles(directory='static'), name="static")
27
  templates = Jinja2Templates(directory="templates/")
28
 
 
14
  import pandas as pd
15
  import time
16
  from uuid import uuid1
17
+ from mangum import Mangum
18
  from localStoragePy import localStoragePy
19
  localStorage = localStoragePy('pathfinder', 'text')
20
 
 
24
 
25
  # APP SETUP
26
  app = FastAPI()
27
+ handler = Mangum(app)
28
  app.mount("/static", StaticFiles(directory='static'), name="static")
29
  templates = Jinja2Templates(directory="templates/")
30
 
requirements.txt CHANGED
@@ -19,4 +19,6 @@ accelerate==0.16.0
19
  plotly-express==0.4.1
20
  bcrypt==4.0.1
21
  passlib==1.7.4
22
- localStoragePy==0.2.3
 
 
 
19
  plotly-express==0.4.1
20
  bcrypt==4.0.1
21
  passlib==1.7.4
22
+ localStoragePy==0.2.3
23
+ sentence-transformers==2.2.2
24
+ mangum==0.17.0
scrape_onet.py CHANGED
@@ -36,6 +36,153 @@ def get_onet_tasks(onetCode):
36
  tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
37
  return tasks
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def get_job_postings(onetCode, state):
40
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
41
  url = "https://www.onetonline.org/link/localjobs/" + onetCode + "?st=" + state
 
36
  tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
37
  return tasks
38
 
39
+ def get_onet_ratings(onetCode):
40
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
41
+
42
+ activities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wa&n_wa=0&s_wa=IM&c_wa=0"
43
+ context_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=cx&n_cx=0&c_cx=0&s_cx=n"
44
+
45
+ response = requests.get(activities_url, headers=headers)
46
+ soup = BeautifulSoup(response.text, 'html.parser')
47
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
48
+ tasks = clean(tasks)
49
+ tasks = tasks.split('show all show top 10')[1]
50
+ tasks = tasks.split('back to top')[0]
51
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace("importance work activity", " ")
52
+ tasks = tasks.split(". ")
53
+ split_data = [item.split(" -- ")[0] for item in tasks]
54
+ num_desc = []
55
+ for i in range(len(tasks)):
56
+ temp = [','.join(item) for item in split_data][i].split(',')
57
+ num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(' ) ', '')])
58
+ df = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
59
+ df = df[df['Importance'] != '']
60
+
61
+ response = requests.get(context_url, headers=headers)
62
+ soup = BeautifulSoup(response.text, 'html.parser')
63
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
64
+ tasks = clean(tasks)
65
+ tasks = tasks.split('show all show top 10')[1]
66
+ tasks = tasks.split('back to top')[0]
67
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace("importance work activity", " ")
68
+ tasks = tasks.split("? ")
69
+ split_data = [item.split(" -- ")[0] for item in tasks]
70
+ num_desc = []
71
+ for i in range(len(tasks)):
72
+ temp = [','.join(item) for item in split_data][i].split(',')
73
+ num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
74
+ df2 = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
75
+ df2 = df2[df2['Importance'] != '']
76
+
77
+ job_df = pd.concat([df, df2], axis = 0)
78
+
79
+ skills_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=sk&n_sk=0&s_sk=IM&c_sk=0"
80
+ knowledge_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=kn&n_kn=0&s_kn=IM&c_kn=0"
81
+ abilities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ab&n_ab=0&s_ab=IM&c_ab=0"
82
+ interests_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=in&c_in=0"
83
+ values_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wv&c_wv=0"
84
+ style_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ws&n_ws=0&c_ws=0"
85
+
86
+ response = requests.get(skills_url, headers=headers)
87
+ soup = BeautifulSoup(response.text, 'html.parser')
88
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
89
+ tasks = clean(tasks)
90
+ tasks = tasks.split('show all show top 10')[1]
91
+ tasks = tasks.split('back to top')[0]
92
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance skill", " ")
93
+ tasks = tasks.split(". ")
94
+ split_data = [item.split(" -- ")[0] for item in tasks]
95
+ num_desc = []
96
+ for i in range(len(tasks)):
97
+ temp = [','.join(item) for item in split_data][i].split(',')
98
+ num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
99
+ df3 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
100
+ df3 = df3[df3['Importance'] != '']
101
+
102
+ response = requests.get(knowledge_url, headers=headers)
103
+ soup = BeautifulSoup(response.text, 'html.parser')
104
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
105
+ tasks = clean(tasks)
106
+ tasks = tasks.split('show all show top 10')[1]
107
+ tasks = tasks.split('back to top')[0]
108
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance knowledge", " ")
109
+ tasks = tasks.split(". ")
110
+ split_data = [item.split(" -- ")[0] for item in tasks]
111
+ num_desc = []
112
+ for i in range(len(tasks)):
113
+ temp = [','.join(item) for item in split_data][i].split(',')
114
+ num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
115
+ df4 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
116
+ df4 = df4[df4['Importance'] != '']
117
+
118
+ response = requests.get(abilities_url, headers=headers)
119
+ soup = BeautifulSoup(response.text, 'html.parser')
120
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
121
+ tasks = clean(tasks)
122
+ tasks = tasks.split('show all show top 10')[1]
123
+ tasks = tasks.split('back to top')[0]
124
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance ability", " ")
125
+ tasks = tasks.split(". ")
126
+ split_data = [item.split(" -- ")[0] for item in tasks]
127
+ num_desc = []
128
+ for i in range(len(tasks)):
129
+ temp = [','.join(item) for item in split_data][i].split(',')
130
+ num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
131
+ df5 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
132
+ df5 = df5[df5['Importance'] != '']
133
+
134
+ response = requests.get(interests_url, headers=headers)
135
+ soup = BeautifulSoup(response.text, 'html.parser')
136
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
137
+ tasks = clean(tasks)
138
+ tasks = tasks.split("occupational interest interest")[1]#.replace('bright outlook', '').replace('updated 2023', '')
139
+ tasks = tasks.split('back to top')[0]
140
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance interest", " ")
141
+ tasks = tasks.split(". ")
142
+ split_data = [item.split(" -- ")[0] for item in tasks]
143
+ num_desc = []
144
+ for i in range(len(tasks)):
145
+ temp = [','.join(item) for item in split_data][i].split(',')
146
+ num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
147
+ df6 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
148
+ df6 = df6[df6['Importance'] != '']
149
+
150
+ response = requests.get(values_url, headers=headers)
151
+ soup = BeautifulSoup(response.text, 'html.parser')
152
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
153
+ tasks = clean(tasks)
154
+ tasks = tasks.split('extent work value')[1]
155
+ tasks = tasks.split('back to top')[0]
156
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance value", " ")
157
+ tasks = tasks.split(". ")
158
+ split_data = [item.split(" -- ")[0] for item in tasks]
159
+ num_desc = []
160
+ for i in range(len(tasks)):
161
+ temp = [','.join(item) for item in split_data][i].split(',')
162
+ num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
163
+ df7 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
164
+ df7 = df7[df7['Importance'] != '']
165
+
166
+ response = requests.get(style_url, headers=headers)
167
+ soup = BeautifulSoup(response.text, 'html.parser')
168
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
169
+ tasks = clean(tasks)
170
+ tasks = tasks.split('show all show top 10')[1]
171
+ tasks = tasks.split('back to top')[0]
172
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance style", " ")
173
+ tasks = tasks.split(". ")
174
+ split_data = [item.split(" -- ")[0] for item in tasks]
175
+ num_desc = []
176
+ for i in range(len(tasks)):
177
+ temp = [','.join(item) for item in split_data][i].split(',')
178
+ num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
179
+ df8 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
180
+ df8 = df8[df8['Importance'] != '']
181
+
182
+ cand_df = pd.concat([df3, df4, df5, df6, df7, df8], axis = 0)
183
+
184
+ return [job_df, cand_df]
185
+
186
  def get_job_postings(onetCode, state):
187
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
188
  url = "https://www.onetonline.org/link/localjobs/" + onetCode + "?st=" + state