Spaces:
Runtime error
Runtime error
add ONET webscrape functionality for importance ratings
Browse files- main.py +2 -0
- requirements.txt +3 -1
- scrape_onet.py +147 -0
main.py
CHANGED
@@ -14,6 +14,7 @@ from fastapi.responses import HTMLResponse
|
|
14 |
import pandas as pd
|
15 |
import time
|
16 |
from uuid import uuid1
|
|
|
17 |
from localStoragePy import localStoragePy
|
18 |
localStorage = localStoragePy('pathfinder', 'text')
|
19 |
|
@@ -23,6 +24,7 @@ from user_utils import Hash
|
|
23 |
|
24 |
# APP SETUP
|
25 |
app = FastAPI()
|
|
|
26 |
app.mount("/static", StaticFiles(directory='static'), name="static")
|
27 |
templates = Jinja2Templates(directory="templates/")
|
28 |
|
|
|
14 |
import pandas as pd
|
15 |
import time
|
16 |
from uuid import uuid1
|
17 |
+
from mangum import Mangum
|
18 |
from localStoragePy import localStoragePy
|
19 |
localStorage = localStoragePy('pathfinder', 'text')
|
20 |
|
|
|
24 |
|
25 |
# APP SETUP
|
26 |
app = FastAPI()
|
27 |
+
handler = Mangum(app)
|
28 |
app.mount("/static", StaticFiles(directory='static'), name="static")
|
29 |
templates = Jinja2Templates(directory="templates/")
|
30 |
|
requirements.txt
CHANGED
@@ -19,4 +19,6 @@ accelerate==0.16.0
|
|
19 |
plotly-express==0.4.1
|
20 |
bcrypt==4.0.1
|
21 |
passlib==1.7.4
|
22 |
-
localStoragePy==0.2.3
|
|
|
|
|
|
19 |
plotly-express==0.4.1
|
20 |
bcrypt==4.0.1
|
21 |
passlib==1.7.4
|
22 |
+
localStoragePy==0.2.3
|
23 |
+
sentence-transformers==2.2.2
|
24 |
+
mangum==0.17.0
|
scrape_onet.py
CHANGED
@@ -36,6 +36,153 @@ def get_onet_tasks(onetCode):
|
|
36 |
tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
|
37 |
return tasks
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
def get_job_postings(onetCode, state):
|
40 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
41 |
url = "https://www.onetonline.org/link/localjobs/" + onetCode + "?st=" + state
|
|
|
36 |
tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
|
37 |
return tasks
|
38 |
|
39 |
+
def get_onet_ratings(onetCode):
|
40 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
41 |
+
|
42 |
+
activities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wa&n_wa=0&s_wa=IM&c_wa=0"
|
43 |
+
context_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=cx&n_cx=0&c_cx=0&s_cx=n"
|
44 |
+
|
45 |
+
response = requests.get(activities_url, headers=headers)
|
46 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
47 |
+
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
48 |
+
tasks = clean(tasks)
|
49 |
+
tasks = tasks.split('show all show top 10')[1]
|
50 |
+
tasks = tasks.split('back to top')[0]
|
51 |
+
tasks = remove_new_line(tasks).replace("related occupations", " ").replace("importance work activity", " ")
|
52 |
+
tasks = tasks.split(". ")
|
53 |
+
split_data = [item.split(" -- ")[0] for item in tasks]
|
54 |
+
num_desc = []
|
55 |
+
for i in range(len(tasks)):
|
56 |
+
temp = [','.join(item) for item in split_data][i].split(',')
|
57 |
+
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(' ) ', '')])
|
58 |
+
df = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
|
59 |
+
df = df[df['Importance'] != '']
|
60 |
+
|
61 |
+
response = requests.get(context_url, headers=headers)
|
62 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
63 |
+
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
64 |
+
tasks = clean(tasks)
|
65 |
+
tasks = tasks.split('show all show top 10')[1]
|
66 |
+
tasks = tasks.split('back to top')[0]
|
67 |
+
tasks = remove_new_line(tasks).replace("related occupations", " ").replace("importance work activity", " ")
|
68 |
+
tasks = tasks.split("? ")
|
69 |
+
split_data = [item.split(" -- ")[0] for item in tasks]
|
70 |
+
num_desc = []
|
71 |
+
for i in range(len(tasks)):
|
72 |
+
temp = [','.join(item) for item in split_data][i].split(',')
|
73 |
+
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
74 |
+
df2 = pd.DataFrame(num_desc, columns = ['Importance', 'Work Characteristic'])
|
75 |
+
df2 = df2[df2['Importance'] != '']
|
76 |
+
|
77 |
+
job_df = pd.concat([df, df2], axis = 0)
|
78 |
+
|
79 |
+
skills_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=sk&n_sk=0&s_sk=IM&c_sk=0"
|
80 |
+
knowledge_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=kn&n_kn=0&s_kn=IM&c_kn=0"
|
81 |
+
abilities_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ab&n_ab=0&s_ab=IM&c_ab=0"
|
82 |
+
interests_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=in&c_in=0"
|
83 |
+
values_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=wv&c_wv=0"
|
84 |
+
style_url = "https://www.onetonline.org/link/result/" + onetCode + "?c=ws&n_ws=0&c_ws=0"
|
85 |
+
|
86 |
+
response = requests.get(skills_url, headers=headers)
|
87 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
88 |
+
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
89 |
+
tasks = clean(tasks)
|
90 |
+
tasks = tasks.split('show all show top 10')[1]
|
91 |
+
tasks = tasks.split('back to top')[0]
|
92 |
+
tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance skill", " ")
|
93 |
+
tasks = tasks.split(". ")
|
94 |
+
split_data = [item.split(" -- ")[0] for item in tasks]
|
95 |
+
num_desc = []
|
96 |
+
for i in range(len(tasks)):
|
97 |
+
temp = [','.join(item) for item in split_data][i].split(',')
|
98 |
+
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
99 |
+
df3 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
100 |
+
df3 = df3[df3['Importance'] != '']
|
101 |
+
|
102 |
+
response = requests.get(knowledge_url, headers=headers)
|
103 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
104 |
+
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
105 |
+
tasks = clean(tasks)
|
106 |
+
tasks = tasks.split('show all show top 10')[1]
|
107 |
+
tasks = tasks.split('back to top')[0]
|
108 |
+
tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance knowledge", " ")
|
109 |
+
tasks = tasks.split(". ")
|
110 |
+
split_data = [item.split(" -- ")[0] for item in tasks]
|
111 |
+
num_desc = []
|
112 |
+
for i in range(len(tasks)):
|
113 |
+
temp = [','.join(item) for item in split_data][i].split(',')
|
114 |
+
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
115 |
+
df4 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
116 |
+
df4 = df4[df4['Importance'] != '']
|
117 |
+
|
118 |
+
response = requests.get(abilities_url, headers=headers)
|
119 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
120 |
+
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
121 |
+
tasks = clean(tasks)
|
122 |
+
tasks = tasks.split('show all show top 10')[1]
|
123 |
+
tasks = tasks.split('back to top')[0]
|
124 |
+
tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance ability", " ")
|
125 |
+
tasks = tasks.split(". ")
|
126 |
+
split_data = [item.split(" -- ")[0] for item in tasks]
|
127 |
+
num_desc = []
|
128 |
+
for i in range(len(tasks)):
|
129 |
+
temp = [','.join(item) for item in split_data][i].split(',')
|
130 |
+
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
131 |
+
df5 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
132 |
+
df5 = df5[df5['Importance'] != '']
|
133 |
+
|
134 |
+
response = requests.get(interests_url, headers=headers)
|
135 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
136 |
+
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
137 |
+
tasks = clean(tasks)
|
138 |
+
tasks = tasks.split("occupational interest interest")[1]#.replace('bright outlook', '').replace('updated 2023', '')
|
139 |
+
tasks = tasks.split('back to top')[0]
|
140 |
+
tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance interest", " ")
|
141 |
+
tasks = tasks.split(". ")
|
142 |
+
split_data = [item.split(" -- ")[0] for item in tasks]
|
143 |
+
num_desc = []
|
144 |
+
for i in range(len(tasks)):
|
145 |
+
temp = [','.join(item) for item in split_data][i].split(',')
|
146 |
+
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
147 |
+
df6 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
148 |
+
df6 = df6[df6['Importance'] != '']
|
149 |
+
|
150 |
+
response = requests.get(values_url, headers=headers)
|
151 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
152 |
+
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
153 |
+
tasks = clean(tasks)
|
154 |
+
tasks = tasks.split('extent work value')[1]
|
155 |
+
tasks = tasks.split('back to top')[0]
|
156 |
+
tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance value", " ")
|
157 |
+
tasks = tasks.split(". ")
|
158 |
+
split_data = [item.split(" -- ")[0] for item in tasks]
|
159 |
+
num_desc = []
|
160 |
+
for i in range(len(tasks)):
|
161 |
+
temp = [','.join(item) for item in split_data][i].split(',')
|
162 |
+
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
163 |
+
df7 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
164 |
+
df7 = df7[df7['Importance'] != '']
|
165 |
+
|
166 |
+
response = requests.get(style_url, headers=headers)
|
167 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
168 |
+
tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
|
169 |
+
tasks = clean(tasks)
|
170 |
+
tasks = tasks.split('show all show top 10')[1]
|
171 |
+
tasks = tasks.split('back to top')[0]
|
172 |
+
tasks = remove_new_line(tasks).replace("related occupations", " ").replace(")importance style", " ")
|
173 |
+
tasks = tasks.split(". ")
|
174 |
+
split_data = [item.split(" -- ")[0] for item in tasks]
|
175 |
+
num_desc = []
|
176 |
+
for i in range(len(tasks)):
|
177 |
+
temp = [','.join(item) for item in split_data][i].split(',')
|
178 |
+
num_desc.append([''.join([c for c in temp if c in '0123456789']), ''.join([c for c in temp if c not in '0123456789']).replace(')context work context', '')])
|
179 |
+
df8 = pd.DataFrame(num_desc, columns = ['Importance', 'Candidate Characteristic'])
|
180 |
+
df8 = df8[df8['Importance'] != '']
|
181 |
+
|
182 |
+
cand_df = pd.concat([df3, df4, df5, df6, df7, df8], axis = 0)
|
183 |
+
|
184 |
+
return [job_df, cand_df]
|
185 |
+
|
186 |
def get_job_postings(onetCode, state):
|
187 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
|
188 |
url = "https://www.onetonline.org/link/localjobs/" + onetCode + "?st=" + state
|