celise88 commited on
Commit
9b3b1bc
1 Parent(s): e1f9362

organize functions and add async

Browse files
Files changed (3) hide show
  1. main.py +10 -133
  2. match_utils.py +107 -0
  3. scrape_onet.py +36 -0
main.py CHANGED
@@ -12,36 +12,8 @@ from fastapi.templating import Jinja2Templates
12
  from fastapi.staticfiles import StaticFiles
13
  from fastapi.responses import HTMLResponse
14
  import pandas as pd
15
- import requests
16
- from bs4 import BeautifulSoup
17
- from cleantext import clean
18
- from docx import Document
19
- import os
20
- import ssl
21
- import cohere
22
- from cohere import CohereError
23
- import string
24
- import numpy as np
25
- from numpy.linalg import norm
26
- from nltk.tokenize import SpaceTokenizer
27
- import nltk
28
- from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
29
- from dotenv import load_dotenv
30
-
31
- # LOAD ENVIRONMENT VARIABLES
32
- load_dotenv()
33
-
34
- # SSL CERTIFICATE FIX
35
- try:
36
- _create_unverified_https_context = ssl._create_unverified_context
37
- except AttributeError:
38
- pass
39
- else:
40
- ssl._create_default_https_context = _create_unverified_https_context
41
-
42
- # DOWNLOAD NLTK DATA IF NOT ALREADY DOWNLOADED
43
- if os.path.isdir('nltk_data')==False:
44
- nltk.download('stopwords', quiet=True)
45
 
46
  # APP SETUP
47
  app = FastAPI()
@@ -50,57 +22,6 @@ templates = Jinja2Templates(directory="templates/")
50
 
51
  # LOAD DATA
52
  onet = pd.read_csv('static/ONET_JobTitles.csv')
53
- simdat = pd.read_csv('static/cohere_embeddings.csv')
54
-
55
- # LOAD FINE-TUNED MODEL
56
- # (see https://huggingface.co/celise88/distilbert-base-uncased-finetuned-binary-classifier)
57
- model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
58
- tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
59
- classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
60
-
61
- # UTILITY FUNCTIONS
62
- def clean_my_text(text):
63
- clean_text = ' '.join(text.splitlines())
64
- clean_text = clean_text.replace('-', " ").replace("/"," ")
65
- clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
66
- return clean_text
67
-
68
- def remove_new_line(value):
69
- return ''.join(value.splitlines())
70
-
71
- def coSkillEmbed(text):
72
- try:
73
- co = cohere.Client(os.getenv("COHERE_TOKEN"))
74
- response = co.embed(
75
- model='large',
76
- texts=[text])
77
- return response.embeddings
78
- except CohereError as e:
79
- return e
80
-
81
- def skillNER(resume):
82
- resume = clean_my_text(resume)
83
- stops = set(nltk.corpus.stopwords.words('english'))
84
- stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
85
- 'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
86
- 'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
87
- 'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
88
- resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
89
- resume = [word for word in resume if ")" not in word]
90
- resume = [word for word in resume if "(" not in word]
91
-
92
- labels = []
93
- for i in range(len(resume)):
94
- classification = classifier(resume[i])[0]['label']
95
- if classification == 'LABEL_1':
96
- labels.append("Skill")
97
- else:
98
- labels.append("Not Skill")
99
- labels_dict = dict(zip(resume, labels))
100
- return labels_dict
101
-
102
- def cosine(A, B):
103
- return np.dot(A,B)/(norm(A)*norm(B))
104
 
105
  ### JOB INFORMATION CENTER ###
106
  # GET
@@ -114,26 +35,9 @@ def render_job_list(request: Request):
114
  def render_job_info(request: Request, jobtitle: str = Form(enum=[x for x in onet['JobTitle']])):
115
  joblist = onet['JobTitle']
116
  if jobtitle:
117
- # SCRAPE ONET TO GET JOB DESCRIPTION, TASKS, ETC.
118
- onetCode = onet.loc[onet['JobTitle'] == jobtitle, 'onetCode']
119
- onetCode = onetCode.reindex().tolist()[0]
120
- headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
121
- url = "https://www.onetonline.org/link/summary/" + onetCode
122
- response = requests.get(url, headers=headers)
123
- soup = BeautifulSoup(response.text, 'html.parser')
124
- jobdescription = soup.p.get_text()
125
-
126
- url = "https://www.onetonline.org/link/result/" + onetCode + "?c=tk&n_tk=0&s_tk=IM&c_tk=0"
127
- response = requests.get(url, headers=headers)
128
- soup = BeautifulSoup(response.text, 'html.parser')
129
- tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
130
- tasks = clean(tasks)
131
- tasks = tasks.split('show all show top 10')[1]
132
- tasks = tasks.split('occupations related to multiple tasks')[0]
133
- tasks = remove_new_line(tasks).replace("related occupations", " ").replace("core", " - ").replace(" )importance category task", "").replace(" find ", "")
134
- tasks = tasks.split(". ")
135
- tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
136
-
137
  return templates.TemplateResponse('job_list.html', context={
138
  'request': request,
139
  'joblist': joblist,
@@ -154,35 +58,8 @@ def match_page(request: Request):
154
 
155
  # POST
156
  @app.post('/find-my-match/', response_class=HTMLResponse)
157
- async def get_resume(request: Request, resume: UploadFile = File(...)):
158
-
159
- # READ AND PERFORM BASIC CLEANING ON RESUME
160
- path = f"static/{resume.filename}"
161
- with open(path, 'wb') as buffer:
162
- buffer.write(resume.file.read())
163
- file = Document(path)
164
- text = []
165
- for para in file.paragraphs:
166
- text.append(para.text)
167
- resume = "\n".join(text)
168
-
169
- # GET RESUME EMBEDDINGS AND JOB SIMILARITY SCORES
170
- embeds = coSkillEmbed(resume)
171
- simResults = []
172
- for i in range(len(simdat)):
173
- simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:])))
174
- simResults = pd.DataFrame(simResults)
175
- simResults['JobTitle'] = simdat['Title']
176
- simResults = simResults.iloc[:,[1,0]]
177
- simResults.columns = ['JobTitle', 'Similarity']
178
- simResults = simResults.sort_values(by = "Similarity", ascending = False)
179
- simResults = simResults.iloc[:13,:]
180
- simResults = simResults.iloc[1:,:]
181
- simResults.reset_index(drop=True, inplace=True)
182
- for x in range(len(simResults)):
183
- simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
184
-
185
- # EXTRACT SKILLS FROM RESUME
186
- skills = skillNER(resume)
187
-
188
- return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})
 
12
  from fastapi.staticfiles import StaticFiles
13
  from fastapi.responses import HTMLResponse
14
  import pandas as pd
15
+ from scrape_onet import get_onet_code, get_onet_description, get_onet_tasks
16
+ from match_utils import get_resume, get_simresults, skillNER
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # APP SETUP
19
  app = FastAPI()
 
22
 
23
  # LOAD DATA
24
  onet = pd.read_csv('static/ONET_JobTitles.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  ### JOB INFORMATION CENTER ###
27
  # GET
 
35
  def render_job_info(request: Request, jobtitle: str = Form(enum=[x for x in onet['JobTitle']])):
36
  joblist = onet['JobTitle']
37
  if jobtitle:
38
+ onetCode = get_onet_code(jobtitle)
39
+ jobdescription = get_onet_description(onetCode)
40
+ tasks = get_onet_tasks(onetCode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  return templates.TemplateResponse('job_list.html', context={
42
  'request': request,
43
  'joblist': joblist,
 
58
 
59
  # POST
60
  @app.post('/find-my-match/', response_class=HTMLResponse)
61
+ async def match_page(request: Request, resume: UploadFile = File(...)):
62
+ resume = get_resume(resume)
63
+ simResults = await get_simresults(resume)
64
+ skills = await skillNER(resume)
65
+ return templates.TemplateResponse('find_my_match.html', context={'request': request, 'resume': resume, 'skills': skills, 'simResults': simResults})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
match_utils.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from cleantext import clean
2
+ import string
3
+ from nltk.tokenize import SpaceTokenizer
4
+ import nltk
5
+ import cohere
6
+ from cohere import CohereError
7
+ import os
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
9
+ from docx import Document
10
+ import pandas as pd
11
+ import numpy as np
12
+ from numpy.linalg import norm
13
+ import ssl
14
+ from dotenv import load_dotenv
15
+ import concurrent.futures
16
+
17
+ # SSL CERTIFICATE FIX
18
+ try:
19
+ _create_unverified_https_context = ssl._create_unverified_context
20
+ except AttributeError:
21
+ pass
22
+ else:
23
+ ssl._create_default_https_context = _create_unverified_https_context
24
+
25
+ # DOWNLOAD NLTK DATA IF NOT ALREADY DOWNLOADED
26
+ if os.path.isdir('nltk_data')==False:
27
+ nltk.download('stopwords', quiet=True)
28
+
29
+ # LOAD ENVIRONMENT VARIABLES
30
+ load_dotenv()
31
+
32
+ # LOAD COHERE EMBEDDINGS:
33
+ simdat = pd.read_csv('static/cohere_embeddings.csv')
34
+
35
+ # LOAD FINE-TUNED MODEL
36
+ # (see https://huggingface.co/celise88/distilbert-base-uncased-finetuned-binary-classifier)
37
+ model = AutoModelForSequenceClassification.from_pretrained('static/model_shards', low_cpu_mem_usage=True)
38
+ tokenizer = AutoTokenizer.from_pretrained('static/tokenizer_shards', low_cpu_mem_usage=True)
39
+ classifier = pipeline('text-classification', model = model, tokenizer = tokenizer)
40
+
41
+ # UTILITY FUNCTIONS
42
+ def get_resume(resume):
43
+ path = f"static/{resume.filename}"
44
+ with open(path, 'wb') as buffer:
45
+ buffer.write(resume.file.read())
46
+ file = Document(path)
47
+ text = []
48
+ for para in file.paragraphs:
49
+ text.append(para.text)
50
+ resume = "\n".join(text)
51
+ return resume
52
+
53
+ def coSkillEmbed(text):
54
+ try:
55
+ co = cohere.Client(os.getenv("COHERE_TOKEN"))
56
+ response = co.embed(
57
+ model='large',
58
+ texts=[text])
59
+ return response.embeddings
60
+ except CohereError as e:
61
+ return e
62
+
63
+ async def get_simresults(resume):
64
+ def cosine(A, B):
65
+ return np.dot(A,B)/(norm(A)*norm(B))
66
+ embeds = coSkillEmbed(resume)
67
+ simResults = []
68
+ for i in range(len(simdat)):
69
+ simResults.append(cosine(np.array(embeds), np.array(simdat.iloc[i,1:])))
70
+ simResults = pd.DataFrame(simResults)
71
+ simResults['JobTitle'] = simdat['Title']
72
+ simResults = simResults.iloc[:,[1,0]]
73
+ simResults.columns = ['JobTitle', 'Similarity']
74
+ simResults = simResults.sort_values(by = "Similarity", ascending = False)
75
+ simResults = simResults.iloc[:13,:]
76
+ simResults = simResults.iloc[1:,:]
77
+ simResults.reset_index(drop=True, inplace=True)
78
+ for x in range(len(simResults)):
79
+ simResults.iloc[x,1] = "{:0.2f}".format(simResults.iloc[x,1])
80
+ return simResults
81
+
82
+ async def skillNER(resume):
83
+ def clean_my_text(text):
84
+ clean_text = ' '.join(text.splitlines())
85
+ clean_text = clean_text.replace('-', " ").replace("/"," ")
86
+ clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
87
+ return clean_text
88
+
89
+ resume = clean_my_text(resume)
90
+ stops = set(nltk.corpus.stopwords.words('english'))
91
+ stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
92
+ 'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
93
+ 'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
94
+ 'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
95
+ resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
96
+ resume = [word for word in resume if ")" not in word]
97
+ resume = [word for word in resume if "(" not in word]
98
+
99
+ labels = []
100
+ for i in range(len(resume)):
101
+ classification = classifier(resume[i])[0]['label']
102
+ if classification == 'LABEL_1':
103
+ labels.append("Skill")
104
+ else:
105
+ labels.append("Not Skill")
106
+ labels_dict = dict(zip(resume, labels))
107
+ return labels_dict
scrape_onet.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from cleantext import clean
4
+ import pandas as pd
5
+
6
+ onet = pd.read_csv('static/ONET_JobTitles.csv')
7
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
8
+
9
+ def remove_new_line(value):
10
+ return ''.join(value.splitlines())
11
+
12
+ def get_onet_code(jobtitle):
13
+ onetCode = onet.loc[onet['JobTitle'] == jobtitle, 'onetCode']
14
+ onetCode = onetCode.reindex().tolist()[0]
15
+ return onetCode
16
+
17
+ def get_onet_description(onetCode):
18
+ url = "https://www.onetonline.org/link/summary/" + onetCode
19
+ response = requests.get(url, headers=headers)
20
+ soup = BeautifulSoup(response.text, 'html.parser')
21
+ jobdescription = soup.p.get_text()
22
+ return jobdescription
23
+
24
+ def get_onet_tasks(onetCode):
25
+ headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
26
+ url = "https://www.onetonline.org/link/result/" + onetCode + "?c=tk&n_tk=0&s_tk=IM&c_tk=0"
27
+ response = requests.get(url, headers=headers)
28
+ soup = BeautifulSoup(response.text, 'html.parser')
29
+ tasks = str(soup.get_text('reportsubdesc')).replace("reportsubdesc", " ").replace("ImportanceCategoryTask ", "")
30
+ tasks = clean(tasks)
31
+ tasks = tasks.split('show all show top 10')[1]
32
+ tasks = tasks.split('occupations related to multiple tasks')[0]
33
+ tasks = remove_new_line(tasks).replace("related occupations", " ").replace("core", " - ").replace(" )importance category task", "").replace(" find ", "")
34
+ tasks = tasks.split(". ")
35
+ tasks = [''.join(map(lambda c: '' if c in '0123456789-' else c, task)) for task in tasks]
36
+ return tasks