import pandas as pd
import numpy as np
import torch
import zipfile
import os
from transformers import BertTokenizer, BertForSequenceClassification
import contractions
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
# Load pre-trained BERT model and tokenizer
# def load_model():
# model_name = "./bert_fine_tuned/bert_fine_tuned"
# tokenizer = BertTokenizer.from_pretrained('./bert_fine_tuned/bert_tokens')
# model = BertForSequenceClassification.from_pretrained(model_name)
# return model, tokenizer
def load_model():
model_name = "azrai99/bert-skills-extraction"
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
return model,tokenizer
def clean(desc):
desc = contractions.fix(desc)
desc = re.sub("[!@.$\'\'':()]", "", desc)
return desc
def extract_POS(tagged):
#pattern 1
grammar1 = ('''Noun Phrases: {
?*+}''')
chunkParser = nltk.RegexpParser(grammar1)
tree1 = chunkParser.parse(tagged)
# typical noun phrase pattern appending to be concatted later
g1_chunks = []
for subtree in tree1.subtrees(filter=lambda t: t.label() == 'Noun Phrases'):
g1_chunks.append(subtree)
#pattern 2
grammar2 = ('''NP2: {?*} ''')
chunkParser = nltk.RegexpParser(grammar2)
tree2 = chunkParser.parse(tagged)
# variation of a noun phrase pattern to be pickled for later analyses
g2_chunks = []
for subtree in tree2.subtrees(filter=lambda t: t.label() == 'NP2'):
g2_chunks.append(subtree)
#pattern 3
grammar3 = (''' VS: {*}''')
chunkParser = nltk.RegexpParser(grammar3)
tree3 = chunkParser.parse(tagged)
# verb-noun pattern appending to be concatted later
g3_chunks = []
for subtree in tree3.subtrees(filter=lambda t: t.label() == 'VS'):
g3_chunks.append(subtree)
# pattern 4
# any number of a singular or plural noun followed by a comma followed by the same noun, noun, noun pattern
grammar4 = ('''Commas: {*<,>*<,>*} ''')
chunkParser = nltk.RegexpParser(grammar4)
tree4 = chunkParser.parse(tagged)
# common pattern of listing skills appending to be concatted later
g4_chunks = []
for subtree in tree4.subtrees(filter=lambda t: t.label() == 'Commas'):
g4_chunks.append(subtree)
return g1_chunks, g2_chunks, g3_chunks, g4_chunks
def tokenize_and_tag(desc):
tokens = nltk.word_tokenize(desc.lower())
filtered_tokens = [w for w in tokens if not w in stop_words]
tagged = nltk.pos_tag(filtered_tokens)
return tagged
def training_set(chunks):
'''creates a dataframe that easily parsed with the chunks data '''
df = pd.DataFrame(chunks)
df.fillna('X', inplace = True)
train = []
for row in df.values:
phrase = ''
for tup in row:
# needs a space at the end for seperation
phrase += tup[0] + ' '
phrase = ''.join(phrase)
# could use padding tages but encoder method will provide during
# tokenizing/embeddings; X can replace paddding for now
train.append( phrase.replace('X', '').strip())
df['phrase'] = train
return df.phrase
def strip_commas(df):
'''create new series of individual n-grams'''
grams = []
for sen in df:
sent = sen.split(',')
for word in sent:
grams.append(word)
return pd.Series(grams)
def generate_phrases(desc):
tagged = tokenize_and_tag(desc)
g1_chunks, g2_chunks, g3_chunks, g4_chunks = extract_POS(tagged)
c = training_set(g4_chunks)
separated_chunks4 = strip_commas(c)
phrases = pd.concat([training_set(g1_chunks),
training_set(g2_chunks),
training_set(g3_chunks),
separated_chunks4],
ignore_index = True )
return phrases
def get_predictions(desc, model, tokenizer, threshold=0.6, return_probabilities=False):
# Clean
desc = clean(desc)
phrases = generate_phrases(desc).tolist()
phrases = [phrase.strip() for phrase in phrases]
print(phrases)
# Tokenize and prepare phrases for the model
inputs = tokenizer(phrases, return_tensors="pt", truncation=True, padding=True)
model,tokenizer = load_model()
# Perform inference
with torch.no_grad():
outputs = model(**inputs)
# Get predicted probabilities
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
# Get predicted classes based on the threshold
predictions = (probs[:, 1] > threshold).to(torch.int32)
# Return predicted skills as a list
out = pd.DataFrame({'Phrase': phrases, 'Class': predictions})
skills = out.loc[out['Class'] == 1]
return skills['Phrase'].unique().tolist()
# # Return predicted skills and probabilities as lists
# out = pd.DataFrame({'Phrase': phrases, 'Class': predictions, 'Probability': probs[:, 1]})
# skills = out.loc[out['Class'] == 1]
# if return_probabilities:
# return skills['Phrase'].tolist(), skills['Probability'].tolist()
# else:
# return skills['Phrase'].tolist()
def get_predictions_excel(filename):
"""description column must be titled Job Desc"""
df = pd.read_csv(filename)
df['Extracted skills'] = df['Job Description'].apply(lambda x: get_predictions(x))
return df.to_csv('extracted.csv')