|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
import nltk |
|
|
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
|
|
from bs4 import BeautifulSoup |
|
import re |
|
from nltk.corpus import stopwords |
|
from wordcloud import WordCloud |
|
|
|
from nltk import word_tokenize |
|
from nltk.util import ngrams |
|
|
|
import PyPDF2 |
|
import base64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.set_page_config( |
|
page_title = 'Resume enhancement by extracting keywords using NLP ', |
|
page_icon = 'π', |
|
layout = 'wide' |
|
) |
|
|
|
|
|
st.title(" π Resume enhancement by extracting keywords π ") |
|
|
|
st.subheader("π’ using NLP π’") |
|
|
|
|
|
"""β
** Downloading Models and Basic Setup **""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
nltk.download("popular") |
|
nltk.download('stopwords') |
|
|
|
lemmatizer = WordNetLemmatizer() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df=pd.read_csv('Resume_skills.csv') |
|
|
|
df=df.drop(columns=['Unnamed: 0']) |
|
|
|
|
|
file1=open('linkedin_skill','r') |
|
skills=[(line.strip()).split(',')[0].lower() for line in file1.readlines()] |
|
|
|
|
|
def sentence_maker(unique_words): |
|
sentences='' |
|
for i in unique_words: |
|
sentences+=''.join(i.strip())+' ' |
|
return sentences |
|
|
|
|
|
|
|
|
|
|
|
def extract_skills(input_text): |
|
res=[] |
|
|
|
for i in input_text: |
|
|
|
|
|
|
|
bigrams_trigrams = list(map(' '.join, nltk.everygrams(i, 2, 3))) |
|
|
|
|
|
found_skills = set() |
|
|
|
for token in i: |
|
if token.lower() in skills: |
|
found_skills.add(token) |
|
|
|
|
|
for ngram in bigrams_trigrams: |
|
if ngram.lower() in skills: |
|
found_skills.add(ngram) |
|
res.append(found_skills) |
|
print(res) |
|
return res |
|
|
|
def clean_sentences(df,col_name): |
|
reviews = [] |
|
|
|
for sent in (df[col_name]): |
|
|
|
review_text = BeautifulSoup(sent).get_text() |
|
|
|
review_text = re.sub("[^a-zA-Z]"," ", review_text) |
|
|
|
words = word_tokenize(review_text.lower()) |
|
|
|
stops = set(stopwords.words("english")) |
|
|
|
meaningful_words = [w for w in words if not w in stops] |
|
|
|
reviews.append(meaningful_words) |
|
|
|
return(reviews) |
|
|
|
def clean_sentences2(df,col_name): |
|
reviews = [] |
|
|
|
for sent in (df[col_name]): |
|
|
|
|
|
review_text = BeautifulSoup(sent).get_text() |
|
|
|
|
|
review_text = re.sub("[^a-zA-Z]"," ", review_text) |
|
|
|
|
|
words = word_tokenize(review_text.lower()) |
|
reviews.append(words) |
|
return(reviews) |
|
|
|
def extract_keywords(res): |
|
keywords=set() |
|
for i in res: |
|
for j in i: |
|
keywords.add(j) |
|
return(keywords) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_sentences3(text): |
|
reviews = [] |
|
|
|
|
|
review_text = BeautifulSoup(text).get_text() |
|
|
|
|
|
review_text = re.sub("[^a-zA-Z]"," ", review_text) |
|
|
|
|
|
words = word_tokenize(review_text.lower()) |
|
|
|
stops = set(stopwords.words("english")) |
|
|
|
|
|
meaningful_words = [w for w in words if not w in stops] |
|
|
|
reviews.append(meaningful_words) |
|
|
|
return(reviews) |
|
|
|
def decode_txt1(file_name): |
|
f= open(file_name,"r") |
|
full_text = f.read() |
|
clean_text=clean_sentences3(full_text) |
|
f.close() |
|
return clean_text |
|
|
|
|
|
|
|
|
|
|
|
def decode_pdf(filename): |
|
|
|
pdfFileObj = open(filename, 'rb') |
|
|
|
|
|
|
|
pdfReader1 = PyPDF2.PdfFileReader(pdfFileObj) |
|
|
|
|
|
|
|
num_pages=pdfReader1.numPages |
|
|
|
text=open('Sample.txt','w') |
|
for i in range(num_pages): |
|
|
|
pageObj = pdfReader1.getPage(i) |
|
|
|
|
|
t=(pageObj.extractText()) |
|
text.write(t) |
|
|
|
|
|
pdfFileObj.close() |
|
text.close() |
|
|
|
dec_txt=decode_txt1('Sample.txt') |
|
|
|
return dec_txt |
|
|
|
|
|
|
|
|
|
def extract_skills2(input_text): |
|
found_skills=[] |
|
|
|
for i in input_text: |
|
|
|
|
|
bigrams_trigrams = list(map(' '.join, nltk.everygrams(i, 2, 3))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
token=i |
|
if token.lower() in skills: |
|
|
|
found_skills.append(token) |
|
|
|
|
|
for ngram in bigrams_trigrams: |
|
if ngram.lower() in skills: |
|
found_skills.append(ngram) |
|
|
|
|
|
return found_skills |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf") |
|
|
|
if uploaded_file is not None: |
|
with open("input.pdf", "wb") as f: |
|
base64_pdf = base64.b64encode(uploaded_file.read()).decode('utf-8') |
|
f.write(base64.b64decode(base64_pdf)) |
|
f.close() |
|
|
|
resume_text=decode_pdf("input.pdf") |
|
|
|
|
|
|
|
|
|
|
|
list_of_cats = [ 'Testing', 'HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE','BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE','BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE','CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT','CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION','Data Science', 'Advocate', 'Arts', 'Web Designing','Mechanical Engineer', 'Sales', 'Health and fitness','Civil Engineer', 'Java Developer', 'Business Analyst','SAP Developer', 'Automation Testing', 'Electrical Engineering','Operations Manager', 'Python Developer', 'DevOps Engineer','Network Security Engineer', 'PMO', 'Database', 'Hadoop','ETL Developer', 'DotNet Developer', 'Blockchain'] |
|
|
|
cat = st.selectbox("Select your desired Category",list_of_cats, index = 0) |
|
|
|
|
|
|
|
print('You selected:', cat) |
|
|
|
|
|
sub_df=df[df['Category']==cat] |
|
|
|
sentences1=sentence_maker(sub_df['Resume_skills']) |
|
|
|
"""β
**Extracting Data from PDF **""" |
|
|
|
resume_text2=extract_skills2(resume_text[0]) |
|
resume_keywords=set(resume_text2) |
|
|
|
print(resume_keywords) |
|
|
|
|
|
wc = WordCloud(width = 500, height = 500,include_numbers=True,collocations=True, background_color ='white',min_font_size = 10).generate(sentence_maker(resume_keywords)) |
|
plt.figure(figsize=(10,10)) |
|
plt.imshow(wc, interpolation='bilinear') |
|
plt.axis("off") |
|
plt.title(' existing Keywords') |
|
plt.show() |
|
|
|
|
|
|
|
"""β
Generating ***Similarity Score*** with existing skillset""" |
|
|
|
from cdifflib import CSequenceMatcher |
|
|
|
def get_similarity_score(s1,s2): |
|
|
|
sm= CSequenceMatcher(None,s1,s2) |
|
return(str(round(sm.ratio()*100,3))+'%') |
|
|
|
|
|
|
|
wc_r = WordCloud(width = 500, height = 500,max_words=200,include_numbers=True,collocations=True, |
|
background_color ='white',min_font_size = 10).generate(sentences1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""β
**Getting the matching score with database**""" |
|
|
|
sub_unique_words=list(wc_r.words_.keys()) |
|
resume_keywords=list(resume_keywords) |
|
|
|
bigram = list(map(' '.join,ngrams(sub_unique_words, 1))) |
|
|
|
sub_keywords=set() |
|
for bg in bigram: |
|
if bg in skills: |
|
|
|
sub_keywords.add(bg) |
|
tokens = nltk.word_tokenize(sentence_maker(sub_unique_words)) |
|
for i in tokens: |
|
sub_keywords.add(i) |
|
|
|
def preprocess(words): |
|
res=set() |
|
for i in words: |
|
|
|
review_text = BeautifulSoup(i).get_text() |
|
|
|
|
|
review_text = re.sub("[^a-zA-Z]"," ", review_text) |
|
|
|
words = word_tokenize(review_text.lower()) |
|
|
|
for j in words: |
|
res.add(j) |
|
|
|
return res |
|
|
|
|
|
with st.spinner(): |
|
|
|
sub_unique_words_match=list(preprocess(sub_unique_words)) |
|
resume_keywords=list(preprocess(resume_keywords)) |
|
|
|
|
|
predicted_keywords_match=[i for i in sub_unique_words_match if i not in resume_keywords] |
|
pred_keywords=[i for i in sub_keywords if i not in resume_keywords] |
|
|
|
print(pred_keywords) |
|
|
|
|
|
|
|
|
|
|
|
from collections import Counter |
|
word_could_dict=Counter(pred_keywords) |
|
wc = WordCloud(width = 500, height = 500,include_numbers=True,collocations=True, |
|
background_color ='white',min_font_size = 10).generate_from_frequencies(word_could_dict) |
|
plt.figure(figsize=(10,10)) |
|
plt.imshow(wc, interpolation='bilinear') |
|
plt.axis("off") |
|
plt.title(' predicted keywords') |
|
|
|
wc.to_file('prediction.jpg') |
|
|
|
st.markdown("# Output") |
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
|
|
with col2: |
|
st.markdown("### Predicted Keywords WordCloud") |
|
|
|
st.image('prediction.jpg') |
|
|
|
|
|
|
|
|
|
|
|
|
|
existing_score = get_similarity_score(sub_unique_words_match,resume_keywords) |
|
|
|
predicted_result_score = get_similarity_score(predicted_keywords_match,sub_unique_words_match) |
|
|
|
with col1: |
|
st.markdown('### Existing Keywords :' ) |
|
|
|
st.metric( label = 'Score', value = existing_score) |
|
|
|
with col3: |
|
st.markdown(" ") |
|
|
|
|
|
with col4: |
|
st.markdown('### Predicted Keywords :' ) |
|
|
|
st.metric( label = 'Score', value = predicted_result_score) |