Spaces:
Sleeping
Sleeping
File size: 4,771 Bytes
0a510f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import PyPDF2
from docx import Document
from pptx import Presentation
from nlp import get_average_similarity_scores
import numpy as np
import plotly.graph_objects as go
import os
import tempfile
import shutil
# Langchain document loaders
from langchain.document_loaders import PyPDFLoader #for pdf files
from langchain.document_loaders import TextLoader #for text files
from langchain.document_loaders import Docx2txtLoader #for docx files
from langchain.document_loaders import UnstructuredPowerPointLoader #for pptx files
from constants import StreamlitException
from PyPDF2.errors import PdfReadError
from zipfile import BadZipFile
def load_file(st, uploaded_file):
# uploaded_file is the output of st.sidebar.file_uploader
file_type = uploaded_file.type
try:
os.mkdir('downloaded_files')
except FileExistsError:
pass
download_path = os.path.join('downloaded_files', uploaded_file.name)
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
# Write the contents of the uploaded file to the temporary file
tmp_file.write(uploaded_file.read())
tmp_file.flush()
shutil.copy(tmp_file.name, download_path)
try:
if file_type == "application/pdf":
resume_text_raw = extract_pdf_text(uploaded_file)
lang_loader = PyPDFLoader(download_path)
elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
resume_text_raw = extract_word_text(uploaded_file)
lang_loader = Docx2txtLoader(download_path)
elif file_type == "application/vnd.ms-powerpoint" or file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
resume_text_raw = extract_ppt_text(uploaded_file)
lang_loader = UnstructuredPowerPointLoader(download_path)
else:
return StreamlitException("**Error**: Invalid file format. Please upload a Word, PDF, or PowerPoint file.")
except (PdfReadError, BadZipFile):
return StreamlitException("**Error**: Invalid file content. Please upload a valid Word, PDF, or PowerPoint file.")
return resume_text_raw, lang_loader
# Function to extract text from a PDF file
def extract_pdf_text(file):
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
lines = page.extract_text().split('\n')
for line in lines:
text += line.strip() + ".\n"
return text
# Function to extract text from a Word file
def extract_word_text(file):
doc = Document(file)
text = ''
p_iter = iter(doc.paragraphs)
t_iter = iter(doc.tables)
while True:
try:
paragraph = next(p_iter)
text += paragraph.text + '.\n'
except StopIteration:
break
try:
table = next(t_iter)
for row in table.rows:
for cell in row.cells:
text += cell.text + '\t'
text += '\n'
except StopIteration:
pass
return text
# Function to extract text from a PowerPoint file
def extract_ppt_text(file):
prs = Presentation(file)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
text += shape.text_frame.text
return text
# Function to plot the average similarity score for each job description phrase
def plot_similarity_scores(job_description_phrases, resume_phrases):
avg_similarity_scores = get_average_similarity_scores(job_description_phrases, resume_phrases)
sorted_scores = sorted(enumerate(avg_similarity_scores), key=lambda x: x[1], reverse=True)[:10]
indices = [i[0] for i in sorted_scores]
sorted_scores = [i[1] for i in sorted_scores]
y_pos = list(range(len(indices)))
fig = go.Figure()
fig.add_trace(go.Bar(
y=y_pos,
x=sorted_scores,
orientation='h'
))
fig.update_layout(
yaxis=dict(
tickmode="array",
tickvals=y_pos,
ticktext=[s[:100].ljust(100) + '...' if len(s) > 100 else s.ljust(75) for s in np.array(job_description_phrases)[indices]],
tickfont=dict(size=14),
autorange="reversed",
side="right",
automargin=True
),
xaxis=dict(
tickmode="array",
tickvals=np.round(np.arange(0, 1.2, 0.2), 2),
ticktext=np.round(np.arange(0, 1.2, 0.2), 2),
tickfont=dict(size=14),
range=[0, 1.05]
),
showlegend=False,
margin=dict(t=0)
)
fig.update_xaxes(title="Average Similarity Score", title_font=dict(size=14))
return fig |