File size: 4,771 Bytes
0a510f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import PyPDF2
from docx import Document
from pptx import Presentation
from nlp import get_average_similarity_scores
import numpy as np
import plotly.graph_objects as go
import os
import tempfile
import shutil

# Langchain document loaders
from langchain.document_loaders import PyPDFLoader #for pdf files
from langchain.document_loaders import TextLoader  #for text files
from langchain.document_loaders import Docx2txtLoader  #for docx files
from langchain.document_loaders import UnstructuredPowerPointLoader  #for pptx files

from constants import StreamlitException
from PyPDF2.errors import PdfReadError
from zipfile import BadZipFile

def load_file(st, uploaded_file):
    # uploaded_file is the output of st.sidebar.file_uploader
    file_type = uploaded_file.type
    try:
        os.mkdir('downloaded_files')
    except FileExistsError:
        pass
    download_path = os.path.join('downloaded_files', uploaded_file.name)
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        # Write the contents of the uploaded file to the temporary file
        tmp_file.write(uploaded_file.read())
        tmp_file.flush()
        shutil.copy(tmp_file.name, download_path)
    try:
        if file_type == "application/pdf":
            resume_text_raw = extract_pdf_text(uploaded_file)
            lang_loader = PyPDFLoader(download_path)
        elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            resume_text_raw = extract_word_text(uploaded_file)
            lang_loader = Docx2txtLoader(download_path)
        elif file_type == "application/vnd.ms-powerpoint" or file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
            resume_text_raw = extract_ppt_text(uploaded_file)
            lang_loader = UnstructuredPowerPointLoader(download_path)
        else:
            return StreamlitException("**Error**: Invalid file format. Please upload a Word, PDF, or PowerPoint file.")
    except (PdfReadError, BadZipFile):
            return StreamlitException("**Error**: Invalid file content. Please upload a valid Word, PDF, or PowerPoint file.")

    return resume_text_raw, lang_loader     


# Function to extract text from a PDF file
def extract_pdf_text(file):
    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        lines = page.extract_text().split('\n')
        for line in lines:
            text += line.strip() + ".\n"
    return text
    

# Function to extract text from a Word file
def extract_word_text(file):
    doc = Document(file)
    text = ''
    p_iter = iter(doc.paragraphs)
    t_iter = iter(doc.tables)
    while True:
        try:
            paragraph = next(p_iter)
            text += paragraph.text + '.\n'
        except StopIteration:
            break
        try:
            table = next(t_iter)
            for row in table.rows:
                for cell in row.cells:
                    text += cell.text + '\t'
                text += '\n'
        except StopIteration:
            pass
    return text


# Function to extract text from a PowerPoint file
def extract_ppt_text(file):
    prs = Presentation(file)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                text += shape.text_frame.text
    return text

# Function to plot the average similarity score for each job description phrase
def plot_similarity_scores(job_description_phrases, resume_phrases):
    avg_similarity_scores = get_average_similarity_scores(job_description_phrases, resume_phrases)
    sorted_scores = sorted(enumerate(avg_similarity_scores), key=lambda x: x[1], reverse=True)[:10]
    indices = [i[0] for i in sorted_scores]
    sorted_scores = [i[1] for i in sorted_scores]
    
    y_pos = list(range(len(indices)))
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        y=y_pos,
        x=sorted_scores,
        orientation='h'
    ))
    
    fig.update_layout(
        yaxis=dict(
            tickmode="array",
            tickvals=y_pos,
            ticktext=[s[:100].ljust(100) + '...' if len(s) > 100 else s.ljust(75) for s in np.array(job_description_phrases)[indices]],
            tickfont=dict(size=14),
            autorange="reversed",
            side="right",
            automargin=True
        ),
        xaxis=dict(
            tickmode="array",
            tickvals=np.round(np.arange(0, 1.2, 0.2), 2),
            ticktext=np.round(np.arange(0, 1.2, 0.2), 2),
            tickfont=dict(size=14),
            range=[0, 1.05]
        ),
        showlegend=False,
        margin=dict(t=0)
    )
    
    fig.update_xaxes(title="Average Similarity Score", title_font=dict(size=14))
    
    return fig