Spaces:

cm0805
/

ResuMate_NVIDIA

Sleeping

File size: 4,771 Bytes

0a510f9

import PyPDF2
from docx import Document
from pptx import Presentation
from nlp import get_average_similarity_scores
import numpy as np
import plotly.graph_objects as go
import os
import tempfile
import shutil

# Langchain document loaders
from langchain.document_loaders import PyPDFLoader #for pdf files
from langchain.document_loaders import TextLoader  #for text files
from langchain.document_loaders import Docx2txtLoader  #for docx files
from langchain.document_loaders import UnstructuredPowerPointLoader  #for pptx files

from constants import StreamlitException
from PyPDF2.errors import PdfReadError
from zipfile import BadZipFile

def load_file(st, uploaded_file):
    # uploaded_file is the output of st.sidebar.file_uploader
    file_type = uploaded_file.type
    try:
        os.mkdir('downloaded_files')
    except FileExistsError:
        pass
    download_path = os.path.join('downloaded_files', uploaded_file.name)
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        # Write the contents of the uploaded file to the temporary file
        tmp_file.write(uploaded_file.read())
        tmp_file.flush()
        shutil.copy(tmp_file.name, download_path)
    try:
        if file_type == "application/pdf":
            resume_text_raw = extract_pdf_text(uploaded_file)
            lang_loader = PyPDFLoader(download_path)
        elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            resume_text_raw = extract_word_text(uploaded_file)
            lang_loader = Docx2txtLoader(download_path)
        elif file_type == "application/vnd.ms-powerpoint" or file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
            resume_text_raw = extract_ppt_text(uploaded_file)
            lang_loader = UnstructuredPowerPointLoader(download_path)
        else:
            return StreamlitException("**Error**: Invalid file format. Please upload a Word, PDF, or PowerPoint file.")
    except (PdfReadError, BadZipFile):
            return StreamlitException("**Error**: Invalid file content. Please upload a valid Word, PDF, or PowerPoint file.")

    return resume_text_raw, lang_loader     


# Function to extract text from a PDF file
def extract_pdf_text(file):
    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        lines = page.extract_text().split('\n')
        for line in lines:
            text += line.strip() + ".\n"
    return text
    

# Function to extract text from a Word file
def extract_word_text(file):
    doc = Document(file)
    text = ''
    p_iter = iter(doc.paragraphs)
    t_iter = iter(doc.tables)
    while True:
        try:
            paragraph = next(p_iter)
            text += paragraph.text + '.\n'
        except StopIteration:
            break
        try:
            table = next(t_iter)
            for row in table.rows:
                for cell in row.cells:
                    text += cell.text + '\t'
                text += '\n'
        except StopIteration:
            pass
    return text


# Function to extract text from a PowerPoint file
def extract_ppt_text(file):
    prs = Presentation(file)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                text += shape.text_frame.text
    return text

# Function to plot the average similarity score for each job description phrase
def plot_similarity_scores(job_description_phrases, resume_phrases):
    avg_similarity_scores = get_average_similarity_scores(job_description_phrases, resume_phrases)
    sorted_scores = sorted(enumerate(avg_similarity_scores), key=lambda x: x[1], reverse=True)[:10]
    indices = [i[0] for i in sorted_scores]
    sorted_scores = [i[1] for i in sorted_scores]
    
    y_pos = list(range(len(indices)))
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        y=y_pos,
        x=sorted_scores,
        orientation='h'
    ))
    
    fig.update_layout(
        yaxis=dict(
            tickmode="array",
            tickvals=y_pos,
            ticktext=[s[:100].ljust(100) + '...' if len(s) > 100 else s.ljust(75) for s in np.array(job_description_phrases)[indices]],
            tickfont=dict(size=14),
            autorange="reversed",
            side="right",
            automargin=True
        ),
        xaxis=dict(
            tickmode="array",
            tickvals=np.round(np.arange(0, 1.2, 0.2), 2),
            ticktext=np.round(np.arange(0, 1.2, 0.2), 2),
            tickfont=dict(size=14),
            range=[0, 1.05]
        ),
        showlegend=False,
        margin=dict(t=0)
    )
    
    fig.update_xaxes(title="Average Similarity Score", title_font=dict(size=14))
    
    return fig