LeoWalker commited on
Commit
318df0b
1 Parent(s): c7004fc

working resume parser with streamlit front end, tested OpenAI and Anthropic.

Browse files
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import io
3
+ import streamlit as st
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.output_parsers import PydanticOutputParser
6
+ from langchain_community.chat_models import ChatAnthropic
7
+ from langchain_openai import ChatOpenAI
8
+ from pydantic import ValidationError
9
+ from resume_template import Resume
10
+ from json import JSONDecodeError
11
+ import PyPDF2
12
+ import json
13
+
14
+ load_dotenv()
15
+
16
+ def pdf_to_string(file):
17
+ """
18
+ Convert a PDF file to a string.
19
+
20
+ Parameters:
21
+ file (io.BytesIO): A file-like object representing the PDF file.
22
+
23
+ Returns:
24
+ str: The extracted text from the PDF.
25
+ """
26
+ pdf_reader = PyPDF2.PdfReader(file)
27
+ num_pages = len(pdf_reader.pages)
28
+ text = ''
29
+ for i in range(num_pages):
30
+ page = pdf_reader.pages[i]
31
+ text += page.extract_text()
32
+ file.close()
33
+ return text
34
+
35
+ def extract_resume_fields(full_text, model):
36
+ """
37
+ Analyze a resume text and extract structured information using a specified language model.
38
+
39
+ Parameters:
40
+ full_text (str): The text content of the resume.
41
+ model (str): The language model object to use for processing the text.
42
+
43
+ Returns:
44
+ dict: A dictionary containing structured information extracted from the resume.
45
+ """
46
+ # The Resume object is imported from the local resume_template file
47
+
48
+ with open("prompts/resume_extraction.prompt", "r") as f:
49
+ template = f.read()
50
+
51
+ parser = PydanticOutputParser(pydantic_object=Resume)
52
+
53
+ prompt_template = PromptTemplate(
54
+ template=template,
55
+ input_variables=["resume"],
56
+ partial_variables={"response_template": parser.get_format_instructions()},
57
+ )
58
+ # Invoke the language model and process the resume
59
+ formatted_input = prompt_template.format_prompt(resume=full_text)
60
+ llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
61
+ # print("llm", llm)
62
+ output = llm.invoke(formatted_input.to_string())
63
+
64
+ # print(output) # Print the output object for debugging
65
+
66
+ try:
67
+ parsed_output = parser.parse(output.content)
68
+ json_output = parsed_output.json()
69
+ print(json_output)
70
+ return json_output
71
+
72
+ except ValidationError as e:
73
+ print(f"Validation error: {e}")
74
+ print(output)
75
+ return output.content
76
+
77
+ except JSONDecodeError as e:
78
+ print(f"JSONDecodeError error: {e}")
79
+ print(output)
80
+ return output.content
81
+
82
+ st.title("Resume Parser")
83
+
84
+ # Set up the LLM dictionary
85
+ llm_dict = {
86
+ "gpt-4-1106-preview": ChatOpenAI(temperature=0, model="gpt-4-1106-preview"),
87
+ "gpt-4": ChatOpenAI(temperature=0, model="gpt-4"),
88
+ "gpt-3.5-turbo-1106": ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106"),
89
+ "claude-2": ChatAnthropic(model="claude-2", max_tokens=20_000),
90
+ "claude-instant-1": ChatAnthropic(model="claude-instant-1", max_tokens=20_000)
91
+ }
92
+
93
+ # Add a Streamlit dropdown menu for model selection
94
+ selected_model = st.selectbox("Select a model", list(llm_dict.keys()))
95
+
96
+ # Add a file uploader
97
+ uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
98
+
99
+ # Check if a file is uploaded
100
+ if uploaded_file is not None:
101
+ # Add a button to trigger the conversion
102
+ if st.button("Convert PDF to Text"):
103
+ # Convert the uploaded file to a string
104
+ text = pdf_to_string(uploaded_file)
105
+
106
+ # Extract resume fields using the selected model
107
+ extracted_fields = extract_resume_fields(text, selected_model)
108
+
109
+ # Display the extracted fields on the Streamlit app
110
+ st.json(extracted_fields)
prompts/resume_extraction.prompt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Given the task of analyzing the resume provided below, identify and organize the following information into structured data:
2
+
3
+ - Personal Details: Full name, contact information, and any professional summary or objective.
4
+ - Education: List of educational institutions attended, degrees obtained, fields of study, and graduation dates.
5
+ - Work Experience: For each position held, extract the company name, job title, duration of employment, a brief description of the role, and notable contributions or responsibilities.
6
+ - Projects: Details of significant projects worked on, including the project name, description, technologies used, and the role in the project.
7
+ - Skills: A compilation of technical and soft skills listed.
8
+ - Certifications: Any professional certifications, the certifying body, and the date of certification.
9
+ - Publications: Titles of publications, co-authors if applicable, and date of publication.
10
+ - Awards: Titles of any awards or honors received, awarding bodies, and date of receipt.
11
+
12
+ For fields not explicitly mentioned by the user, ensure to check for common sections such as volunteer experience, languages spoken, and hobbies or interests if they are professionally relevant.
13
+
14
+ Through each field make sure that you maintain as much details as possible, for example notable contributions should not be summarized but rather listed in full detail without adding any new material that isn't in the document.
15
+
16
+ Use the JSON structure below file to format your output. If any section does not apply or information is not available, it should be omitted from the JSON object. Ensure the output is formatted properly with the correct data types for each field (e.g., arrays, strings, objects).
17
+
18
+ ----
19
+
20
+ {response_template}
21
+
22
+ ----
23
+
24
+ {resume}
25
+
26
+ ----
27
+
28
+ ensure that the output is formatted properly with the correct data types for each field.
requirements.txt CHANGED
@@ -5,4 +5,5 @@ PyPDF2
5
 
6
  openai
7
  anthropic
8
- langchain
 
 
5
 
6
  openai
7
  anthropic
8
+ langchain
9
+ langchain-community
resume_template.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, ValidationError
2
+ from typing import List, Optional, Dict
3
+
4
+ # The following classes are for the resume template
5
+
6
+ class ContactInfo(BaseModel):
7
+ email: Optional[str] = None
8
+ phone: Optional[str] = None
9
+ linkedin: Optional[str] = None
10
+
11
+ class PersonalDetails(BaseModel):
12
+ full_name: str
13
+ contact_info: ContactInfo
14
+ professional_summary: Optional[str] = None
15
+
16
+ class Education(BaseModel):
17
+ institution: Optional[str] = None
18
+ degree: Optional[str] = None
19
+ field_of_study: Optional[str] = None
20
+ graduation_date: Optional[str] = None
21
+
22
+ class WorkExperience(BaseModel):
23
+ company: Optional[str] = None
24
+ title: Optional[str] = None
25
+ duration: Optional[str] = None
26
+ description: Optional[str] = None
27
+ notable_contributions: Optional[List[str]] = None
28
+
29
+ class Project(BaseModel):
30
+ name: Optional[str] = None
31
+ description: Optional[str] = None
32
+ technologies: Optional[str] = None
33
+ role: Optional[str] = None
34
+
35
+ class Certification(BaseModel):
36
+ title: Optional[str] = None
37
+ certifying_body: Optional[str] = None
38
+ date: Optional[str] = None
39
+
40
+ class Publication(BaseModel):
41
+ title: Optional[str] = None
42
+ co_authors: List[str] = []
43
+ date: Optional[str] = None
44
+
45
+ class Award(BaseModel):
46
+ title: Optional[str] = None
47
+ awarding_body: Optional[str] = None
48
+ date: Optional[str] = None
49
+
50
+ class VolunteerExperience(BaseModel):
51
+ organization: Optional[str] = None
52
+ role: Optional[str] = None
53
+ duration: Optional[str] = None
54
+ description: Optional[str] = None
55
+
56
+ class AdditionalSections(BaseModel):
57
+ volunteer_experience: Optional[List[VolunteerExperience]] = []
58
+ languages: Optional[List[str]] = []
59
+ interests: Optional[List[str]] = []
60
+
61
+ class Resume(BaseModel):
62
+ personal_details: PersonalDetails
63
+ education: List[Education] = []
64
+ work_experience: List[WorkExperience] = []
65
+ projects: List[Project] = []
66
+ skills: List[str] = []
67
+ certifications: List[Certification] = []
68
+ publications: List[Publication] = []
69
+ awards: List[Award] = []
70
+ additional_sections: Optional[AdditionalSections] = None