Spaces:

Prernas19
/

Cv_parser

Sleeping

App Files Files Community

Cv_parser / src /streamlit_app.py

Prernas19

Update src/streamlit_app.py

1ca78ca verified 2 months ago

raw

history blame contribute delete

8.73 kB

	import streamlit as st
	import google.generativeai as genai
	from PIL import Image
	import fitz # PyMuPDF
	from docx import Document
	import json
	from pathlib import Path
	from datetime import datetime
	import re
	import pytesseract
	import io

	def extract_text_from_pdf(pdf_file):
	"""Extract text from uploaded PDF file."""
	text_content = []
	try:
	pdf_bytes = pdf_file.read()
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	for page_num in range(len(doc)):
	page = doc[page_num]
	text_content.append(page.get_text())
	return "\n".join(text_content)
	except Exception as e:
	st.error(f"Error in PDF extraction: {str(e)}")
	return ""

	def extract_text_from_docx(docx_file):
	"""Extract text from uploaded DOCX file."""
	try:
	doc = Document(docx_file)
	text_content = []
	for paragraph in doc.paragraphs:
	text_content.append(paragraph.text)
	return "\n".join(text_content)
	except Exception as e:
	st.error(f"Error in DOCX extraction: {str(e)}")
	return ""

	def parse_date(date_str):
	"""Parse date from various formats."""
	try:
	# Handle 'Present' or 'Current'
	if date_str.lower() in ['present', 'current', 'now']:
	return datetime.now()

	date_str = date_str.strip()

	formats = [
	'%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y',
	'%Y/%m', '%Y-%m'
	]

	for fmt in formats:
	try:
	return datetime.strptime(date_str, fmt)
	except ValueError:
	continue

	year_match = re.search(r'\b20\d{2}\b', date_str)
	if year_match:
	return datetime.strptime(year_match.group(), '%Y')

	return None
	except Exception:
	return None

	def calculate_experience(work_history):
	"""Calculate total years of experience from work history."""
	total_experience = 0
	current_year = datetime.now().year

	for job in work_history:
	duration = job.get('duration', '')
	if not duration:
	continue

	parts = re.split(r'\s-\s\|\s+to\s+', duration)
	if len(parts) != 2:
	continue

	start_date = parse_date(parts[0])
	end_date = parse_date(parts[1])

	if start_date and end_date:
	years = (end_date.year - start_date.year) + \
	(end_date.month - start_date.month) / 12
	total_experience += max(0, years)

	return round(total_experience, 1)

	def parse_resume(file_uploaded, api_key):
	"""Parse resume and extract information."""
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-1.5-flash')

	prompt = """Extract the following information from this resume:
	1. Summarize the following resume in 100 words, focusing on key skills, experience, and qualifications
	2. Full Name
	3. Email Address
	4. Phone Number
	5. Education History (including degree, institution, graduation year, and field of study)
	6. Companies worked at with positions and EXACT duration (e.g., "Jan 2020 - Present" or "2018-2020")
	7. Skills
	8. LinkedIn Profile URL
	Return the information in this JSON format:
	{
	"summary": "",
	"name": "",
	"email": "",
	"phone": "",
	"education": [
	{
	"degree": "",
	"institution": "",
	"year": "",
	"field": "",
	"gpa": ""
	}
	],
	"work_experience": [
	{
	"company": "",
	"position": "",
	"duration": ""
	}
	],
	"skills": [],
	"linkedin": ""
	}
	For skills include tools and technologies in output if present any in resume.
	For work experience durations, please specify exact dates in format: "MMM YYYY - MMM YYYY" or "YYYY - Present" , please return in one order either in ascending or descending.
	Only return the JSON object, nothing else. If any field is not found, leave it empty."""

	try:
	file_extension = Path(file_uploaded.name).suffix.lower()

	if file_extension == '.pdf':
	text_content = extract_text_from_pdf(file_uploaded)
	elif file_extension in ['.docx', '.doc']:
	text_content = extract_text_from_docx(file_uploaded)
	elif file_extension in ['.jpg', '.jpeg', '.png']:
	image = Image.open(file_uploaded)
	text_content = pytesseract.image_to_string(image)
	else:
	st.error(f"Unsupported file format: {file_extension}")
	return None

	response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")

	try:
	response_text = response.text
	json_start = response_text.find('{')
	json_end = response_text.rfind('}') + 1
	json_str = response_text[json_start:json_end]

	result = json.loads(json_str)
	total_exp = calculate_experience(result.get('work_experience', []))
	result['total_years_experience'] = total_exp

	return result
	except json.JSONDecodeError as e:
	st.error(f"Error parsing response: {str(e)}")
	return None

	except Exception as e:
	st.error(f"Error processing resume: {str(e)}")
	return None

	def format_education(edu):
	"""Format education details for display."""
	parts = []
	if edu.get('degree'):
	parts.append(edu['degree'])
	if edu.get('field'):
	parts.append(f"in {edu['field']}")
	if edu.get('institution'):
	parts.append(f"from {edu['institution']}")
	if edu.get('year'):
	parts.append(f"({edu['year']})")
	if edu.get('gpa') and edu['gpa'].strip():
	parts.append(f"- GPA: {edu['gpa']}")
	return " ".join(parts)

	def main():
	st.title("Resume Parser")
	st.write("Upload a resume (PDF, DOCX, or Image) to extract information")

	# Get API key from secrets or user input
	api_key = st.secrets["GEMINI_API_KEY"] if "GEMINI_API_KEY" in st.secrets else st.text_input("Enter Gemini API Key", type="password")

	uploaded_file = st.file_uploader("Choose a resume file", type=["pdf", "docx", "doc", "jpg", "jpeg", "png"])

	if uploaded_file and api_key:
	with st.spinner('Analyzing resume...'):
	result = parse_resume(uploaded_file, api_key)

	if result:
	st.subheader("Extracted Information")

	# Display summary in a text area
	st.text_area("Summary", result.get('summary', 'Not found'), height=100)

	# Display personal information
	col1, col2, col3 = st.columns(3)
	with col1:
	st.write("Name:", result.get('name', 'Not found'))
	with col2:
	st.write("Email:", result.get('email', 'Not found'))
	with col3:
	st.write("Phone:", result.get('phone', 'Not found'))

	# Display total experience
	total_exp = result.get('total_years_experience', 0)
	exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months"
	st.write("Total Experience:", exp_text)

	# Display education
	st.subheader("Education")
	if result.get('education'):
	for edu in result['education']:
	st.write(f"- {format_education(edu)}")
	else:
	st.write("No education information found")

	# Display work experience
	st.subheader("Work Experience")
	if result.get('work_experience'):
	for exp in result['work_experience']:
	duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else ""
	st.write(f"- {exp.get('position', 'Role not found')} at {exp.get('company', 'Company not found')}{duration}")
	else:
	st.write("No work experience found")

	# Display Skills
	st.subheader("Skills:")
	if result.get('skills'):
	for skill in result['skills']:
	st.write(f"- {skill}")
	else:
	st.write("- No skills found")

	# Display LinkedIn profile
	st.write("LinkedIn Profile:", result.get('linkedin', 'Not found'))

	if __name__ == "__main__":
	main()