Spaces:
Running
Running
import groq | |
from pdfextractor import extract_text_from_pdf | |
from models import Profile, SocialMedia, Project, Skill, Education | |
from typing import List, Dict, Any, Optional | |
from langchain.output_parsers import PydanticOutputParser | |
from langchain.prompts import PromptTemplate | |
from langchain_groq import ChatGroq | |
import json | |
from config import get_settings | |
settings = get_settings() | |
class ProfileExtractor: | |
""" | |
Class for extracting profile information from resume text | |
""" | |
def __init__(self): | |
self.groq_api_key = settings.GROQ_API_KEY | |
self.model_name = settings.MODEL_NAME | |
self.temperature = settings.TEMPERATURE | |
self.max_tokens = settings.MAX_TOKENS | |
self.llm = self._initialize_llm() | |
def _initialize_llm(self) -> ChatGroq: | |
"""Initialize the language model client""" | |
return ChatGroq( | |
groq_api_key=self.groq_api_key, | |
model_name=self.model_name, | |
temperature=self.temperature, | |
max_tokens=self.max_tokens | |
) | |
def extract_profile(self, pdf_text: str) -> Profile: | |
""" | |
Main method to extract profile information from PDF text | |
Args: | |
pdf_text: Text extracted from a resume PDF | |
Returns: | |
Profile object with extracted information | |
""" | |
try: | |
profile = self._extract_with_langchain(pdf_text) | |
return profile | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"LangChain extraction failed: {e}") | |
return self._extract_with_fallback(pdf_text) | |
def _extract_with_langchain(self, pdf_text: str) -> Profile: | |
"""Extract profile with structured LangChain approach""" | |
# Define the format instructions for the LLM | |
format_instructions = """ | |
Extract the following information from the resume: | |
1. Full name | |
2. Professional title | |
3. Email address | |
4. Bio (a 50-100 word professional summary) | |
5. Tagline (a short 5-10 word catchy phrase summarizing professional identity) | |
6. Social media links (LinkedIn, GitHub, Instagram) | |
7. Projects (with title, description, and tech stack) | |
8. Skills | |
9. Education history (with school, degree, field of study, start date and end date) | |
Return the information in the following JSON format: | |
{ | |
"name": "Full Name", | |
"title": "Professional Title", | |
"email": "email@example.com", | |
"bio": "Professional biography...", | |
"tagline": "Catchy professional tagline", | |
"social": { | |
"linkedin": "LinkedIn URL or null", | |
"github": "GitHub URL or null", | |
"instagram": "Instagram URL or null" | |
}, | |
"projects": [ | |
{ | |
"title": "Project Title", | |
"description": "Project Description", | |
"techStack": "Technologies used" | |
} | |
], | |
"skills": [ | |
{"name": "Skill 1"}, | |
{"name": "Skill 2"} | |
], | |
"educations": [ | |
{ | |
"school": "University Name", | |
"degree": "Degree Type (e.g., Bachelor's, Master's)", | |
"fieldOfStudy": "Major or Field", | |
"startDate": "Start Year", | |
"endDate": "End Year or Present" | |
} | |
] | |
} | |
If any information is not available, use null for that field. | |
""" | |
# Create the prompt template | |
template = """ | |
You are a professional resume parser. Extract structured information from the following resume: | |
{pdf_text} | |
{format_instructions} | |
""" | |
prompt = PromptTemplate( | |
template=template, | |
input_variables=["pdf_text"], | |
partial_variables={"format_instructions": format_instructions} | |
) | |
# Get the structured information from the LLM | |
chain = prompt | self.llm | |
result = chain.invoke({"pdf_text": pdf_text}) | |
response_text = result.content | |
# Extract JSON from the response text (in case the LLM adds extra text) | |
json_start = response_text.find('{') | |
json_end = response_text.rfind('}') + 1 | |
if json_start >= 0 and json_end > json_start: | |
json_str = response_text[json_start:json_end] | |
profile_dict = json.loads(json_str) | |
# Create a Profile object from the dictionary | |
profile = Profile.model_validate(profile_dict) | |
# Check for missing information and try to extract it if necessary | |
profile = self._fill_missing_information(profile, pdf_text) | |
return profile | |
else: | |
raise ValueError("No JSON found in the response") | |
def _fill_missing_information(self, profile: Profile, pdf_text: str) -> Profile: | |
""" | |
Attempts to fill in any missing information in the profile | |
""" | |
# Check and fill name if missing | |
if not profile.name or profile.name == "N/A": | |
try: | |
response = self.llm.invoke("Extract only the full name from this resume text. Respond with just the name: " + pdf_text[:settings.CHUNK_SIZE]) | |
name = response.content.strip() | |
if name and name != "N/A": | |
profile.name = name | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"Error extracting name: {e}") | |
# Check and fill title if missing | |
if not profile.title or profile.title == "N/A": | |
try: | |
response = self.llm.invoke("Extract only the professional title from this resume text. Respond with just the title: " + pdf_text[:settings.CHUNK_SIZE]) | |
title = response.content.strip() | |
if title and title != "N/A": | |
profile.title = title | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"Error extracting title: {e}") | |
# Check and fill email if missing | |
if not profile.email or profile.email == "N/A": | |
try: | |
response = self.llm.invoke("Extract only the email address from this resume text. Respond with just the email: " + pdf_text) | |
email = response.content.strip() | |
if email and email != "N/A" and "@" in email: | |
profile.email = email | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"Error extracting email: {e}") | |
# Check and fill bio if missing | |
if not profile.bio or profile.bio == "N/A": | |
try: | |
response = self.llm.invoke("Create a short professional biography (around 50-100 words) based on this resume. Focus on skills and experience: " + pdf_text) | |
bio = response.content.strip() | |
if bio and bio != "N/A": | |
profile.bio = bio | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"Error creating bio: {e}") | |
# Check for education if missing | |
if not profile.educations: | |
try: | |
education_prompt = "Extract education history from this resume. For each education entry, provide the school name, degree type, field of study, start date, and end date. Format the response as a list of JSON objects." | |
response = self.llm.invoke(education_prompt + "\n\n" + pdf_text) | |
education_text = response.content.strip() | |
# Try to extract JSON from the response | |
json_start = education_text.find('[') | |
json_end = education_text.rfind(']') + 1 | |
if json_start >= 0 and json_end > json_start: | |
edu_json = education_text[json_start:json_end] | |
educations = json.loads(edu_json) | |
for edu in educations: | |
education = Education( | |
school=edu.get("school", "Unknown"), | |
degree=edu.get("degree", ""), | |
fieldOfStudy=edu.get("fieldOfStudy", ""), | |
startDate=edu.get("startDate", ""), | |
endDate=edu.get("endDate", "") | |
) | |
profile.educations.append(education) | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"Error extracting education: {e}") | |
return profile | |
def _extract_with_fallback(self, pdf_text: str) -> Profile: | |
"""Fallback method for profile extraction using direct API calls""" | |
client = groq.Groq(api_key=self.groq_api_key) | |
def get_llm_response(prompt: str) -> str: | |
"""Helper function to get a response from the LLM.""" | |
try: | |
chat_completion = client.chat.completions.create( | |
messages=[{"role": "user", "content": prompt}], | |
model=self.model_name, | |
temperature=settings.FALLBACK_TEMPERATURE, | |
max_tokens=settings.MAX_TOKENS | |
) | |
return chat_completion.choices[0].message.content | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"Error during LLM call: {e}") | |
return "" # Return empty string on failure | |
# Extract basic information | |
name = get_llm_response(f"Extract the full name from the following text. If no name is present, respond with 'N/A'. Only respond with the name: {pdf_text}").strip() | |
title = get_llm_response(f"Extract the professional title from the following text. If no title is present, respond with 'N/A'. Only respond with the title: {pdf_text}").strip() | |
email = get_llm_response(f"Extract the email address from the following text. If no email is present, respond with 'N/A'. Only respond with the email: {pdf_text}").strip() | |
bio = get_llm_response(f"Create a short professional biography (around 50-100 words) based on the following text. Focus on skills and experience. If no bio is possible, respond with 'N/A'. Provide only the biography itself: {pdf_text}").strip() | |
tagline = get_llm_response(f"Create a short and catchy tagline (around 5-10 words) that summarizes the person's professional identity from the following text. If no tagline is possible, respond with 'N/A'. Provide only the tagline: {pdf_text}").strip() | |
# Extract social media | |
linkedin = get_llm_response(f"Extract the LinkedIn profile URL from the following text. If no LinkedIn URL is present, respond with 'N/A'. Only respond with the LinkedIn URL: {pdf_text}").strip() | |
github = get_llm_response(f"Extract the GitHub profile URL from the following text. If no GitHub URL is present, respond with 'N/A'. Only respond with the GitHub URL: {pdf_text}").strip() | |
instagram = get_llm_response(f"Extract the Instagram profile URL from the following text. If no Instagram URL is present, respond with 'N/A'. Only respond with the Instagram URL: {pdf_text}").strip() | |
# Extract projects and skills | |
project_info = get_llm_response(f"Extract information about projects from the following text in this format Project Title: Project Description: Tech Stack:. If no projects are present, respond with 'N/A': {pdf_text}").strip() | |
skills_info = get_llm_response(f"Extract a list of skills from the following text, separated by commas. If no skills are present, respond with 'N/A'. Only respond with the skills: {pdf_text}").strip() | |
# Extract education | |
education_info = get_llm_response(f"Extract education history from the following resume. For each education entry, provide the school name, degree type, field of study, start date, and end date. Format as 'School: Degree: Field: StartDate: EndDate' with each education on a new line. If no education is found, respond with 'N/A': {pdf_text}").strip() | |
# Process the extracted information | |
social_media = SocialMedia( | |
linkedin=linkedin if linkedin != 'N/A' else None, | |
github=github if github != 'N/A' else None, | |
instagram=instagram if instagram != 'N/A' else None | |
) | |
# Process projects | |
projects = [] | |
if project_info != "N/A": | |
project_lines = project_info.split("\n") | |
for line in project_lines: | |
if ":" in line: | |
try: | |
project_title, project_description_techstack = line.split(":", 1) | |
project_description, tech_stack = project_description_techstack.split("Tech Stack:", 1) | |
projects.append(Project( | |
title=project_title.strip(), | |
description=project_description.strip(), | |
techStack=tech_stack.strip() | |
)) | |
except ValueError as e: | |
if settings.DEBUG: | |
print(f"Error parsing project: {line}. Error: {e}") | |
# Process skills | |
skills = [] | |
if skills_info != "N/A": | |
skill_list = [skill.strip() for skill in skills_info.split(",")] | |
for skill_name in skill_list: | |
if skill_name: | |
skills.append(Skill(name=skill_name)) | |
# Process education | |
educations = [] | |
if education_info != "N/A": | |
education_lines = education_info.split("\n") | |
for line in education_lines: | |
if ":" in line: | |
try: | |
parts = line.split(":") | |
if len(parts) >= 5: | |
educations.append(Education( | |
school=parts[0].strip(), | |
degree=parts[1].strip(), | |
fieldOfStudy=parts[2].strip(), | |
startDate=parts[3].strip(), | |
endDate=parts[4].strip() | |
)) | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"Error parsing education: {line}. Error: {e}") | |
# Create the profile object | |
profile = Profile( | |
name=name if name != 'N/A' else "N/A", | |
title=title if title != 'N/A' else "N/A", | |
email=email if email != 'N/A' else "N/A", | |
bio=bio if bio != 'N/A' else "N/A", | |
tagline=tagline if tagline != 'N/A' else None, | |
social=social_media if (social_media.github or social_media.instagram or social_media.linkedin) else None, | |
chatbot=None, | |
profileImg=None, | |
heroImg=None, | |
projects=projects, | |
skills=skills, | |
educations=educations | |
) | |
return profile | |
class GrammarCorrector: | |
"""Class for correcting grammar in text using LLM""" | |
def __init__(self): | |
self.groq_api_key = settings.GROQ_API_KEY | |
self.model_name = settings.MODEL_NAME | |
self.temperature = settings.GRAMMAR_CORRECTION_TEMPERATURE | |
def correct_grammar(self, text: str) -> str: | |
""" | |
Corrects grammar in user input using Groq's LLM. | |
Args: | |
text: The text to correct | |
Returns: | |
The corrected text | |
""" | |
if not text: | |
return text | |
client = groq.Groq(api_key=self.groq_api_key) | |
try: | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "user", | |
"content": f"Correct any grammar, spelling, or punctuation errors in the following text, but keep the meaning exactly the same: '{text}'" | |
} | |
], | |
model=self.model_name, | |
temperature=self.temperature, | |
max_tokens=settings.MAX_TOKENS | |
) | |
return chat_completion.choices[0].message.content | |
except Exception as e: | |
if settings.DEBUG: | |
print(f"Error during grammar correction: {e}") | |
return text # Return original text if correction fails | |
# Create module-level instances for easier imports | |
profile_extractor = ProfileExtractor() | |
grammar_corrector = GrammarCorrector() | |
# Export functions for backward compatibility | |
def extract_profile_information(pdf_text: str) -> Profile: | |
"""Legacy function for backward compatibility""" | |
return profile_extractor.extract_profile(pdf_text) | |
def correct_grammar(text: str) -> str: | |
"""Legacy function for backward compatibility""" | |
return grammar_corrector.correct_grammar(text) | |