Spaces:
Sleeping
Sleeping
| import groq | |
| from pdfextractor import extract_text_from_pdf | |
| from models import Profile, SocialMedia, Project, Skill, Education | |
| from typing import List, Dict, Any, Optional | |
| from langchain.output_parsers import PydanticOutputParser | |
| from langchain.prompts import PromptTemplate | |
| from langchain_groq import ChatGroq | |
| import json | |
| from config import get_settings | |
| settings = get_settings() | |
| class ProfileExtractor: | |
| """ | |
| Class for extracting profile information from resume text | |
| """ | |
| def __init__(self): | |
| self.groq_api_key = settings.GROQ_API_KEY | |
| self.model_name = settings.MODEL_NAME | |
| self.temperature = settings.TEMPERATURE | |
| self.max_tokens = settings.MAX_TOKENS | |
| self.llm = self._initialize_llm() | |
| def _initialize_llm(self) -> ChatGroq: | |
| """Initialize the language model client""" | |
| return ChatGroq( | |
| groq_api_key=self.groq_api_key, | |
| model_name=self.model_name, | |
| temperature=self.temperature, | |
| max_tokens=self.max_tokens | |
| ) | |
| def extract_profile(self, pdf_text: str) -> Profile: | |
| """ | |
| Main method to extract profile information from PDF text | |
| Args: | |
| pdf_text: Text extracted from a resume PDF | |
| Returns: | |
| Profile object with extracted information | |
| """ | |
| try: | |
| profile = self._extract_with_langchain(pdf_text) | |
| return profile | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"LangChain extraction failed: {e}") | |
| return self._extract_with_fallback(pdf_text) | |
| def _extract_with_langchain(self, pdf_text: str) -> Profile: | |
| """Extract profile with structured LangChain approach""" | |
| # Define the format instructions for the LLM | |
| format_instructions = """ | |
| Extract the following information from the resume: | |
| 1. Full name | |
| 2. Professional title | |
| 3. Email address | |
| 4. Bio (a 50-100 word professional summary) | |
| 5. Tagline (a short 5-10 word catchy phrase summarizing professional identity) | |
| 6. Social media links (LinkedIn, GitHub, Instagram) | |
| 7. Projects (with title, description, and tech stack) | |
| 8. Skills | |
| 9. Education history (with school, degree, field of study, start date and end date) | |
| Return the information in the following JSON format: | |
| { | |
| "name": "Full Name", | |
| "title": "Professional Title", | |
| "email": "email@example.com", | |
| "bio": "Professional biography...", | |
| "tagline": "Catchy professional tagline", | |
| "social": { | |
| "linkedin": "LinkedIn URL or null", | |
| "github": "GitHub URL or null", | |
| "instagram": "Instagram URL or null" | |
| }, | |
| "projects": [ | |
| { | |
| "title": "Project Title", | |
| "description": "Project Description", | |
| "techStack": "Technologies used" | |
| } | |
| ], | |
| "skills": [ | |
| {"name": "Skill 1"}, | |
| {"name": "Skill 2"} | |
| ], | |
| "educations": [ | |
| { | |
| "school": "University Name", | |
| "degree": "Degree Type (e.g., Bachelor's, Master's)", | |
| "fieldOfStudy": "Major or Field", | |
| "startDate": "Start Year", | |
| "endDate": "End Year or Present" | |
| } | |
| ] | |
| } | |
| If any information is not available, use null for that field. | |
| """ | |
| # Create the prompt template | |
| template = """ | |
| You are a professional resume parser. Extract structured information from the following resume: | |
| {pdf_text} | |
| {format_instructions} | |
| """ | |
| prompt = PromptTemplate( | |
| template=template, | |
| input_variables=["pdf_text"], | |
| partial_variables={"format_instructions": format_instructions} | |
| ) | |
| # Get the structured information from the LLM | |
| chain = prompt | self.llm | |
| result = chain.invoke({"pdf_text": pdf_text}) | |
| response_text = result.content | |
| # Extract JSON from the response text (in case the LLM adds extra text) | |
| json_start = response_text.find('{') | |
| json_end = response_text.rfind('}') + 1 | |
| if json_start >= 0 and json_end > json_start: | |
| json_str = response_text[json_start:json_end] | |
| profile_dict = json.loads(json_str) | |
| # Create a Profile object from the dictionary | |
| profile = Profile.model_validate(profile_dict) | |
| # Check for missing information and try to extract it if necessary | |
| profile = self._fill_missing_information(profile, pdf_text) | |
| return profile | |
| else: | |
| raise ValueError("No JSON found in the response") | |
| def _fill_missing_information(self, profile: Profile, pdf_text: str) -> Profile: | |
| """ | |
| Attempts to fill in any missing information in the profile | |
| """ | |
| # Check and fill name if missing | |
| if not profile.name or profile.name == "N/A": | |
| try: | |
| response = self.llm.invoke("Extract only the full name from this resume text. Respond with just the name: " + pdf_text[:settings.CHUNK_SIZE]) | |
| name = response.content.strip() | |
| if name and name != "N/A": | |
| profile.name = name | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"Error extracting name: {e}") | |
| # Check and fill title if missing | |
| if not profile.title or profile.title == "N/A": | |
| try: | |
| response = self.llm.invoke("Extract only the professional title from this resume text. Respond with just the title: " + pdf_text[:settings.CHUNK_SIZE]) | |
| title = response.content.strip() | |
| if title and title != "N/A": | |
| profile.title = title | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"Error extracting title: {e}") | |
| # Check and fill email if missing | |
| if not profile.email or profile.email == "N/A": | |
| try: | |
| response = self.llm.invoke("Extract only the email address from this resume text. Respond with just the email: " + pdf_text) | |
| email = response.content.strip() | |
| if email and email != "N/A" and "@" in email: | |
| profile.email = email | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"Error extracting email: {e}") | |
| # Check and fill bio if missing | |
| if not profile.bio or profile.bio == "N/A": | |
| try: | |
| response = self.llm.invoke("Create a short professional biography (around 50-100 words) based on this resume. Focus on skills and experience: " + pdf_text) | |
| bio = response.content.strip() | |
| if bio and bio != "N/A": | |
| profile.bio = bio | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"Error creating bio: {e}") | |
| # Check for education if missing | |
| if not profile.educations: | |
| try: | |
| education_prompt = "Extract education history from this resume. For each education entry, provide the school name, degree type, field of study, start date, and end date. Format the response as a list of JSON objects." | |
| response = self.llm.invoke(education_prompt + "\n\n" + pdf_text) | |
| education_text = response.content.strip() | |
| # Try to extract JSON from the response | |
| json_start = education_text.find('[') | |
| json_end = education_text.rfind(']') + 1 | |
| if json_start >= 0 and json_end > json_start: | |
| edu_json = education_text[json_start:json_end] | |
| educations = json.loads(edu_json) | |
| for edu in educations: | |
| education = Education( | |
| school=edu.get("school", "Unknown"), | |
| degree=edu.get("degree", ""), | |
| fieldOfStudy=edu.get("fieldOfStudy", ""), | |
| startDate=edu.get("startDate", ""), | |
| endDate=edu.get("endDate", "") | |
| ) | |
| profile.educations.append(education) | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"Error extracting education: {e}") | |
| return profile | |
| def _extract_with_fallback(self, pdf_text: str) -> Profile: | |
| """Fallback method for profile extraction using direct API calls""" | |
| client = groq.Groq(api_key=self.groq_api_key) | |
| def get_llm_response(prompt: str) -> str: | |
| """Helper function to get a response from the LLM.""" | |
| try: | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "user", "content": prompt}], | |
| model=self.model_name, | |
| temperature=settings.FALLBACK_TEMPERATURE, | |
| max_tokens=settings.MAX_TOKENS | |
| ) | |
| return chat_completion.choices[0].message.content | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"Error during LLM call: {e}") | |
| return "" # Return empty string on failure | |
| # Extract basic information | |
| name = get_llm_response(f"Extract the full name from the following text. If no name is present, respond with 'N/A'. Only respond with the name: {pdf_text}").strip() | |
| title = get_llm_response(f"Extract the professional title from the following text. If no title is present, respond with 'N/A'. Only respond with the title: {pdf_text}").strip() | |
| email = get_llm_response(f"Extract the email address from the following text. If no email is present, respond with 'N/A'. Only respond with the email: {pdf_text}").strip() | |
| bio = get_llm_response(f"Create a short professional biography (around 50-100 words) based on the following text. Focus on skills and experience. If no bio is possible, respond with 'N/A'. Provide only the biography itself: {pdf_text}").strip() | |
| tagline = get_llm_response(f"Create a short and catchy tagline (around 5-10 words) that summarizes the person's professional identity from the following text. If no tagline is possible, respond with 'N/A'. Provide only the tagline: {pdf_text}").strip() | |
| # Extract social media | |
| linkedin = get_llm_response(f"Extract the LinkedIn profile URL from the following text. If no LinkedIn URL is present, respond with 'N/A'. Only respond with the LinkedIn URL: {pdf_text}").strip() | |
| github = get_llm_response(f"Extract the GitHub profile URL from the following text. If no GitHub URL is present, respond with 'N/A'. Only respond with the GitHub URL: {pdf_text}").strip() | |
| instagram = get_llm_response(f"Extract the Instagram profile URL from the following text. If no Instagram URL is present, respond with 'N/A'. Only respond with the Instagram URL: {pdf_text}").strip() | |
| # Extract projects and skills | |
| project_info = get_llm_response(f"Extract information about projects from the following text in this format Project Title: Project Description: Tech Stack:. If no projects are present, respond with 'N/A': {pdf_text}").strip() | |
| skills_info = get_llm_response(f"Extract a list of skills from the following text, separated by commas. If no skills are present, respond with 'N/A'. Only respond with the skills: {pdf_text}").strip() | |
| # Extract education | |
| education_info = get_llm_response(f"Extract education history from the following resume. For each education entry, provide the school name, degree type, field of study, start date, and end date. Format as 'School: Degree: Field: StartDate: EndDate' with each education on a new line. If no education is found, respond with 'N/A': {pdf_text}").strip() | |
| # Process the extracted information | |
| social_media = SocialMedia( | |
| linkedin=linkedin if linkedin != 'N/A' else None, | |
| github=github if github != 'N/A' else None, | |
| instagram=instagram if instagram != 'N/A' else None | |
| ) | |
| # Process projects | |
| projects = [] | |
| if project_info != "N/A": | |
| project_lines = project_info.split("\n") | |
| for line in project_lines: | |
| if ":" in line: | |
| try: | |
| project_title, project_description_techstack = line.split(":", 1) | |
| project_description, tech_stack = project_description_techstack.split("Tech Stack:", 1) | |
| projects.append(Project( | |
| title=project_title.strip(), | |
| description=project_description.strip(), | |
| techStack=tech_stack.strip() | |
| )) | |
| except ValueError as e: | |
| if settings.DEBUG: | |
| print(f"Error parsing project: {line}. Error: {e}") | |
| # Process skills | |
| skills = [] | |
| if skills_info != "N/A": | |
| skill_list = [skill.strip() for skill in skills_info.split(",")] | |
| for skill_name in skill_list: | |
| if skill_name: | |
| skills.append(Skill(name=skill_name)) | |
| # Process education | |
| educations = [] | |
| if education_info != "N/A": | |
| education_lines = education_info.split("\n") | |
| for line in education_lines: | |
| if ":" in line: | |
| try: | |
| parts = line.split(":") | |
| if len(parts) >= 5: | |
| educations.append(Education( | |
| school=parts[0].strip(), | |
| degree=parts[1].strip(), | |
| fieldOfStudy=parts[2].strip(), | |
| startDate=parts[3].strip(), | |
| endDate=parts[4].strip() | |
| )) | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"Error parsing education: {line}. Error: {e}") | |
| # Create the profile object | |
| profile = Profile( | |
| name=name if name != 'N/A' else "N/A", | |
| title=title if title != 'N/A' else "N/A", | |
| email=email if email != 'N/A' else "N/A", | |
| bio=bio if bio != 'N/A' else "N/A", | |
| tagline=tagline if tagline != 'N/A' else None, | |
| social=social_media if (social_media.github or social_media.instagram or social_media.linkedin) else None, | |
| chatbot=None, | |
| profileImg=None, | |
| heroImg=None, | |
| projects=projects, | |
| skills=skills, | |
| educations=educations | |
| ) | |
| return profile | |
| class GrammarCorrector: | |
| """Class for correcting grammar in text using LLM""" | |
| def __init__(self): | |
| self.groq_api_key = settings.GROQ_API_KEY | |
| self.model_name = settings.MODEL_NAME | |
| self.temperature = settings.GRAMMAR_CORRECTION_TEMPERATURE | |
| def correct_grammar(self, text: str) -> str: | |
| """ | |
| Corrects grammar in user input using Groq's LLM. | |
| Args: | |
| text: The text to correct | |
| Returns: | |
| The corrected text | |
| """ | |
| if not text: | |
| return text | |
| client = groq.Groq(api_key=self.groq_api_key) | |
| try: | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": f"Correct any grammar, spelling, or punctuation errors in the following text, but keep the meaning exactly the same: '{text}'" | |
| } | |
| ], | |
| model=self.model_name, | |
| temperature=self.temperature, | |
| max_tokens=settings.MAX_TOKENS | |
| ) | |
| return chat_completion.choices[0].message.content | |
| except Exception as e: | |
| if settings.DEBUG: | |
| print(f"Error during grammar correction: {e}") | |
| return text # Return original text if correction fails | |
| # Create module-level instances for easier imports | |
| profile_extractor = ProfileExtractor() | |
| grammar_corrector = GrammarCorrector() | |
| # Export functions for backward compatibility | |
| def extract_profile_information(pdf_text: str) -> Profile: | |
| """Legacy function for backward compatibility""" | |
| return profile_extractor.extract_profile(pdf_text) | |
| def correct_grammar(text: str) -> str: | |
| """Legacy function for backward compatibility""" | |
| return grammar_corrector.correct_grammar(text) | |