CV_Process / extraction.py
bsiddhharth
added try except to groq_api_key
45fe6a4
import logging
from typing import Optional
from pydantic import BaseModel, Field
from langchain.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import os
import tempfile
import streamlit as st
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
# logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Defining the CV structure using Pydantic for structured output
class cv(BaseModel):
name: Optional[str] = Field(default=None, description="Name of candidate")
skills: Optional[list[str]] = Field(default=None, description="Skills of candidate")
certifications: Optional[list[str]] = Field(default=None, description="Certificates of candidate")
years_of_exp: Optional[int] = Field(default=None, description="Years of experience")
# Defining the data structure that contains a list of CVs
class data(BaseModel):
candidates: list[cv]
def create_prompt_template() -> ChatPromptTemplate:
logger.info("Creating the prompt template for CV extraction")
"""Create the prompt template for CV extraction."""
return ChatPromptTemplate.from_messages(
[
("system",
"You are an expert extraction algorithm. Your job is to extract the following specific information from the given text:"
"- Name of the candidate"
"- Skills"
"- Certifications (Look for terms such as 'Certified,' 'Certification,' 'Certificate')"
"- years_of_exp (Extract only the number of years. If an approximation is given (e.g., '5+ years'), return the lower bound (e.g., '5').)"
"If you cannot find the value for a specific attribute, return null for that attribute's value."
"The 'years of experience' can be mentioned in various formats (e.g., '5+ years', '5 years', 'since 2010'). "
"Extract it accurately, even if it's mentioned in different contexts like a professional summary or work experience. "
"If multiple jobs are listed, you can calculate the experience from the work history."
"Certifications are usually found under headers like 'Certifications,' 'Professional Certificates,' or similar. They might include phrases like 'AWS Certified Developer,' 'MongoDB Developer Associate,' etc."
),
("human", "{text}")
]
)
def initialize_llm() -> ChatGroq:
logger.info("Initializing LLM")
"""Initialize the language model."""
# os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
if groq_api_key is None:
try:
groq_api_key = st.secrets["GROQ_API_KEY"]
except Exception as e:
st.error("GROQ_API_KEY is not set in the environment variables or Streamlit secrets.")
groq_api_key = None
# groq_api_key = st.secrets["GROQ_API_KEY"]
if not groq_api_key:
logger.error("GROQ_API_KEY is not set")
raise ValueError("GROQ_API_KEY environment variable is missing.")
return ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile", temperature=0.6)
def extract_cv_data(text: str) -> list[cv]:
logger.info("Extracting CV data from text")
"""Extract data from the text using the language model."""
prompt = create_prompt_template()
llm = initialize_llm()
# creating a chain to extract structred ouput from the text using schema
runnable = prompt | llm.with_structured_output(schema=data)
response = runnable.invoke({"text": text})
logger.info(f"Extracted {len(response.candidates)} candidate(s) from the text")
return response.candidates # returns the list of candidates
def process_file(uploaded_files) -> str:
logger.info(f"Processing file: {uploaded_files.name}")
"""Process the uploaded file and return the text."""
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_files.name)[1]) as tmp_file:
tmp_file.write(uploaded_files.getvalue())
tmp_path = tmp_file.name
try:
if tmp_path.endswith('.pdf'):
loader = PDFPlumberLoader(tmp_path)
logger.info(f"Loaded PDF file: {tmp_path}")
else:
loader = TextLoader(tmp_path)
logger.info(f"Loaded text file: {tmp_path}")
documents = loader.load()
# return " ".join([doc.page_content for doc in documents])
text_content = " ".join([doc.page_content for doc in documents])
logger.info(f"Extracted text from file: {uploaded_files.name}")
return text_content
finally:
logger.info(f"Deleting temporary file: {tmp_path}")
os.unlink(tmp_path)
def display_candidates_info(candidates_list: list[cv]):
logger.info(f"Displaying information for {len(candidates_list)} candidate(s)")
"""Display the extracted candidates' information in a table."""
logger.debug(f"Candidate list: {candidates_list}")
data = []
for candidate in candidates_list:
data.append({
"Name": candidate.name,
"Skills": ", ".join(candidate.skills) if candidate.skills else 'None',
"Certifications": ", ".join(candidate.certifications) if candidate.certifications else 'None',
"Years of Experience": candidate.years_of_exp if candidate.years_of_exp else 'None'
})
st.write("### Candidates Information")
st.table(data)
logger.debug("Displayed candidates' information in table")
# print(candidates_list)
# Try this to see the working of extraction
# Streamlit file uploader and extraction logic
# uploaded_files = st.file_uploader(" Upload the CV: ", type=['pdf', 'txt'],key="unique_cv_upload")
# if uploaded_files is not None:
# text = process_file(uploaded_files)
# # text = ep.text
# candidates_list = extract_cv_data(text)
# display_candidates_info(candidates_list)