Spaces:
Sleeping
Sleeping
import logging | |
from typing import Optional | |
from pydantic import BaseModel, Field | |
from langchain.prompts import ChatPromptTemplate | |
from langchain_groq import ChatGroq | |
import os | |
import tempfile | |
import streamlit as st | |
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader | |
# logging | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Defining the CV structure using Pydantic for structured output | |
class cv(BaseModel): | |
name: Optional[str] = Field(default=None, description="Name of candidate") | |
skills: Optional[list[str]] = Field(default=None, description="Skills of candidate") | |
certifications: Optional[list[str]] = Field(default=None, description="Certificates of candidate") | |
years_of_exp: Optional[int] = Field(default=None, description="Years of experience") | |
# Defining the data structure that contains a list of CVs | |
class data(BaseModel): | |
candidates: list[cv] | |
def create_prompt_template() -> ChatPromptTemplate: | |
logger.info("Creating the prompt template for CV extraction") | |
"""Create the prompt template for CV extraction.""" | |
return ChatPromptTemplate.from_messages( | |
[ | |
("system", | |
"You are an expert extraction algorithm. Your job is to extract the following specific information from the given text:" | |
"- Name of the candidate" | |
"- Skills" | |
"- Certifications (Look for terms such as 'Certified,' 'Certification,' 'Certificate')" | |
"- years_of_exp (Extract only the number of years. If an approximation is given (e.g., '5+ years'), return the lower bound (e.g., '5').)" | |
"If you cannot find the value for a specific attribute, return null for that attribute's value." | |
"The 'years of experience' can be mentioned in various formats (e.g., '5+ years', '5 years', 'since 2010'). " | |
"Extract it accurately, even if it's mentioned in different contexts like a professional summary or work experience. " | |
"If multiple jobs are listed, you can calculate the experience from the work history." | |
"Certifications are usually found under headers like 'Certifications,' 'Professional Certificates,' or similar. They might include phrases like 'AWS Certified Developer,' 'MongoDB Developer Associate,' etc." | |
), | |
("human", "{text}") | |
] | |
) | |
def initialize_llm() -> ChatGroq: | |
logger.info("Initializing LLM") | |
"""Initialize the language model.""" | |
# os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY") | |
groq_api_key = os.getenv("GROQ_API_KEY") | |
if groq_api_key is None: | |
try: | |
groq_api_key = st.secrets["GROQ_API_KEY"] | |
except Exception as e: | |
st.error("GROQ_API_KEY is not set in the environment variables or Streamlit secrets.") | |
groq_api_key = None | |
# groq_api_key = st.secrets["GROQ_API_KEY"] | |
if not groq_api_key: | |
logger.error("GROQ_API_KEY is not set") | |
raise ValueError("GROQ_API_KEY environment variable is missing.") | |
return ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile", temperature=0.6) | |
def extract_cv_data(text: str) -> list[cv]: | |
logger.info("Extracting CV data from text") | |
"""Extract data from the text using the language model.""" | |
prompt = create_prompt_template() | |
llm = initialize_llm() | |
# creating a chain to extract structred ouput from the text using schema | |
runnable = prompt | llm.with_structured_output(schema=data) | |
response = runnable.invoke({"text": text}) | |
logger.info(f"Extracted {len(response.candidates)} candidate(s) from the text") | |
return response.candidates # returns the list of candidates | |
def process_file(uploaded_files) -> str: | |
logger.info(f"Processing file: {uploaded_files.name}") | |
"""Process the uploaded file and return the text.""" | |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_files.name)[1]) as tmp_file: | |
tmp_file.write(uploaded_files.getvalue()) | |
tmp_path = tmp_file.name | |
try: | |
if tmp_path.endswith('.pdf'): | |
loader = PDFPlumberLoader(tmp_path) | |
logger.info(f"Loaded PDF file: {tmp_path}") | |
else: | |
loader = TextLoader(tmp_path) | |
logger.info(f"Loaded text file: {tmp_path}") | |
documents = loader.load() | |
# return " ".join([doc.page_content for doc in documents]) | |
text_content = " ".join([doc.page_content for doc in documents]) | |
logger.info(f"Extracted text from file: {uploaded_files.name}") | |
return text_content | |
finally: | |
logger.info(f"Deleting temporary file: {tmp_path}") | |
os.unlink(tmp_path) | |
def display_candidates_info(candidates_list: list[cv]): | |
logger.info(f"Displaying information for {len(candidates_list)} candidate(s)") | |
"""Display the extracted candidates' information in a table.""" | |
logger.debug(f"Candidate list: {candidates_list}") | |
data = [] | |
for candidate in candidates_list: | |
data.append({ | |
"Name": candidate.name, | |
"Skills": ", ".join(candidate.skills) if candidate.skills else 'None', | |
"Certifications": ", ".join(candidate.certifications) if candidate.certifications else 'None', | |
"Years of Experience": candidate.years_of_exp if candidate.years_of_exp else 'None' | |
}) | |
st.write("### Candidates Information") | |
st.table(data) | |
logger.debug("Displayed candidates' information in table") | |
# print(candidates_list) | |
# Try this to see the working of extraction | |
# Streamlit file uploader and extraction logic | |
# uploaded_files = st.file_uploader(" Upload the CV: ", type=['pdf', 'txt'],key="unique_cv_upload") | |
# if uploaded_files is not None: | |
# text = process_file(uploaded_files) | |
# # text = ep.text | |
# candidates_list = extract_cv_data(text) | |
# display_candidates_info(candidates_list) | |