Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import PorterStemmer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from PyPDF2 import PdfReader | |
| import os | |
| from io import BytesIO | |
| import pickle | |
| import pdfminer | |
| from pdfminer.high_level import extract_text | |
| import re | |
| import PyPDF2 | |
| import textract | |
| import tempfile | |
| from docx import Document | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| def preprocess_text(text): | |
| words = word_tokenize(text.lower()) | |
| stop_words = set(stopwords.words('english')) | |
| words = [word for word in words if word not in stop_words] | |
| stemmer = PorterStemmer() | |
| words = [stemmer.stem(word) for word in words] | |
| return ' '.join(words) | |
| def extract_text_from_pdf(pdf_content): | |
| pdf_reader = PdfReader(BytesIO(pdf_content)) | |
| text = '' | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def extract_text_from_docx(docx_content): | |
| doc = Document(BytesIO(docx_content)) | |
| text = " ".join(paragraph.text for paragraph in doc.paragraphs) | |
| return text | |
| def extract_text_from_txt(txt_content): | |
| text = textract.process(input_filename=None, input_bytes=txt_content) | |
| return text | |
| def extract_text_from_resume(file_path): | |
| file_extension = file_path.split('.')[-1].lower() | |
| if file_extension == 'pdf': | |
| return extract_text_from_pdf(file_path) | |
| elif file_extension == 'docx': | |
| return extract_text_from_docx(file_path) | |
| elif file_extension == 'txt': | |
| return extract_text_from_txt(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_extension}") | |
| def clean_pdf_text(text): | |
| text = re.sub('http\S+\s*', ' ', text) | |
| text = re.sub('RT|cc', ' ', text) | |
| text = re.sub('#\S+', '', text) | |
| text = re.sub('@\S+', ' ', text) | |
| text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text) | |
| text = re.sub(r'[^\x00-\x7f]',r' ', text) | |
| text = re.sub('\s+', ' ', text) | |
| return text | |
| def extract_candidate_name(text): | |
| pattern = r'(?:Mr\.|Ms\.|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)' | |
| match = re.search(pattern, text) | |
| if match: | |
| return match.group(0) | |
| return "Candidate Name Not Found" | |
| def calculate_similarity(job_description, cvs, cv_file_names): | |
| processed_job_desc = preprocess_text(job_description) | |
| processed_cvs = [preprocess_text(cv) for cv in cvs] | |
| all_text = [processed_job_desc] + processed_cvs | |
| vectorizer = TfidfVectorizer() | |
| tfidf_matrix = vectorizer.fit_transform(all_text) | |
| similarity_scores = cosine_similarity(tfidf_matrix)[0][1:] | |
| ranked_cvs = list(zip(cv_file_names, similarity_scores)) | |
| ranked_cvs.sort(key=lambda x: x[1], reverse=True) | |
| return ranked_cvs | |
| def extract_email_phone(text): | |
| email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| phone_pattern = r'\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b' | |
| emails = re.findall(email_pattern, text) | |
| phones = re.findall(phone_pattern, text) | |
| return emails, phones | |
| def rank_and_shortlist(job_description, cv_files, threshold=0.10): | |
| cv_texts = [] | |
| cv_file_names = [] | |
| cv_emails = [] | |
| cv_phones = [] | |
| for cv_file in cv_files: | |
| file_extension = os.path.splitext(cv_file.name)[1].lower() | |
| try: | |
| if file_extension == '.pdf': | |
| cv_text = extract_text_from_pdf(cv_file.read()) | |
| elif file_extension == '.docx': | |
| cv_text = extract_text_from_docx(cv_file.read()) | |
| elif file_extension == '.txt': | |
| cv_text = cv_file.read().decode('utf-8', errors='ignore') | |
| else: | |
| st.warning(f"Unsupported file format: {file_extension}. Skipping file: {cv_file.name}") | |
| continue | |
| cv_texts.append(clean_pdf_text(cv_text)) | |
| cv_file_names.append(cv_file.name) | |
| # Extract email and phone number from the CV text | |
| emails, phones = extract_email_phone(cv_text) | |
| cv_emails.append(emails) | |
| cv_phones.append(phones) | |
| except Exception as e: | |
| st.warning(f"Error processing file '{cv_file.name}': {str(e)}") | |
| continue | |
| if not cv_texts: | |
| st.error("No valid resumes found. Please upload resumes in supported formats (PDF, DOCX, or TXT).") | |
| return [], {} | |
| similarity_scores = calculate_similarity(job_description, cv_texts, cv_file_names) | |
| ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores] | |
| shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score >= threshold] | |
| contact_info_dict = {} | |
| for cv_name, emails, phones in zip(cv_file_names, cv_emails, cv_phones): | |
| contact_info_dict[cv_name] = { | |
| 'emails': emails, | |
| 'phones': phones, | |
| } | |
| return ranked_cvs, shortlisted_cvs, contact_info_dict | |
| def main(): | |
| st.title("Resume Ranking App") | |
| st.write("Enter Job Title:") | |
| job_title = st.text_input("Job Title") | |
| st.write("Enter Job Description:") | |
| job_description = st.text_area("Job Description", height=200, key='job_description') | |
| st.write("Upload the Resumes:") | |
| cv_files = st.file_uploader("Choose files", accept_multiple_files=True, key='cv_files') | |
| if st.button("Submit"): | |
| if job_title and job_description and cv_files: | |
| job_description_text = f"{job_title} {job_description}" | |
| ranked_cvs, shortlisted_cvs, contact_info_dict = rank_and_shortlist(job_description_text, cv_files) | |
| st.markdown("### Ranking of Resumes:") | |
| for rank, score in ranked_cvs: | |
| st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}") | |
| st.markdown("### Shortlisted Candidates:") | |
| if not shortlisted_cvs: | |
| st.markdown("None") | |
| else: | |
| for rank, score in shortlisted_cvs: | |
| st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}") | |
| contact_info = contact_info_dict[rank] | |
| candidate_emails = contact_info.get('emails', []) | |
| candidate_phones = contact_info.get('phones', []) | |
| if candidate_emails: | |
| st.markdown(f"**Emails:** {', '.join(candidate_emails)}") | |
| if candidate_phones: | |
| st.markdown(f"**Phone Numbers:** {', '.join(candidate_phones)}") | |
| else: | |
| st.error("Please enter the job title, job description, and upload resumes to proceed.") | |
| else: | |
| st.write("Please enter the job title, job description, and upload resumes to proceed.") | |
| if __name__ == "__main__": | |
| main() | |