| | import pandas as pd
|
| | import pickle
|
| | import re
|
| | from datasets import load_dataset
|
| | from sklearn.feature_extraction.text import TfidfVectorizer
|
| | from sklearn.multiclass import OneVsRestClassifier
|
| | from sklearn.neighbors import KNeighborsClassifier
|
| | from sklearn.preprocessing import LabelEncoder
|
| |
|
| | def train_classifier():
|
| |
|
| | print("Loading AzharAli05/Resume-Screening-Dataset...")
|
| | try:
|
| | ds = load_dataset("AzharAli05/Resume-Screening-Dataset")
|
| | df = pd.DataFrame(ds['train'])
|
| | print(f"Loaded {len(df)} resumes.")
|
| | except Exception as e:
|
| | print(f"Error loading dataset: {e}")
|
| | exit()
|
| |
|
| |
|
| |
|
| | text_col = 'Resume'
|
| | label_col = 'Role'
|
| |
|
| |
|
| | def clean_resume(txt):
|
| | cleanText = re.sub(r'http\S+\s', ' ', str(txt))
|
| | cleanText = re.sub(r'RT|cc', ' ', cleanText)
|
| | cleanText = re.sub(r'#\S+\s', ' ', cleanText)
|
| | cleanText = re.sub(r'@\S+', ' ', cleanText)
|
| | cleanText = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', cleanText)
|
| | cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)
|
| | cleanText = re.sub(r'\s+', ' ', cleanText)
|
| | return cleanText
|
| |
|
| | print("Cleaning data...")
|
| | df['cleaned_resume'] = df[text_col].apply(clean_resume)
|
| |
|
| |
|
| | print("Generating Master Profiles (Prototypes)...")
|
| |
|
| | prototypes = df.groupby(label_col)['cleaned_resume'].apply(lambda x: ' '.join(x)).to_dict()
|
| | pickle.dump(prototypes, open('prototypes.pkl', 'wb'))
|
| |
|
| |
|
| | le = LabelEncoder()
|
| | df['Category_ID'] = le.fit_transform(df[label_col])
|
| |
|
| |
|
| | print("Vectorizing...")
|
| | tfidf = TfidfVectorizer(stop_words='english', max_features=200)
|
| | tfidf.fit(df['cleaned_resume'])
|
| | requiredText = tfidf.transform(df['cleaned_resume'])
|
| |
|
| |
|
| | print("Training Classifier...")
|
| | clf = OneVsRestClassifier(KNeighborsClassifier())
|
| | clf.fit(requiredText, df['Category_ID'])
|
| |
|
| |
|
| | print("Saving models...")
|
| | pickle.dump(clf, open('clf.pkl', 'wb'))
|
| | pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
|
| | pickle.dump(le, open('encoder.pkl', 'wb'))
|
| | print("SUCCESS: Classification models + Prototypes saved.")
|
| |
|
| | if __name__ == "__main__":
|
| | train_classifier() |