In [21]:
pip install transformers datasets huggingface_hub sentence-transformers



In [22]:
import re
import nltk
from nltk.corpus import stopwords
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW
import pandas as pd
from tqdm import tqdm

In [23]:
# Load your unlabeled dataset
resumes = pd.read_csv('/content/resumes6000.csv')

In [24]:
resumes.head(5)

Unnamed: 0,Resumes
0,Global Sales Administrator Biamp Systems Globa...
1,Python Developer - Sprint 8 years of experien...
2,IT Project Manager - Scrum Master of Digital ...
3,"UI Front End Developer UI <span class=""hl"">Fro..."
4,"IT Security Analyst Camp Hill, PA Work Experie..."


In [26]:
# Define the function for cleaning text
def clean_text(text):
    return re.sub(r"<span class=\"hl\">(.*?)</span>", r"\1", text)
# Apply the function to the entire column
resumes['Resumes'] = resumes['Resumes'].apply(clean_text)

In [27]:
 import nltk
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
# Function for cleaning and preprocessing the resume
def clean_resume(resume):
    if isinstance(resume, str):
        # Convert to lowercase
        resume = resume.lower()

        # Remove URLs, RT, cc, hashtags, mentions, non-ASCII characters, punctuation, and extra whitespace
        resume = re.sub('http\S+\s*|RT|cc|#\S+|@\S+|[^\x00-\x7f]|[^\w\s]', ' ', resume)
        resume = re.sub('\s+', ' ', resume).strip()

        # Tokenize the resume
        tokens = nltk.word_tokenize(resume)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token.lower() not in stop_words]

        # Join the tokens back into a sentence
        preprocessed_resume = ' '.join(tokens)

        return preprocessed_resume
    else:
        return ''
# Applying the cleaning function to a Datasets
resumes['Resumes']  = resumes['Resumes'].apply(lambda x: clean_resume(x))

In [43]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load the pre-trained model
mpnet = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(mpnet)
pretrained_model = AutoModelForMaskedLM.from_pretrained(mpnet)

# Assuming 'resumes' is a DataFrame with a column named 'Resumes'
texts = resumes['Resumes'].tolist()

# Tokenize and encode the unlabeled data
encodings = tokenizer(texts, padding=True, truncation = True, return_tensors='pt')

# Create a TensorDataset
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])

# Move the model to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pretrained_model.to(device)

# Initialize the optimizer
optimizer = AdamW(pretrained_model.parameters(), lr=2e-5)

batch_size = 8
epochs = 3
import math

# Experiment with different chunk sizes
chunk_sizes_to_try = [200]  # Can add more sizes later

for chunk_size in chunk_sizes_to_try:
    for epoch in range(epochs):
        tqdm_dataloader = tqdm(DataLoader(dataset, batch_size=batch_size, shuffle=True), desc=f'Epoch {epoch + 1}/{epochs}')

        pretrained_model.train()
        for batch in tqdm_dataloader:
            input_ids, attention_mask = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

            # Calculate number of chunks for current batch
            sequence_length = input_ids.size(1)  # Get actual sequence length
            num_chunks = math.ceil(sequence_length / chunk_size)

            for i in range(num_chunks):
                start_idx = i * chunk_size
                end_idx = min((i + 1) * chunk_size, sequence_length)  # Handle final chunk

                # Extract chunk data
                input_ids_chunk = input_ids[:, start_idx:end_idx]
                attention_mask_chunk = attention_mask[:, start_idx:end_idx]

                # Forward pass
                outputs = pretrained_model(
                    input_ids_chunk, attention_mask=attention_mask_chunk, labels=input_ids_chunk.reshape(-1)
                    )

                # Calculate loss
                loss = outputs.loss

                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Update progress bar
                tqdm_dataloader.set_postfix({'Loss': loss.item(), 'Chunk Size': chunk_size})

Some weights of the model checkpoint at sentence-transformers/all-mpnet-base-v2 were not used when initializing MPNetForMaskedLM: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing MPNetForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetForMaskedLM were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be 

In [44]:
# Save the fine-tuned model
pretrained_model.save_pretrained('fine_tuned_mpnet')
tokenizer.save_pretrained('fine_tuned_mpnet')

('fine_tuned_mpnet/tokenizer_config.json',
 'fine_tuned_mpnet/special_tokens_map.json',
 'fine_tuned_mpnet/vocab.txt',
 'fine_tuned_mpnet/added_tokens.json',
 'fine_tuned_mpnet/tokenizer.json')