{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "H-2L-S6b4ukm", "outputId": "12789315-f584-4d98-afd4-2bd35d0453d9" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.2)\n", "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.15.0)\n", "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.19.4)\n", "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (10.0.1)\n", "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n", "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n", "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)\n", "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.5.0)\n", "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.1.0+cu121)\n", "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.16.0+cu121)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.2.2)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.11.4)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (3.8.1)\n", "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.1.99)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.11.17)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.2.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (3.1.2)\n", "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence-transformers) (2.1.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (8.1.7)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence-transformers) (1.3.2)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3.post1)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence-transformers) (9.4.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->sentence-transformers) (2.1.3)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->sentence-transformers) (1.3.0)\n" ] } ], "source": [ "pip install transformers datasets huggingface_hub sentence-transformers" ] }, { "cell_type": "code", "source": [ "import re\n", "import nltk\n", "from nltk.corpus import stopwords\n", "import torch\n", "from torch.utils.data import DataLoader, TensorDataset\n", "from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW\n", "import pandas as pd\n", "from tqdm import tqdm" ], "metadata": { "id": "Jk533_F14yV8" }, "execution_count": 22, "outputs": [] }, { "cell_type": "code", "source": [ "# Load your unlabeled dataset\n", "resumes = pd.read_csv('/content/resumes6000.csv')" ], "metadata": { "id": "IR-KIxHd5iyu" }, "execution_count": 23, "outputs": [] }, { "cell_type": "code", "source": [ "resumes.head(5)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "Y0sgNBwr5mzH", "outputId": "9728d843-eef7-4719-c9ed-418155127788" }, "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Resumes\n", "0 Global Sales Administrator Biamp Systems Globa...\n", "1 Python Developer - Sprint 8 years of experien...\n", "2 IT Project Manager - Scrum Master of Digital ...\n", "3 UI Front End Developer UI Fro...\n", "4 IT Security Analyst Camp Hill, PA Work Experie..." ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Resumes
0Global Sales Administrator Biamp Systems Globa...
1Python Developer - Sprint 8 years of experien...
2IT Project Manager - Scrum Master of Digital ...
3UI Front End Developer UI <span class=\"hl\">Fro...
4IT Security Analyst Camp Hill, PA Work Experie...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "# Define the function for cleaning text\n", "def clean_text(text):\n", " return re.sub(r\"(.*?)\", r\"\\1\", text)\n", "# Apply the function to the entire column\n", "resumes['Resumes'] = resumes['Resumes'].apply(clean_text)" ], "metadata": { "id": "MrCrvWv65nAw" }, "execution_count": 26, "outputs": [] }, { "cell_type": "code", "source": [ " import nltk\n", " nltk.download('punkt')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aUdNZquW4yXo", "outputId": "254067bd-9b4e-4e98-b8a0-9c661e6955f3" }, "execution_count": 27, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "code", "source": [ "import nltk\n", "nltk.download('stopwords')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "09C8uhGu51Vh", "outputId": "3cd7a9af-293f-4c3c-a073-92fe26c49bd5" }, "execution_count": 28, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 28 } ] }, { "cell_type": "code", "source": [ "# Function for cleaning and preprocessing the resume\n", "def clean_resume(resume):\n", " if isinstance(resume, str):\n", " # Convert to lowercase\n", " resume = resume.lower()\n", "\n", " # Remove URLs, RT, cc, hashtags, mentions, non-ASCII characters, punctuation, and extra whitespace\n", " resume = re.sub('http\\S+\\s*|RT|cc|#\\S+|@\\S+|[^\\x00-\\x7f]|[^\\w\\s]', ' ', resume)\n", " resume = re.sub('\\s+', ' ', resume).strip()\n", "\n", " # Tokenize the resume\n", " tokens = nltk.word_tokenize(resume)\n", "\n", " # Remove stopwords\n", " stop_words = set(stopwords.words('english'))\n", " tokens = [token for token in tokens if token.lower() not in stop_words]\n", "\n", " # Join the tokens back into a sentence\n", " preprocessed_resume = ' '.join(tokens)\n", "\n", " return preprocessed_resume\n", " else:\n", " return ''\n", "# Applying the cleaning function to a Datasets\n", "resumes['Resumes'] = resumes['Resumes'].apply(lambda x: clean_resume(x))" ], "metadata": { "id": "TWyPQ63w51kN" }, "execution_count": 30, "outputs": [] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW\n", "import torch\n", "from torch.utils.data import DataLoader, TensorDataset\n", "from tqdm import tqdm\n", "\n", "# Load the pre-trained model\n", "mpnet = \"sentence-transformers/all-mpnet-base-v2\"\n", "tokenizer = AutoTokenizer.from_pretrained(mpnet)\n", "pretrained_model = AutoModelForMaskedLM.from_pretrained(mpnet)\n", "\n", "# Assuming 'resumes' is a DataFrame with a column named 'Resumes'\n", "texts = resumes['Resumes'].tolist()\n", "\n", "# Tokenize and encode the unlabeled data\n", "encodings = tokenizer(texts, padding=True, truncation = True, return_tensors='pt')\n", "\n", "# Create a TensorDataset\n", "dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])\n", "\n", "# Move the model to the appropriate device (CPU or GPU)\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "pretrained_model.to(device)\n", "\n", "# Initialize the optimizer\n", "optimizer = AdamW(pretrained_model.parameters(), lr=2e-5)\n", "\n", "batch_size = 8\n", "epochs = 3\n", "import math\n", "\n", "# Experiment with different chunk sizes\n", "chunk_sizes_to_try = [200] # Can add more sizes later\n", "\n", "for chunk_size in chunk_sizes_to_try:\n", " for epoch in range(epochs):\n", " tqdm_dataloader = tqdm(DataLoader(dataset, batch_size=batch_size, shuffle=True), desc=f'Epoch {epoch + 1}/{epochs}')\n", "\n", " pretrained_model.train()\n", " for batch in tqdm_dataloader:\n", " input_ids, attention_mask = batch\n", " input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)\n", "\n", " # Calculate number of chunks for current batch\n", " sequence_length = input_ids.size(1) # Get actual sequence length\n", " num_chunks = math.ceil(sequence_length / chunk_size)\n", "\n", " for i in range(num_chunks):\n", " start_idx = i * chunk_size\n", " end_idx = min((i + 1) * chunk_size, sequence_length) # Handle final chunk\n", "\n", " # Extract chunk data\n", " input_ids_chunk = input_ids[:, start_idx:end_idx]\n", " attention_mask_chunk = attention_mask[:, start_idx:end_idx]\n", "\n", " # Forward pass\n", " outputs = pretrained_model(\n", " input_ids_chunk, attention_mask=attention_mask_chunk, labels=input_ids_chunk.reshape(-1)\n", " )\n", "\n", " # Calculate loss\n", " loss = outputs.loss\n", "\n", " # Backward pass and optimization\n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", "\n", " # Update progress bar\n", " tqdm_dataloader.set_postfix({'Loss': loss.item(), 'Chunk Size': chunk_size})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kypmxXhz4ybO", "outputId": "a142f965-498a-4f33-ffbb-028f88f27d51" }, "execution_count": 43, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Some weights of the model checkpoint at sentence-transformers/all-mpnet-base-v2 were not used when initializing MPNetForMaskedLM: ['pooler.dense.weight', 'pooler.dense.bias']\n", "- This IS expected if you are initializing MPNetForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing MPNetForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of MPNetForMaskedLM were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "Epoch 1/3: 100%|██████████| 750/750 [11:46<00:00, 1.06it/s, Loss=0.057, Chunk Size=200]\n", "Epoch 2/3: 100%|██████████| 750/750 [11:47<00:00, 1.06it/s, Loss=0.0571, Chunk Size=200]\n", "Epoch 3/3: 100%|██████████| 750/750 [11:47<00:00, 1.06it/s, Loss=0.0464, Chunk Size=200]\n" ] } ] }, { "cell_type": "code", "source": [ "# Save the fine-tuned model\n", "pretrained_model.save_pretrained('fine_tuned_mpnet')\n", "tokenizer.save_pretrained('fine_tuned_mpnet')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "U-mZPfa8Sipl", "outputId": "fc93a178-aaf4-415b-f8e2-bba93a832052" }, "execution_count": 44, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('fine_tuned_mpnet/tokenizer_config.json',\n", " 'fine_tuned_mpnet/special_tokens_map.json',\n", " 'fine_tuned_mpnet/vocab.txt',\n", " 'fine_tuned_mpnet/added_tokens.json',\n", " 'fine_tuned_mpnet/tokenizer.json')" ] }, "metadata": {}, "execution_count": 44 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "fnD7hsloTA1i" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "LEUEojrfTBB0" }, "execution_count": null, "outputs": [] } ] }