File size: 14,280 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install datasets\n",
    "# !huggingface-cli login"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from datasets import load_dataset\n",
    "# load_dataset(\"balochiml/balochi-language-data\", data_dir=\"data\", cache_dir=\"../data\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate the processed data without English characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4294"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "\n",
    "def get_txt_file_paths(directory):\n",
    "    txt_file_paths = []\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            if file.endswith(\".txt\"):\n",
    "                file_path = os.path.join(root, file)\n",
    "                txt_file_paths.append(file_path)\n",
    "    return txt_file_paths\n",
    "\n",
    "\n",
    "# Replace \"directory_path\" with the actual path of the directory you want to search\n",
    "directory_path = \"../data/raw_text\"\n",
    "txt_paths = get_txt_file_paths(directory_path)\n",
    "\n",
    "len(txt_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "\n",
    "def clean_text(file_path):\n",
    "    # Open the file and read it into memory\n",
    "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
    "        text = file.read()\n",
    "\n",
    "    # Remove English-language characters and numbers\n",
    "    text = re.sub(r\"[a-zA-Z0-9]\", \"\", text)\n",
    "\n",
    "    # Remove any excess whitespace\n",
    "    text = re.sub(r\"[^\\S\\n]+\", \" \", text)\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "for path in txt_paths:\n",
    "    cleaned_text = clean_text(path)\n",
    "\n",
    "    # write the cleaned text to a new file with an incremented filename\n",
    "    # write the files all into the '../data/processed_text' directory\n",
    "    with open(\n",
    "        f'../data/processed_text/{path.split(\"/\")[-1]}', \"w\", encoding=\"utf-8\"\n",
    "    ) as file:\n",
    "        file.write(cleaned_text)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training a Tokenizer using 🤗 Tokenizers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import Tokenizer\n",
    "from tokenizers.models import BPE\n",
    "\n",
    "tokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers.pre_tokenizers import Whitespace\n",
    "\n",
    "tokenizer.pre_tokenizer = Whitespace()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers.trainers import BpeTrainer\n",
    "\n",
    "trainer = BpeTrainer(\n",
    "    min_frequency=2,\n",
    "    vocab_size=30000,\n",
    "    special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"],\n",
    "    show_progress=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4294"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get a list of all the txt files in\n",
    "# '/Users/strickvl/balochi/balochi-tokenizer/data/processed_text'\n",
    "\n",
    "processed_files = get_txt_file_paths(\"../data/processed_text\")\n",
    "assert len(processed_files) == len(txt_paths)\n",
    "len(processed_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tokenizer.train(processed_files, trainer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tokenizers.models.BPE at 0x108eaa830>"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "30000"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.get_vocab_size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tokenizer.get_vocab()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save(\"../models/30k-balochi-tokenizer.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer.from_file(\"../models/30k-balochi-tokenizer.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'گوں ھر کس ءَ جنگ ء ُ مڑ بیت'"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_text = \"      آیک  جناورے اَت۔  لھتے گشیت آ سکیں کارزوالے ات کہ اگاں آزاتی دیگ بہ بیت، بازارءَ، لوگے ءَ، جاگاہ یے  ءَ،دپتر ء ُ کارگس یے  ءَ یا ھر ھما جاگاہ ءَ کہ شُت کنت مزنیں کارزوالی کنت۔گوں ھر کس ءَ جنگ ء ُ مڑ بیت۔گدء ُ پچاں  چنڈ چنڈ ء ُ راڑ راڑ کنت،کاگد ء ُ وانگیاں وارت ء ُ آدراہ کنت۔ورگی چیزاں اگاں وارت نکنت آھاں گٹ پاچیت ھراب کنت۔ایندگہ جناور چہ بندات ء َ ایشی ءِ کازوالیاں چہ وتا دیر دارگ ءِ کوشست کن اَنت۔ چیا کہ آ بازیں دگہ ھرابی ء ُ کارزوالی ھم کنت،پمیشکا کسانیں جناور  بالی مُرگ،کوہ پاچن،آسک ء ُ ایندگہ کسان کسانیں جناورچر آئی ءِ کارزوالیانی سوب ءَ آئی ءَ چہ سک باز شزار اَنت ۔\".replace(\n",
    "    \"\\xa0\", \"\"\n",
    ")\n",
    "sample_sentence = sample_text.split(\"۔\")[2]\n",
    "sample_sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['گوں', 'ھر', 'کس', 'ءَ', 'جنگ', 'ء', 'ُ', 'مڑ', 'بیت']"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.encode(sample_sentence).tokens"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training a custom tokenizer using Spacy and FastAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai.text.all import *\n",
    "files = get_text_files(\"../data/processed_text\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4294"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'*آمیتگءِ جُستءَمکن* لچّہ: *آمیتگءِ جُستءَمکن* آ میتگءَکہ من وتی شوکیں کسانی'"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "txt = files[0].open().read(); txt[:75]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(#146) ['*','آمیتگءِ','جُستءَمکن','*','لچّہ',':','*','آمیتگءِ','جُستءَمکن','*','آ','میتگءَکہ','من','وتی','شوکیں','کسانی','پیر','کُت','آ','میتگءِ','جسُتءَمکن','آ','میتگءِ','گیراں','مبو','بے','اوستیں','تاهیراں','مبو','آ'...]\n"
     ]
    }
   ],
   "source": [
    "spacy = WordTokenizer()\n",
    "toks = first(spacy([txt]))\n",
    "print(coll_repr(toks, 30))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(#147) ['xxbos','*','آمیتگءِ','جُستءَمکن','*','لچّہ',':','*','آمیتگءِ','جُستءَمکن','*','آ','میتگءَکہ','من','وتی','شوکیں','کسانی','پیر','کُت','آ','میتگءِ','جسُتءَمکن','آ','میتگءِ','گیراں','مبو','بے','اوستیں','تاهیراں','مبو','آ'...]\n"
     ]
    }
   ],
   "source": [
    "tkn = Tokenizer(spacy)\n",
    "print(coll_repr(tkn(txt), 31))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "txts = L(o.open().read() for o in files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "def subword(size: int):\n",
    "    sp = SubwordTokenizer(vocab_sz=size)\n",
    "    sp.setup(txts)\n",
    "    return \" \".join(first(sp([txt]))[:40])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'▁* آ می تگ ءِ ▁جُست ءَ م ک ن * ▁لچّہ : ▁* آ می تگ ءِ ▁جُست ءَ م ک ن * ▁آ ▁میتگ ءَ کہ ▁من ▁وتی ▁ش وکیں ▁کس انی ▁پیر ▁کُت ▁آ ▁میتگ ءِ ▁ج'"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subword(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'▁ * آ م ی ت گ ء ِ ▁ ج ُ س ت ء َ م ک ن * ▁ ل چ ّ ہ : ▁ * آ م ی ت گ ء ِ ▁ ج ُ س ت'"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subword(275)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(#147) ['xxbos','*','آمیتگءِ','جُستءَمکن','*','لچّہ',':','*','آمیتگءِ','جُستءَمکن'...]"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "toks200 = txts[:200].map(tkn)\n",
    "toks200[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"(#4096) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','ءَ','ءِ','ءُ','۔','کہ','،','انت','من','اے','نہ','وتی','بیت','”','ات','چہ','گوں','اَنت','اِنت','پہ','بہ','‘','یک','آئی','.','آ','منی','ھم',')','کنت','بلوچی','3','تو','بلے','ئے',':','کنگ','(','بوتگ','آں','کن','؟'...]\""
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num = Numericalize()\n",
    "num.setup(toks200)\n",
    "coll_repr(num.vocab,50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TensorText([ 156, 2340,    0,  156,  563,   43,  156, 2340,    0,  156,   33,\n",
       "               0,   16,   19, 1490,  831,  457,  102,   33, 1031])"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nums = num(toks)[:20]; nums"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'* آمیتگءِ xxunk * لچّہ : * آمیتگءِ xxunk * آ xxunk من وتی شوکیں کسانی پیر کُت آ میتگءِ'"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join(num.vocab[o] for o in nums)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "balochi",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}