{"cells":[{"cell_type":"markdown","source":["# **Language Translation**"],"metadata":{"id":"bHXiNN5kZzxh"}},{"cell_type":"markdown","source":["This is the code of language translation from english to french. I have done a fine tuning of a dataset taken from huggingface on a pre trained model from huggingface only. The details of each cell code is given above that cell or in comments."],"metadata":{"id":"NbfRz78aZ5gz"}},{"cell_type":"markdown","source":["First we will connect the GPU so as to run our program. GPU is very useful to run programs which have very large datasets."],"metadata":{"id":"8QJEwxHIZ8cX"}},{"cell_type":"code","source":["!nvidia-smi #checking whether GPU is working or not"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Fet3K5GfHHpQ","executionInfo":{"status":"ok","timestamp":1691774881690,"user_tz":-330,"elapsed":30,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"}},"outputId":"27798fb0-ab40-4291-ec3c-369af9495511"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Fri Aug 11 17:28:17 2023 \n","+-----------------------------------------------------------------------------+\n","| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\n","|-------------------------------+----------------------+----------------------+\n","| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|===============================+======================+======================|\n","| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n","| N/A 38C P8 9W / 70W | 0MiB / 15360MiB | 0% Default |\n","| | | N/A |\n","+-------------------------------+----------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=============================================================================|\n","| No running processes found |\n","+-----------------------------------------------------------------------------+\n"]}]},{"cell_type":"code","source":["! pip install -q transformers accelerate sentencepiece gradio datasets evaluate sacrebleu\n","#installing all the necessary libraries"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UF0lJfd5HFXs","executionInfo":{"status":"ok","timestamp":1691774911552,"user_tz":-330,"elapsed":25826,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"}},"outputId":"f308b8a7-3b9e-4b2a-8c75-613e5d25ae17"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.4/7.4 MB\u001b[0m \u001b[31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m244.2/244.2 kB\u001b[0m \u001b[31m23.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m44.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.0/20.0 MB\u001b[0m \u001b[31m76.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m49.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m118.9/118.9 kB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m81.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m65.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.7/65.7 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m297.4/297.4 kB\u001b[0m \u001b[31m34.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.5/50.5 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m140.3/140.3 kB\u001b[0m \u001b[31m16.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m22.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.4/50.4 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.5/46.5 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.5/74.5 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n"]}]},{"cell_type":"markdown","source":["From the datasets download the dataset kde4(available on huggingface) which is a sample dataset for languuage translation and mention the two languages."],"metadata":{"id":"-qSxRNqSIk_2"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":281,"referenced_widgets":["47ad99f6b46546adad2e835e69c4d5c2","599cd4d0d8e444fc90c40d94ab16c504","6b4500eca72a49669cbaedc926bac7cc","f92eb6b332dc461dbf5754795309b5aa","76787c1a9b9f4ca78fd860a36764cd52","a6028c653d1345919c5b63fe7b7e403e","2078096d72884602af53ba7289da25f7","921dbf6129194020a2a926727cc19026","60f8a144f9084cc390c2699a015e6af1","a7fab938d85c44749f71e51f64d0a2fa","487f71dff3dc4e30a80fd03be79205d0","d2e537aa19134472afd8b50b667e76d8","b9d9af11284d465891c4609974bf5e60","5a612eef7a844341989a75e0c99192b6","ac222746957a4079ba64f1a33b852d5d","2f117da3c67147a888a7ace0b70e8965","04c9884f174b4e3db470980dd0226df2","6d9d8908ce2b492fa1998c617732f8ea","0a6c79eefdfd4662b2bf96361643790b","d46d3f520d4644faaecf6818532d29cb","096c878917f24392a2e58a083326b88a","9f4a5bf5b5974b469b67da39f178b5d9","252301e8994c4d90882b76b3289005ee","7fa0879d2a704872811e42d66f465b5a","70137f4fd1c8401ab2026d30edfb2f6b","803cdfa3ee964950a19be5259a8c307d","11949ce885f84414ba9b6b9dc7f89466","90a8572aca4c48dead2fd960c0e95e54","2ee648f41a0c44f7856dbe625a12e250","0502e0eddfbd481f8b0922aad1657ba1","d27154bd6c0949419d13150872762aa3","1fcd4bac845b4e679188f176c96f1615","7af53643b4c546c5af204580a91fc001","91bfed8a6cc5452db08355373ad53669","ba3149153cef47e8a175ac7b39c5a4fd","eb2152329ceb4d9f88df19c5520d5282","3d8257b9260a43bbb6a08eb27ad9ad85","de8c51bcbc4c4750a62ceb2efbeac1a5","f87961aac78847c99fa26040b8d27e2a","0025ac250d8d477392ad3a9820d58757","797d3247971d4e50b81a812944f28ba4","06acf1d9c3ee43d581515120e0b5a5c4","4586c6897473439a905c44fe9a5969c5","cd212e9d4c034ef481a6db41e7282f44","f6218ea6cff14edc8225c4064ccaac7c","554aa22675444b96bad6b4b284febe6a","f9884ffd88814aeb9ac6178e106b287a","c5be0d5d00fb40479b0302dbc37a98fc","9d33945752a04ee787e99c8dafa1a224","943a8c3fd15b4a5e9389fa6eacf5bb0e","417a643b51d44836a5e367172fd19d2d","6dd1d82f97bd4dee8064a1f58fc96c1e","3970ba2cb5be4362a2a3eef215cbdf19","1bdacb6ffec34402b339ceeea6fcd5f7","54675d5f5aaf48deafe67be32593a8c8"]},"executionInfo":{"elapsed":62975,"status":"ok","timestamp":1691774974516,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"},"user_tz":-330},"id":"NRnwM8bcnmGP","outputId":"3a7fb86c-697c-40bb-a482-33fa579dc0b3"},"outputs":[{"output_type":"display_data","data":{"text/plain":["Downloading builder script: 0%| | 0.00/4.25k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"47ad99f6b46546adad2e835e69c4d5c2"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading metadata: 0%| | 0.00/8.45k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d2e537aa19134472afd8b50b667e76d8"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading readme: 0%| | 0.00/5.10k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"252301e8994c4d90882b76b3289005ee"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading data: 0%| | 0.00/7.05M [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"91bfed8a6cc5452db08355373ad53669"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Generating train split: 0%| | 0/210173 [00:00, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f6218ea6cff14edc8225c4064ccaac7c"}},"metadata":{}},{"output_type":"execute_result","data":{"text/plain":["DatasetDict({\n"," train: Dataset({\n"," features: ['id', 'translation'],\n"," num_rows: 210173\n"," })\n","})"]},"metadata":{},"execution_count":3}],"source":["from datasets import load_dataset\n","raw_datasets = load_dataset(\"kde4\", lang1=\"en\", lang2=\"fr\")\n","raw_datasets"]},{"cell_type":"markdown","source":["In the original dataset we have only train set but we want both train and test sets to compare our outputs so use train_test_split function to break the dataset into two sets train and test."],"metadata":{"id":"LJfI3lcrJJcm"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1009,"status":"ok","timestamp":1691774975886,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"},"user_tz":-330},"id":"wJ4OnmL0n1YF","outputId":"9636433b-7ad0-4d1e-deb0-c853f9c400a5"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["DatasetDict({\n"," train: Dataset({\n"," features: ['id', 'translation'],\n"," num_rows: 168138\n"," })\n"," test: Dataset({\n"," features: ['id', 'translation'],\n"," num_rows: 42035\n"," })\n","})"]},"metadata":{},"execution_count":4}],"source":["from sklearn.model_selection import train_test_split\n","split_datasets= raw_datasets[\"train\"].train_test_split(test_size=0.2, seed=20)\n","split_datasets"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":672,"status":"ok","timestamp":1691774979320,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"},"user_tz":-330},"id":"x3oI6dujodIT","outputId":"7bd739cc-8a5d-4d02-8aeb-c74b4f8f7a3c"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["{'en': 'If you choose the wrong settings here your articles could be unreadable or not sendable at all, so please be careful with these settings.',\n"," 'fr': 'Si vous choisissez ici les mauvais paramètres, vos articles peuvent devenir illisibles ou vous ne pourrez pas du tout les envoyer. Veuillez donc être prudent avec ces paramètres.'}"]},"metadata":{},"execution_count":5}],"source":["split_datasets[\"train\"][45][\"translation\"]"]},{"cell_type":"markdown","source":["Now we will be using a pre trained model from HuggingFace Helsinki-NLP/opus-mt-en-fr which contained the pre trained model of translation from english to french."],"metadata":{"id":"cDdD0lxUJg5K"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":310,"referenced_widgets":["9117f69ef15a43fcb0254ee376f58f2f","8244721f691d4e96adba56a86b3437c0","7dc533f7608942b7a72b3c27741da395","d599c7e5ab9943678fd9501e1110d86a","17f9eb0650df4f78a42d507eef6e5f52","4605fb7b09014a32847d34b864140171","b83f3cf75b1241a8bcf7e62748f7a4db","eeee34d605024c628bdcbdd3dc99d399","272aaa2385f84b3bb93ea2ebe5ee4242","fd014cf6d89f4578a965942218cd1362","f77f1a1ae5cc44008d64f44f992d6f38","98c4a8ae12f6440c85e546202cef5be8","5a499b4fc2ea4b2c925d7f0a30465a26","cf969cae39dd4e28bea810bbe2aff519","a9ecab3510af4fc0bebbc4fb9574a287","b9a6af1794f6487ca61e53e72f51b0da","63100194d974430f8a3a3814f0a2a230","036738472fb945dea98c6aa3c4e3a07b","4a05b9f44522409aa9c73f5bcb5dad16","56d262230a154e648ae648626ab3b62f","4ba943586fbb4c0ba6622865a5224dce","1953875189374b9aa60b67bf6e633369","e15845c66a1544469d33145c7d64c83b","a5a6a772ce00427cb6d2a367324d7ce5","25ba625dfaa447f28a22ec6d4979c29d","44bf3008535043d6811dd35fff66d089","03fa4ed1e8774cbdba3d641283b4ee00","0a967e8e20004c5b8d9bb13fb2c41f8b","0274c2a91cac4838ac26a9116a4f0dab","478048c46c3d48a0821a4a23f7adb440","b6d20d2329e94a4a8983147d405ac255","eb88a1074314454abe14ef1c6c614f7c","69f264748dcf4c95a5eb3bb50f13de68","dcfae2731c6a4e988e51226993f38cef","f3d16551cc1842eb9d0e69022ab4d1f0","33a04ef08a1b42b4a069493d975dcb65","c07d06ca41a44294b8c86ad5bfdbc2c7","8c3726beef2b4d21a69dad22c87ee609","243f70556f17403b9320e449534281e8","56af055f7103445487dba387c6c5c894","fad61732cd09489a9516ae44de735242","72acb76b240a40898c9e9d3a823a6734","5beaa61ee0c847b994ea6fe254a7e88e","e8f30a97ae20499fa60843a2f2e86774","a60bbc988839494293250f5413aecd1e","10a7a0c0eef64dbe86cce109b457dc8d","eebc5f8f167b4e8ca17e6728ee4a50e6","8d0217cd673743e583869ea4d0c2740b","9a0d5702db6643e1b1e90eefcc2af0d7","4f3a1ec8fcf241c68d9f461dca8aa3de","5c68f0bb58fa44638da17a2d7511b68d","abfcfa35d926468984e8026d7d945ed0","d874ad80d82b49569687fd3511fd2059","61904c2a529d415cbae2466d2cdc31f5","70c8121c1afd4642996b25ff63c19f95","7a9f5b8b706545b99f5e85d2dc70b72e","d93ee4c78bd64cdd8d98fad819ffd494","703c52f9c99b4c9c874f79bdac98e73f","60a81ee8f89348aaa8838b4fec1f0fb0","ed2800f4da6c47ac89baace03d73c32a","436aad457f634fa9b312117b629eb5fc","d7d64221386c430d8e9fac40977b266a","3a6d7ba70e614ee68469d4084b139529","f5aa15cecc1b49f2818b3e3cf0d39269","01cc9e7eba5a407b8c72a118490eb7c4","f8445bada66c4ad0aa201bd66d2dee31","c2232ae80d0c4cae939c0bfce888dc95","177ece608d1f4c15b28cccf0199d85fa","b1bd4f7b4862491d99c87dd2751270ae","16f0ef6041cf48e6a39b1cc1853d3744","b08d8b14f2cf41ab83b440841f8904a9","96fae90c2e0448369e619b288b687936","177e9557d0204d099699bf36488f27d2","7d455e831513485bb42e881076af121c","9604e538c8c94b3cbecba2fa5d54e3e1","458b8db0e6684afbbb0c839704a0afa9","2037546f25b3498eb952d13bc7f910a6"]},"executionInfo":{"elapsed":21003,"status":"ok","timestamp":1691775023800,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"},"user_tz":-330},"id":"X1yfmRCGo41_","outputId":"fc6c3d38-1eb5-40ea-c264-ae76398e06cc"},"outputs":[{"output_type":"display_data","data":{"text/plain":["Downloading (…)lve/main/config.json: 0%| | 0.00/1.42k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"9117f69ef15a43fcb0254ee376f58f2f"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading pytorch_model.bin: 0%| | 0.00/301M [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"98c4a8ae12f6440c85e546202cef5be8"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading (…)neration_config.json: 0%| | 0.00/293 [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"e15845c66a1544469d33145c7d64c83b"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading (…)okenizer_config.json: 0%| | 0.00/42.0 [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"dcfae2731c6a4e988e51226993f38cef"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading (…)olve/main/source.spm: 0%| | 0.00/778k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a60bbc988839494293250f5413aecd1e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading (…)olve/main/target.spm: 0%| | 0.00/802k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7a9f5b8b706545b99f5e85d2dc70b72e"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Downloading (…)olve/main/vocab.json: 0%| | 0.00/1.34M [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c2232ae80d0c4cae939c0bfce888dc95"}},"metadata":{}},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/transformers/models/marian/tokenization_marian.py:194: UserWarning: Recommended: pip install sacremoses.\n"," warnings.warn(\"Recommended: pip install sacremoses.\")\n"]},{"output_type":"execute_result","data":{"text/plain":["[{'translation_text': \"Si vous choisissez les mauvais paramètres ici, vos articles pourraient être illisibles ou ne pas être envoyés du tout, alors s'il vous plaît soyez prudent avec ces paramètres.\"}]"]},"metadata":{},"execution_count":6}],"source":["from transformers import pipeline\n","model=\"Helsinki-NLP/opus-mt-en-fr\"\n","translator=pipeline(\"translation\", model=model)\n","translator(\"If you choose the wrong settings here your articles could be unreadable or not sendable at all, so please be careful with these settings.\")"]},{"cell_type":"markdown","source":["We can see that the pre trained model itself is giving us somewhat good translation and after fien tuning it will get better."],"metadata":{"id":"3Qi73zBYJ8Rb"}},{"cell_type":"markdown","source":["Now use the AutoTokenizer to use the same tokenizer which is used in pre trained model on the dataset we are using."],"metadata":{"id":"EC1cwrWTKIB_"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"kSNwMIrNsN0U"},"outputs":[],"source":["from transformers import AutoTokenizer\n","tokenizer=AutoTokenizer.from_pretrained(model, return_tensors=\"pt\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"_yokqgMF1HBS"},"outputs":[],"source":["def pre_processtext(text):\n"," inputs=[sample['en'] for sample in text['translation']]\n"," output=[sample['fr'] for sample in text['translation']]\n"," tokenized_text=tokenizer(inputs, text_target=output, max_length=128, truncation=True) #(text_target because if not done it will tokenize the french sentence according to english and so the labels will then not be correct)\n"," return tokenized_text"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"bp6Sv37y2AYm","colab":{"base_uri":"https://localhost:8080/","height":81,"referenced_widgets":["77a8cb1fcf634d9f94a40d17b107a3c5","ea4cd85e6e4d436ab8beba037fa2e719","8239b082d84a4c42befc11ca0a023058","1c1ac3cdcd604dcdab98cb6feccab3c6","bd3e6dcbbaa4483d8574a516c712add9","209365ee684a4de8b5111855ed7f7260","82e40eaa5699457cbbcdbb8176650cbd","3ac8bab967a8464e915927b7c3ad89fd","3bf91a6310014f1381ba48f7dc5ca5cf","c97e350975f14e0c89de2be2caa0caf6","ace5d1e2e85f464c87ed73f7633bc2df","77f7e7f0477a4c9aad516ebca4122a48","e5a3ebdd56ca43799b52c5a59940be29","5a50b270da494b6c8f51d7cc4a8083ff","1520062f580e4f748b09dbd57c30341b","fef37f76703542608ea17b2aad2758f1","fdca6ce44fd94cc6aac74e91adac2652","1e0722ba2a6a44249956f0f78bb3025d","6cbf4268db3448b4afb04a971b4a973e","ef89ecbd5a0b45ea8ba097db10020d80","e70c8edfad2b46138101b5fc5b00e490","df9cc7604ce7441ca8cf597666721796"]},"executionInfo":{"status":"ok","timestamp":1691775094027,"user_tz":-330,"elapsed":42427,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"}},"outputId":"6edb2f29-3d1d-4c54-ecf8-9ef89cd6feac"},"outputs":[{"output_type":"display_data","data":{"text/plain":["Map: 0%| | 0/168138 [00:00, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"77a8cb1fcf634d9f94a40d17b107a3c5"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Map: 0%| | 0/42035 [00:00, ? examples/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"77f7e7f0477a4c9aad516ebca4122a48"}},"metadata":{}}],"source":["tokenized_datasets=split_datasets.map(\n"," pre_processtext,\n"," batched=True,\n"," remove_columns=split_datasets[\"train\"].column_names #(to remove extra columns)\n",")"]},{"cell_type":"markdown","source":["Now after preprocessing we have to choose a model to train and we will be using the AutoModelForSeq2SeqLM"],"metadata":{"id":"NHWsFEfRK5QJ"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"LE8WLR177Lv-"},"outputs":[],"source":["from transformers import AutoModelForSeq2SeqLM\n","model_1= AutoModelForSeq2SeqLM.from_pretrained(model)"]},{"cell_type":"markdown","source":["Data collator is also an important tool which is used for dynamic padding and adding -100 to short sentences to make it to match max length and also adding a start of sentence token which can be seen in decoder_input_ids."],"metadata":{"id":"61NDGQXwLLyw"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"KCehvmtU4Wd0"},"outputs":[],"source":["from transformers import DataCollatorForSeq2Seq\n","data_collator=DataCollatorForSeq2Seq(tokenizer,model=model_1)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":11,"status":"ok","timestamp":1691775123256,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"},"user_tz":-330},"id":"eNs1nUEq5ddQ","outputId":"e0f7fe5b-38df-468a-d68d-d5a56135e7df"},"outputs":[{"output_type":"stream","name":"stdout","text":["dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])\n","tensor([[25966, 19, 540, 8, 669, 33355, 24, 11106, 37, 583,\n"," 583, 9507, 10571, 3, 49, 19015, 3, 49, 1937, 74,\n"," 2635, 973, 529, 13518, 74, 102, 0],\n"," [14743, 301, 548, 0, -100, -100, -100, -100, -100, -100,\n"," -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n"," -100, -100, -100, -100, -100, -100, -100]])\n"]},{"output_type":"execute_result","data":{"text/plain":["tensor([[59513, 25966, 19, 540, 8, 669, 33355, 24, 11106, 37,\n"," 583, 583, 9507, 10571, 3, 49, 19015, 3, 49, 1937,\n"," 74, 2635, 973, 529, 13518, 74, 102],\n"," [59513, 14743, 301, 548, 0, 59513, 59513, 59513, 59513, 59513,\n"," 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,\n"," 59513, 59513, 59513, 59513, 59513, 59513, 59513]])"]},"metadata":{},"execution_count":12}],"source":["batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(1,3)])\n","print(batch.keys())\n","print(batch['labels'])\n","batch['decoder_input_ids']"]},{"cell_type":"markdown","source":["For evaluating our model we will be using sacrebleu score which basically doesnt check the sentences grammatically but they see how many words are matching with each other and also penalise if the same word is coming multiple times but its not their in the original translation."],"metadata":{"id":"5vwR9WbxLfvi"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"0I9_9UOs9-CV","colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["c83a1dc94db445c5a7c638f167cd9c4b","5898e6600d01451e8cb1c0f774840e62","44f5cea4f52e4dd1b65987a47db0ff0d","21a5f74a35954700961b87e520bf1f53","966235471ec1455aa23f6b87c009cc83","1138ccbcab194d25bc548956ecd125fd","713a1e32023c47fcbfc6adfa01918b0b","c1516ee579e644ec85874eef6e96bb63","f94fa6400f764c8bafef7beb5b6d4721","b35e6ff8dc62453586e4d975f0012f47","7939fab4575e4a84954e8ea8aa1f3923"]},"executionInfo":{"status":"ok","timestamp":1691775125698,"user_tz":-330,"elapsed":1117,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"}},"outputId":"ba993da2-ef3e-4658-8cf6-2dc37a9dcfbf"},"outputs":[{"output_type":"display_data","data":{"text/plain":["Downloading builder script: 0%| | 0.00/8.15k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c83a1dc94db445c5a7c638f167cd9c4b"}},"metadata":{}}],"source":["import evaluate\n","metric_evaluate= evaluate.load(\"sacrebleu\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Lgo74zb7-9ds"},"outputs":[],"source":["import numpy as np\n","\n","def compute_metrics(eval):\n"," preds, labels= eval\n"," if isinstance(preds, tuple): #if model returns more than the prediction logits\n"," preds=preds[0]\n"," decoded_preds= tokenizer.batch_decode(preds, skip_special_tokens=True)\n","\n"," labels=np.where(labels != -100, labels,tokenizer.pad_token_id) #replacing -100 as we will not be able to decode them\n"," decoded_labels=tokenizer.batch_decode(labels, skip_special_tokens=True)\n","\n"," decoded_preds=[pred.strip() for pred in decoded_preds]\n"," decoded_labels=[[label.strip()] for label in decoded_labels] #references should be list of list of sentences\n","\n"," result=metric_evaluate.compute(predictions=decoded_preds, references=decoded_labels)\n"," return {\"bleu\": result[\"score\"]}\n"]},{"cell_type":"markdown","source":["I will be storing my model in hugging face repository so login to HuggingFace"],"metadata":{"id":"hVC-1fpLMBXY"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":145,"referenced_widgets":["12fe79894f0b49e2be0ca6b72422f1ea","7ffe7032b19c4cbab2890a1f920b9588","80119334d4d644a396a2a5f048cacd03","c0d57450bc2245be8ef31cbc39b6fffe","f8bf48097d4b4f449f6ccf9bfbed9821","e9681f3e3b574506a4de6e9a23f924ef","e1f7af41736542d6871430b3fac84f08","bbee0a93bc97428d8c5c40f977daa635","22d53f4c540540be8e49a5b32b23b9c2","5be6d1c9a30d4303a434183ee1b5dc42","991d889ea2744f84b06612e1f9fcd62c","883f5116e1424d058b0e5747480918a1","f57c78ca494048e8ac75ef3b53fdb6fc","945f7d423f104e11b2e94769b18d5275","93c01b2ac77340479c1531f58d1de6da","1ddcdda83ece47539ec20474a602cd4c","d927636098284e7a811cb30457571521","1d993f56cd57496f8bcb2e29c30eaac6","7ed7c6ba14cd4cefbad1d88846343c35","32c7fc26844a440f80441c181ca40f32","e31f5f5666b84773bc2a689f213c16ce","74cebf3739ef4967a29c4e30a27201a4","7abc7ea390774fc687519f1360aa1f84","26e2e3cc769a47dd82bde9ae4734ab4d","d2e26a203c554f4aa8f4c0488745a14a","bcd6dc0f2c6a4f2694b114cc0ffbcddd","5a6c2a2652d541868d54b5807ccd2998","4996b5af45fd44fe940324c01677a5ad","dae3f7ddecd648dab23b6834516cc2cc","04212a652fe24082b8bb9636cd3a4c6e","b588deb82f954cc7adf3b38b75f7caa6","7a3fce417c7a438ca4af1a5327721d0d"]},"executionInfo":{"elapsed":15,"status":"ok","timestamp":1691775127906,"user":{"displayName":"Suyash Shirish Gahankari","userId":"03254191982788877845"},"user_tz":-330},"id":"Z2br-fqVBA5r","outputId":"58becea7-4bd3-4ecb-d74f-9ad226a2c833"},"outputs":[{"output_type":"display_data","data":{"text/plain":["VBox(children=(HTML(value='
Step | \n","Training Loss | \n","
---|---|
500 | \n","1.378900 | \n","
1000 | \n","1.211500 | \n","
1500 | \n","1.162500 | \n","
2000 | \n","1.112500 | \n","
2500 | \n","1.072500 | \n","
3000 | \n","1.039900 | \n","
3500 | \n","1.024300 | \n","
4000 | \n","1.010100 | \n","
4500 | \n","0.981200 | \n","
5000 | \n","0.982100 | \n","
5500 | \n","0.918800 | \n","
6000 | \n","0.871900 | \n","
6500 | \n","0.852400 | \n","
7000 | \n","0.862000 | \n","
7500 | \n","0.869900 | \n","
8000 | \n","0.854800 | \n","
8500 | \n","0.832100 | \n","
9000 | \n","0.842000 | \n","
9500 | \n","0.852900 | \n","
10000 | \n","0.840000 | \n","
10500 | \n","0.830200 | \n","
11000 | \n","0.760900 | \n","
11500 | \n","0.763300 | \n","
12000 | \n","0.755700 | \n","
12500 | \n","0.777900 | \n","
13000 | \n","0.755600 | \n","
13500 | \n","0.758600 | \n","
14000 | \n","0.760700 | \n","
14500 | \n","0.758900 | \n","
15000 | \n","0.761500 | \n","
15500 | \n","0.773400 | \n","
"],"text/plain":["
Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.