{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "3a55acf6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/workspace/xls-r-300m-te'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "os.getcwd()" ] }, { "cell_type": "code", "execution_count": 6, "id": "8491f5f9", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 7, "id": "fed9879a", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dc35d55b7a9444128bb348a38969453f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/3.92k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset samanantar/te to /workspace/.cache/huggingface/datasets/ai4bharat___samanantar/te/0.3.0/556308f80c011cb3c32f3de18199d7b1e4cf9ca707843c92bb0bede0e47a8bd6...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "de6defc6eb934d87ab8a18cd4fe2a04d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/4.60G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0 examples [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset samanantar downloaded and prepared to /workspace/.cache/huggingface/datasets/ai4bharat___samanantar/te/0.3.0/556308f80c011cb3c32f3de18199d7b1e4cf9ca707843c92bb0bede0e47a8bd6. Subsequent calls will reuse this data.\n" ] } ], "source": [ "dataset = load_dataset(\"ai4bharat/samanantar\", \"te\", split=\"train\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "5c478941", "metadata": {}, "outputs": [], "source": [ "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'" ] }, { "cell_type": "code", "execution_count": 17, "id": "abf69ac9", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def extract_text(batch):\n", " text = batch[\"tgt\"]\n", " batch[\"text\"] = re.sub(chars_to_ignore_regex, \"\", text.lower())\n", " return batch" ] }, { "cell_type": "code", "execution_count": 16, "id": "6b4d0c6c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'వర్షాలకు చేతికి వచ్చిన పంట దెబ్బతిన్నదని రైతులు వాపోతున్నారు'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset[0]['tgt']" ] }, { "cell_type": "code", "execution_count": 18, "id": "710de6ce", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cc51f1d8191c4118b9281727e6ec4b63", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/4661986 [00:00, ?ex/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset = dataset.map(extract_text, remove_columns=dataset.column_names)" ] }, { "cell_type": "code", "execution_count": 19, "id": "bd4c05b4", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a0a50384591d42489963b8990624ab95", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='