Spaces:

syubraj
/

syubrajRomanEng2Nep-v2

Sleeping

App Files Files Community

syubraj commited on Oct 2, 2024

Commit

463cc27

verified ·

1 Parent(s): 0811fc9

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

Notebook/01_romaneng2nep-translation.ipynb +1 -0
Notebook/02_RomanEng2Nep_Transliteration_v2.ipynb +0 -0
Notebook/dataset-conversion.ipynb +1 -0
README.md +16 -8
app.py +35 -0
requirements.txt +5 -0

Notebook/01_romaneng2nep-translation.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30762,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install transformers datasets evaluate sacrebleu","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-09-22T16:50:45.081814Z","iopub.execute_input":"2024-09-22T16:50:45.082178Z","iopub.status.idle":"2024-09-22T16:51:00.847186Z","shell.execute_reply.started":"2024-09-22T16:50:45.082142Z","shell.execute_reply":"2024-09-22T16:51:00.846189Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from huggingface_hub import notebook_login\n\nnotebook_login()","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:01.731576Z","iopub.execute_input":"2024-09-22T16:53:01.732612Z","iopub.status.idle":"2024-09-22T16:53:02.070092Z","shell.execute_reply.started":"2024-09-22T16:53:01.732561Z","shell.execute_reply":"2024-09-22T16:53:02.068675Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from datasets import load_dataset\n\ndata = load_dataset(\"syubraj/roman2nepali-transliteration\")","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:19.592468Z","iopub.execute_input":"2024-09-22T16:53:19.592905Z","iopub.status.idle":"2024-09-22T16:53:27.147254Z","shell.execute_reply.started":"2024-09-22T16:53:19.592866Z","shell.execute_reply":"2024-09-22T16:53:27.146252Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"data = data['train'].train_test_split(test_size=0.02)","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:27.159658Z","iopub.execute_input":"2024-09-22T16:53:27.159978Z","iopub.status.idle":"2024-09-22T16:53:28.254948Z","shell.execute_reply.started":"2024-09-22T16:53:27.159945Z","shell.execute_reply":"2024-09-22T16:53:28.254138Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from transformers import AutoTokenizer\n\ncheckpoint = \"google-t5/t5-small\"\ntokenizer_checkpoint = \"FacebookAI/xlm-roberta-base\"\ntokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:28.257447Z","iopub.execute_input":"2024-09-22T16:53:28.257770Z","iopub.status.idle":"2024-09-22T16:53:33.442359Z","shell.execute_reply.started":"2024-09-22T16:53:28.257734Z","shell.execute_reply":"2024-09-22T16:53:33.441518Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"source_lang = 'roman'\ntarget_lang = 'nepali'\nprefix = \"translate Roman to Nepali: \"\n\n\ndef preprocess_function(examples):\n inputs = [prefix + example[source_lang] for example in examples[\"translation\"]]\n targets = [example[target_lang] for example in examples[\"translation\"]]\n model_inputs = tokenizer(inputs, text_target=targets, max_length=30, truncation=True)\n return model_inputs","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:53:33.443485Z","iopub.execute_input":"2024-09-22T16:53:33.443837Z","iopub.status.idle":"2024-09-22T16:53:33.449493Z","shell.execute_reply.started":"2024-09-22T16:53:33.443801Z","shell.execute_reply":"2024-09-22T16:53:33.448556Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tokenized_data = data.map(preprocess_function, batched=True)\nprint(\"Data mapping done\")","metadata":{"execution":{"iopub.status.busy":"2024-09-22T16:56:35.660144Z","iopub.execute_input":"2024-09-22T16:56:35.660919Z","iopub.status.idle":"2024-09-22T17:00:01.565246Z","shell.execute_reply.started":"2024-09-22T16:56:35.660877Z","shell.execute_reply":"2024-09-22T17:00:01.564308Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from transformers import DataCollatorForSeq2Seq\n\ndata_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:06.956005Z","iopub.execute_input":"2024-09-22T17:00:06.956402Z","iopub.status.idle":"2024-09-22T17:00:19.227805Z","shell.execute_reply.started":"2024-09-22T17:00:06.956363Z","shell.execute_reply":"2024-09-22T17:00:19.226974Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import evaluate\n\nmetric = evaluate.load(\"sacrebleu\")","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:26.591170Z","iopub.execute_input":"2024-09-22T17:00:26.591860Z","iopub.status.idle":"2024-09-22T17:00:29.284578Z","shell.execute_reply.started":"2024-09-22T17:00:26.591822Z","shell.execute_reply":"2024-09-22T17:00:29.283684Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\n\n\ndef postprocess_text(preds, labels):\n preds = [pred.strip() for pred in preds]\n labels = [[label.strip()] for label in labels]\n\n return preds, labels\n\n\ndef compute_metrics(eval_preds):\n preds, labels = eval_preds\n if isinstance(preds, tuple):\n preds = preds[0]\n decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n\n labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n\n decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)\n\n result = metric.compute(predictions=decoded_preds, references=decoded_labels)\n result = {\"bleu\": result[\"score\"]}\n\n prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]\n result[\"gen_len\"] = np.mean(prediction_lens)\n result = {k: round(v, 4) for k, v in result.items()}\n return result","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:38.702200Z","iopub.execute_input":"2024-09-22T17:00:38.702924Z","iopub.status.idle":"2024-09-22T17:00:38.712287Z","shell.execute_reply.started":"2024-09-22T17:00:38.702882Z","shell.execute_reply":"2024-09-22T17:00:38.711336Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer\n\nmodel = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:40.951814Z","iopub.execute_input":"2024-09-22T17:00:40.952201Z","iopub.status.idle":"2024-09-22T17:00:46.435083Z","shell.execute_reply.started":"2024-09-22T17:00:40.952163Z","shell.execute_reply":"2024-09-22T17:00:46.434107Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import wandb\n\ntry:\n from kaggle_secrets import UserSecretsClient\n user_secrets = UserSecretsClient()\n api_key = user_secrets.get_secret(\"wandb_api\")\n wandb.login(key=api_key)\n anony = None\nexcept:\n anony = \"must\"\n print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \\nGet your W&B access token from here: https://wandb.ai/authorize')","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:00:46.437207Z","iopub.execute_input":"2024-09-22T17:00:46.438049Z","iopub.status.idle":"2024-09-22T17:00:49.392150Z","shell.execute_reply.started":"2024-09-22T17:00:46.437981Z","shell.execute_reply":"2024-09-22T17:00:49.391318Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"training_args = Seq2SeqTrainingArguments(\n output_dir=\"/kaggle/working/romaneng2nep/\",\n eval_strategy=\"epoch\",\n learning_rate=2e-5,\n per_device_train_batch_size=16,\n per_device_eval_batch_size=16,\n weight_decay=0.01,\n lr_scheduler = linear,\n save_total_limit=3,\n num_train_epochs=1,\n predict_with_generate=True,\n fp16=True,\n report_to = 'wandb'\n push_to_hub = True,\n)\n\ntrainer = Seq2SeqTrainer(\n model=model,\n args=training_args,\n train_dataset=tokenized_data[\"train\"],\n eval_dataset=tokenized_data[\"test\"],\n tokenizer=tokenizer,\n data_collator=data_collator,\n compute_metrics=compute_metrics,\n)\n","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:01:55.818916Z","iopub.execute_input":"2024-09-22T17:01:55.819976Z","iopub.status.idle":"2024-09-22T17:01:55.981315Z","shell.execute_reply.started":"2024-09-22T17:01:55.819917Z","shell.execute_reply":"2024-09-22T17:01:55.980163Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"trainer.train()","metadata":{"execution":{"iopub.status.busy":"2024-09-22T17:01:57.802474Z","iopub.execute_input":"2024-09-22T17:01:57.803390Z","iopub.status.idle":"2024-09-22T17:03:13.942728Z","shell.execute_reply.started":"2024-09-22T17:01:57.803348Z","shell.execute_reply":"2024-09-22T17:03:13.940096Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"trainer.push_to_hub('syubraj/romanized_english_2_nepali')","metadata":{},"execution_count":null,"outputs":[]}]}

Notebook/02_RomanEng2Nep_Transliteration_v2.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Notebook/dataset-conversion.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-09-22T15:43:44.884747Z","iopub.status.busy":"2024-09-22T15:43:44.884016Z","iopub.status.idle":"2024-09-22T15:43:53.003699Z","shell.execute_reply":"2024-09-22T15:43:53.002880Z","shell.execute_reply.started":"2024-09-22T15:43:44.884711Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"10615553125b47bbb283c6d15b9d8ac3","version_major":2,"version_minor":0},"text/plain":["Downloading readme: 0%| | 0.00/624 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6ee918c192bf476fb66c29742418b4ca","version_major":2,"version_minor":0},"text/plain":["Downloading data: 0%| | 0.00/86.1M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"7d7dcb663d2747d5bd509b4931809910","version_major":2,"version_minor":0},"text/plain":["Downloading data: 0%| | 0.00/94.2k [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"1a364f3f533f4edfa042201e1825d207","version_major":2,"version_minor":0},"text/plain":["Generating train split: 0%| | 0/2397414 [00:00<?, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"76d84909e197482ea32cdd4a4e035ee3","version_major":2,"version_minor":0},"text/plain":["Generating validation split: 0%| | 0/2804 [00:00<?, ? examples/s]"]},"metadata":{},"output_type":"display_data"}],"source":["from datasets import load_dataset\n","\n","ds = load_dataset(\"Saugatkafley/Nepali-Roman-Transliteration\")"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T15:48:28.037465Z","iopub.status.busy":"2024-09-22T15:48:28.036523Z","iopub.status.idle":"2024-09-22T15:48:28.042501Z","shell.execute_reply":"2024-09-22T15:48:28.041586Z","shell.execute_reply.started":"2024-09-22T15:48:28.037419Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['unique_identifier', 'native word', 'english word'],\n"," num_rows: 2397414\n"," })\n"," validation: Dataset({\n"," features: ['unique_identifier', 'native word', 'english word'],\n"," num_rows: 2804\n"," })\n","})\n"]}],"source":["print(ds)"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T15:51:52.459691Z","iopub.status.busy":"2024-09-22T15:51:52.458718Z","iopub.status.idle":"2024-09-22T15:51:52.495039Z","shell.execute_reply":"2024-09-22T15:51:52.493900Z","shell.execute_reply.started":"2024-09-22T15:51:52.459633Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"d15ba4abe5c342d5afe42cc8959365bb","version_major":2,"version_minor":0},"text/plain":["VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"]},"metadata":{},"output_type":"display_data"}],"source":["# !pip install huggingface\n","\n","from huggingface_hub import notebook_login\n","\n","notebook_login()"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T16:06:46.498144Z","iopub.status.busy":"2024-09-22T16:06:46.497197Z","iopub.status.idle":"2024-09-22T16:09:05.290568Z","shell.execute_reply":"2024-09-22T16:09:05.288867Z","shell.execute_reply.started":"2024-09-22T16:06:46.498097Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|██████████| 2397414/2397414 [02:11<00:00, 18234.37it/s]\n","100%|██████████| 2804/2804 [00:00<00:00, 19079.34it/s]\n"]}],"source":["from datasets import DatasetDict, Dataset\n","from tqdm import tqdm\n","\n","\n","def transform_dataset(dataset):\n"," # Create a list to hold our transformed data\n"," transformed_data = []\n"," \n"," for example in tqdm(dataset):\n","# # Generate a random 5-digit ID (you may want to use a more robust method)\n","# random_id = str(random.randint(10000, 99999))\n"," \n"," transformed_example = {\n"," 'id': example['unique_identifier'],\n"," 'translation': {\n"," 'roman': example['english word'],\n"," 'nepali': example['native word'] \n"," }\n"," }\n"," transformed_data.append(transformed_example)\n"," \n"," # Create a new dataset from our transformed data\n"," return Dataset.from_list(transformed_data)\n","\n","transformed_train = transform_dataset(ds['train'])\n","transformed_validation = transform_dataset(ds['validation'])"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T16:09:19.328280Z","iopub.status.busy":"2024-09-22T16:09:19.327739Z","iopub.status.idle":"2024-09-22T16:09:19.350833Z","shell.execute_reply":"2024-09-22T16:09:19.349569Z","shell.execute_reply.started":"2024-09-22T16:09:19.328241Z"},"trusted":true},"outputs":[],"source":["transformed_dataset = DatasetDict({\n"," 'train': transformed_train,\n"," 'validation': transformed_validation\n","})"]},{"cell_type":"code","execution_count":17,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T16:09:23.069359Z","iopub.status.busy":"2024-09-22T16:09:23.068241Z","iopub.status.idle":"2024-09-22T16:09:23.074713Z","shell.execute_reply":"2024-09-22T16:09:23.073749Z","shell.execute_reply.started":"2024-09-22T16:09:23.069316Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['id', 'translation'],\n"," num_rows: 2397414\n"," })\n"," validation: Dataset({\n"," features: ['id', 'translation'],\n"," num_rows: 2804\n"," })\n","})\n"]}],"source":["print(transformed_dataset)"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T16:09:33.982265Z","iopub.status.busy":"2024-09-22T16:09:33.981439Z","iopub.status.idle":"2024-09-22T16:09:42.214228Z","shell.execute_reply":"2024-09-22T16:09:42.213022Z","shell.execute_reply.started":"2024-09-22T16:09:33.982224Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"c0871bd3cfd84bbfa54346500196cc28","version_major":2,"version_minor":0},"text/plain":["Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"a1d23f24909544b19842206be23976a5","version_major":2,"version_minor":0},"text/plain":["Creating parquet from Arrow format: 0%| | 0/2398 [00:00<?, ?ba/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"641a18429d8c4fcfacdd568aa5e9e9ad","version_major":2,"version_minor":0},"text/plain":["Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3b6e0b99110644a29d97aba3ee927aa3","version_major":2,"version_minor":0},"text/plain":["Creating parquet from Arrow format: 0%| | 0/3 [00:00<?, ?ba/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"081765380fb64a8eb0e6122590532a0f","version_major":2,"version_minor":0},"text/plain":["README.md: 0%| | 0.00/683 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["{'id': 'nep1', 'translation': {'nepali': 'मुस्कुराउँदै', 'roman': 'muskuraundai'}}\n","{'id': 'nep1', 'translation': {'nepali': 'सर्वसाधारणसम्मले', 'roman': 'sarwasadharansammale'}}\n"]}],"source":["# Save the transformed dataset\n","transformed_dataset.push_to_hub('syubraj/roman2nepali-transliteration')\n","\n","# To verify the transformation, you can load a few examples:\n","print(transformed_dataset['train'][0])\n","print(transformed_dataset['validation'][0])"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30761,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.0"}},"nbformat":4,"nbformat_minor":4}

README.md CHANGED Viewed

@@ -1,12 +1,20 @@
 ---
-title: SyubrajRomanEng2Nep V2
-emoji: 🌖
-colorFrom: gray
-colorTo: yellow
-sdk: gradio
-sdk_version: 4.44.1
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: syubrajRomanEng2Nep-v2
 app_file: app.py
+sdk: gradio
+sdk_version: 4.44.0
 ---
+## Steps
+### 1. Clone this repo
+```
+git clone git@github.com:yubraaj11/RomanEng2Nep.git
+```
+### 2. Install requirements
+```
+pip install -r requirements.txt
+```
+### 3. Run the Gradio App
+```
+python app.py
+```

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import gradio as gr
+from transformers import AutoTokenizer, MT5ForConditionalGeneration
+# Load tokenizer and model
+checkpoint = "syubraj/RomanEng2Nep-v2"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = MT5ForConditionalGeneration.from_pretrained(checkpoint)
+# Set max sequence length
+max_seq_len = 20
+# Define the translation function
+def translate(text):
+    # Tokenize the input text with a max length of 20
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_seq_len)
+    # Generate translation
+    translated = model.generate(**inputs)
+    # Decode the translated tokens back to text
+    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
+    return translated_text
+# Gradio interface
+iface = gr.Interface(
+    fn=translate,  # function to use for inference
+    inputs="text",  # input type
+    outputs="text",  # output type
+    title="Romanized English to Nepali Transliterator",
+    description="Translate Romanized English text into Nepali.",
+    examples=[["ahile"],["prakriti"], ["mahasagar"], ["pradarshan"]]
+)
+# Launch the Gradio app
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers==4.45.1
+gradio==4.44.0
+protobuf==5.28.2
+sentencepiece==0.2.0
+torch