dnzblgn
/

T5_machine_translation

Model card Files Files and versions Community

dnzblgn commited on Jun 26, 2023

Commit

f672048

•

1 Parent(s): c6b852c

Upload 2 files

Browse files

Files changed (3) hide show

.gitattributes +1 -0
deu_deu.csv +3 -0
machine_translation.ipynb +197 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+deu_deu.csv filter=lfs diff=lfs merge=lfs -text

deu_deu.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9187e5fb17d498a9b8d75ce2d3ac73079ceeb9aa8fa156c386a398fb0a3346e4
+size 14217492

machine_translation.ipynb ADDED Viewed

	@@ -0,0 +1,197 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1f93b921-c3dc-487d-b813-53f542981ca2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skipping empty line\n"
+     ]
+    }
+   ],
+   "source": [
+    "import csv\n",
+    "\n",
+    "# Prepare the input data\n",
+    "with open('deu.txt', 'r', encoding='utf-8') as file:\n",
+    "    lines = file.read().split('\\n')\n",
+    "\n",
+    "input_texts = []\n",
+    "target_texts = []\n",
+    "\n",
+    "for line in lines:\n",
+    "    if line:\n",
+    "        parts = line.split('\\t')\n",
+    "        if len(parts) >= 2:\n",
+    "            input_texts.append(parts[0])\n",
+    "            target_texts.append(parts[1])\n",
+    "        else:\n",
+    "            print(f\"Skipping invalid line: {line}\")\n",
+    "    else:\n",
+    "        print(\"Skipping empty line\")\n",
+    "\n",
+    "# Write the sentences to a CSV file\n",
+    "with open('deu_deu.csv', 'w', newline='', encoding='utf-8') as csvfile:\n",
+    "    writer = csv.writer(csvfile)\n",
+    "    writer.writerow(['eng', 'deu'])  # Write column headers\n",
+    "    for eng, ger in zip(input_texts, target_texts):\n",
+    "        writer.writerow([eng, ger])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8bf6832-3bae-4702-926f-b369eca4d111",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-06-26 21:59:03.360147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b10e58922d784b20a369f41d94348364",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f48703cdfd934af49e2071decd504eef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "88acf3a7ab47431ea9dd580dc15f6471",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a576412eb34944ad9282357e2878a6d5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)\"model.safetensors\";:   0%|          | 0.00/242M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bbfdfea01b2d41eb89bf5a79db977381",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Load pre-trained T5 model and tokenizer\n",
+    "model_name = 't5-base'\n",
+    "tokenizer = T5Tokenizer.from_pretrained(model_name)\n",
+    "model = T5ForConditionalGeneration.from_pretrained(model_name)\n",
+    "\n",
+    "data = pd.read_csv(\"/users/deniz.bilgin/Machine Translation/deu_deu.csv\")\n",
+    "input_texts = data[\"eng\"]\n",
+    "target_texts = data[\"deu\"]\n",
+    "\n",
+    "# Tokenize the input and target texts\n",
+    "input_tokenized = tokenizer(input_texts.tolist(), return_tensors='pt', padding=True, truncation=True)\n",
+    "target_tokenized = tokenizer(target_texts.tolist(), return_tensors='pt', padding=True, truncation=True)\n",
+    "\n",
+    "# Fine-tune the T5 model on the translation task\n",
+    "optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n",
+    "\n",
+    "model.train()\n",
+    "for epoch in range(10):  # Adjust the number of epochs as needed\n",
+    "    optimizer.zero_grad()\n",
+    "    outputs = model(input_tokenized.input_ids, attention_mask=input_tokenized.attention_mask, labels=target_tokenized.input_ids)\n",
+    "    loss = outputs.loss\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    print(f\"Epoch {epoch+1} Loss: {loss.item()}\")\n",
+    "\n",
+    "# Save the fine-tuned model\n",
+    "model.save_pretrained('translation_model')\n",
+    "tokenizer.save_pretrained('translation_model')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a8b8b6e-9f62-47c8-907b-688a9c7df95c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}