dnzblgn commited on
Commit
f672048
1 Parent(s): c6b852c

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. deu_deu.csv +3 -0
  3. machine_translation.ipynb +197 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ deu_deu.csv filter=lfs diff=lfs merge=lfs -text
deu_deu.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9187e5fb17d498a9b8d75ce2d3ac73079ceeb9aa8fa156c386a398fb0a3346e4
3
+ size 14217492
machine_translation.ipynb ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 9,
6
+ "id": "1f93b921-c3dc-487d-b813-53f542981ca2",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Skipping empty line\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import csv\n",
19
+ "\n",
20
+ "# Prepare the input data\n",
21
+ "with open('deu.txt', 'r', encoding='utf-8') as file:\n",
22
+ " lines = file.read().split('\\n')\n",
23
+ "\n",
24
+ "input_texts = []\n",
25
+ "target_texts = []\n",
26
+ "\n",
27
+ "for line in lines:\n",
28
+ " if line:\n",
29
+ " parts = line.split('\\t')\n",
30
+ " if len(parts) >= 2:\n",
31
+ " input_texts.append(parts[0])\n",
32
+ " target_texts.append(parts[1])\n",
33
+ " else:\n",
34
+ " print(f\"Skipping invalid line: {line}\")\n",
35
+ " else:\n",
36
+ " print(\"Skipping empty line\")\n",
37
+ "\n",
38
+ "# Write the sentences to a CSV file\n",
39
+ "with open('deu_deu.csv', 'w', newline='', encoding='utf-8') as csvfile:\n",
40
+ " writer = csv.writer(csvfile)\n",
41
+ " writer.writerow(['eng', 'deu']) # Write column headers\n",
42
+ " for eng, ger in zip(input_texts, target_texts):\n",
43
+ " writer.writerow([eng, ger])\n",
44
+ "\n"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "id": "b8bf6832-3bae-4702-926f-b369eca4d111",
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "name": "stderr",
55
+ "output_type": "stream",
56
+ "text": [
57
+ "2023-06-26 21:59:03.360147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
58
+ "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
59
+ ]
60
+ },
61
+ {
62
+ "data": {
63
+ "application/vnd.jupyter.widget-view+json": {
64
+ "model_id": "b10e58922d784b20a369f41d94348364",
65
+ "version_major": 2,
66
+ "version_minor": 0
67
+ },
68
+ "text/plain": [
69
+ "Downloading (…)ve/main/spiece.model: 0%| | 0.00/792k [00:00<?, ?B/s]"
70
+ ]
71
+ },
72
+ "metadata": {},
73
+ "output_type": "display_data"
74
+ },
75
+ {
76
+ "data": {
77
+ "application/vnd.jupyter.widget-view+json": {
78
+ "model_id": "f48703cdfd934af49e2071decd504eef",
79
+ "version_major": 2,
80
+ "version_minor": 0
81
+ },
82
+ "text/plain": [
83
+ "Downloading (…)okenizer_config.json: 0%| | 0.00/2.32k [00:00<?, ?B/s]"
84
+ ]
85
+ },
86
+ "metadata": {},
87
+ "output_type": "display_data"
88
+ },
89
+ {
90
+ "data": {
91
+ "application/vnd.jupyter.widget-view+json": {
92
+ "model_id": "88acf3a7ab47431ea9dd580dc15f6471",
93
+ "version_major": 2,
94
+ "version_minor": 0
95
+ },
96
+ "text/plain": [
97
+ "Downloading (…)lve/main/config.json: 0%| | 0.00/1.21k [00:00<?, ?B/s]"
98
+ ]
99
+ },
100
+ "metadata": {},
101
+ "output_type": "display_data"
102
+ },
103
+ {
104
+ "data": {
105
+ "application/vnd.jupyter.widget-view+json": {
106
+ "model_id": "a576412eb34944ad9282357e2878a6d5",
107
+ "version_major": 2,
108
+ "version_minor": 0
109
+ },
110
+ "text/plain": [
111
+ "Downloading (…)\"model.safetensors\";: 0%| | 0.00/242M [00:00<?, ?B/s]"
112
+ ]
113
+ },
114
+ "metadata": {},
115
+ "output_type": "display_data"
116
+ },
117
+ {
118
+ "data": {
119
+ "application/vnd.jupyter.widget-view+json": {
120
+ "model_id": "bbfdfea01b2d41eb89bf5a79db977381",
121
+ "version_major": 2,
122
+ "version_minor": 0
123
+ },
124
+ "text/plain": [
125
+ "Downloading (…)neration_config.json: 0%| | 0.00/147 [00:00<?, ?B/s]"
126
+ ]
127
+ },
128
+ "metadata": {},
129
+ "output_type": "display_data"
130
+ }
131
+ ],
132
+ "source": [
133
+ "import torch\n",
134
+ "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
135
+ "import pandas as pd\n",
136
+ "\n",
137
+ "# Load pre-trained T5 model and tokenizer\n",
138
+ "model_name = 't5-base'\n",
139
+ "tokenizer = T5Tokenizer.from_pretrained(model_name)\n",
140
+ "model = T5ForConditionalGeneration.from_pretrained(model_name)\n",
141
+ "\n",
142
+ "data = pd.read_csv(\"/users/deniz.bilgin/Machine Translation/deu_deu.csv\")\n",
143
+ "input_texts = data[\"eng\"]\n",
144
+ "target_texts = data[\"deu\"]\n",
145
+ "\n",
146
+ "# Tokenize the input and target texts\n",
147
+ "input_tokenized = tokenizer(input_texts.tolist(), return_tensors='pt', padding=True, truncation=True)\n",
148
+ "target_tokenized = tokenizer(target_texts.tolist(), return_tensors='pt', padding=True, truncation=True)\n",
149
+ "\n",
150
+ "# Fine-tune the T5 model on the translation task\n",
151
+ "optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n",
152
+ "\n",
153
+ "model.train()\n",
154
+ "for epoch in range(10): # Adjust the number of epochs as needed\n",
155
+ " optimizer.zero_grad()\n",
156
+ " outputs = model(input_tokenized.input_ids, attention_mask=input_tokenized.attention_mask, labels=target_tokenized.input_ids)\n",
157
+ " loss = outputs.loss\n",
158
+ " loss.backward()\n",
159
+ " optimizer.step()\n",
160
+ " print(f\"Epoch {epoch+1} Loss: {loss.item()}\")\n",
161
+ "\n",
162
+ "# Save the fine-tuned model\n",
163
+ "model.save_pretrained('translation_model')\n",
164
+ "tokenizer.save_pretrained('translation_model')\n"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": null,
170
+ "id": "0a8b8b6e-9f62-47c8-907b-688a9c7df95c",
171
+ "metadata": {},
172
+ "outputs": [],
173
+ "source": []
174
+ }
175
+ ],
176
+ "metadata": {
177
+ "kernelspec": {
178
+ "display_name": "Python 3 (ipykernel)",
179
+ "language": "python",
180
+ "name": "python3"
181
+ },
182
+ "language_info": {
183
+ "codemirror_mode": {
184
+ "name": "ipython",
185
+ "version": 3
186
+ },
187
+ "file_extension": ".py",
188
+ "mimetype": "text/x-python",
189
+ "name": "python",
190
+ "nbconvert_exporter": "python",
191
+ "pygments_lexer": "ipython3",
192
+ "version": "3.10.10"
193
+ }
194
+ },
195
+ "nbformat": 4,
196
+ "nbformat_minor": 5
197
+ }