derek-thomas HF staff commited on
Commit
0c259a6
1 Parent(s): 0dd63b0

Upload baseline.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. baseline.ipynb +201 -59
baseline.ipynb CHANGED
@@ -47,23 +47,6 @@
47
  "login()"
48
  ]
49
  },
50
- {
51
- "cell_type": "code",
52
- "execution_count": null,
53
- "id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
54
- "metadata": {},
55
- "outputs": [],
56
- "source": [
57
- "from huggingface_hub import HfApi\n",
58
- "api = HfApi()\n",
59
- "api.upload_file(\n",
60
- " path_or_fileobj=\"baseline.ipynb\",\n",
61
- " path_in_repo=\"baseline.ipynb\",\n",
62
- " repo_id=\"arabic-translation-prompt-engineering/atpe-notebooks\",\n",
63
- " repo_type=\"dataset\",\n",
64
- ")"
65
- ]
66
- },
67
  {
68
  "cell_type": "markdown",
69
  "id": "4c254d6f-f3a1-49c1-815e-36e41e75ca25",
@@ -108,14 +91,6 @@
108
  "For our baseline we will translate with a simple system prompt and instruction."
109
  ]
110
  },
111
- {
112
- "cell_type": "markdown",
113
- "id": "6cea05d6-afb7-4829-b130-d4bcfe549acb",
114
- "metadata": {},
115
- "source": [
116
- "### Analysis"
117
- ]
118
- },
119
  {
120
  "cell_type": "markdown",
121
  "id": "a98b9b67-e68b-43b2-b8e9-0ed1cf85591f",
@@ -129,14 +104,14 @@
129
  },
130
  {
131
  "cell_type": "code",
132
- "execution_count": null,
133
  "id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
134
  "metadata": {
135
  "tags": []
136
  },
137
  "outputs": [],
138
  "source": [
139
- "system_prompt = \"\"\"You are a skilled translator with extensive experience in English to Arabic translations. You possess a deep understanding of the linguistic, cultural, and contextual nuances essential for accurate and effective translation between these languages. Highly motivated and detail-oriented, you are committed to delivering translations that maintain the integrity and intent of the original text. Your role is crucial in ensuring clear and precise communication in our multilingual system.\"\"\""
140
  ]
141
  },
142
  {
@@ -144,69 +119,195 @@
144
  "id": "803ddeba-03de-4f13-95d1-5fb097058cf2",
145
  "metadata": {},
146
  "source": [
147
- "### Prompt\n",
148
- "> Translate this from english to arabic: {en_input}.\n",
149
  ">\n",
150
  "> Translation: \n",
151
  "\n",
152
- "Again we use a simple prompt to get a translation."
153
  ]
154
  },
155
  {
156
  "cell_type": "code",
157
- "execution_count": 53,
158
  "id": "b7f1722c-c484-4e22-a025-53f95943fc76",
159
  "metadata": {},
160
  "outputs": [],
161
  "source": [
162
- "def baseline_chat_completion(system_prompt, en_input):\n",
163
  " \"\"\"\n",
164
  " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
165
  " \"\"\"\n",
166
  " messages = [\n",
167
  " {\"role\": \"system\", \"content\": system_prompt},\n",
168
- " {\"role\": \"user\", \"content\": f\"Translate this from english to arabic: {en_input}.\\nTranslation: \"},\n",
 
 
 
169
  " ]\n",
170
  " return client.chat_completion(messages, max_tokens=10_000)"
171
  ]
172
  },
173
  {
174
  "cell_type": "code",
175
- "execution_count": 50,
176
  "id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
177
  "metadata": {
178
  "tags": []
179
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  "outputs": [
181
  {
182
  "data": {
183
  "text/plain": [
184
- "120"
185
  ]
186
  },
187
- "execution_count": 50,
188
  "metadata": {},
189
  "output_type": "execute_result"
190
  }
191
  ],
192
  "source": [
193
- "en_input = \"Float like a butterfly sting like a bee – his hands can’t hit what his eyes can’t see.\"\n",
194
- "response = baseline_chat_completion(system_prompt, \"Float like a butterfly sting like a bee – his hands can’t hit what his eyes can’t see.\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  ]
196
  },
197
  {
198
  "cell_type": "markdown",
199
- "id": "2bca574c-461d-4822-b0dd-b12a3b9846b3",
200
  "metadata": {},
201
  "source": [
202
- "### Token Cost\n",
203
- "Here we can see that the cost is quite cheap, only 92 tokens!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  ]
205
  },
206
  {
207
  "cell_type": "code",
208
- "execution_count": 52,
209
- "id": "4e305b1e-56e0-44da-8c17-496cbcc35fad",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  "metadata": {
211
  "tags": []
212
  },
@@ -214,10 +315,10 @@
214
  {
215
  "data": {
216
  "text/plain": [
217
- "120"
218
  ]
219
  },
220
- "execution_count": 52,
221
  "metadata": {},
222
  "output_type": "execute_result"
223
  }
@@ -228,8 +329,28 @@
228
  },
229
  {
230
  "cell_type": "code",
231
- "execution_count": 51,
232
- "id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  "metadata": {
234
  "tags": []
235
  },
@@ -237,35 +358,56 @@
237
  {
238
  "data": {
239
  "text/plain": [
240
- "'يرفرف مثل الفراشة ويلسع كالنحلة - يديه لا يمكن أن تصيب ما لا تستطيع عينيه رؤيته.'"
241
  ]
242
  },
243
- "execution_count": 51,
244
  "metadata": {},
245
  "output_type": "execute_result"
246
  }
247
  ],
248
  "source": [
249
- "response.choices[0].message.content"
250
  ]
251
  },
252
  {
253
  "cell_type": "markdown",
254
- "id": "3a9cdf02-d590-4bcf-a7a8-e6b7817ba715",
255
  "metadata": {},
256
  "source": [
257
- "## Purpose Driven Translation\n",
258
- "\n",
259
- "[](https://arxiv.org/pdf/2308.01391)"
260
  ]
261
  },
262
  {
263
  "cell_type": "code",
264
- "execution_count": null,
265
- "id": "12e57d07-9e86-426b-be42-e932699d1fe2",
266
- "metadata": {},
267
- "outputs": [],
268
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  }
270
  ],
271
  "metadata": {
 
47
  "login()"
48
  ]
49
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  {
51
  "cell_type": "markdown",
52
  "id": "4c254d6f-f3a1-49c1-815e-36e41e75ca25",
 
91
  "For our baseline we will translate with a simple system prompt and instruction."
92
  ]
93
  },
 
 
 
 
 
 
 
 
94
  {
95
  "cell_type": "markdown",
96
  "id": "a98b9b67-e68b-43b2-b8e9-0ed1cf85591f",
 
104
  },
105
  {
106
  "cell_type": "code",
107
+ "execution_count": 69,
108
  "id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
109
  "metadata": {
110
  "tags": []
111
  },
112
  "outputs": [],
113
  "source": [
114
+ "system_prompt = \"\"\"You are a skilled translator with extensive experience in English and Arabic translations. You possess a deep understanding of the linguistic, cultural, and contextual nuances essential for accurate and effective translation between these languages. Highly motivated and detail-oriented, you are committed to delivering translations that maintain the integrity and intent of the original text. Your role is crucial in ensuring clear and precise communication in our multilingual system.\"\"\""
115
  ]
116
  },
117
  {
 
119
  "id": "803ddeba-03de-4f13-95d1-5fb097058cf2",
120
  "metadata": {},
121
  "source": [
122
+ "### Instruction\n",
123
+ "> Translate this from arabic to english: {translation_input}.\n",
124
  ">\n",
125
  "> Translation: \n",
126
  "\n",
127
+ "We will use a simple instruction to get a translation."
128
  ]
129
  },
130
  {
131
  "cell_type": "code",
132
+ "execution_count": 70,
133
  "id": "b7f1722c-c484-4e22-a025-53f95943fc76",
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
137
+ "def baseline_chat_completion(system_prompt, translation_input):\n",
138
  " \"\"\"\n",
139
  " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
140
  " \"\"\"\n",
141
  " messages = [\n",
142
  " {\"role\": \"system\", \"content\": system_prompt},\n",
143
+ " {\n",
144
+ " \"role\": \"user\",\n",
145
+ " \"content\": f\"Translate this from arabic to english: {translation_input}.\\nTranslation: \",\n",
146
+ " },\n",
147
  " ]\n",
148
  " return client.chat_completion(messages, max_tokens=10_000)"
149
  ]
150
  },
151
  {
152
  "cell_type": "code",
153
+ "execution_count": 74,
154
  "id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
155
  "metadata": {
156
  "tags": []
157
  },
158
+ "outputs": [],
159
+ "source": [
160
+ "translation_input = (\n",
161
+ " \"'يرفرف مثل الفراشة ويلسع كالنحلة - يديه لا يمكن أن تصيب ما لا تستطيع عينيه رؤيته.'\"\n",
162
+ ")\n",
163
+ "response = baseline_chat_completion(\n",
164
+ " system_prompt,\n",
165
+ " translation_input,\n",
166
+ ")"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "markdown",
171
+ "id": "2bca574c-461d-4822-b0dd-b12a3b9846b3",
172
+ "metadata": {},
173
+ "source": [
174
+ "### Token Cost\n",
175
+ "Here we can see that the cost is quite cheap, only 92 tokens!"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 75,
181
+ "id": "4e305b1e-56e0-44da-8c17-496cbcc35fad",
182
+ "metadata": {
183
+ "tags": []
184
+ },
185
  "outputs": [
186
  {
187
  "data": {
188
  "text/plain": [
189
+ "129"
190
  ]
191
  },
192
+ "execution_count": 75,
193
  "metadata": {},
194
  "output_type": "execute_result"
195
  }
196
  ],
197
  "source": [
198
+ "response.usage.prompt_tokens"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": 77,
204
+ "id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
205
+ "metadata": {
206
+ "tags": []
207
+ },
208
+ "outputs": [
209
+ {
210
+ "name": "stdout",
211
+ "output_type": "stream",
212
+ "text": [
213
+ "\"He floats like a butterfly and stings like a bee - his hands can't hit what his eyes can't see.\"\n"
214
+ ]
215
+ }
216
+ ],
217
+ "source": [
218
+ "print(response.choices[0].message.content)"
219
  ]
220
  },
221
  {
222
  "cell_type": "markdown",
223
+ "id": "3a9cdf02-d590-4bcf-a7a8-e6b7817ba715",
224
  "metadata": {},
225
  "source": [
226
+ "## Purpose Driven Translation\n",
227
+ "\n",
228
+ "[Optimizing Machine Translation through Prompt Engineering](https://arxiv.org/pdf/2308.01391) has done some good exploratory work in examining how prompt engineering can impact translation. They were working between Japanese and English and showed that translations influenced by prompts tailored to **specific purposes** and **target audiences** generally adhered more closely to the translation specifications, suggesting that such prompted translations could be more culturally and contextually appropriate than standard machine translations.\n",
229
+ "\n",
230
+ "### System Prompt\n",
231
+ "We will use the same system prompt as before as it is versatile.\n",
232
+ "\n",
233
+ "### Prompt\n",
234
+ "One of the approaches in the paper was to include the purpose and target audience specification. This was motivated by the author’s experience as a professional translator, leading to the conclusion that these two parameters are essential even in everyday translation work. You can find the prompt below adapted for Arabic to English:\n",
235
+ "\n",
236
+ "> Translate the following Arabic [source text] into English. Please fulfill the following conditions when translating. \n",
237
+ "> Purpose of the translation: `<Manual description>` \n",
238
+ "> Target audience: `<Manual description>` \n",
239
+ "> [source text] `{en_input}` \n",
240
+ "\n",
241
+ "You can see that we need to provide the Purpose and the Target Audience for each translation, lets go ahead and do that."
242
  ]
243
  },
244
  {
245
  "cell_type": "code",
246
+ "execution_count": 78,
247
+ "id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
248
+ "metadata": {
249
+ "tags": []
250
+ },
251
+ "outputs": [],
252
+ "source": [
253
+ "dataset_to_purpose_target = {\n",
254
+ " \"ELRC-24ss\": {\n",
255
+ " \"purpose\": \"Enhancing understanding and knowledge about COVID-19 and health-related topics.\",\n",
256
+ " \"target\": \"Individuals seeking reliable and comprehensible information about COVID-19 and related health topics.\",\n",
257
+ " },\n",
258
+ " \"GNOME-25ss\": {\"purpose\": \"\", \"target\": \"\"},\n",
259
+ " \"HPLT-25ss\": {\"purpose\": \"\", \"target\": \"\"},\n",
260
+ " \"OpenSubtitles-25ss\": {\"purpose\": \"\", \"target\": \"\"},\n",
261
+ " \"TED2020-25ss\": {\"purpose\": \"\", \"target\": \"\"},\n",
262
+ " \"UNPC-24ss\": {\"purpose\": \"\", \"target\": \"\"},\n",
263
+ "}"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": 79,
269
+ "id": "f865e9f8-7c63-4e72-b539-0d5916eda44f",
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": [
273
+ "def purpose_driven_chat_completion(system_prompt, en_input, dataset):\n",
274
+ " \"\"\"\n",
275
+ " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
276
+ " \"\"\"\n",
277
+ "\n",
278
+ " prompt = f\"\"\"Translate the following Arabic [source text] into English. Please fulfill the following conditions when translating.\n",
279
+ "Purpose of the translation: {dataset_to_purpose_target[dataset]['purpose']}\n",
280
+ "Target audience: {dataset_to_purpose_target[dataset]['target']}\n",
281
+ "[source text] `{translation_input}`\n",
282
+ "\"\"\"\n",
283
+ "\n",
284
+ " messages = [\n",
285
+ " {\"role\": \"system\", \"content\": system_prompt},\n",
286
+ " {\n",
287
+ " \"role\": \"user\",\n",
288
+ " \"content\": prompt,\n",
289
+ " },\n",
290
+ " ]\n",
291
+ " return client.chat_completion(messages, max_tokens=10_000)"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": 83,
297
+ "id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
298
+ "metadata": {
299
+ "tags": []
300
+ },
301
+ "outputs": [],
302
+ "source": [
303
+ "ar_input = \"«لاحظنا أنه عندما تعمل مجموعات من أصحاب المصالح على تحديد.. الرؤى، فإن هذا يؤدي إلى مناقشة ما إذا كان ينبغي التأكيد على صحة النظام البيئي أو رفاهية الإنسان.. سواء كانت الأولوية هي النظم البيئية أو أن الأشخاص يؤثرون بدرجة كبيرة على تقييم أصحاب المصالح للحالات البيئية والاجتماعية المرغوبة»، وعلى سبيل المثال، «تُعتبر الذئاب بالنسبة للبعض شيئاً جوهرياً لصحة النظام البيئي وجزءاً أساسياً من الطبيعة، وأما بالنسبة للبعض الآخر فهي رمز تجاوز الحكومة الذي يهدد سبل عيشهم وقيمهم الثقافية».\"\n",
304
+ "response = purpose_driven_chat_completion(system_prompt, translation_input, \"ELRC-24ss\")"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 84,
310
+ "id": "6296d255-d11d-4df7-aa0e-1226ef3d963a",
311
  "metadata": {
312
  "tags": []
313
  },
 
315
  {
316
  "data": {
317
  "text/plain": [
318
+ "186"
319
  ]
320
  },
321
+ "execution_count": 84,
322
  "metadata": {},
323
  "output_type": "execute_result"
324
  }
 
329
  },
330
  {
331
  "cell_type": "code",
332
+ "execution_count": 85,
333
+ "id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
334
+ "metadata": {
335
+ "tags": []
336
+ },
337
+ "outputs": [
338
+ {
339
+ "name": "stdout",
340
+ "output_type": "stream",
341
+ "text": [
342
+ "\"He floats like a butterfly and stings like a bee - his hands can't hit what his eyes can't see.\"\n"
343
+ ]
344
+ }
345
+ ],
346
+ "source": [
347
+ "print(response.choices[0].message.content)"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": 86,
353
+ "id": "513ec227-fdc1-441b-a73b-9603925d038c",
354
  "metadata": {
355
  "tags": []
356
  },
 
358
  {
359
  "data": {
360
  "text/plain": [
361
+ "ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='eos_token', index=0, message=ChatCompletionOutputMessage(role='assistant', content='\"He floats like a butterfly and stings like a bee - his hands can\\'t hit what his eyes can\\'t see.\"', name=None, tool_calls=None), logprobs=None)], created=1719034916, id='', model='CohereForAI/c4ai-command-r-plus', object='text_completion', system_fingerprint='2.0.4-sha-f426a33', usage=ChatCompletionOutputUsage(completion_tokens=26, prompt_tokens=186, total_tokens=212))"
362
  ]
363
  },
364
+ "execution_count": 86,
365
  "metadata": {},
366
  "output_type": "execute_result"
367
  }
368
  ],
369
  "source": [
370
+ "response"
371
  ]
372
  },
373
  {
374
  "cell_type": "markdown",
375
+ "id": "1ec3b20b-8393-4fda-a51d-cf67984cc166",
376
  "metadata": {},
377
  "source": [
378
+ "# Push to the hub"
 
 
379
  ]
380
  },
381
  {
382
  "cell_type": "code",
383
+ "execution_count": 57,
384
+ "id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
385
+ "metadata": {
386
+ "tags": []
387
+ },
388
+ "outputs": [
389
+ {
390
+ "data": {
391
+ "text/plain": [
392
+ "CommitInfo(commit_url='https://huggingface.co/arabic-translation-prompt-engineering/atpe-notebooks/commit/0dd63b0f4dbe2fd59ced328d04a4749cadfceac9', commit_message='Upload requirements.txt with huggingface_hub', commit_description='', oid='0dd63b0f4dbe2fd59ced328d04a4749cadfceac9', pr_url=None, pr_revision=None, pr_num=None)"
393
+ ]
394
+ },
395
+ "execution_count": 57,
396
+ "metadata": {},
397
+ "output_type": "execute_result"
398
+ }
399
+ ],
400
+ "source": [
401
+ "from huggingface_hub import HfApi\n",
402
+ "\n",
403
+ "api = HfApi()\n",
404
+ "api.upload_file(\n",
405
+ " path_or_fileobj=\"baseline.ipynb\",\n",
406
+ " path_in_repo=\"baseline.ipynb\",\n",
407
+ " repo_id=\"arabic-translation-prompt-engineering/atpe-notebooks\",\n",
408
+ " repo_type=\"model\",\n",
409
+ ")"
410
+ ]
411
  }
412
  ],
413
  "metadata": {