{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "private_outputs": true, "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "rK-kOO01qe9L" }, "outputs": [], "source": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "PWV5yLxVqfbx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "V28eHua6qff_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install git+https://github.com/huggingface/transformers" ], "metadata": { "id": "LoXk14QLqflG" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install -r /content/a.txt" ], "metadata": { "id": "pgdUZsRrrEfJ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "Helsinki-NLP/opus-mt-en-ar\n", "\n", "sdyy/en-ar" ], "metadata": { "id": "uNfZzFhzsjUZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "ssHEG88C1Ali" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "7u7KH1vv1AoZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# convet csv tojsonl" ], "metadata": { "id": "PPFPtqFBPgMx" } }, { "cell_type": "code", "source": [ "import csv\n", "import json\n", "\n", "# Specify the input and output file names\n", "csv_file_name = \"english_arabic_dataset.csv\"\n", "jsonl_file_name = \"english_arabic_dataset.jsonl\"\n", "\n", "# Read the CSV file and convert each row to a dictionary\n", "data = []\n", "with open(csv_file_name, mode='r', encoding='utf-8') as csv_file:\n", " csv_reader = csv.DictReader(csv_file)\n", " for row in csv_reader:\n", " translation = row[\"translation\"]\n", " # Assuming the translation is formatted as \"English sentence\",\"Arabic sentence\"\n", " if ',' in translation:\n", " english, arabic = translation.split(',', 1)\n", " data.append({\"English\": english.strip('\"'), \"translation\": arabic.strip('\"')})\n", " else:\n", " # Handle cases where there is no comma\n", " data.append({\"English\": translation.strip('\"'), \"translation\": \"\"})\n", "\n", "# Write the data to a JSON Lines file\n", "with open(jsonl_file_name, mode='w', encoding='utf-8') as jsonl_file:\n", " for item in data:\n", " jsonl_file.write(json.dumps(item, ensure_ascii=False) + '\\n')\n", "\n", "print(f\"JSON Lines file '{jsonl_file_name}' has been created successfully.\")\n" ], "metadata": { "id": "23eu4ZLo1ArU" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## dowload dataset" ], "metadata": { "id": "x1BkM6-5Pzx4" } }, { "cell_type": "code", "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset('wmt16', 'ro-en')" ], "metadata": { "id": "7ZRJyRGQCizE" }, "execution_count": null, "outputs": [] }, { "source": [ "!wget https://raw.githubusercontent.com/Helsinki-NLP/opus-mt-en-ar/master/run_translation.py" ], "cell_type": "code", "metadata": { "id": "LOBTHjmd80ZT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!wget https://huggingface.co/datasets/wmt/wmt16/tree/main/ro-en" ], "metadata": { "id": "zMCXLEIm-Kpl" }, "execution_count": null, "outputs": [] }, { "source": [ "import csv\n", "import json\n", "\n", "# Specify the input and output file names\n", "csv_file_name = \"english_arabic_dataset.csv\"\n", "jsonl_file_name = \"english_arabic_dataset.jsonl\"\n", "\n", "# Read the CSV file and convert each row to a dictionary\n", "# Read the CSV file and convert each row to a dictionary\n", "data = []\n", "with open(csv_file_name, mode='r', encoding='utf-8') as csv_file:\n", " csv_reader = csv.DictReader(csv_file)\n", " for row in csv_reader:\n", " english = row[\"English\"]\n", " arabic = row.get(\"Arabic\", \"\") # Use get() to avoid KeyError\n", " data.append({\"translation\": {\"en\": english, \"ar\": arabic}})\n", "\n", "print(f\"JSON Lines file '{jsonl_file_name}' has been created successfully.\")" ], "cell_type": "code", "metadata": { "id": "Thf3r5Fm2IV5" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "Byy-EbCC4OZE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "1F9zNSTLQBww" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "تسجيل فى huggingface" ], "metadata": { "id": "RJqZKae5QB7R" } }, { "cell_type": "code", "source": [ "!huggingface-cli login" ], "metadata": { "id": "5aR5f2h04Ofj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "Oi1KWQR-1AxC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "kg6Cqg0k1Azl" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "O8G9DSTM0zyW" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [], "metadata": { "id": "jlJyvpJ90z9a" } }, { "cell_type": "code", "source": [ "import csv\n", "import json\n", "\n", "# Specify the input and output file names\n", "csv_file_name = \"english_arabic_dataset.csv\"\n", "jsonl_file_name = \"english_arabic_dataset.jsonl\"\n", "\n", "# Read the CSV file and convert each row to a dictionary\n", "data = []\n", "with open(csv_file_name, mode='r', encoding='utf-8') as csv_file:\n", " csv_reader = csv.DictReader(csv_file)\n", " for row in csv_reader:\n", " translation = row[\"translation\"]\n", " # Check if the row contains a comma\n", " if ',' in translation:\n", " english, arabic = translation.split(',', 1)\n", " data.append({\"English\": english.strip('\"'), \"translation\": arabic.strip('\"')})\n", " else:\n", " # Handle cases where there is no comma\n", " data.append({\"English\": translation.strip('\"'), \"translation\": \"\"})\n", "\n", "# Write the data to a JSON Lines file\n", "with open(jsonl_file_name, mode='w', encoding='utf-8') as jsonl_file:\n", " for item in data:\n", " jsonl_file.write(json.dumps(item, ensure_ascii=False) + '\\n')\n", "\n", "print(f\"JSON Lines file '{jsonl_file_name}' has been created successfully.\")\n" ], "metadata": { "id": "dnMfgJzWwC9I" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "oltCt7rewDAB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "0FhbGKQ7wDDA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "QSs3f3SEu3Da" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "3VgjiZouu3Xo" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "create csv data" ], "metadata": { "id": "_N9OQERxQSrI" } }, { "cell_type": "code", "source": [ "import csv\n", "\n", "# Sample dataset with 50 English-Arabic sentence pairs\n", "data = [\n", " (\"Hello, how are you?\", \"مرحباً، كيف حالك؟\"),\n", " (\"I am learning how to code.\", \"أنا أتعلم كيفية البرمجة.\"),\n", " (\"This is a test sentence.\", \"هذه جملة اختبار.\"),\n", " (\"The weather is nice today.\", \"الطقس جميل اليوم.\"),\n", " (\"I like to read books.\", \"أحب قراءة الكتب.\"),\n", " (\"What is your name?\", \"ما اسمك؟\"),\n", " (\"I live in a big city.\", \"أعيش في مدينة كبيرة.\"),\n", " (\"Do you speak Arabic?\", \"هل تتحدث العربية؟\"),\n", " (\"I have a pet cat.\", \"لدي قطة أليفة.\"),\n", " (\"The sun is shining brightly.\", \"الشمس تشرق بسطوع.\"),\n", " (\"I enjoy cooking.\", \"أستمتع بالطهي.\"),\n", " (\"He is a good friend.\", \"هو صديق جيد.\"),\n", " (\"She loves to travel.\", \"هي تحب السفر.\"),\n", " (\"We are going to the park.\", \"نحن ذاهبون إلى الحديقة.\"),\n", " (\"They are playing soccer.\", \"هم يلعبون كرة القدم.\"),\n", " (\"I need to buy groceries.\", \"أحتاج لشراء البقالة.\"),\n", " (\"My favorite color is blue.\", \"لوني المفضل هو الأزرق.\"),\n", " (\"I will call you tomorrow.\", \"سأتصل بك غداً.\"),\n", " (\"Please turn off the lights.\", \"من فضلك أغلق الأنوار.\"),\n", " (\"Can you help me?\", \"هل يمكنك مساعدتي؟\"),\n", " (\"I am very happy today.\", \"أنا سعيد جداً اليوم.\"),\n", " (\"The movie was interesting.\", \"الفيلم كان مثيراً.\"),\n", " (\"We are studying for the exam.\", \"نحن ندرس للامتحان.\"),\n", " (\"I like listening to music.\", \"أحب الاستماع إلى الموسيقى.\"),\n", " (\"She is reading a novel.\", \"هي تقرأ رواية.\"),\n", " (\"He works in an office.\", \"هو يعمل في مكتب.\"),\n", " (\"They are building a house.\", \"هم يبنون منزلاً.\"),\n", " (\"The car is parked outside.\", \"السيارة متوقفة بالخارج.\"),\n", " (\"I enjoy learning new languages.\", \"أستمتع بتعلم لغات جديدة.\"),\n", " (\"The cake tastes delicious.\", \"الكعكة طعمها لذيذ.\"),\n", " (\"We are planning a trip.\", \"نحن نخطط لرحلة.\"),\n", " (\"She is a talented artist.\", \"هي فنانة موهوبة.\"),\n", " (\"He is watching TV.\", \"هو يشاهد التلفاز.\"),\n", " (\"I forgot my keys.\", \"لقد نسيت مفاتيحي.\"),\n", " (\"The book is on the table.\", \"الكتاب على الطاولة.\"),\n", " (\"I need to charge my phone.\", \"أحتاج لشحن هاتفي.\"),\n", " (\"We are having dinner.\", \"نحن نتناول العشاء.\"),\n", " (\"He is writing a letter.\", \"هو يكتب رسالة.\"),\n", " (\"She is singing a song.\", \"هي تغني أغنية.\"),\n", " (\"The children are playing.\", \"الأطفال يلعبون.\"),\n", " (\"I am drinking coffee.\", \"أنا أشرب القهوة.\"),\n", " (\"The plane is taking off.\", \"الطائرة تقلع.\"),\n", " (\"We are visiting our grandparents.\", \"نحن نزور أجدادنا.\"),\n", " (\"He is wearing a suit.\", \"هو يرتدي بدلة.\"),\n", " (\"She is cooking dinner.\", \"هي تطبخ العشاء.\"),\n", " (\"I am feeling tired.\", \"أشعر بالتعب.\"),\n", " (\"The dog is barking.\", \"الكلب ينبح.\"),\n", " (\"I am going to bed.\", \"أنا ذاهب للنوم.\"),\n", " (\"We are celebrating his birthday.\", \"نحن نحتفل بعيد ميلاده.\"),\n", " (\"She is studying medicine.\", \"هي تدرس الطب.\")\n", "]\n", "\n", "# Specify the file name\n", "file_name = \"english_arabic_dataset.csv\"\n", "\n", "# Write to CSV file\n", "with open(file_name, mode='w', newline='', encoding='utf-8') as file:\n", " writer = csv.writer(file)\n", " writer.writerow([\"English\", \"translation\"]) # Write the header\n", " writer.writerows(data) # Write the data\n", "\n", "print(f\"CSV file '{file_name}' has been created successfully.\")\n" ], "metadata": { "id": "Rl4x1e_Uu3ax" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "Tq3T37lWEhil" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "4Rn8aFUzEhll" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "CRBuxZLBEho_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "mOxewk1EEhr0" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "شغال" ], "metadata": { "id": "9zn-QR7JGkAt" } }, { "cell_type": "code", "source": [ "!python /content/run_translation.py \\\n", " --model_name_or_path Helsinki-NLP/opus-mt-en-ro \\\n", " --do_train \\\n", " --do_eval \\\n", " --source_lang en \\\n", " --target_lang ro \\\n", " --dataset_name wmt16 \\\n", " --dataset_config_name ro-en \\\n", " --output_dir /content/tst-translation \\\n", " --per_device_train_batch_size=4 \\\n", " --per_device_eval_batch_size=4 \\\n", " --overwrite_output_dir \\\n", " --predict_with_generate" ], "metadata": { "id": "FqDQmY9WEht4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "ش" ], "metadata": { "id": "85ja_1nzGmvx" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "شغال" ], "metadata": { "id": "MU3-tXnVGnxn" } }, { "cell_type": "markdown", "source": [], "metadata": { "id": "6BNO7VXXGpOL" } }, { "cell_type": "code", "source": [ "!python /content/run_translation.py \\\n", " --model_name_or_path facebook/mbart-large-en-ro \\\n", " --do_train \\\n", " --do_eval \\\n", " --dataset_name wmt16 \\\n", " --dataset_config_name ro-en \\\n", " --source_lang en_XX \\\n", " --target_lang ro_RO \\\n", " --output_dir /tmp/tst-translation \\\n", " --per_device_train_batch_size=4 \\\n", " --per_device_eval_batch_size=4 \\\n", " --overwrite_output_dir \\\n", " --predict_with_generate" ], "metadata": { "id": "ibqJuCE7EwHa" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"sdyy/en-ar\")" ], "metadata": { "id": "vOCRRv6CGyRm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "eAdw0rvSNI84" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "كودنجح ف التدريب" ], "metadata": { "id": "hrR3WcbhNJKW" } }, { "cell_type": "markdown", "source": [ "# كود نجح ف التدريب" ], "metadata": { "id": "oQYg39oiQe6h" } }, { "cell_type": "code", "source": [ "!python /content/run_translation.py \\\n", " --model_name_or_path Helsinki-NLP/opus-mt-en-ar \\\n", " --do_train \\\n", " --do_eval \\\n", " --source_lang en \\\n", " --target_lang ar \\\n", " --source_prefix \"translate English to Arabic: \" \\\n", " --dataset_name sdyy/en-ar \\\n", " --dataset_config_name default \\\n", " --train_file train_file.jsonl \\\n", " --validation_file validation_dataset.jsonl \\\n", " --output_dir /content/tst-translation \\\n", " --per_device_train_batch_size 4 \\\n", " --per_device_eval_batch_size 4 \\\n", " --overwrite_output_dir \\\n", " --predict_with_generate\n" ], "metadata": { "id": "P0daa-JoL1kJ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "efOp8eT2Mw13" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "r6JXWBi5Mw4g" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from transformers import MarianMTModel, MarianTokenizer\n", "\n", "# تحميل نموذج الترجمة\n", "model_name = \"Helsinki-NLP/opus-mt-en-ar\" # تحديد نموذج الترجمة الذي يترجم من الإنجليزية إلى العربية\n", "model = MarianMTModel.from_pretrained(model_name)\n", "tokenizer = MarianTokenizer.from_pretrained(model_name)\n", "\n", "# الجملة التي تريد ترجمتها\n", "sentence = \"Others have dismissed him as a joke.\"\n", "\n", "# تحويل الجملة إلى توكنات\n", "inputs = tokenizer(sentence, return_tensors=\"pt\", padding=True, truncation=True)\n", "\n", "# الترجمة\n", "translated = model.generate(**inputs)\n", "translated_sentence = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]\n", "\n", "print(\"الجملة المترجمة:\", translated_sentence)\n" ], "metadata": { "id": "OnvrW8gOMw7q" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "JpnN5jvhNxdX" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### طلب نجح ترجمة جملة من الداتاسيت" ], "metadata": { "id": "Gpbx9jHyQoV2" } }, { "cell_type": "code", "source": [ "from transformers import MarianMTModel, MarianTokenizer\n", "\n", "# تحميل نموذج الترجمة\n", "model_path = \"/content/tst-translation/checkpoint-9\"\n", "model = MarianMTModel.from_pretrained(model_path)\n", "tokenizer = MarianTokenizer.from_pretrained(model_path)\n", "\n", "# الجملة التي تريد ترجمتها\n", "sentence = \"Others have dismissed him as a joke.\"\n", "\n", "# تحويل الجملة إلى توكنات\n", "inputs = tokenizer(sentence, return_tensors=\"pt\", padding=True, truncation=True)\n", "\n", "# الترجمة\n", "translated = model.generate(**inputs)\n", "translated_sentence = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]\n", "\n", "print(\"الجملة المترجمة:\", translated_sentence)\n" ], "metadata": { "id": "iqUc-K5sNkH4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "vjiaStGGNkK_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "BaSd47OSNkOB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "AIhWySn5NkQ3" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "jcIT8JXBNkTo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "pL5I0XNDNkV-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import tensorflow as tf\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "\n", "# البيانات المعطاة\n", "data = [\n", " {\"translation\": {\"en\": \"Others have dismissed him as a joke.\", \"ar\": \"اعتبره البعض مجرد مزحة.\"}},\n", " {\"translation\": {\"en\": \"And some are holding out for an implosion.\", \"ar\": \"وبعضهم ينتظر الانهيار الداخلي.\"}},\n", " {\"translation\": {\"en\": \"She dismissed his concerns as trivial.\", \"ar\": \"اعتبرت مخاوفه تافهة.\"}},\n", " {\"translation\": {\"en\": \"Don't dismiss his ideas outright; they might have some merit.\", \"ar\": \"لا تستهتر بأفكاره مباشرة؛ فقد تكون لها قيمة.\"}},\n", " {\"translation\": {\"en\": \"The jury dismissed the case due to lack of evidence.\", \"ar\": \"رفضت المحكمة القضية بسبب عدم وجود أدلة.\"}}\n", "]\n", "\n", "# استخراج الجمل بالإنجليزية والعربية\n", "english_sentences = [item[\"translation\"][\"en\"] for item in data]\n", "arabic_sentences = [item[\"translation\"][\"ar\"] for item in data]\n", "\n", "# تحديد معلمات التدريب\n", "max_words = 1000\n", "max_sequence_length = 100\n", "\n", "# ترميز الجمل\n", "tokenizer_en = Tokenizer(num_words=max_words)\n", "tokenizer_en.fit_on_texts(english_sentences)\n", "tokenizer_ar = Tokenizer(num_words=max_words)\n", "tokenizer_ar.fit_on_texts(arabic_sentences)\n", "\n", "# تحويل الجمل إلى تسلسلات من الأرقام\n", "sequences_en = tokenizer_en.texts_to_sequences(english_sentences)\n", "sequences_ar = tokenizer_ar.texts_to_sequences(arabic_sentences)\n", "\n", "# ملء التسلسلات بصفر حتى يكون لدينا جمل بنفس الطول\n", "padded_sequences_en = pad_sequences(sequences_en, maxlen=max_sequence_length)\n", "padded_sequences_ar = pad_sequences(sequences_ar, maxlen=max_sequence_length)\n", "\n", "# بناء نموذج الترجمة\n", "embedding_dim = 16\n", "model = tf.keras.Sequential([\n", " tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_sequence_length),\n", " tf.keras.layers.GlobalAveragePooling1D(),\n", " tf.keras.layers.Dense(16, activation='relu'),\n", " tf.keras.layers.Dense(max_words, activation='softmax')\n", "])\n", "\n", "# تحديد معلمات التدريب\n", "model.compile(optimizer='adam',\n", " loss='sparse_categorical_crossentropy',\n", " metrics=['accuracy'])\n", "\n", "# تدريب النموذج\n", "model.fit(padded_sequences_ar, padded_sequences_en, epochs=10, validation_split=0.2)\n", "\n", "# تقييم أداء النموذج\n", "# eval_loss, eval_accuracy = model.evaluate(padded_sequences_ar, padded_sequences_en)\n", "# print(f'Evaluation Loss: {eval_loss}, Evaluation Accuracy: {eval_accuracy}')\n" ], "metadata": { "id": "MdGsqm-4Mw-Q" }, "execution_count": null, "outputs": [] } ] }