diff --git "a/Benchmark2/benchmark-2-openchat.ipynb" "b/Benchmark2/benchmark-2-openchat.ipynb" new file mode 100644--- /dev/null +++ "b/Benchmark2/benchmark-2-openchat.ipynb" @@ -0,0 +1,6377 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b6920725", + "metadata": { + "papermill": { + "duration": 0.011813, + "end_time": "2024-05-31T18:42:32.343085", + "exception": false, + "start_time": "2024-05-31T18:42:32.331272", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "

Benchmark 2: Openchat

\n" + ] + }, + { + "cell_type": "markdown", + "id": "de1daec0", + "metadata": { + "papermill": { + "duration": 0.011375, + "end_time": "2024-05-31T18:42:32.366254", + "exception": false, + "start_time": "2024-05-31T18:42:32.354879", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "
\n", + "

\n", + " Notebook Gool\n", + "

\n", + "

\n", + "The objective of this notebook is to evaluate the performance of openchat using the Table-extract Benchmark dataset available at Hugging Face.

\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "dd1a5e0e", + "metadata": { + "papermill": { + "duration": 0.010999, + "end_time": "2024-05-31T18:42:32.388876", + "exception": false, + "start_time": "2024-05-31T18:42:32.377877", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#
Table of Content
\n", + "\n", + "* [I. Loading and Importing Libraries](#1)\n", + "* [II. Definition and Implementation of Metrics](#2)\n", + "* [III. Clean Response Obtained by LLM](#3)\n", + "* [IV. Data Preparation](#5)\n", + "* [V. Benchmark](#6)\n", + " * [Prompt](#61)\n", + " * [OpenChat](#62)\n", + " * [Gemini-Pro](#63)\n", + " * [OpenHermes-Mistral](#64)\n", + " * [ Mistral-7B-Instruct-v0.2](#65)" + ] + }, + { + "cell_type": "markdown", + "id": "b4005591", + "metadata": { + "papermill": { + "duration": 0.011603, + "end_time": "2024-05-31T18:42:32.411521", + "exception": false, + "start_time": "2024-05-31T18:42:32.399918", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
I | Loading and Importing Libraries
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "49687d79", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:42:32.435657Z", + "iopub.status.busy": "2024-05-31T18:42:32.434871Z", + "iopub.status.idle": "2024-05-31T18:43:39.462522Z", + "shell.execute_reply": "2024-05-31T18:43:39.461516Z" + }, + "papermill": { + "duration": 67.042525, + "end_time": "2024-05-31T18:43:39.465027", + "exception": false, + "start_time": "2024-05-31T18:42:32.422502", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install google-generativeai\n", + "!pip install --upgrade pip\n", + "!pip install bitsandbytes\n", + "!pip install transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e33c869d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:39.490175Z", + "iopub.status.busy": "2024-05-31T18:43:39.489848Z", + "iopub.status.idle": "2024-05-31T18:43:45.419675Z", + "shell.execute_reply": "2024-05-31T18:43:45.418870Z" + }, + "papermill": { + "duration": 5.945366, + "end_time": "2024-05-31T18:43:45.422072", + "exception": false, + "start_time": "2024-05-31T18:43:39.476706", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import re\n", + "import json\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "from datasets import load_dataset, Dataset\n", + "from wand.image import Image as WImage\n", + "import torch\n", + "import pandas as pd\n", + "from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", + "import time \n", + "import random\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9a5a0def", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:45.446829Z", + "iopub.status.busy": "2024-05-31T18:43:45.445980Z", + "iopub.status.idle": "2024-05-31T18:43:46.064310Z", + "shell.execute_reply": "2024-05-31T18:43:46.063503Z" + }, + "papermill": { + "duration": 0.632813, + "end_time": "2024-05-31T18:43:46.066514", + "exception": false, + "start_time": "2024-05-31T18:43:45.433701", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import google.generativeai as genai\n", + "import time \n", + "genai.configure(api_key=\"AIzaSyAhz9UBzkEIYI886zZRm40qqB1Kd_9Y4-0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b4178da4", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.092857Z", + "iopub.status.busy": "2024-05-31T18:43:46.091786Z", + "iopub.status.idle": "2024-05-31T18:43:46.099836Z", + "shell.execute_reply": "2024-05-31T18:43:46.099131Z" + }, + "papermill": { + "duration": 0.023475, + "end_time": "2024-05-31T18:43:46.101850", + "exception": false, + "start_time": "2024-05-31T18:43:46.078375", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Set random seed for reproducibility\n", + "random.seed(42)\n", + "np.random.seed(42)\n", + "torch.manual_seed(42)\n", + "torch.cuda.manual_seed_all(42)\n", + "torch.backends.cudnn.deterministic = True\n", + "torch.backends.cudnn.benchmark = False" + ] + }, + { + "cell_type": "markdown", + "id": "b88ffeeb", + "metadata": { + "papermill": { + "duration": 0.011136, + "end_time": "2024-05-31T18:43:46.125097", + "exception": false, + "start_time": "2024-05-31T18:43:46.113961", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
II | Definition and Implementation of Metrics
\n", + "So, let's begin by providing an example of the example output." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0d55fbfd", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.148246Z", + "iopub.status.busy": "2024-05-31T18:43:46.147962Z", + "iopub.status.idle": "2024-05-31T18:43:46.154390Z", + "shell.execute_reply": "2024-05-31T18:43:46.153555Z" + }, + "papermill": { + "duration": 0.020108, + "end_time": "2024-05-31T18:43:46.156203", + "exception": false, + "start_time": "2024-05-31T18:43:46.136095", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "desired_output = [{'aircraft': 'robinson r - 22',\n", + " 'description': 'light utility helicopter',\n", + " 'max gross weight': '1370 lb (635 kg)',\n", + " 'total disk area': '497 ft square (46.2 m square)',\n", + " 'max disk loading': '2.6 lb / ft square (14 kg / m square)'},\n", + " {'aircraft': 'bell 206b3 jetranger',\n", + " 'description': 'turboshaft utility helicopter',\n", + " 'max gross weight': '3200 lb (1451 kg)',\n", + " 'total disk area': '872 ft square (81.1 m square)',\n", + " 'max disk loading': '3.7 lb / ft square (18 kg / m square)'},\n", + " {'aircraft': 'ch - 47d chinook',\n", + " 'description': 'tandem rotor helicopter',\n", + " 'max gross weight': '50000 lb (22680 kg)',\n", + " 'total disk area': '5655 ft square (526 m square)',\n", + " 'max disk loading': '8.8 lb / ft square (43 kg / m square)'},\n", + " {'aircraft': 'mil mi - 26',\n", + " 'description': 'heavy - lift helicopter',\n", + " 'max gross weight': '123500 lb (56000 kg)',\n", + " 'total disk area': '8495 ft square (789 m square)',\n", + " 'max disk loading': '14.5 lb / ft square (71 kg / m square)'},\n", + " {'aircraft': 'ch - 53e super stallion',\n", + " 'description': 'heavy - lift helicopter',\n", + " 'max gross weight': '73500 lb (33300 kg)',\n", + " 'total disk area': '4900 ft square (460 m square)',\n", + " 'max disk loading': '15 lb / ft square (72 kg / m square)'}]\n" + ] + }, + { + "cell_type": "markdown", + "id": "7eab15f8", + "metadata": { + "papermill": { + "duration": 0.011226, + "end_time": "2024-05-31T18:43:46.178915", + "exception": false, + "start_time": "2024-05-31T18:43:46.167689", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "To compare between the expected list of records and the predicted list of records, we first need to verify the percentage of predicted keys relative to the desired keys" + ] + }, + { + "cell_type": "markdown", + "id": "aa514ea1", + "metadata": { + "papermill": { + "duration": 0.011529, + "end_time": "2024-05-31T18:43:46.202339", + "exception": false, + "start_time": "2024-05-31T18:43:46.190810", + "status": "completed" + }, + "tags": [] + }, + "source": [ + ">## Percentage of predicted keys" + ] + }, + { + "cell_type": "markdown", + "id": "92f9cc82", + "metadata": { + "papermill": { + "duration": 0.01121, + "end_time": "2024-05-31T18:43:46.225678", + "exception": false, + "start_time": "2024-05-31T18:43:46.214468", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Let's begin by defining a function to retrieve all keys of record" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "144501e7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.248870Z", + "iopub.status.busy": "2024-05-31T18:43:46.248595Z", + "iopub.status.idle": "2024-05-31T18:43:46.254946Z", + "shell.execute_reply": "2024-05-31T18:43:46.254035Z" + }, + "papermill": { + "duration": 0.02004, + "end_time": "2024-05-31T18:43:46.256806", + "exception": false, + "start_time": "2024-05-31T18:43:46.236766", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def get_keys(d):\n", + " # Iterate over each key-value pair in the dictionary\n", + " for k, v in d.items():\n", + " # Append the key to the list of all_keys\n", + " all_keys.append(k)\n", + " # If the value is a dictionary, recursively call get_keys\n", + " if isinstance(v, dict):\n", + " get_keys(v)\n", + " # If the value is a list, iterate over each item\n", + " elif isinstance(v, list):\n", + " for item in v:\n", + " # If the item is a dictionary, recursively call get_keys\n", + " if isinstance(item, dict):\n", + " get_keys(item)\n", + "# Define a function to retrieve all unique keys from a nested dictionary\n", + "def get_all_keys(d):\n", + " # Declare all_keys as a global variable\n", + " global all_keys\n", + " # Initialize all_keys as an empty list\n", + " all_keys = []\n", + " # Call the helper function get_keys to populate all_keys\n", + " get_keys(d)\n", + " # Return a list containing the unique keys by converting all_keys to a set and then back to a list\n", + " return list(set(all_keys))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e3bb288e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.280499Z", + "iopub.status.busy": "2024-05-31T18:43:46.280236Z", + "iopub.status.idle": "2024-05-31T18:43:46.286509Z", + "shell.execute_reply": "2024-05-31T18:43:46.285865Z" + }, + "papermill": { + "duration": 0.020532, + "end_time": "2024-05-31T18:43:46.288468", + "exception": false, + "start_time": "2024-05-31T18:43:46.267936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['max disk loading',\n", + " 'aircraft',\n", + " 'max gross weight',\n", + " 'total disk area',\n", + " 'description']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Testing our function\n", + "get_all_keys(desired_output[0])" + ] + }, + { + "cell_type": "markdown", + "id": "3ce46b02", + "metadata": { + "papermill": { + "duration": 0.01119, + "end_time": "2024-05-31T18:43:46.311091", + "exception": false, + "start_time": "2024-05-31T18:43:46.299901", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now, we define the percentage of predicted keys as follows:\n", + "\n", + "$$\\Large \\text{Percentage of predicted keys} = \\frac{\\text{Number of correctly predicted keys}}{\\text{Total number of true keys}}$$\n", + "This percentage is calculated for every record in the list, then summed and divided by the number of records in the list." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "38e453a1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.334813Z", + "iopub.status.busy": "2024-05-31T18:43:46.334510Z", + "iopub.status.idle": "2024-05-31T18:43:46.340513Z", + "shell.execute_reply": "2024-05-31T18:43:46.339627Z" + }, + "papermill": { + "duration": 0.019887, + "end_time": "2024-05-31T18:43:46.342328", + "exception": false, + "start_time": "2024-05-31T18:43:46.322441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def process_dict(data):\n", + " if isinstance(data, dict):\n", + " for key, value in data.items():\n", + " if isinstance(value, str):\n", + " data[key] = value.strip().lower()\n", + " elif isinstance(value, list):\n", + " data[key] = [process_dict(item) for item in value]\n", + " elif isinstance(value, dict):\n", + " data[key] = process_dict(value)\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f3476fb5", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.366091Z", + "iopub.status.busy": "2024-05-31T18:43:46.365417Z", + "iopub.status.idle": "2024-05-31T18:43:46.371401Z", + "shell.execute_reply": "2024-05-31T18:43:46.370549Z" + }, + "papermill": { + "duration": 0.019948, + "end_time": "2024-05-31T18:43:46.373463", + "exception": false, + "start_time": "2024-05-31T18:43:46.353515", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def percentage_of_predicted_keys(true_dic, pred_dic):\n", + " true_dic=process_dict(true_dic)\n", + " pred_dic=process_dict(pred_dic)\n", + " # Get all keys of the true dictionary\n", + " all_keys_of_true_dic = get_all_keys(true_dic)\n", + " # Get all keys of the predicted dictionary\n", + " all_keys_of_pred_dic = get_all_keys(pred_dic)\n", + " \n", + " # Check if there are no keys in the true dictionary to avoid division by zero\n", + " if len(all_keys_of_true_dic) == 0:\n", + " return 0 # Avoid division by zero\n", + " \n", + " # Initialize count of predicted keys\n", + " p_keys = 0\n", + " # Iterate through all keys in the predicted dictionary\n", + " for key in all_keys_of_pred_dic:\n", + " # Check if the key is also present in the true dictionary\n", + " if key in all_keys_of_true_dic:\n", + " # Increment count if the key is found in both dictionaries\n", + " p_keys += 1\n", + " \n", + " # Calculate the percentage of predicted keys compared to true keys\n", + " p_keys /= len(all_keys_of_true_dic)\n", + " # Return the percentage of predicted keys\n", + " return p_keys" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7ecfcbc7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.397542Z", + "iopub.status.busy": "2024-05-31T18:43:46.396938Z", + "iopub.status.idle": "2024-05-31T18:43:46.401826Z", + "shell.execute_reply": "2024-05-31T18:43:46.400960Z" + }, + "papermill": { + "duration": 0.018768, + "end_time": "2024-05-31T18:43:46.403757", + "exception": false, + "start_time": "2024-05-31T18:43:46.384989", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def average_percentage_key(true_list, pred_list):\n", + " min_length = min(len(true_list), len(pred_list)) # Find the minimum length of the two lists\n", + " score = 0\n", + " for i in range(min_length):\n", + " score += percentage_of_predicted_keys(true_list[i], pred_list[i])\n", + " return score / len(true_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "80d9f735", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.428184Z", + "iopub.status.busy": "2024-05-31T18:43:46.427524Z", + "iopub.status.idle": "2024-05-31T18:43:46.433961Z", + "shell.execute_reply": "2024-05-31T18:43:46.433116Z" + }, + "papermill": { + "duration": 0.020494, + "end_time": "2024-05-31T18:43:46.436009", + "exception": false, + "start_time": "2024-05-31T18:43:46.415515", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average percentage of keys: 1.0\n" + ] + } + ], + "source": [ + "# Example true and predicted lists\n", + "true_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 6}, {'key1': 7, 'key2': 8, 'key3': 9}]\n", + "pred_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 7}, {'key1': 7, 'key2': 8, 'key3': 9}]\n", + "\n", + "# Test the function\n", + "result = average_percentage_key(true_list, pred_list)\n", + "print(\"Average percentage of keys:\", result)" + ] + }, + { + "cell_type": "markdown", + "id": "3347362f", + "metadata": { + "papermill": { + "duration": 0.011022, + "end_time": "2024-05-31T18:43:46.458274", + "exception": false, + "start_time": "2024-05-31T18:43:46.447252", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now we will define the principal metrics used to compare the values of two list recods." + ] + }, + { + "cell_type": "markdown", + "id": "7c2c9671", + "metadata": { + "papermill": { + "duration": 0.011035, + "end_time": "2024-05-31T18:43:46.480486", + "exception": false, + "start_time": "2024-05-31T18:43:46.469451", + "status": "completed" + }, + "tags": [] + }, + "source": [ + ">## Percentage of predicted values\n", + "\n", + "The function calculates the percentage of correctly predicted values compared to the total number of true values across different types of data structures.\n", + "\n", + "The formula for calculating the percentage of values is as follows:\n", + "\n", + "$$\n", + "\\text{Average percentage of values} = \\frac{\\sum_{i=1}^{\\text{Total number of records}} p_i }{Total number of records}\n", + "$$\n", + "\n", + "Here, $p_i$ represents the percentage of correctly predicted values for each key. It's calculated as:\n", + "\n", + "$$p_i = \\frac{\\text{Number of correctly predicted values of item i}}{\\text{Total number of true values of item i}}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "da12189d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.504433Z", + "iopub.status.busy": "2024-05-31T18:43:46.504056Z", + "iopub.status.idle": "2024-05-31T18:43:46.510074Z", + "shell.execute_reply": "2024-05-31T18:43:46.509209Z" + }, + "papermill": { + "duration": 0.020467, + "end_time": "2024-05-31T18:43:46.512170", + "exception": false, + "start_time": "2024-05-31T18:43:46.491703", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def calculate_percentage_of_values(true_dic, pred_dic):\n", + " total_percentage = 0 # Initialize total percentage\n", + " # Type 1: Single string values\n", + " for key, true_value in true_dic.items(): # Loop through key-value pairs in true_dic\n", + " \n", + " # Check if the key exists in pred_dic, if its value is a string and if it matches the true value\n", + " if key in pred_dic and str(pred_dic[key]) == str(true_value):\n", + " match = 1 # Assign perfect match\n", + " else:\n", + " match = 0 # Assign no match\n", + " total_percentage += match\n", + " return total_percentage / len(true_dic) # Calculate and return the average percentage" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "520856dd", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.537035Z", + "iopub.status.busy": "2024-05-31T18:43:46.536699Z", + "iopub.status.idle": "2024-05-31T18:43:46.543001Z", + "shell.execute_reply": "2024-05-31T18:43:46.541938Z" + }, + "papermill": { + "duration": 0.021493, + "end_time": "2024-05-31T18:43:46.545462", + "exception": false, + "start_time": "2024-05-31T18:43:46.523969", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def average_percentage_value(true_list, pred_list):\n", + " min_length = min(len(true_list), len(pred_list)) # Find the minimum length of the two lists\n", + " score = 0\n", + " for i in range(min_length):\n", + " score += calculate_percentage_of_values(true_list[i], pred_list[i])\n", + " return score / len(true_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "3295e756", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.572677Z", + "iopub.status.busy": "2024-05-31T18:43:46.572322Z", + "iopub.status.idle": "2024-05-31T18:43:46.578952Z", + "shell.execute_reply": "2024-05-31T18:43:46.577947Z" + }, + "papermill": { + "duration": 0.020978, + "end_time": "2024-05-31T18:43:46.580874", + "exception": false, + "start_time": "2024-05-31T18:43:46.559896", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average percentage of keys: 0.8888888888888888\n" + ] + } + ], + "source": [ + "# Example true and predicted lists\n", + "true_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 6}, {'key1': 7, 'key2': 8, 'key3': 9}]\n", + "pred_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 7}, {'key1': 7, 'key2': 8, 'key3': 9}]\n", + "\n", + "# Test the function\n", + "result = average_percentage_value(true_list, pred_list)\n", + "print(\"Average percentage of keys:\", result)" + ] + }, + { + "cell_type": "markdown", + "id": "6d7781eb", + "metadata": { + "papermill": { + "duration": 0.011, + "end_time": "2024-05-31T18:43:46.603194", + "exception": false, + "start_time": "2024-05-31T18:43:46.592194", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
III | Clean Response Obtained by LLM
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "74d01368", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.626862Z", + "iopub.status.busy": "2024-05-31T18:43:46.626490Z", + "iopub.status.idle": "2024-05-31T18:43:46.632581Z", + "shell.execute_reply": "2024-05-31T18:43:46.631758Z" + }, + "papermill": { + "duration": 0.020219, + "end_time": "2024-05-31T18:43:46.634511", + "exception": false, + "start_time": "2024-05-31T18:43:46.614292", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "def parse_json(data_str):\n", + " # Remove leading/trailing whitespace and newlines\n", + " data_str = data_str.strip()\n", + "\n", + " # Check if the string is enclosed within triple backticks (\"```json\" and \"```\")\n", + " if data_str.startswith(\"```json\") and data_str.endswith(\"```\"):\n", + " # Remove the leading/trailing \"```json\" and \"```\"\n", + " data_str = data_str[len(\"```json\"): -len(\"```\")]\n", + "\n", + " try:\n", + " # Parse JSON\n", + " data = json.loads(data_str)\n", + " return data\n", + " except json.JSONDecodeError as e:\n", + " print(\"JSON parsing error:\", e)\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5ca3e906", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.659623Z", + "iopub.status.busy": "2024-05-31T18:43:46.658864Z", + "iopub.status.idle": "2024-05-31T18:43:46.666973Z", + "shell.execute_reply": "2024-05-31T18:43:46.666195Z" + }, + "papermill": { + "duration": 0.022205, + "end_time": "2024-05-31T18:43:46.668839", + "exception": false, + "start_time": "2024-05-31T18:43:46.646634", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'aircraft': 'robinson r - 22',\n", + " 'description': 'light utility helicopter',\n", + " 'max gross weight': '1370 lb (635 kg)',\n", + " 'total disk area': '497 ft square (46.2 m square)',\n", + " 'max disk loading': '2.6 lb / ft square (14 kg / m square)'},\n", + " {'aircraft': 'bell 206b3 jetranger',\n", + " 'description': 'turboshaft utility helicopter',\n", + " 'max gross weight': '3200 lb (1451 kg)',\n", + " 'total disk area': '872 ft square (81.1 m square)',\n", + " 'max disk loading': '3.7 lb / ft square (18 kg / m square)'},\n", + " {'aircraft': 'ch - 47d chinook',\n", + " 'description': 'tandem rotor helicopter',\n", + " 'max gross weight': '50000 lb (22680 kg)',\n", + " 'total disk area': '5655 ft square (526 m square)',\n", + " 'max disk loading': '8.8 lb / ft square (43 kg / m square)'},\n", + " {'aircraft': 'mil mi - 26',\n", + " 'description': 'heavy - lift helicopter',\n", + " 'max gross weight': '123500 lb (56000 kg)',\n", + " 'total disk area': '8495 ft square (789 m square)',\n", + " 'max disk loading': '14.5 lb / ft square (71 kg / m square)'},\n", + " {'aircraft': 'ch - 53e super stallion',\n", + " 'description': 'heavy - lift helicopter',\n", + " 'max gross weight': '73500 lb (33300 kg)',\n", + " 'total disk area': '4900 ft square (460 m square)',\n", + " 'max disk loading': '15 lb / ft square (72 kg / m square)'}]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response_str = \"\"\"[{\"aircraft\": \"robinson r - 22\",\n", + " \"description\": \"light utility helicopter\",\n", + " \"max gross weight\": \"1370 lb (635 kg)\",\n", + " \"total disk area\": \"497 ft square (46.2 m square)\",\n", + " \"max disk loading\": \"2.6 lb / ft square (14 kg / m square)\"},\n", + "{\"aircraft\": \"bell 206b3 jetranger\",\n", + " \"description\": \"turboshaft utility helicopter\",\n", + " \"max gross weight\": \"3200 lb (1451 kg)\",\n", + " \"total disk area\": \"872 ft square (81.1 m square)\",\n", + " \"max disk loading\": \"3.7 lb / ft square (18 kg / m square)\"},\n", + "{\"aircraft\": \"ch - 47d chinook\",\n", + " \"description\": \"tandem rotor helicopter\",\n", + " \"max gross weight\": \"50000 lb (22680 kg)\",\n", + " \"total disk area\": \"5655 ft square (526 m square)\",\n", + " \"max disk loading\": \"8.8 lb / ft square (43 kg / m square)\"},\n", + "{\"aircraft\": \"mil mi - 26\",\n", + " \"description\": \"heavy - lift helicopter\",\n", + " \"max gross weight\": \"123500 lb (56000 kg)\",\n", + " \"total disk area\": \"8495 ft square (789 m square)\",\n", + " \"max disk loading\": \"14.5 lb / ft square (71 kg / m square)\"},\n", + "{\"aircraft\": \"ch - 53e super stallion\",\n", + " \"description\": \"heavy - lift helicopter\",\n", + " \"max gross weight\": \"73500 lb (33300 kg)\",\n", + " \"total disk area\": \"4900 ft square (460 m square)\",\n", + " \"max disk loading\": \"15 lb / ft square (72 kg / m square)\"}]\"\"\"\n", + "\n", + "# Convert the string representation to a list of dictionaries\n", + "parse_json(response_str)" + ] + }, + { + "cell_type": "markdown", + "id": "daecb5c0", + "metadata": { + "papermill": { + "duration": 0.013403, + "end_time": "2024-05-31T18:43:46.693812", + "exception": false, + "start_time": "2024-05-31T18:43:46.680409", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
IV | Data Preparation
\n" + ] + }, + { + "cell_type": "markdown", + "id": "40c089cd", + "metadata": { + "papermill": { + "duration": 0.014721, + "end_time": "2024-05-31T18:43:46.724815", + "exception": false, + "start_time": "2024-05-31T18:43:46.710094", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "I'll extract a sample of 100 records from the dataset excluding those with Arabic names, and then simplify the output to enhance performance." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "937a9d87", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:46.756186Z", + "iopub.status.busy": "2024-05-31T18:43:46.755526Z", + "iopub.status.idle": "2024-05-31T18:43:47.978038Z", + "shell.execute_reply": "2024-05-31T18:43:47.977150Z" + }, + "papermill": { + "duration": 1.240895, + "end_time": "2024-05-31T18:43:47.980091", + "exception": false, + "start_time": "2024-05-31T18:43:46.739196", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contextanswer
0aircraft ...{\"aircraft\":{\"0\":\"robinson r - 22\",\"1\":\"bell 2...
1order year manufacturer mod...{\"order year\":{\"0\":\"1992 - 93\",\"1\":\"1996\",\"2\":...
2player no nationality ...{\"player\":{\"0\":\"quincy acy\",\"1\":\"hassan adams\"...
3player no nationali...{\"player\":{\"0\":\"patrick o'bryant\",\"1\":\"jermain...
4player no nationality ...{\"player\":{\"0\":\"mark baker\",\"1\":\"marcus banks\"...
\n", + "
" + ], + "text/plain": [ + " context \\\n", + "0 aircraft ... \n", + "1 order year manufacturer mod... \n", + "2 player no nationality ... \n", + "3 player no nationali... \n", + "4 player no nationality ... \n", + "\n", + " answer \n", + "0 {\"aircraft\":{\"0\":\"robinson r - 22\",\"1\":\"bell 2... \n", + "1 {\"order year\":{\"0\":\"1992 - 93\",\"1\":\"1996\",\"2\":... \n", + "2 {\"player\":{\"0\":\"quincy acy\",\"1\":\"hassan adams\"... \n", + "3 {\"player\":{\"0\":\"patrick o'bryant\",\"1\":\"jermain... \n", + "4 {\"player\":{\"0\":\"mark baker\",\"1\":\"marcus banks\"... " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"/kaggle/input/table-extraction/table_extract.csv\")\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "c70acdfc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:48.005676Z", + "iopub.status.busy": "2024-05-31T18:43:48.005166Z", + "iopub.status.idle": "2024-05-31T18:43:48.009839Z", + "shell.execute_reply": "2024-05-31T18:43:48.009009Z" + }, + "papermill": { + "duration": 0.019566, + "end_time": "2024-05-31T18:43:48.011735", + "exception": false, + "start_time": "2024-05-31T18:43:47.992169", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def is_arabic_name(name):\n", + " \"\"\"\n", + " Checks if a name contains Arabic characters.\n", + "\n", + " Args:\n", + " name: The name string to check.\n", + "\n", + " Returns:\n", + " True if Arabic characters are found, False otherwise.\n", + " \"\"\"\n", + " # Regular expression to match Arabic characters\n", + " arabic_pattern = re.compile(\"[\\u0600-\\u06FF]+\")\n", + "\n", + " # Search for Arabic characters in the name\n", + " match = arabic_pattern.search(name)\n", + "\n", + " # Return True if a match is found, False otherwise\n", + " return bool(match)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b2cd7f4f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:48.036321Z", + "iopub.status.busy": "2024-05-31T18:43:48.035722Z", + "iopub.status.idle": "2024-05-31T18:43:48.878798Z", + "shell.execute_reply": "2024-05-31T18:43:48.877862Z" + }, + "papermill": { + "duration": 0.857601, + "end_time": "2024-05-31T18:43:48.880980", + "exception": false, + "start_time": "2024-05-31T18:43:48.023379", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df = df[~df['context'].apply(lambda x: is_arabic_name(x))]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5ff7b30d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:48.907676Z", + "iopub.status.busy": "2024-05-31T18:43:48.907365Z", + "iopub.status.idle": "2024-05-31T18:43:48.912778Z", + "shell.execute_reply": "2024-05-31T18:43:48.911977Z" + }, + "papermill": { + "duration": 0.019861, + "end_time": "2024-05-31T18:43:48.914742", + "exception": false, + "start_time": "2024-05-31T18:43:48.894881", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_sample =df.loc[:100]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "36e74deb", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:48.939496Z", + "iopub.status.busy": "2024-05-31T18:43:48.938769Z", + "iopub.status.idle": "2024-05-31T18:43:48.944817Z", + "shell.execute_reply": "2024-05-31T18:43:48.944090Z" + }, + "papermill": { + "duration": 0.020103, + "end_time": "2024-05-31T18:43:48.946542", + "exception": false, + "start_time": "2024-05-31T18:43:48.926439", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def transform_json_to_records(json_data):\n", + " \"\"\"\n", + " Transforms a structured JSON object into a list of records.\n", + "\n", + " The function assumes the structure of the JSON object is a dictionary of dictionaries,\n", + " where each top-level key is a field name, and its value is a dictionary mapping indices\n", + " to field values. All sub-dictionaries must have the same keys.\n", + "\n", + " Parameters:\n", + " - json_data: A dictionary representing the structured JSON object to transform.\n", + "\n", + " Returns:\n", + " - A list of dictionaries, where each dictionary represents a record with fields and values\n", + " derived from the input JSON.\n", + " \"\"\"\n", + " json_data = json.loads(json_data)\n", + " # Extract keys from the first dictionary item to use as indices\n", + " indices = list(next(iter(json_data.values())).keys())\n", + " # Initialize the list to store transformed records\n", + " records = []\n", + "\n", + " # Loop over each index to create a record\n", + " for index in indices:\n", + " record = {field: values[index] for field, values in json_data.items()}\n", + " records.append(record)\n", + "\n", + " return records" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ffd53e33", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:48.971043Z", + "iopub.status.busy": "2024-05-31T18:43:48.970775Z", + "iopub.status.idle": "2024-05-31T18:43:48.982528Z", + "shell.execute_reply": "2024-05-31T18:43:48.981700Z" + }, + "papermill": { + "duration": 0.026522, + "end_time": "2024-05-31T18:43:48.984677", + "exception": false, + "start_time": "2024-05-31T18:43:48.958155", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_sample.loc[:, 'answer'] = df_sample['answer'].map(transform_json_to_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "40483cb0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:49.010744Z", + "iopub.status.busy": "2024-05-31T18:43:49.010286Z", + "iopub.status.idle": "2024-05-31T18:43:49.042579Z", + "shell.execute_reply": "2024-05-31T18:43:49.041778Z" + }, + "papermill": { + "duration": 0.047173, + "end_time": "2024-05-31T18:43:49.044758", + "exception": false, + "start_time": "2024-05-31T18:43:48.997585", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contextanswer
0aircraft ...[{'aircraft': 'robinson r - 22', 'description'...
1order year manufacturer mod...[{'order year': '1992 - 93', 'manufacturer': '...
2player no nationality ...[{'player': 'quincy acy', 'no': '4', 'national...
3player no nationali...[{'player': 'patrick o'bryant', 'no': 13, 'nat...
4player no nationality ...[{'player': 'mark baker', 'no': '3', 'national...
\n", + "
" + ], + "text/plain": [ + " context \\\n", + "0 aircraft ... \n", + "1 order year manufacturer mod... \n", + "2 player no nationality ... \n", + "3 player no nationali... \n", + "4 player no nationality ... \n", + "\n", + " answer \n", + "0 [{'aircraft': 'robinson r - 22', 'description'... \n", + "1 [{'order year': '1992 - 93', 'manufacturer': '... \n", + "2 [{'player': 'quincy acy', 'no': '4', 'national... \n", + "3 [{'player': 'patrick o'bryant', 'no': 13, 'nat... \n", + "4 [{'player': 'mark baker', 'no': '3', 'national... " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sample.head()" + ] + }, + { + "cell_type": "markdown", + "id": "43c4fc4d", + "metadata": { + "papermill": { + "duration": 0.056482, + "end_time": "2024-05-31T18:43:49.113347", + "exception": false, + "start_time": "2024-05-31T18:43:49.056865", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
V | Benchmark
\n" + ] + }, + { + "cell_type": "markdown", + "id": "f191a75f", + "metadata": { + "papermill": { + "duration": 0.011795, + "end_time": "2024-05-31T18:43:49.137322", + "exception": false, + "start_time": "2024-05-31T18:43:49.125527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + ">## Prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ade0a612", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:49.163630Z", + "iopub.status.busy": "2024-05-31T18:43:49.162872Z", + "iopub.status.idle": "2024-05-31T18:43:49.169637Z", + "shell.execute_reply": "2024-05-31T18:43:49.168786Z" + }, + "papermill": { + "duration": 0.021972, + "end_time": "2024-05-31T18:43:49.171493", + "exception": false, + "start_time": "2024-05-31T18:43:49.149521", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "prompt = \"\"\"Your task is to extract relevant information from the provided context and format it into a list of records, following the template below.\n", + " A JSON object representing the extracted table structure. The list of records follows this format: \n", + " [ { \"column_1\": \"val1\",\"column_2\": \"val1\",\"column_3\": \"val1\",...},\n", + " { \"column_1\": \"val2\",\"column_2\": \"val2\",\"column_3\": \"val3\",...},\n", + " ...\n", + " ]\n", + " Each key in the records represents a column header, and the corresponding value is another object containing key-value pairs for each row in that column.\n", + "\n", + "INPUT example:\n", + "# do not use the data from the examples & template; they are just for reference only. The following data contains actual information. If a value is not found, leave it empty. \n", + "\n", + " aircraft description max gross weight total disk area max disk loading\n", + "0 robinson r - 22 light utility helicopter 1370 lb (635 kg) 497 ft square (46.2 m square) 2.6 lb / ft square (14 kg / m square)\n", + "1 bell 206b3 jetranger turboshaft utility helicopter 3200 lb (1451 kg) 872 ft square (81.1 m square) 3.7 lb / ft square (18 kg / m square)\n", + "2 ch - 47d chinook tandem rotor helicopter 50000 lb (22680 kg) 5655 ft square (526 m square) 8.8 lb / ft square (43 kg / m square)\n", + "3 mil mi - 26 heavy - lift helicopter 123500 lb (56000 kg) 8495 ft square (789 m square) 14.5 lb / ft square (71 kg / m square)\n", + "4 ch - 53e super stallion heavy - lift helicopter 73500 lb (33300 kg) 4900 ft square (460 m square) 15 lb / ft square (72 kg / m square)\n", + "\n", + "OUTPUT example:\n", + "# do not use the data from the examples & template; they are just for reference only. The following data contains actual information. If a value is not found, leave it empty. \n", + "[{\"aircraft\": \"robinson r - 22\",\n", + " \"description\": \"light utility helicopter\",\n", + " \"max gross weight\": \"1370 lb (635 kg)\",\n", + " \"total disk area\": \"497 ft square (46.2 m square)\",\n", + " \"max disk loading\": \"2.6 lb / ft square (14 kg / m square)\"},\n", + "{\"aircraft\": \"bell 206b3 jetranger\",\n", + " \"description\": \"turboshaft utility helicopter\",\n", + " \"max gross weight\": \"3200 lb (1451 kg)\",\n", + " \"total disk area\": \"872 ft square (81.1 m square)\",\n", + " \"max disk loading\": \"3.7 lb / ft square (18 kg / m square)\"},\n", + "{\"aircraft\": \"ch - 47d chinook\",\n", + " \"description\": \"tandem rotor helicopter\",\n", + " \"max gross weight\": \"50000 lb (22680 kg)\",\n", + " \"total disk area\": \"5655 ft square (526 m square)\",\n", + " \"max disk loading\": \"8.8 lb / ft square (43 kg / m square)\"},\n", + "{\"aircraft\": \"mil mi - 26\",\n", + " \"description\": \"heavy - lift helicopter\",\n", + " \"max gross weight\": \"123500 lb (56000 kg)\",\n", + " \"total disk area\": \"8495 ft square (789 m square)\",\n", + " \"max disk loading\": \"14.5 lb / ft square (71 kg / m square)\"},\n", + "{\"aircraft\": \"ch - 53e super stallion\",\n", + " \"description\": \"heavy - lift helicopter\",\n", + " \"max gross weight\": \"73500 lb (33300 kg)\",\n", + " \"total disk area\": \"4900 ft square (460 m square)\",\n", + " \"max disk loading\": \"15 lb / ft square (72 kg / m square)\"}]\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "abc85dbc", + "metadata": { + "papermill": { + "duration": 0.012075, + "end_time": "2024-05-31T18:43:49.195733", + "exception": false, + "start_time": "2024-05-31T18:43:49.183658", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + ">## OpenChat" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "4d7641e4", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:49.222862Z", + "iopub.status.busy": "2024-05-31T18:43:49.222025Z", + "iopub.status.idle": "2024-05-31T18:43:49.226803Z", + "shell.execute_reply": "2024-05-31T18:43:49.225878Z" + }, + "papermill": { + "duration": 0.020487, + "end_time": "2024-05-31T18:43:49.228685", + "exception": false, + "start_time": "2024-05-31T18:43:49.208198", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_sample =df.loc[:50]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "af6ad0b5", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:43:49.254939Z", + "iopub.status.busy": "2024-05-31T18:43:49.254611Z", + "iopub.status.idle": "2024-05-31T18:45:21.314757Z", + "shell.execute_reply": "2024-05-31T18:45:21.313701Z" + }, + "papermill": { + "duration": 92.075473, + "end_time": "2024-05-31T18:45:21.316713", + "exception": false, + "start_time": "2024-05-31T18:43:49.241240", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "802f3f2be634485785f3d334448f2cfc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/623 [00:00