diff --git "a/Benchmark2/benchmark-2-hermes-2-pro-mistral-7b.ipynb" "b/Benchmark2/benchmark-2-hermes-2-pro-mistral-7b.ipynb" new file mode 100644--- /dev/null +++ "b/Benchmark2/benchmark-2-hermes-2-pro-mistral-7b.ipynb" @@ -0,0 +1,6468 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "52c8d1b1", + "metadata": { + "papermill": { + "duration": 0.016827, + "end_time": "2024-05-31T18:39:22.003858", + "exception": false, + "start_time": "2024-05-31T18:39:21.987031", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "

Benchmark 2: Hermes-2-Pro-Mistral-7B

\n" + ] + }, + { + "cell_type": "markdown", + "id": "b7e6cd12", + "metadata": { + "papermill": { + "duration": 0.014296, + "end_time": "2024-05-31T18:39:22.033571", + "exception": false, + "start_time": "2024-05-31T18:39:22.019275", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "
\n", + "

\n", + " Notebook Gool\n", + "

\n", + "

\n", + "The objective of this notebook is to evaluate the performance of Hermes-2-Pro-Mistral-7B using the Table-extract Benchmark dataset available at Hugging Face.

\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "42012452", + "metadata": { + "papermill": { + "duration": 0.014112, + "end_time": "2024-05-31T18:39:22.062056", + "exception": false, + "start_time": "2024-05-31T18:39:22.047944", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#
Table of Content
\n", + "\n", + "* [I. Loading and Importing Libraries](#1)\n", + "* [II. Definition and Implementation of Metrics](#2)\n", + "* [III. Clean Response Obtained by LLM](#3)\n", + "* [IV. Data Preparation](#5)\n", + "* [V. Benchmark](#6)\n", + " * [Prompt](#61)\n", + " * [Hermes-2-Pro-Mistral-7B](#62)" + ] + }, + { + "cell_type": "markdown", + "id": "13141fb0", + "metadata": { + "papermill": { + "duration": 0.014248, + "end_time": "2024-05-31T18:39:22.091712", + "exception": false, + "start_time": "2024-05-31T18:39:22.077464", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
I | Loading and Importing Libraries
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9b693727", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:39:22.122508Z", + "iopub.status.busy": "2024-05-31T18:39:22.122146Z", + "iopub.status.idle": "2024-05-31T18:40:34.765250Z", + "shell.execute_reply": "2024-05-31T18:40:34.763990Z" + }, + "papermill": { + "duration": 72.661722, + "end_time": "2024-05-31T18:40:34.767888", + "exception": false, + "start_time": "2024-05-31T18:39:22.106166", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install google-generativeai\n", + "!pip install --upgrade pip\n", + "!pip install bitsandbytes\n", + "!pip install transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e916e5ec", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:34.797469Z", + "iopub.status.busy": "2024-05-31T18:40:34.797142Z", + "iopub.status.idle": "2024-05-31T18:40:41.438022Z", + "shell.execute_reply": "2024-05-31T18:40:41.437060Z" + }, + "papermill": { + "duration": 6.658375, + "end_time": "2024-05-31T18:40:41.440614", + "exception": false, + "start_time": "2024-05-31T18:40:34.782239", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import re\n", + "import json\n", + "from tqdm import tqdm\n", + "import pandas as pd\n", + "from datasets import load_dataset, Dataset\n", + "from wand.image import Image as WImage\n", + "import torch\n", + "import pandas as pd\n", + "from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", + "import time \n", + "import random\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "888d83f5", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:41.471951Z", + "iopub.status.busy": "2024-05-31T18:40:41.470895Z", + "iopub.status.idle": "2024-05-31T18:40:42.166871Z", + "shell.execute_reply": "2024-05-31T18:40:42.166015Z" + }, + "papermill": { + "duration": 0.714082, + "end_time": "2024-05-31T18:40:42.169464", + "exception": false, + "start_time": "2024-05-31T18:40:41.455382", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import google.generativeai as genai\n", + "import time \n", + "genai.configure(api_key=\"AIzaSyAhz9UBzkEIYI886zZRm40qqB1Kd_9Y4-0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8ad6ddf8", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.200934Z", + "iopub.status.busy": "2024-05-31T18:40:42.200298Z", + "iopub.status.idle": "2024-05-31T18:40:42.208803Z", + "shell.execute_reply": "2024-05-31T18:40:42.207974Z" + }, + "papermill": { + "duration": 0.026714, + "end_time": "2024-05-31T18:40:42.211164", + "exception": false, + "start_time": "2024-05-31T18:40:42.184450", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Set random seed for reproducibility\n", + "random.seed(42)\n", + "np.random.seed(42)\n", + "torch.manual_seed(42)\n", + "torch.cuda.manual_seed_all(42)\n", + "torch.backends.cudnn.deterministic = True\n", + "torch.backends.cudnn.benchmark = False" + ] + }, + { + "cell_type": "markdown", + "id": "5a3cd96c", + "metadata": { + "papermill": { + "duration": 0.014153, + "end_time": "2024-05-31T18:40:42.239702", + "exception": false, + "start_time": "2024-05-31T18:40:42.225549", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
II | Definition and Implementation of Metrics
\n", + "So, let's begin by providing an example of the example output." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e91811b6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.270826Z", + "iopub.status.busy": "2024-05-31T18:40:42.270480Z", + "iopub.status.idle": "2024-05-31T18:40:42.277602Z", + "shell.execute_reply": "2024-05-31T18:40:42.276589Z" + }, + "papermill": { + "duration": 0.024967, + "end_time": "2024-05-31T18:40:42.279982", + "exception": false, + "start_time": "2024-05-31T18:40:42.255015", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "desired_output = [{'aircraft': 'robinson r - 22',\n", + " 'description': 'light utility helicopter',\n", + " 'max gross weight': '1370 lb (635 kg)',\n", + " 'total disk area': '497 ft square (46.2 m square)',\n", + " 'max disk loading': '2.6 lb / ft square (14 kg / m square)'},\n", + " {'aircraft': 'bell 206b3 jetranger',\n", + " 'description': 'turboshaft utility helicopter',\n", + " 'max gross weight': '3200 lb (1451 kg)',\n", + " 'total disk area': '872 ft square (81.1 m square)',\n", + " 'max disk loading': '3.7 lb / ft square (18 kg / m square)'},\n", + " {'aircraft': 'ch - 47d chinook',\n", + " 'description': 'tandem rotor helicopter',\n", + " 'max gross weight': '50000 lb (22680 kg)',\n", + " 'total disk area': '5655 ft square (526 m square)',\n", + " 'max disk loading': '8.8 lb / ft square (43 kg / m square)'},\n", + " {'aircraft': 'mil mi - 26',\n", + " 'description': 'heavy - lift helicopter',\n", + " 'max gross weight': '123500 lb (56000 kg)',\n", + " 'total disk area': '8495 ft square (789 m square)',\n", + " 'max disk loading': '14.5 lb / ft square (71 kg / m square)'},\n", + " {'aircraft': 'ch - 53e super stallion',\n", + " 'description': 'heavy - lift helicopter',\n", + " 'max gross weight': '73500 lb (33300 kg)',\n", + " 'total disk area': '4900 ft square (460 m square)',\n", + " 'max disk loading': '15 lb / ft square (72 kg / m square)'}]\n" + ] + }, + { + "cell_type": "markdown", + "id": "d36195db", + "metadata": { + "papermill": { + "duration": 0.015241, + "end_time": "2024-05-31T18:40:42.309558", + "exception": false, + "start_time": "2024-05-31T18:40:42.294317", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "To compare between the expected list of records and the predicted list of records, we first need to verify the percentage of predicted keys relative to the desired keys" + ] + }, + { + "cell_type": "markdown", + "id": "32e1dc12", + "metadata": { + "papermill": { + "duration": 0.014044, + "end_time": "2024-05-31T18:40:42.338066", + "exception": false, + "start_time": "2024-05-31T18:40:42.324022", + "status": "completed" + }, + "tags": [] + }, + "source": [ + ">## Percentage of predicted keys" + ] + }, + { + "cell_type": "markdown", + "id": "3615db22", + "metadata": { + "papermill": { + "duration": 0.013961, + "end_time": "2024-05-31T18:40:42.366521", + "exception": false, + "start_time": "2024-05-31T18:40:42.352560", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Let's begin by defining a function to retrieve all keys of record" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5015ce26", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.397397Z", + "iopub.status.busy": "2024-05-31T18:40:42.397003Z", + "iopub.status.idle": "2024-05-31T18:40:42.404578Z", + "shell.execute_reply": "2024-05-31T18:40:42.403580Z" + }, + "papermill": { + "duration": 0.025562, + "end_time": "2024-05-31T18:40:42.406894", + "exception": false, + "start_time": "2024-05-31T18:40:42.381332", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def get_keys(d):\n", + " # Iterate over each key-value pair in the dictionary\n", + " for k, v in d.items():\n", + " # Append the key to the list of all_keys\n", + " all_keys.append(k)\n", + " # If the value is a dictionary, recursively call get_keys\n", + " if isinstance(v, dict):\n", + " get_keys(v)\n", + " # If the value is a list, iterate over each item\n", + " elif isinstance(v, list):\n", + " for item in v:\n", + " # If the item is a dictionary, recursively call get_keys\n", + " if isinstance(item, dict):\n", + " get_keys(item)\n", + "# Define a function to retrieve all unique keys from a nested dictionary\n", + "def get_all_keys(d):\n", + " # Declare all_keys as a global variable\n", + " global all_keys\n", + " # Initialize all_keys as an empty list\n", + " all_keys = []\n", + " # Call the helper function get_keys to populate all_keys\n", + " get_keys(d)\n", + " # Return a list containing the unique keys by converting all_keys to a set and then back to a list\n", + " return list(set(all_keys))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fe0d9a28", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.436989Z", + "iopub.status.busy": "2024-05-31T18:40:42.436675Z", + "iopub.status.idle": "2024-05-31T18:40:42.443907Z", + "shell.execute_reply": "2024-05-31T18:40:42.442951Z" + }, + "papermill": { + "duration": 0.02481, + "end_time": "2024-05-31T18:40:42.446142", + "exception": false, + "start_time": "2024-05-31T18:40:42.421332", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['total disk area',\n", + " 'aircraft',\n", + " 'max gross weight',\n", + " 'max disk loading',\n", + " 'description']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Testing our function\n", + "get_all_keys(desired_output[0])" + ] + }, + { + "cell_type": "markdown", + "id": "0ffb7d75", + "metadata": { + "papermill": { + "duration": 0.014511, + "end_time": "2024-05-31T18:40:42.475565", + "exception": false, + "start_time": "2024-05-31T18:40:42.461054", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now, we define the percentage of predicted keys as follows:\n", + "\n", + "$$\\Large \\text{Percentage of predicted keys} = \\frac{\\text{Number of correctly predicted keys}}{\\text{Total number of true keys}}$$\n", + "This percentage is calculated for every record in the list, then summed and divided by the number of records in the list." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3968ea95", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.506652Z", + "iopub.status.busy": "2024-05-31T18:40:42.506288Z", + "iopub.status.idle": "2024-05-31T18:40:42.513054Z", + "shell.execute_reply": "2024-05-31T18:40:42.512049Z" + }, + "papermill": { + "duration": 0.0248, + "end_time": "2024-05-31T18:40:42.515253", + "exception": false, + "start_time": "2024-05-31T18:40:42.490453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def process_dict(data):\n", + " if isinstance(data, dict):\n", + " for key, value in data.items():\n", + " if isinstance(value, str):\n", + " data[key] = value.strip().lower()\n", + " elif isinstance(value, list):\n", + " data[key] = [process_dict(item) for item in value]\n", + " elif isinstance(value, dict):\n", + " data[key] = process_dict(value)\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "16cb4433", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.545797Z", + "iopub.status.busy": "2024-05-31T18:40:42.545456Z", + "iopub.status.idle": "2024-05-31T18:40:42.552252Z", + "shell.execute_reply": "2024-05-31T18:40:42.551244Z" + }, + "papermill": { + "duration": 0.024542, + "end_time": "2024-05-31T18:40:42.554540", + "exception": false, + "start_time": "2024-05-31T18:40:42.529998", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def percentage_of_predicted_keys(true_dic, pred_dic):\n", + " true_dic=process_dict(true_dic)\n", + " pred_dic=process_dict(pred_dic)\n", + " # Get all keys of the true dictionary\n", + " all_keys_of_true_dic = get_all_keys(true_dic)\n", + " # Get all keys of the predicted dictionary\n", + " all_keys_of_pred_dic = get_all_keys(pred_dic)\n", + " \n", + " # Check if there are no keys in the true dictionary to avoid division by zero\n", + " if len(all_keys_of_true_dic) == 0:\n", + " return 0 # Avoid division by zero\n", + " \n", + " # Initialize count of predicted keys\n", + " p_keys = 0\n", + " # Iterate through all keys in the predicted dictionary\n", + " for key in all_keys_of_pred_dic:\n", + " # Check if the key is also present in the true dictionary\n", + " if key in all_keys_of_true_dic:\n", + " # Increment count if the key is found in both dictionaries\n", + " p_keys += 1\n", + " \n", + " # Calculate the percentage of predicted keys compared to true keys\n", + " p_keys /= len(all_keys_of_true_dic)\n", + " # Return the percentage of predicted keys\n", + " return p_keys" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d1bc1b87", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.585210Z", + "iopub.status.busy": "2024-05-31T18:40:42.584858Z", + "iopub.status.idle": "2024-05-31T18:40:42.590375Z", + "shell.execute_reply": "2024-05-31T18:40:42.589425Z" + }, + "papermill": { + "duration": 0.023648, + "end_time": "2024-05-31T18:40:42.592651", + "exception": false, + "start_time": "2024-05-31T18:40:42.569003", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def average_percentage_key(true_list, pred_list):\n", + " min_length = min(len(true_list), len(pred_list)) # Find the minimum length of the two lists\n", + " score = 0\n", + " for i in range(min_length):\n", + " score += percentage_of_predicted_keys(true_list[i], pred_list[i])\n", + " return score / len(true_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "34050c8a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.624198Z", + "iopub.status.busy": "2024-05-31T18:40:42.623835Z", + "iopub.status.idle": "2024-05-31T18:40:42.631695Z", + "shell.execute_reply": "2024-05-31T18:40:42.630537Z" + }, + "papermill": { + "duration": 0.026297, + "end_time": "2024-05-31T18:40:42.633992", + "exception": false, + "start_time": "2024-05-31T18:40:42.607695", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average percentage of keys: 1.0\n" + ] + } + ], + "source": [ + "# Example true and predicted lists\n", + "true_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 6}, {'key1': 7, 'key2': 8, 'key3': 9}]\n", + "pred_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 7}, {'key1': 7, 'key2': 8, 'key3': 9}]\n", + "\n", + "# Test the function\n", + "result = average_percentage_key(true_list, pred_list)\n", + "print(\"Average percentage of keys:\", result)" + ] + }, + { + "cell_type": "markdown", + "id": "ddf139b4", + "metadata": { + "papermill": { + "duration": 0.01488, + "end_time": "2024-05-31T18:40:42.663805", + "exception": false, + "start_time": "2024-05-31T18:40:42.648925", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now we will define the principal metrics used to compare the values of two list recods." + ] + }, + { + "cell_type": "markdown", + "id": "76790f13", + "metadata": { + "papermill": { + "duration": 0.063724, + "end_time": "2024-05-31T18:40:42.742500", + "exception": false, + "start_time": "2024-05-31T18:40:42.678776", + "status": "completed" + }, + "tags": [] + }, + "source": [ + ">## Percentage of predicted values\n", + "\n", + "The function calculates the percentage of correctly predicted values compared to the total number of true values across different types of data structures.\n", + "\n", + "The formula for calculating the percentage of values is as follows:\n", + "\n", + "$$\n", + "\\text{Average percentage of values} = \\frac{\\sum_{i=1}^{\\text{Total number of records}} p_i }{Total number of records}\n", + "$$\n", + "\n", + "Here, $p_i$ represents the percentage of correctly predicted values for each key. It's calculated as:\n", + "\n", + "$$p_i = \\frac{\\text{Number of correctly predicted values of item i}}{\\text{Total number of true values of item i}}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "178e2bfc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.774935Z", + "iopub.status.busy": "2024-05-31T18:40:42.774155Z", + "iopub.status.idle": "2024-05-31T18:40:42.780768Z", + "shell.execute_reply": "2024-05-31T18:40:42.779755Z" + }, + "papermill": { + "duration": 0.025171, + "end_time": "2024-05-31T18:40:42.782982", + "exception": false, + "start_time": "2024-05-31T18:40:42.757811", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def calculate_percentage_of_values(true_dic, pred_dic):\n", + " total_percentage = 0 # Initialize total percentage\n", + " # Type 1: Single string values\n", + " for key, true_value in true_dic.items(): # Loop through key-value pairs in true_dic\n", + " \n", + " # Check if the key exists in pred_dic, if its value is a string and if it matches the true value\n", + " if key in pred_dic and str(pred_dic[key]) == str(true_value):\n", + " match = 1 # Assign perfect match\n", + " else:\n", + " match = 0 # Assign no match\n", + " total_percentage += match\n", + " return total_percentage / len(true_dic) # Calculate and return the average percentage" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ad2a1633", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.813899Z", + "iopub.status.busy": "2024-05-31T18:40:42.813594Z", + "iopub.status.idle": "2024-05-31T18:40:42.819198Z", + "shell.execute_reply": "2024-05-31T18:40:42.818152Z" + }, + "papermill": { + "duration": 0.023552, + "end_time": "2024-05-31T18:40:42.821428", + "exception": false, + "start_time": "2024-05-31T18:40:42.797876", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def average_percentage_value(true_list, pred_list):\n", + " min_length = min(len(true_list), len(pred_list)) # Find the minimum length of the two lists\n", + " score = 0\n", + " for i in range(min_length):\n", + " score += calculate_percentage_of_values(true_list[i], pred_list[i])\n", + " return score / len(true_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "625f043d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.853399Z", + "iopub.status.busy": "2024-05-31T18:40:42.853054Z", + "iopub.status.idle": "2024-05-31T18:40:42.859876Z", + "shell.execute_reply": "2024-05-31T18:40:42.858917Z" + }, + "papermill": { + "duration": 0.026314, + "end_time": "2024-05-31T18:40:42.862573", + "exception": false, + "start_time": "2024-05-31T18:40:42.836259", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average percentage of keys: 0.8888888888888888\n" + ] + } + ], + "source": [ + "# Example true and predicted lists\n", + "true_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 6}, {'key1': 7, 'key2': 8, 'key3': 9}]\n", + "pred_list = [{'key1': 1, 'key2': 2, 'key3': 3}, {'key1': 4, 'key2': 5, 'key3': 7}, {'key1': 7, 'key2': 8, 'key3': 9}]\n", + "\n", + "# Test the function\n", + "result = average_percentage_value(true_list, pred_list)\n", + "print(\"Average percentage of keys:\", result)" + ] + }, + { + "cell_type": "markdown", + "id": "7e474034", + "metadata": { + "papermill": { + "duration": 0.014957, + "end_time": "2024-05-31T18:40:42.892682", + "exception": false, + "start_time": "2024-05-31T18:40:42.877725", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
III | Clean Response Obtained by LLM
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ddf6ed1f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.924356Z", + "iopub.status.busy": "2024-05-31T18:40:42.923980Z", + "iopub.status.idle": "2024-05-31T18:40:42.930541Z", + "shell.execute_reply": "2024-05-31T18:40:42.929557Z" + }, + "papermill": { + "duration": 0.025015, + "end_time": "2024-05-31T18:40:42.932843", + "exception": false, + "start_time": "2024-05-31T18:40:42.907828", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "def parse_json(data_str):\n", + " # Remove leading/trailing whitespace and newlines\n", + " data_str = data_str.strip()\n", + "\n", + " # Check if the string is enclosed within triple backticks (\"```json\" and \"```\")\n", + " if data_str.startswith(\"```json\") and data_str.endswith(\"```\"):\n", + " # Remove the leading/trailing \"```json\" and \"```\"\n", + " data_str = data_str[len(\"```json\"): -len(\"```\")]\n", + "\n", + " try:\n", + " # Parse JSON\n", + " data = json.loads(data_str)\n", + " return data\n", + " except json.JSONDecodeError as e:\n", + " print(\"JSON parsing error:\", e)\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "70196b87", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:42.965065Z", + "iopub.status.busy": "2024-05-31T18:40:42.964393Z", + "iopub.status.idle": "2024-05-31T18:40:42.973613Z", + "shell.execute_reply": "2024-05-31T18:40:42.972644Z" + }, + "papermill": { + "duration": 0.027671, + "end_time": "2024-05-31T18:40:42.975918", + "exception": false, + "start_time": "2024-05-31T18:40:42.948247", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'aircraft': 'robinson r - 22',\n", + " 'description': 'light utility helicopter',\n", + " 'max gross weight': '1370 lb (635 kg)',\n", + " 'total disk area': '497 ft square (46.2 m square)',\n", + " 'max disk loading': '2.6 lb / ft square (14 kg / m square)'},\n", + " {'aircraft': 'bell 206b3 jetranger',\n", + " 'description': 'turboshaft utility helicopter',\n", + " 'max gross weight': '3200 lb (1451 kg)',\n", + " 'total disk area': '872 ft square (81.1 m square)',\n", + " 'max disk loading': '3.7 lb / ft square (18 kg / m square)'},\n", + " {'aircraft': 'ch - 47d chinook',\n", + " 'description': 'tandem rotor helicopter',\n", + " 'max gross weight': '50000 lb (22680 kg)',\n", + " 'total disk area': '5655 ft square (526 m square)',\n", + " 'max disk loading': '8.8 lb / ft square (43 kg / m square)'},\n", + " {'aircraft': 'mil mi - 26',\n", + " 'description': 'heavy - lift helicopter',\n", + " 'max gross weight': '123500 lb (56000 kg)',\n", + " 'total disk area': '8495 ft square (789 m square)',\n", + " 'max disk loading': '14.5 lb / ft square (71 kg / m square)'},\n", + " {'aircraft': 'ch - 53e super stallion',\n", + " 'description': 'heavy - lift helicopter',\n", + " 'max gross weight': '73500 lb (33300 kg)',\n", + " 'total disk area': '4900 ft square (460 m square)',\n", + " 'max disk loading': '15 lb / ft square (72 kg / m square)'}]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response_str = \"\"\"[{\"aircraft\": \"robinson r - 22\",\n", + " \"description\": \"light utility helicopter\",\n", + " \"max gross weight\": \"1370 lb (635 kg)\",\n", + " \"total disk area\": \"497 ft square (46.2 m square)\",\n", + " \"max disk loading\": \"2.6 lb / ft square (14 kg / m square)\"},\n", + "{\"aircraft\": \"bell 206b3 jetranger\",\n", + " \"description\": \"turboshaft utility helicopter\",\n", + " \"max gross weight\": \"3200 lb (1451 kg)\",\n", + " \"total disk area\": \"872 ft square (81.1 m square)\",\n", + " \"max disk loading\": \"3.7 lb / ft square (18 kg / m square)\"},\n", + "{\"aircraft\": \"ch - 47d chinook\",\n", + " \"description\": \"tandem rotor helicopter\",\n", + " \"max gross weight\": \"50000 lb (22680 kg)\",\n", + " \"total disk area\": \"5655 ft square (526 m square)\",\n", + " \"max disk loading\": \"8.8 lb / ft square (43 kg / m square)\"},\n", + "{\"aircraft\": \"mil mi - 26\",\n", + " \"description\": \"heavy - lift helicopter\",\n", + " \"max gross weight\": \"123500 lb (56000 kg)\",\n", + " \"total disk area\": \"8495 ft square (789 m square)\",\n", + " \"max disk loading\": \"14.5 lb / ft square (71 kg / m square)\"},\n", + "{\"aircraft\": \"ch - 53e super stallion\",\n", + " \"description\": \"heavy - lift helicopter\",\n", + " \"max gross weight\": \"73500 lb (33300 kg)\",\n", + " \"total disk area\": \"4900 ft square (460 m square)\",\n", + " \"max disk loading\": \"15 lb / ft square (72 kg / m square)\"}]\"\"\"\n", + "\n", + "# Convert the string representation to a list of dictionaries\n", + "parse_json(response_str)" + ] + }, + { + "cell_type": "markdown", + "id": "8925860e", + "metadata": { + "papermill": { + "duration": 0.015084, + "end_time": "2024-05-31T18:40:43.006542", + "exception": false, + "start_time": "2024-05-31T18:40:42.991458", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
IV | Data Preparation
\n" + ] + }, + { + "cell_type": "markdown", + "id": "f74aa569", + "metadata": { + "papermill": { + "duration": 0.015037, + "end_time": "2024-05-31T18:40:43.036960", + "exception": false, + "start_time": "2024-05-31T18:40:43.021923", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "I'll extract a sample of 100 records from the dataset excluding those with Arabic names, and then simplify the output to enhance performance." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "72e33f2c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:43.069719Z", + "iopub.status.busy": "2024-05-31T18:40:43.068798Z", + "iopub.status.idle": "2024-05-31T18:40:44.436289Z", + "shell.execute_reply": "2024-05-31T18:40:44.435163Z" + }, + "papermill": { + "duration": 1.386678, + "end_time": "2024-05-31T18:40:44.439016", + "exception": false, + "start_time": "2024-05-31T18:40:43.052338", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contextanswer
0aircraft ...{\"aircraft\":{\"0\":\"robinson r - 22\",\"1\":\"bell 2...
1order year manufacturer mod...{\"order year\":{\"0\":\"1992 - 93\",\"1\":\"1996\",\"2\":...
2player no nationality ...{\"player\":{\"0\":\"quincy acy\",\"1\":\"hassan adams\"...
3player no nationali...{\"player\":{\"0\":\"patrick o'bryant\",\"1\":\"jermain...
4player no nationality ...{\"player\":{\"0\":\"mark baker\",\"1\":\"marcus banks\"...
\n", + "
" + ], + "text/plain": [ + " context \\\n", + "0 aircraft ... \n", + "1 order year manufacturer mod... \n", + "2 player no nationality ... \n", + "3 player no nationali... \n", + "4 player no nationality ... \n", + "\n", + " answer \n", + "0 {\"aircraft\":{\"0\":\"robinson r - 22\",\"1\":\"bell 2... \n", + "1 {\"order year\":{\"0\":\"1992 - 93\",\"1\":\"1996\",\"2\":... \n", + "2 {\"player\":{\"0\":\"quincy acy\",\"1\":\"hassan adams\"... \n", + "3 {\"player\":{\"0\":\"patrick o'bryant\",\"1\":\"jermain... \n", + "4 {\"player\":{\"0\":\"mark baker\",\"1\":\"marcus banks\"... " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"/kaggle/input/table-extraction/table_extract.csv\")\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f7df4563", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:44.473102Z", + "iopub.status.busy": "2024-05-31T18:40:44.472727Z", + "iopub.status.idle": "2024-05-31T18:40:44.478238Z", + "shell.execute_reply": "2024-05-31T18:40:44.477222Z" + }, + "papermill": { + "duration": 0.024353, + "end_time": "2024-05-31T18:40:44.480590", + "exception": false, + "start_time": "2024-05-31T18:40:44.456237", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def is_arabic_name(name):\n", + " \"\"\"\n", + " Checks if a name contains Arabic characters.\n", + "\n", + " Args:\n", + " name: The name string to check.\n", + "\n", + " Returns:\n", + " True if Arabic characters are found, False otherwise.\n", + " \"\"\"\n", + " # Regular expression to match Arabic characters\n", + " arabic_pattern = re.compile(\"[\\u0600-\\u06FF]+\")\n", + "\n", + " # Search for Arabic characters in the name\n", + " match = arabic_pattern.search(name)\n", + "\n", + " # Return True if a match is found, False otherwise\n", + " return bool(match)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b840a20f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:44.512468Z", + "iopub.status.busy": "2024-05-31T18:40:44.512121Z", + "iopub.status.idle": "2024-05-31T18:40:45.498736Z", + "shell.execute_reply": "2024-05-31T18:40:45.497677Z" + }, + "papermill": { + "duration": 1.005554, + "end_time": "2024-05-31T18:40:45.501403", + "exception": false, + "start_time": "2024-05-31T18:40:44.495849", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df = df[~df['context'].apply(lambda x: is_arabic_name(x))]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b80dcb76", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:45.534927Z", + "iopub.status.busy": "2024-05-31T18:40:45.534019Z", + "iopub.status.idle": "2024-05-31T18:40:45.540997Z", + "shell.execute_reply": "2024-05-31T18:40:45.540000Z" + }, + "papermill": { + "duration": 0.026075, + "end_time": "2024-05-31T18:40:45.543295", + "exception": false, + "start_time": "2024-05-31T18:40:45.517220", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_sample =df.loc[:100]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1e0fa1a2", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:45.575761Z", + "iopub.status.busy": "2024-05-31T18:40:45.575439Z", + "iopub.status.idle": "2024-05-31T18:40:45.582393Z", + "shell.execute_reply": "2024-05-31T18:40:45.581339Z" + }, + "papermill": { + "duration": 0.025629, + "end_time": "2024-05-31T18:40:45.584568", + "exception": false, + "start_time": "2024-05-31T18:40:45.558939", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def transform_json_to_records(json_data):\n", + " \"\"\"\n", + " Transforms a structured JSON object into a list of records.\n", + "\n", + " The function assumes the structure of the JSON object is a dictionary of dictionaries,\n", + " where each top-level key is a field name, and its value is a dictionary mapping indices\n", + " to field values. All sub-dictionaries must have the same keys.\n", + "\n", + " Parameters:\n", + " - json_data: A dictionary representing the structured JSON object to transform.\n", + "\n", + " Returns:\n", + " - A list of dictionaries, where each dictionary represents a record with fields and values\n", + " derived from the input JSON.\n", + " \"\"\"\n", + " json_data = json.loads(json_data)\n", + " # Extract keys from the first dictionary item to use as indices\n", + " indices = list(next(iter(json_data.values())).keys())\n", + " # Initialize the list to store transformed records\n", + " records = []\n", + "\n", + " # Loop over each index to create a record\n", + " for index in indices:\n", + " record = {field: values[index] for field, values in json_data.items()}\n", + " records.append(record)\n", + "\n", + " return records" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a1af7731", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:45.617438Z", + "iopub.status.busy": "2024-05-31T18:40:45.616869Z", + "iopub.status.idle": "2024-05-31T18:40:45.630866Z", + "shell.execute_reply": "2024-05-31T18:40:45.629727Z" + }, + "papermill": { + "duration": 0.032824, + "end_time": "2024-05-31T18:40:45.633055", + "exception": false, + "start_time": "2024-05-31T18:40:45.600231", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_sample.loc[:, 'answer'] = df_sample['answer'].map(transform_json_to_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "1f38b5a7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:45.666356Z", + "iopub.status.busy": "2024-05-31T18:40:45.665670Z", + "iopub.status.idle": "2024-05-31T18:40:45.704282Z", + "shell.execute_reply": "2024-05-31T18:40:45.703240Z" + }, + "papermill": { + "duration": 0.057883, + "end_time": "2024-05-31T18:40:45.706581", + "exception": false, + "start_time": "2024-05-31T18:40:45.648698", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contextanswer
0aircraft ...[{'aircraft': 'robinson r - 22', 'description'...
1order year manufacturer mod...[{'order year': '1992 - 93', 'manufacturer': '...
2player no nationality ...[{'player': 'quincy acy', 'no': '4', 'national...
3player no nationali...[{'player': 'patrick o'bryant', 'no': 13, 'nat...
4player no nationality ...[{'player': 'mark baker', 'no': '3', 'national...
\n", + "
" + ], + "text/plain": [ + " context \\\n", + "0 aircraft ... \n", + "1 order year manufacturer mod... \n", + "2 player no nationality ... \n", + "3 player no nationali... \n", + "4 player no nationality ... \n", + "\n", + " answer \n", + "0 [{'aircraft': 'robinson r - 22', 'description'... \n", + "1 [{'order year': '1992 - 93', 'manufacturer': '... \n", + "2 [{'player': 'quincy acy', 'no': '4', 'national... \n", + "3 [{'player': 'patrick o'bryant', 'no': 13, 'nat... \n", + "4 [{'player': 'mark baker', 'no': '3', 'national... " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sample.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ca8754f9", + "metadata": { + "papermill": { + "duration": 0.015771, + "end_time": "2024-05-31T18:40:45.738071", + "exception": false, + "start_time": "2024-05-31T18:40:45.722300", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + "#
V | Benchmark
\n" + ] + }, + { + "cell_type": "markdown", + "id": "fbd2836c", + "metadata": { + "papermill": { + "duration": 0.015244, + "end_time": "2024-05-31T18:40:45.768817", + "exception": false, + "start_time": "2024-05-31T18:40:45.753573", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + ">## Prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b20239a5", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:45.802215Z", + "iopub.status.busy": "2024-05-31T18:40:45.801822Z", + "iopub.status.idle": "2024-05-31T18:40:45.809301Z", + "shell.execute_reply": "2024-05-31T18:40:45.808157Z" + }, + "papermill": { + "duration": 0.026909, + "end_time": "2024-05-31T18:40:45.811503", + "exception": false, + "start_time": "2024-05-31T18:40:45.784594", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "prompt = \"\"\"Your task is to extract relevant information from the provided context and format it into a list of records, following the template below.\n", + " A JSON object representing the extracted table structure. The list of records follows this format: \n", + " [ { \"column_1\": \"val1\",\"column_2\": \"val1\",\"column_3\": \"val1\",...},\n", + " { \"column_1\": \"val2\",\"column_2\": \"val2\",\"column_3\": \"val3\",...},\n", + " ...\n", + " ]\n", + " Each key in the records represents a column header, and the corresponding value is another object containing key-value pairs for each row in that column.\n", + "\n", + "INPUT example:\n", + "# do not use the data from the examples & template; they are just for reference only. The following data contains actual information. If a value is not found, leave it empty. \n", + "\n", + " aircraft description max gross weight total disk area max disk loading\n", + "0 robinson r - 22 light utility helicopter 1370 lb (635 kg) 497 ft square (46.2 m square) 2.6 lb / ft square (14 kg / m square)\n", + "1 bell 206b3 jetranger turboshaft utility helicopter 3200 lb (1451 kg) 872 ft square (81.1 m square) 3.7 lb / ft square (18 kg / m square)\n", + "2 ch - 47d chinook tandem rotor helicopter 50000 lb (22680 kg) 5655 ft square (526 m square) 8.8 lb / ft square (43 kg / m square)\n", + "3 mil mi - 26 heavy - lift helicopter 123500 lb (56000 kg) 8495 ft square (789 m square) 14.5 lb / ft square (71 kg / m square)\n", + "4 ch - 53e super stallion heavy - lift helicopter 73500 lb (33300 kg) 4900 ft square (460 m square) 15 lb / ft square (72 kg / m square)\n", + "\n", + "OUTPUT example:\n", + "# do not use the data from the examples & template; they are just for reference only. The following data contains actual information. If a value is not found, leave it empty. \n", + "[{\"aircraft\": \"robinson r - 22\",\n", + " \"description\": \"light utility helicopter\",\n", + " \"max gross weight\": \"1370 lb (635 kg)\",\n", + " \"total disk area\": \"497 ft square (46.2 m square)\",\n", + " \"max disk loading\": \"2.6 lb / ft square (14 kg / m square)\"},\n", + "{\"aircraft\": \"bell 206b3 jetranger\",\n", + " \"description\": \"turboshaft utility helicopter\",\n", + " \"max gross weight\": \"3200 lb (1451 kg)\",\n", + " \"total disk area\": \"872 ft square (81.1 m square)\",\n", + " \"max disk loading\": \"3.7 lb / ft square (18 kg / m square)\"},\n", + "{\"aircraft\": \"ch - 47d chinook\",\n", + " \"description\": \"tandem rotor helicopter\",\n", + " \"max gross weight\": \"50000 lb (22680 kg)\",\n", + " \"total disk area\": \"5655 ft square (526 m square)\",\n", + " \"max disk loading\": \"8.8 lb / ft square (43 kg / m square)\"},\n", + "{\"aircraft\": \"mil mi - 26\",\n", + " \"description\": \"heavy - lift helicopter\",\n", + " \"max gross weight\": \"123500 lb (56000 kg)\",\n", + " \"total disk area\": \"8495 ft square (789 m square)\",\n", + " \"max disk loading\": \"14.5 lb / ft square (71 kg / m square)\"},\n", + "{\"aircraft\": \"ch - 53e super stallion\",\n", + " \"description\": \"heavy - lift helicopter\",\n", + " \"max gross weight\": \"73500 lb (33300 kg)\",\n", + " \"total disk area\": \"4900 ft square (460 m square)\",\n", + " \"max disk loading\": \"15 lb / ft square (72 kg / m square)\"}]\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "d0782f3a", + "metadata": { + "papermill": { + "duration": 0.015237, + "end_time": "2024-05-31T18:40:45.842423", + "exception": false, + "start_time": "2024-05-31T18:40:45.827186", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + ">## Hermes-2-Pro-Mistral-7B" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "5337f51d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:45.874942Z", + "iopub.status.busy": "2024-05-31T18:40:45.874592Z", + "iopub.status.idle": "2024-05-31T18:40:45.879465Z", + "shell.execute_reply": "2024-05-31T18:40:45.878548Z" + }, + "papermill": { + "duration": 0.023704, + "end_time": "2024-05-31T18:40:45.881602", + "exception": false, + "start_time": "2024-05-31T18:40:45.857898", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_sample =df.loc[:50]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3b588dbc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-05-31T18:40:46.073896Z", + "iopub.status.busy": "2024-05-31T18:40:46.073516Z", + "iopub.status.idle": "2024-05-31T18:43:18.224393Z", + "shell.execute_reply": "2024-05-31T18:43:18.223034Z" + }, + "papermill": { + "duration": 152.170475, + "end_time": "2024-05-31T18:43:18.226677", + "exception": false, + "start_time": "2024-05-31T18:40:46.056202", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b3839b3add664f1fa87d7e97c0f8e4fc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/642 [00:00