File size: 7,598 Bytes

b0c0df0

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This notebook will guide you to make correct format of Huggingface dataset, in proper parquet format and visualizable in Huggingface dataset hub.\n",
    "# We will take the example of the dataset \"Otter-AI/MMVet\" and convert it to the proper format."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tiger/miniconda3/envs/llava/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "100%|██████████| 499/499 [00:18<00:00, 26.87it/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import Dataset, Features, Value, Image\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "\n",
    "# Define the features for the dataset\n",
    "features = Features(\n",
    "    {\n",
    "        \"video_name\": Value(dtype=\"string\"),\n",
    "        \"question\": Value(dtype=\"string\"),\n",
    "        \"answer\": Value(dtype=\"string\"),\n",
    "    }\n",
    ")\n",
    "\n",
    "df_items = {\n",
    "    \"video_name\": [],\n",
    "    \"question\": [],\n",
    "    \"answer\": [],\n",
    "}\n",
    "\n",
    "description_root = \"/mnt/bn/vl-research/workspace/yhzhang/data/llava_video/video_detail_description/Test_Human_Annotated_Captions\"\n",
    "videos = os.listdir(description_root)\n",
    "for cur_video_name in tqdm(videos):\n",
    "    sample_set = {}\n",
    "    video_name = cur_video_name.split(\".\")[0]\n",
    "    with open(f\"{description_root}/{cur_video_name}\", encoding=\"utf-8-sig\") as f:\n",
    "        description = f.readlines()[0]\n",
    "    question = \"Please provide a detailed description of the video, focusing on the main subjects, their actions, and the background scenes\"\n",
    "    df_items[\"video_name\"].append(video_name)\n",
    "    df_items[\"question\"].append(question)\n",
    "    df_items[\"answer\"].append(description)\n",
    "    # Add other fields as necessary\n",
    "\n",
    "df_items = pd.DataFrame(df_items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>video_name</th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>v_-6dz6tBH77I</td>\n",
       "      <td>Please provide a detailed description of the v...</td>\n",
       "      <td>The video is of a man in athletic clothes stan...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>v_-D1gdv_gQyw</td>\n",
       "      <td>Please provide a detailed description of the v...</td>\n",
       "      <td>The video begins with a man holding a knife in...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>v_-HpCLXdtcas</td>\n",
       "      <td>Please provide a detailed description of the v...</td>\n",
       "      <td>A man is standing behind a barbell placed on t...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>v_-IMXSEIabMM</td>\n",
       "      <td>Please provide a detailed description of the v...</td>\n",
       "      <td>The video starts with two people standing behi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>v_-MbZ-W0AbN0</td>\n",
       "      <td>Please provide a detailed description of the v...</td>\n",
       "      <td>The video starts with an advertisement for fur...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      video_name                                           question  \\\n",
       "0  v_-6dz6tBH77I  Please provide a detailed description of the v...   \n",
       "1  v_-D1gdv_gQyw  Please provide a detailed description of the v...   \n",
       "2  v_-HpCLXdtcas  Please provide a detailed description of the v...   \n",
       "3  v_-IMXSEIabMM  Please provide a detailed description of the v...   \n",
       "4  v_-MbZ-W0AbN0  Please provide a detailed description of the v...   \n",
       "\n",
       "                                              answer  \n",
       "0  The video is of a man in athletic clothes stan...  \n",
       "1  The video begins with a man holding a knife in...  \n",
       "2  A man is standing behind a barbell placed on t...  \n",
       "3  The video starts with two people standing behi...  \n",
       "4  The video starts with an advertisement for fur...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_items.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = Dataset.from_pandas(df_items, features=features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 340.67ba/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.46it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/VideoDetailDescription/commit/ad8e58fa42ad8daf56808724a4bcf4724688194e', commit_message='Upload dataset', commit_description='', oid='ad8e58fa42ad8daf56808724a4bcf4724688194e', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hub_dataset_path = \"lmms-lab/VideoDetailDescription\"\n",
    "dataset.push_to_hub(repo_id=hub_dataset_path, split=\"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "lmms-eval",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}