File size: 35,608 Bytes
002bd9b |
|
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"import json\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"db_path = \"../../exp/annotations.db\"\n",
"\n",
"conn = sqlite3.connect(db_path)\n",
"cursor = conn.cursor()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Table Names:\n",
"visual_genome_densecap_local_train\n",
"visual_genome_densecap_local_eval_visual_genome_densecap_local_densecap_test\n",
"\n",
"Schema for table visual_genome_densecap_local_train\n",
"region_id INTEGER\n",
"image_id INTEGER\n",
"width INTEGER\n",
"height INTEGER\n",
"file_name TEXT\n",
"coco_url TEXT\n",
"task_type TEXT\n",
"phrases TEXT\n",
"tokenized_phrases TEXT\n",
"x REAL\n",
"y REAL\n",
"region_width REAL\n",
"region_height REAL\n",
"\n",
"Schema for table visual_genome_densecap_local_eval_visual_genome_densecap_local_densecap_test\n",
"region_id INTEGER\n",
"image_id INTEGER\n",
"width INTEGER\n",
"height INTEGER\n",
"file_name TEXT\n",
"coco_url TEXT\n",
"task_type TEXT\n",
"phrases TEXT\n",
"tokenized_phrases TEXT\n",
"x REAL\n",
"y REAL\n",
"region_width REAL\n",
"region_height REAL\n"
]
}
],
"source": [
"# Get the table names\n",
"table_names = cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\").fetchall()\n",
"table_names = [name[0] for name in table_names]\n",
"print(\"Table Names:\")\n",
"for name in table_names:\n",
" print(name)\n",
"\n",
"# Get the schema of each table\n",
"for name in table_names:\n",
" print(\"\\nSchema for table\", name)\n",
" schema = cursor.execute(\"PRAGMA table_info({})\".format(name)).fetchall()\n",
" for column in schema:\n",
" print(column[1], column[2])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Rows for table visual_genome_densecap_local_train\n"
]
},
{
"data": {
"text/plain": [
"(929, 3684063, 0.00025216724035392445)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table_id = 0\n",
"fetch = cursor.execute(\"SELECT tokenized_phrases FROM {}\".format(table_names[table_id]))\n",
"print(\"\\nRows for table\", table_names[table_id])\n",
"num_tokens = []\n",
"for row in fetch:\n",
" row = json.loads(row[0])\n",
" for phrase in row:\n",
" num_tokens.append(len(phrase))\n",
"threshold = 20\n",
"num_tokens_array = np.array(num_tokens)\n",
"np.sum(num_tokens_array > threshold), len(num_tokens_array), np.sum(num_tokens_array > threshold) / len(num_tokens_array)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"out = plt.hist(num_tokens, bins=20, range=(0, 20))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Rows for table visual_genome_densecap_local_eval_visual_genome_densecap_local_densecap_test\n"
]
},
{
"data": {
"text/plain": [
"(60, 238069, 0.0002520277734606354)"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table_id = 1\n",
"fetch = cursor.execute(\"SELECT tokenized_phrases FROM {}\".format(table_names[table_id]))\n",
"print(\"\\nRows for table\", table_names[table_id])\n",
"num_tokens = []\n",
"for row in fetch:\n",
" row = json.loads(row[0])\n",
" for phrase in row:\n",
" num_tokens.append(len(phrase))\n",
"threshold = 20\n",
"num_tokens_array = np.array(num_tokens)\n",
"np.sum(num_tokens_array > threshold), len(num_tokens_array), np.sum(num_tokens_array > threshold) / len(num_tokens_array)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "module 'matplotlib.pyplot' has no attribute 'set_xlabel'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/home/t-yutonglin/xiaoke/segment-caption-anything-v2/scripts/notebooks/dataset_statstics_db.ipynb Cell 7\u001b[0m line \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2Bazure_tunnel/home/t-yutonglin/xiaoke/segment-caption-anything-v2/scripts/notebooks/dataset_statstics_db.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m plt\u001b[39m.\u001b[39mhist(num_tokens, bins\u001b[39m=\u001b[39m\u001b[39m20\u001b[39m, \u001b[39mrange\u001b[39m\u001b[39m=\u001b[39m(\u001b[39m0\u001b[39m, \u001b[39m20\u001b[39m))\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Bazure_tunnel/home/t-yutonglin/xiaoke/segment-caption-anything-v2/scripts/notebooks/dataset_statstics_db.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m plt\u001b[39m.\u001b[39;49mset_xlabel(\u001b[39m\"\u001b[39m\u001b[39mNumber of tokens\u001b[39m\u001b[39m\"\u001b[39m)\n",
"\u001b[0;31mAttributeError\u001b[0m: module 'matplotlib.pyplot' has no attribute 'set_xlabel'"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.hist(num_tokens, bins=20, range=(0, 20))\n",
"plt.set_xlabel(\"Number of tokens\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "sca-v2",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|