{ "cells": [ { "cell_type": "code", "execution_count": 5, "id": "61c272f2-edbe-4b7d-8fec-3ab431400cd3", "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 2, "id": "e9dfd7d7-1685-4fc7-bbb9-3905c32d8ba1", "metadata": {}, "outputs": [], "source": [ "with open(\"metadata.json\", \"rb\") as f:\n", " metadata = json.load(f)" ] }, { "cell_type": "code", "execution_count": 4, "id": "70bdba48-db01-42ac-8d89-edc69d7d7672", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "595375" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(metadata)" ] }, { "cell_type": "code", "execution_count": 14, "id": "59e193cc-0dd8-4f7e-959a-fbad0133d76c", "metadata": {}, "outputs": [], "source": [ "with open(\"blip_laion_cc_sbu_558k.jsonblip_laion_cc_sbu_558k.json\", \"rb\") as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": 7, "id": "f3157f41-269b-4f7a-b3ba-9be711babe02", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '004539375',\n", " 'image': '00453/004539375.jpg',\n", " 'conversations': [{'from': 'human',\n", " 'value': 'Render a clear and concise summary of the photo.\\n'},\n", " {'from': 'gpt',\n", " 'value': 'select luxury furniture 3 - inch gel memory foam mattress topper'}]}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[0]" ] }, { "cell_type": "code", "execution_count": 8, "id": "50d8a051-1526-47dd-ad71-d3c66f7bd34e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '004374662',\n", " 'image': '00437/004374662.jpg',\n", " 'conversations': [{'from': 'human',\n", " 'value': 'Give a brief description of the image.\\n'},\n", " {'from': 'gpt', 'value': 'the north face duffel bag camo large'}]}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[234]" ] }, { "cell_type": "code", "execution_count": 17, "id": "2e6d5664-4583-49a6-93cc-079ee2d1ff6c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "558128" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(data)" ] }, { "cell_type": "code", "execution_count": 10, "id": "11ed106d-6bef-482c-a456-5eaaf2025534", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 'GCC_train_001749371',\n", " 'image': 'GCC_train_001749371.jpg',\n", " 'caption': 'if you are dreaming of simpler or off - the - grid living , a yurt is a fantastic option',\n", " 'blip_caption': 'a white and tan yurt sitting on a dirt road',\n", " 'url': 'https://i.pinimg.com/736x/14/7b/64/147b64467ee966d9a578097bb70475ad--yurt-kits-small-space-living.jpg'}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata[67]" ] }, { "cell_type": "code", "execution_count": 15, "id": "ce8adcec-2499-4be3-be1d-7313fe54e96a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': '000466761',\n", " 'image': '00046/000466761.jpg',\n", " 'conversations': [{'from': 'human',\n", " 'value': '\\nProvide a brief description of the given image.'},\n", " {'from': 'gpt',\n", " 'value': 'a clipboard and a pen with the words public health emergency next to it on a white table'}]}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[67]" ] }, { "cell_type": "code", "execution_count": 16, "id": "068313b6-6379-4ca2-892c-682634d3581e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(data)" ] }, { "cell_type": "code", "execution_count": 24, "id": "9ec33b51-4a0b-4a1e-81f7-2fda7cddb25f", "metadata": {}, "outputs": [], "source": [ "sample_data = data[:200000]" ] }, { "cell_type": "code", "execution_count": 25, "id": "095685e5-40f1-4d84-8280-ef74fa56c5a2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "200000" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(sample_data)" ] }, { "cell_type": "code", "execution_count": 26, "id": "ffbad552-23fd-475f-8e9a-7118bcc4f51e", "metadata": {}, "outputs": [], "source": [ "with open(\"llava-phi/pretrain_data/blip_sample.json\", \"w\") as f:\n", " json.dump(sample_data, f)" ] }, { "cell_type": "code", "execution_count": 27, "id": "69a05d25-6f3b-40c0-a3b5-e185ff526471", "metadata": {}, "outputs": [], "source": [ "with open(\"llava-phi/pretrain_data/blip_sample.json\", \"rb\") as f:\n", " sample = json.load(f)" ] }, { "cell_type": "code", "execution_count": 28, "id": "200eea06-dfd6-4b3a-bb91-82af7d363951", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "200000" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(sample)" ] }, { "cell_type": "code", "execution_count": null, "id": "f86caa1e-edea-4a9c-934f-5420ede80d0d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }