{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Details" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`/raw/DocLayNet_core.zip` downloaded from [DocLayNet_core.zip dataset](https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip)\n", "\n", "`/raw/RVL-CDIP-invoice.zip` downloaded from [chainyo/rvl-cdip-invoice](https://huggingface.co/datasets/chainyo/rvl-cdip-invoice). It can also be downloaded from [aharley/rvl_cdip](https://huggingface.co/datasets/aharley/rvl_cdip).\n", "\n", "`/processed/vectors/RVL-CDIP-invoice.json.zip` generated using `/raw/RVL-CDIP-invoice.zip`, and the model to create the following features." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preview" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "ename": "BadZipFile", "evalue": "File is not a zip file", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mBadZipFile\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mzipfile\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m filelist \u001b[38;5;241m=\u001b[39m \u001b[43mzipfile\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mZipFile\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m./raw/DocLayNet_core.zip\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mfilelist\n\u001b[1;32m 4\u001b[0m filelist\n", "File \u001b[0;32m~/miniconda3/envs/dss-env/lib/python3.10/zipfile.py:1269\u001b[0m, in \u001b[0;36mZipFile.__init__\u001b[0;34m(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps)\u001b[0m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m-> 1269\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_RealGetContents\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m mode \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 1271\u001b[0m \u001b[38;5;66;03m# set the modified flag so central directory gets written\u001b[39;00m\n\u001b[1;32m 1272\u001b[0m \u001b[38;5;66;03m# even if no files are added to the archive\u001b[39;00m\n\u001b[1;32m 1273\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_didModify \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", "File \u001b[0;32m~/miniconda3/envs/dss-env/lib/python3.10/zipfile.py:1336\u001b[0m, in \u001b[0;36mZipFile._RealGetContents\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1334\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m BadZipFile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile is not a zip file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1335\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m endrec:\n\u001b[0;32m-> 1336\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m BadZipFile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile is not a zip file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1337\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdebug \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 1338\u001b[0m \u001b[38;5;28mprint\u001b[39m(endrec)\n", "\u001b[0;31mBadZipFile\u001b[0m: File is not a zip file" ] } ], "source": [ "import zipfile\n", "\n", "filelist = zipfile.ZipFile('./raw/DocLayNet_core.zip', 'r').filelist\n", "filelist" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from utils.read_zip_file import read_zip_file\n", "\n", "read_zip_file('./raw/DocLayNet_core.zip', './raw/DocLayNet_core')" ] } ], "metadata": { "kernelspec": { "display_name": "dss-env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }