Charles Kabui commited on
Commit
810a7a1
1 Parent(s): dec87ec

read zip files

Browse files
data/preview.ipynb ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Details"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "`/raw/DocLayNet_core.zip` downloaded from [DocLayNet_core.zip dataset](https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip)\n",
15
+ "\n",
16
+ "`/raw/RVL-CDIP-invoice.zip` downloaded from [chainyo/rvl-cdip-invoice](https://huggingface.co/datasets/chainyo/rvl-cdip-invoice). It can also be downloaded from [aharley/rvl_cdip](https://huggingface.co/datasets/aharley/rvl_cdip).\n",
17
+ "\n",
18
+ "`/processed/vectors/RVL-CDIP-invoice.json.zip` generated using `/raw/RVL-CDIP-invoice.zip`, and the model to create the following features."
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "markdown",
23
+ "metadata": {},
24
+ "source": [
25
+ "### Preview"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 3,
31
+ "metadata": {},
32
+ "outputs": [
33
+ {
34
+ "ename": "BadZipFile",
35
+ "evalue": "File is not a zip file",
36
+ "output_type": "error",
37
+ "traceback": [
38
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
39
+ "\u001b[0;31mBadZipFile\u001b[0m Traceback (most recent call last)",
40
+ "Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mzipfile\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m filelist \u001b[38;5;241m=\u001b[39m \u001b[43mzipfile\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mZipFile\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m./raw/DocLayNet_core.zip\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mfilelist\n\u001b[1;32m 4\u001b[0m filelist\n",
41
+ "File \u001b[0;32m~/miniconda3/envs/dss-env/lib/python3.10/zipfile.py:1269\u001b[0m, in \u001b[0;36mZipFile.__init__\u001b[0;34m(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps)\u001b[0m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m-> 1269\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_RealGetContents\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m mode \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 1271\u001b[0m \u001b[38;5;66;03m# set the modified flag so central directory gets written\u001b[39;00m\n\u001b[1;32m 1272\u001b[0m \u001b[38;5;66;03m# even if no files are added to the archive\u001b[39;00m\n\u001b[1;32m 1273\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_didModify \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
42
+ "File \u001b[0;32m~/miniconda3/envs/dss-env/lib/python3.10/zipfile.py:1336\u001b[0m, in \u001b[0;36mZipFile._RealGetContents\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1334\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m BadZipFile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile is not a zip file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1335\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m endrec:\n\u001b[0;32m-> 1336\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m BadZipFile(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFile is not a zip file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1337\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdebug \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 1338\u001b[0m \u001b[38;5;28mprint\u001b[39m(endrec)\n",
43
+ "\u001b[0;31mBadZipFile\u001b[0m: File is not a zip file"
44
+ ]
45
+ }
46
+ ],
47
+ "source": [
48
+ "import zipfile\n",
49
+ "\n",
50
+ "filelist = zipfile.ZipFile('./raw/DocLayNet_core.zip', 'r').filelist\n",
51
+ "filelist"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "from utils.read_zip_file import read_zip_file\n",
61
+ "\n",
62
+ "read_zip_file('./raw/DocLayNet_core.zip', './raw/DocLayNet_core')"
63
+ ]
64
+ }
65
+ ],
66
+ "metadata": {
67
+ "kernelspec": {
68
+ "display_name": "dss-env",
69
+ "language": "python",
70
+ "name": "python3"
71
+ },
72
+ "language_info": {
73
+ "codemirror_mode": {
74
+ "name": "ipython",
75
+ "version": 3
76
+ },
77
+ "file_extension": ".py",
78
+ "mimetype": "text/x-python",
79
+ "name": "python",
80
+ "nbconvert_exporter": "python",
81
+ "pygments_lexer": "ipython3",
82
+ "version": "3.10.13"
83
+ }
84
+ },
85
+ "nbformat": 4,
86
+ "nbformat_minor": 2
87
+ }
data/raw/README.md DELETED
@@ -1,33 +0,0 @@
1
- How the data got here:
2
-
3
- 1. Downloading the [DocLayNet_core.zip dataset](https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip) using the `wget`.
4
- 2. Uploaded the downloaded `DocLayNet_core.zip` file to Hugging Face space.
5
- 3. Loading a dataset (`chainyo/rvl-cdip-invoice`) using the `load_dataset` function from the Hugging Face library.
6
- 5. Extracting and saving each image from the `train` portion of the loaded dataset into the `RVL-CDIP-invoice` directory.
7
- 7. Compressing the `RVL-CDIP-invoice` directory into a zip file (`RVL-CDIP-invoice.zip`) using the `zip` command.
8
- 8. Uploading the zip file to Hugging Face space.
9
-
10
- ```
11
- # # !wget https://codait-cos-dax.s3.us.cloud-object-storage.appdomain.cloud/dax-doclaynet/1.0.0/DocLayNet_core.zip -O DocLayNet_core.zip
12
- # upload_to_huggingface_space(
13
- # space_name = HUGGINGFACE_SPACE_NAME,
14
- # private = True,
15
- # path_or_fileobj = './DocLayNet_core.zip',
16
- # path_in_repo = 'data/raw/DocLayNet_core.zip')
17
- #
18
- #
19
- # invoices = load_dataset('chainyo/rvl-cdip-invoice', revision="fad615c9ceaecb4476b0a01f29c0a15b276b3a2b")
20
- # # can also be found at: https://huggingface.co/datasets/aharley/rvl_cdip
21
- # os.mkdir('./RVL-CDIP-invoice')
22
- # for index, invoice in enumerate(tqdm(invoices['train'])):
23
- # invoice['image'].save(f'./RVL-CDIP-invoice/{index}.png', format="png")
24
- #
25
- # !ls ./RVL-CDIP-invoice -1 | wc -l
26
- # !zip -r RVL-CDIP-invoice.zip ./RVL-CDIP-invoice
27
- #
28
- # upload_to_huggingface_space(
29
- # space_name = HUGGINGFACE_SPACE_NAME,
30
- # private = True,
31
- # path_or_fileobj = './RVL-CDIP-invoice.zip',
32
- # path_in_repo = f'data/raw/RVL-CDIP-invoice.zip')
33
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/raw/preview.ipynb DELETED
File without changes
utils/read_zip_file.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import zipfile
3
+
4
+ def read_zip_file(archive_path: str, file_path: str):
5
+ """
6
+ Read a file from a zip archive.
7
+ Args:
8
+ archive_path (str): The path to the zip archive.
9
+ file_path (str): The path to the file inside the archive.
10
+ Returns:
11
+ io.BytesIO: The file as a BytesIO object.
12
+ """
13
+ with zipfile.ZipFile(archive_path) as zip_file:
14
+ file = zip_file.read(file_path)
15
+ return io.BytesIO(file) if file else file