ZeroCool94 keturn commited on
Commit
abbf29d
0 Parent(s):

Duplicate from keturn/INED-datasette

Browse files

Co-authored-by: Kevin Turner <keturn@users.noreply.huggingface.co>

Files changed (7) hide show
  1. .gitattributes +34 -0
  2. Dockerfile +31 -0
  3. README.md +11 -0
  4. metadata.json +14 -0
  5. settings.json +3 -0
  6. src/import-git.sh +16 -0
  7. src/textdir2sql/loading.py +90 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM datasetteproject/datasette:0.64.1
2
+
3
+ # huggingface spaces run as user 1000
4
+ RUN adduser hf-space --uid 1000 --disabled-password --gecos '' && \
5
+ mkdir /home/hf-space/app && \
6
+ chown hf-space: /home/hf-space/app
7
+ WORKDIR /home/hf-space/app
8
+
9
+ RUN datasette install datasette-configure-fts && \
10
+ datasette install datasette-render-image-tags
11
+
12
+ RUN apt-get update && \
13
+ apt-get install -y --no-install-recommends git && \
14
+ apt-get clean && \
15
+ rm -rf /var/lib/apt && \
16
+ rm -rf /var/lib/dpkg/info/*
17
+
18
+ USER hf-space
19
+
20
+ # spaces default port
21
+ EXPOSE 7860
22
+ ENTRYPOINT ["datasette", "--host=0.0.0.0", "--port=7860"]
23
+ CMD ["."]
24
+
25
+ ENV PYTHONPATH=/home/hf-space/app/src/
26
+
27
+ COPY src src
28
+ COPY metadata.json settings.json ./
29
+
30
+ RUN src/import-git.sh && \
31
+ datasette inspect *.db --inspect-file=inspect-data.json
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: INED Datasette
3
+ emoji: 🐢
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ duplicated_from: keturn/INED-datasette
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
metadata.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "Imaginary Network Expanded Dataset",
3
+ "description": "Curated by Sygil",
4
+ "source_url": "https://github.com/Sygil-Dev/INE-dataset",
5
+ "databases": {
6
+ "INE": {
7
+ "tables": {
8
+ "images": {
9
+ "fts_table": "captions_fts"
10
+ }
11
+ }
12
+ }
13
+ }
14
+ }
settings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ {
2
+ "default_page_size": 20
3
+ }
src/import-git.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -x -e -o pipefail
3
+
4
+ REPO="https://github.com/Sygil-Dev/INE-dataset.git"
5
+ IMAGE_HOST="https://raw.githubusercontent.com/Sygil-Dev/INE-dataset/main/data/"
6
+
7
+ # avoid cloning all the image files
8
+ git clone --no-checkout --filter=blob:none --depth 1 "${REPO}" dataset
9
+
10
+ # Beware `--no-cone` is deprecated, so this may stop working someday
11
+ # https://git-scm.com/docs/git-sparse-checkout#_internalsnon_cone_problems
12
+ git -C dataset sparse-checkout set --no-cone '/data/*.txt'
13
+ git -C dataset checkout main
14
+
15
+ python3 -m textdir2sql.loading dataset/data INE.db \
16
+ --image-host="${IMAGE_HOST}"
src/textdir2sql/loading.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from functools import partial
3
+ from itertools import islice
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+ BATCH_SIZE=1024
9
+
10
+
11
+ @click.command()
12
+ @click.argument('input_dir', type=click.Path(exists=True, file_okay=False, path_type=Path))
13
+ @click.argument('output', type=click.Path(dir_okay=False, writable=True, path_type=Path))
14
+ @click.option('--image-host', help="base URL of images")
15
+ @click.option('--explicit/--no-explicit', default=False)
16
+ def main(input_dir: Path, output: Path, image_host: str, explicit:bool):
17
+ connection = sqlite3.connect(output)
18
+ try:
19
+ _main_with_connection(input_dir, connection, image_host, explicit)
20
+ finally:
21
+ connection.close()
22
+
23
+ def _main_with_connection(input_dir: Path, connection: sqlite3.Connection, image_host: str=None, explicit=False):
24
+ connection.execute("CREATE TABLE IF NOT EXISTS "
25
+ " captions(image_key text PRIMARY KEY, caption text NOT NULL);")
26
+
27
+ if image_host:
28
+ connection.execute(f"""
29
+ CREATE VIEW IF NOT EXISTS images AS
30
+ SELECT {sql_quote(connection, image_host)} || image_key || '.jpg' AS image,
31
+ caption,
32
+ rowid
33
+ FROM captions
34
+ """)
35
+
36
+ text_files = input_dir.glob("*.txt")
37
+
38
+ with click.progressbar(chunked(text_files, BATCH_SIZE)) as progress:
39
+ for batch in progress:
40
+ text_file: Path
41
+ pairs = ((text_file.stem, text_file.read_text())
42
+ for text_file in batch)
43
+ with connection:
44
+ connection.executemany("INSERT INTO captions(image_key, caption) "
45
+ "VALUES(?, ?) ", pairs)
46
+
47
+ if not explicit:
48
+ ratings = ["rating:unsafe", "rating:explicit", "rating:mature", "meta:nsfw",
49
+ "subreddit:%nsfw"]
50
+ for rating in ratings:
51
+ with connection:
52
+ c = connection.execute("DELETE FROM captions WHERE caption LIKE ?",
53
+ (f"%{rating}%",))
54
+ print(f"Removed {c.rowcount} {rating} rows")
55
+
56
+ with connection:
57
+ # Add full-text search index
58
+ connection.execute("""CREATE VIRTUAL TABLE
59
+ captions_fts USING
60
+ fts5(caption, image_key UNINDEXED, content=captions)
61
+ """)
62
+ connection.execute("""
63
+ INSERT INTO "captions_fts" (rowid, image_key, caption)
64
+ SELECT rowid, image_key, caption
65
+ FROM captions
66
+ """)
67
+
68
+
69
+ def chunked(iterable, n):
70
+ return iter(partial(take, n, iter(iterable)), [])
71
+
72
+ def take(n, iterable):
73
+ return list(islice(iterable, n))
74
+
75
+ def sql_quote(connection, value: str) -> str:
76
+ """
77
+ Apply SQLite string quoting to a value, including wrapping it in single quotes.
78
+ :param value: String to quote
79
+ """
80
+ # Normally we would use .execute(sql, [params]) for escaping, but
81
+ # occasionally that isn't available - most notable when we need
82
+ # to include a "... DEFAULT 'value'" in a column definition.
83
+ return connection.execute(
84
+ # Use SQLite itself to correctly escape this string:
85
+ "SELECT quote(:value)",
86
+ {"value": value},
87
+ ).fetchone()[0]
88
+
89
+ if __name__ == "__main__":
90
+ main()