fil commited on
Commit
66bfd23
·
unverified ·
2 Parent(s): 13b32a7 c9c3380

uses eleventy's cache location for now

Files changed (2) hide show
  1. docs/data/presse.parquet.sh +25 -22
  2. docs/index.md +1 -1
docs/data/presse.parquet.sh CHANGED
@@ -1,27 +1,30 @@
1
- # install duckdb if not already present
2
- export PATH=.cache:$PATH
3
- command -v duckdb || $(
4
- mkdir -p .cache
5
- curl --location --output .cache/duckdb.zip \
6
- https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
7
- unzip -qq .cache/duckdb.zip && chmod +x .cache/duckdb
8
- )
9
-
10
- export TMPDIR="dist"
11
  mkdir -p $TMPDIR
12
 
13
- echo """
14
- CREATE TABLE presse AS (
15
- SELECT title
16
- , author
17
- , LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
18
- FROM read_parquet(
19
- [('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
20
- ORDER BY title, author, year
21
- );
 
 
 
 
 
 
 
 
 
 
22
 
23
- COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
24
- """ | duckdb
 
25
 
26
  # isatty
27
  if [ -t 1 ]; then
@@ -29,5 +32,5 @@ if [ -t 1 ]; then
29
  echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
30
  else
31
  cat $TMPDIR/presse.parquet
32
- rm $TMPDIR/presse.parquet
33
  fi
 
1
+ # Use "eleventy" .cache to store our temp files
2
+ export TMPDIR=".cache"
 
 
 
 
 
 
 
 
3
  mkdir -p $TMPDIR
4
 
5
+ if [ ! -f "$TMPDIR/presse.parquet" ]; then
6
+
7
+ # install duckdb if not already present
8
+ export PATH=.cache:$PATH
9
+ command -v duckdb || $(
10
+ curl --location --output duckdb.zip \
11
+ https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
12
+ unzip -qq duckdb.zip && chmod +x duckdb && mkdir -p .cache && mv duckdb .cache/
13
+ )
14
+
15
+ echo """
16
+ CREATE TABLE presse AS (
17
+ SELECT title
18
+ , author
19
+ , LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
20
+ FROM read_parquet(
21
+ [('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
22
+ ORDER BY title, author, year
23
+ );
24
 
25
+ COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
26
+ """ | duckdb
27
+ fi
28
 
29
  # isatty
30
  if [ -t 1 ]; then
 
32
  echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
33
  else
34
  cat $TMPDIR/presse.parquet
35
+ #rm $TMPDIR/presse.parquet
36
  fi
docs/index.md CHANGED
@@ -25,7 +25,7 @@
25
 
26
  <p class=signature>by <a href="https://observablehq.com/@fil">Fil</a>
27
 
28
- This new and fascinating dataset just dropped on Hugging Face&nbsp;: [French public domain newspapers](https://huggingface.co/datasets/PleIAs/French-PD-Newspapers) 🤗 references about **3&nbsp;million newspapers and periodicals** with their full text OCR’ed and some meta-data.
29
 
30
  The data is stored in 320 large parquet files. The data loader for this [Observable framework](https://observablehq.com/framework) project uses [DuckDB](https://duckdb.org/) to read these files (altogether about 200GB) and combines a minimal subset of their metadata — title and year of publication, most importantly without the text contents&nbsp;—, into a single highly optimized parquet file. This takes only about 1 minute to run in a hugging-face Space.
31
 
 
25
 
26
  <p class=signature>by <a href="https://observablehq.com/@fil">Fil</a>
27
 
28
+ This new fascinating dataset just dropped on Hugging Face&nbsp;: [French public domain newspapers](https://huggingface.co/datasets/PleIAs/French-PD-Newspapers) 🤗 references about **3&nbsp;million newspapers and periodicals** with their full text OCR’ed and some meta-data.
29
 
30
  The data is stored in 320 large parquet files. The data loader for this [Observable framework](https://observablehq.com/framework) project uses [DuckDB](https://duckdb.org/) to read these files (altogether about 200GB) and combines a minimal subset of their metadata — title and year of publication, most importantly without the text contents&nbsp;—, into a single highly optimized parquet file. This takes only about 1 minute to run in a hugging-face Space.
31