fil commited on
Commit
c14a6cc
·
unverified ·
1 Parent(s): 37692ec

Update presse.parquet.sh

Browse files
Files changed (1) hide show
  1. docs/data/presse.parquet.sh +20 -23
docs/data/presse.parquet.sh CHANGED
@@ -1,30 +1,27 @@
1
  # Use "eleventy" .cache to store our temp files
2
- export TMPDIR=".cache"
3
  mkdir -p $TMPDIR
4
 
5
- if [ ! -f "$TMPDIR/presse.parquet" ]; then
 
 
 
 
 
 
6
 
7
- # install duckdb if not already present
8
- export PATH=.cache:$PATH
9
- command -v duckdb || $(
10
- curl --location --output duckdb.zip \
11
- https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
12
- unzip -qq duckdb.zip && chmod +x duckdb && mkdir -p .cache && mv duckdb .cache/
13
- )
 
 
14
 
15
- echo """
16
- CREATE TABLE presse AS (
17
- SELECT title
18
- , author
19
- , LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
20
- FROM read_parquet(
21
- [('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
22
- ORDER BY title, author, year
23
- );
24
-
25
- COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
26
- """ | duckdb
27
- fi
28
 
29
  # isatty
30
  if [ -t 1 ]; then
@@ -32,5 +29,5 @@ if [ -t 1 ]; then
32
  echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
33
  else
34
  cat $TMPDIR/presse.parquet
35
- #rm $TMPDIR/presse.parquet
36
  fi
 
1
  # Use "eleventy" .cache to store our temp files
2
+ export TMPDIR="docs/.observablehhq/.cache"
3
  mkdir -p $TMPDIR
4
 
5
+ # install duckdb if not already present
6
+ export PATH=$TMPDIR:$PATH
7
+ command -v duckdb || $(
8
+ curl --location --output duckdb.zip \
9
+ https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
10
+ unzip -qq duckdb.zip && chmod +x duckdb && mv duckdb $TMPDIR/
11
+ )
12
 
13
+ echo """
14
+ CREATE TABLE presse AS (
15
+ SELECT title
16
+ , author
17
+ , LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
18
+ FROM read_parquet(
19
+ [('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
20
+ ORDER BY title, author, year
21
+ );
22
 
23
+ COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
24
+ """ | duckdb
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # isatty
27
  if [ -t 1 ]; then
 
29
  echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
30
  else
31
  cat $TMPDIR/presse.parquet
32
+ rm $TMPDIR/presse.parquet
33
  fi