Spaces:
Running
Running
Update presse.parquet.sh
Browse files- docs/data/presse.parquet.sh +20 -23
docs/data/presse.parquet.sh
CHANGED
@@ -1,30 +1,27 @@
|
|
1 |
# Use "eleventy" .cache to store our temp files
|
2 |
-
export TMPDIR="
|
3 |
mkdir -p $TMPDIR
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
SELECT title
|
18 |
-
, author
|
19 |
-
, LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
|
20 |
-
FROM read_parquet(
|
21 |
-
[('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
|
22 |
-
ORDER BY title, author, year
|
23 |
-
);
|
24 |
-
|
25 |
-
COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
|
26 |
-
""" | duckdb
|
27 |
-
fi
|
28 |
|
29 |
# isatty
|
30 |
if [ -t 1 ]; then
|
@@ -32,5 +29,5 @@ if [ -t 1 ]; then
|
|
32 |
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
|
33 |
else
|
34 |
cat $TMPDIR/presse.parquet
|
35 |
-
|
36 |
fi
|
|
|
1 |
# Use "eleventy" .cache to store our temp files
|
2 |
+
export TMPDIR="docs/.observablehhq/.cache"
|
3 |
mkdir -p $TMPDIR
|
4 |
|
5 |
+
# install duckdb if not already present
|
6 |
+
export PATH=$TMPDIR:$PATH
|
7 |
+
command -v duckdb || $(
|
8 |
+
curl --location --output duckdb.zip \
|
9 |
+
https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
|
10 |
+
unzip -qq duckdb.zip && chmod +x duckdb && mv duckdb $TMPDIR/
|
11 |
+
)
|
12 |
|
13 |
+
echo """
|
14 |
+
CREATE TABLE presse AS (
|
15 |
+
SELECT title
|
16 |
+
, author
|
17 |
+
, LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
|
18 |
+
FROM read_parquet(
|
19 |
+
[('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
|
20 |
+
ORDER BY title, author, year
|
21 |
+
);
|
22 |
|
23 |
+
COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
|
24 |
+
""" | duckdb
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# isatty
|
27 |
if [ -t 1 ]; then
|
|
|
29 |
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
|
30 |
else
|
31 |
cat $TMPDIR/presse.parquet
|
32 |
+
rm $TMPDIR/presse.parquet
|
33 |
fi
|