Spaces:
Running
Running
test if file exists
Browse files- docs/data/presse.parquet.sh +27 -21
docs/data/presse.parquet.sh
CHANGED
@@ -1,26 +1,32 @@
|
|
1 |
-
#
|
2 |
-
export
|
3 |
-
command -v duckdb || $(
|
4 |
-
curl --location --output duckdb.zip \
|
5 |
-
https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
|
6 |
-
unzip -qq duckdb.zip && chmod +x duckdb && mkdir -p .cache && mv duckdb .cache/
|
7 |
-
)
|
8 |
-
|
9 |
-
export TMPDIR="dist"
|
10 |
mkdir -p $TMPDIR
|
11 |
|
12 |
-
echo """
|
13 |
-
CREATE TABLE presse AS (
|
14 |
-
SELECT title
|
15 |
-
, author
|
16 |
-
, LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
|
17 |
-
FROM read_parquet(
|
18 |
-
[('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
|
19 |
-
ORDER BY title, author, year
|
20 |
-
);
|
21 |
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# isatty
|
26 |
if [ -t 1 ]; then
|
@@ -28,5 +34,5 @@ if [ -t 1 ]; then
|
|
28 |
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
|
29 |
else
|
30 |
cat $TMPDIR/presse.parquet
|
31 |
-
rm $TMPDIR/presse.parquet
|
32 |
fi
|
|
|
1 |
+
# Use "eleventy" .cache to store our temp files
|
2 |
+
export TMPDIR=".cache"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
mkdir -p $TMPDIR
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
if [ ! -f "$TMPDIR/presse.parquet" ]; then
|
7 |
+
|
8 |
+
# install duckdb if not already present
|
9 |
+
export PATH=.cache:$PATH
|
10 |
+
command -v duckdb || $(
|
11 |
+
curl --location --output duckdb.zip \
|
12 |
+
https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
|
13 |
+
unzip -qq duckdb.zip && chmod +x duckdb && mkdir -p .cache && mv duckdb .cache/
|
14 |
+
)
|
15 |
+
|
16 |
+
echo """
|
17 |
+
CREATE TABLE presse AS (
|
18 |
+
SELECT title
|
19 |
+
, author
|
20 |
+
, LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
|
21 |
+
FROM read_parquet(
|
22 |
+
[('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
|
23 |
+
ORDER BY title, author, year
|
24 |
+
);
|
25 |
+
|
26 |
+
COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
|
27 |
+
""" | duckdb
|
28 |
+
|
29 |
+
fi
|
30 |
|
31 |
# isatty
|
32 |
if [ -t 1 ]; then
|
|
|
34 |
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
|
35 |
else
|
36 |
cat $TMPDIR/presse.parquet
|
37 |
+
#rm $TMPDIR/presse.parquet
|
38 |
fi
|