Spaces:
Running
Running
cache
Browse filesuses eleventy's cache location for now
- docs/data/presse.parquet.sh +25 -22
- docs/index.md +1 -1
docs/data/presse.parquet.sh
CHANGED
@@ -1,27 +1,30 @@
|
|
1 |
-
#
|
2 |
-
export
|
3 |
-
command -v duckdb || $(
|
4 |
-
mkdir -p .cache
|
5 |
-
curl --location --output .cache/duckdb.zip \
|
6 |
-
https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
|
7 |
-
unzip -qq .cache/duckdb.zip && chmod +x .cache/duckdb
|
8 |
-
)
|
9 |
-
|
10 |
-
export TMPDIR="dist"
|
11 |
mkdir -p $TMPDIR
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
|
24 |
-
""" | duckdb
|
|
|
25 |
|
26 |
# isatty
|
27 |
if [ -t 1 ]; then
|
@@ -29,5 +32,5 @@ if [ -t 1 ]; then
|
|
29 |
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
|
30 |
else
|
31 |
cat $TMPDIR/presse.parquet
|
32 |
-
rm $TMPDIR/presse.parquet
|
33 |
fi
|
|
|
1 |
+
# Use "eleventy" .cache to store our temp files
|
2 |
+
export TMPDIR=".cache"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
mkdir -p $TMPDIR
|
4 |
|
5 |
+
if [ ! -f "$TMPDIR/presse.parquet" ]; then
|
6 |
+
|
7 |
+
# install duckdb if not already present
|
8 |
+
export PATH=.cache:$PATH
|
9 |
+
command -v duckdb || $(
|
10 |
+
curl --location --output duckdb.zip \
|
11 |
+
https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip && \
|
12 |
+
unzip -qq duckdb.zip && chmod +x duckdb && mkdir -p .cache && mv duckdb .cache/
|
13 |
+
)
|
14 |
+
|
15 |
+
echo """
|
16 |
+
CREATE TABLE presse AS (
|
17 |
+
SELECT title
|
18 |
+
, author
|
19 |
+
, LPAD((REGEXP_EXTRACT(date, '1[0-9][0-9][0-9]') || '-01-01'), 10, '0')::DATE AS year
|
20 |
+
FROM read_parquet(
|
21 |
+
[('https://huggingface.co/datasets/PleIAs/French-PD-Newspapers/resolve/main/gallica_presse_{:d}.parquet').format(n) for n in range(1, 321)])
|
22 |
+
ORDER BY title, author, year
|
23 |
+
);
|
24 |
|
25 |
+
COPY presse TO '$TMPDIR/presse.parquet' (COMPRESSION 'ZSTD', row_group_size 10000000);
|
26 |
+
""" | duckdb
|
27 |
+
fi
|
28 |
|
29 |
# isatty
|
30 |
if [ -t 1 ]; then
|
|
|
32 |
echo "duckdb -csv :memory: \"SELECT * FROM '$TMPDIR/presse.parquet'\""
|
33 |
else
|
34 |
cat $TMPDIR/presse.parquet
|
35 |
+
#rm $TMPDIR/presse.parquet
|
36 |
fi
|
docs/index.md
CHANGED
@@ -25,7 +25,7 @@
|
|
25 |
|
26 |
<p class=signature>by <a href="https://observablehq.com/@fil">Fil</a>
|
27 |
|
28 |
-
This new
|
29 |
|
30 |
The data is stored in 320 large parquet files. The data loader for this [Observable framework](https://observablehq.com/framework) project uses [DuckDB](https://duckdb.org/) to read these files (altogether about 200GB) and combines a minimal subset of their metadata — title and year of publication, most importantly without the text contents —, into a single highly optimized parquet file. This takes only about 1 minute to run in a hugging-face Space.
|
31 |
|
|
|
25 |
|
26 |
<p class=signature>by <a href="https://observablehq.com/@fil">Fil</a>
|
27 |
|
28 |
+
This new fascinating dataset just dropped on Hugging Face : [French public domain newspapers](https://huggingface.co/datasets/PleIAs/French-PD-Newspapers) 🤗 references about **3 million newspapers and periodicals** with their full text OCR’ed and some meta-data.
|
29 |
|
30 |
The data is stored in 320 large parquet files. The data loader for this [Observable framework](https://observablehq.com/framework) project uses [DuckDB](https://duckdb.org/) to read these files (altogether about 200GB) and combines a minimal subset of their metadata — title and year of publication, most importantly without the text contents —, into a single highly optimized parquet file. This takes only about 1 minute to run in a hugging-face Space.
|
31 |
|