November14
/

5G_enet-model

Model card Files Files and versions

5G_enet-model / llama.cpp /scripts /get-pg.sh

November14's picture

Upload folder using huggingface_hub

91c62dc verified about 1 month ago

history blame contribute delete

1.4 kB

	#!/usr/bin/env bash

	function usage {
	echo "usage: <n>$0"
	echo "note: n is the number of essays to download"
	echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
	echo "n \| tokens"
	echo "--- \| ---"
	echo "1 \| 6230"
	echo "2 \| 23619"
	echo "5 \| 25859"
	echo "10 \| 36888"
	echo "15 \| 50188"
	echo "20 \| 59094"
	echo "25 \| 88764"
	echo "30 \| 103121"
	echo "32 \| 108338"
	echo "35 \| 113403"
	echo "40 \| 127699"
	echo "45 \| 135896"
	exit 1
	}

	function has_cmd {
	if ! [ -x "$(command -v $1)" ]; then
	echo "error: $1 is not available" >&2
	exit 1
	fi
	}

	# check for: curl, html2text, tail, sed, fmt
	has_cmd curl
	has_cmd html2text
	has_cmd tail
	has_cmd sed

	if [ $# -ne 1 ]; then
	usage
	fi

	n=$1

	# get urls
	urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss \| grep html \| sed -e "s/.http/http/" \| sed -e "s/html./html/" \| head -n $n)"

	printf "urls:\n%s\n" "$urls"

	if [ -f pg.txt ]; then
	rm pg.txt
	fi

	c=1
	for url in $urls; do
	echo "processing $url"

	cc=$(printf "%03d" $c)

	curl -L $url \| html2text \| tail -n +4 \| sed -E "s/^[[:space:]]+//g" \| fmt -w 80 >> pg-$cc-one.txt
	cat pg-$cc-one.txt >> pg.txt

	cp -v pg.txt pg-$cc-all.txt
	c=$((c+1))

	# don't flood the server
	sleep 1
	done

	echo "done. data in pg.txt"

	exit 0