Spaces:
Runtime error
Runtime error
Create create_corpora.sh
Browse files- server/create_corpora.sh +30 -0
server/create_corpora.sh
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# WARNING: Do not call this as an absolute path
|
4 |
+
|
5 |
+
SCRIPT_DIR="./"
|
6 |
+
WOZ_NAME="woz"
|
7 |
+
WIKI_NAME="wiki"
|
8 |
+
CORPORA="$WOZ_NAME $WIKI_NAME"
|
9 |
+
# MODELS="bert-base-cased gpt2 distilgpt2 roberta-base distilroberta-base distilbert-base-uncased"
|
10 |
+
MODELS="gpt2 distilgpt2"
|
11 |
+
OUT_DIR="./$SCRIPT_DIR/corpora"
|
12 |
+
RAW_TEXT_DIR="./$SCRIPT_DIR/raw_data"
|
13 |
+
PYTHON_SCRIPT="./$SCRIPT_DIR/data_processing/create_corpus.py"
|
14 |
+
|
15 |
+
# Download the models, hardcoded for now
|
16 |
+
# mkdir -p $RAW_TEXT_DIR
|
17 |
+
# WOZURL="https://ibm.box.com/shared/static/uchx6xdvb1ghhrv3ztxk9dvyvfxy31ce.txt"
|
18 |
+
# WIKIURL="https://ibm.box.com/shared/static/3rfbn3v3h6wpjalwob1pl0geppzx9746.txt"
|
19 |
+
#
|
20 |
+
# wget -O "$RAW_TEXT_DIR/$WOZ_NAME.txt" -L $WOZURL
|
21 |
+
# wget -O "$RAW_TEXT_DIR/$WIKI_NAME.txt" -L $WIKIURL
|
22 |
+
|
23 |
+
# Create the corpus
|
24 |
+
mkdir -p "$OUT_DIR"
|
25 |
+
for model in $MODELS; do
|
26 |
+
for corpus in $CORPORA; do
|
27 |
+
CORPUSFILE="$RAW_TEXT_DIR/$corpus.txt"
|
28 |
+
python $PYTHON_SCRIPT -f $CORPUSFILE -o $OUT_DIR -m $model -n $corpus --force
|
29 |
+
done
|
30 |
+
done
|