Start-GPT commited on
Commit
3a29c11
1 Parent(s): df92393

Create create_corpora.sh

Browse files
Files changed (1) hide show
  1. server/create_corpora.sh +30 -0
server/create_corpora.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # WARNING: Do not call this as an absolute path
4
+
5
+ SCRIPT_DIR="./"
6
+ WOZ_NAME="woz"
7
+ WIKI_NAME="wiki"
8
+ CORPORA="$WOZ_NAME $WIKI_NAME"
9
+ # MODELS="bert-base-cased gpt2 distilgpt2 roberta-base distilroberta-base distilbert-base-uncased"
10
+ MODELS="gpt2 distilgpt2"
11
+ OUT_DIR="./$SCRIPT_DIR/corpora"
12
+ RAW_TEXT_DIR="./$SCRIPT_DIR/raw_data"
13
+ PYTHON_SCRIPT="./$SCRIPT_DIR/data_processing/create_corpus.py"
14
+
15
+ # Download the models, hardcoded for now
16
+ # mkdir -p $RAW_TEXT_DIR
17
+ # WOZURL="https://ibm.box.com/shared/static/uchx6xdvb1ghhrv3ztxk9dvyvfxy31ce.txt"
18
+ # WIKIURL="https://ibm.box.com/shared/static/3rfbn3v3h6wpjalwob1pl0geppzx9746.txt"
19
+ #
20
+ # wget -O "$RAW_TEXT_DIR/$WOZ_NAME.txt" -L $WOZURL
21
+ # wget -O "$RAW_TEXT_DIR/$WIKI_NAME.txt" -L $WIKIURL
22
+
23
+ # Create the corpus
24
+ mkdir -p "$OUT_DIR"
25
+ for model in $MODELS; do
26
+ for corpus in $CORPORA; do
27
+ CORPUSFILE="$RAW_TEXT_DIR/$corpus.txt"
28
+ python $PYTHON_SCRIPT -f $CORPUSFILE -o $OUT_DIR -m $model -n $corpus --force
29
+ done
30
+ done