Spaces:
Paused
Paused
| # for seqeval metrics import | |
| pip install -r ../requirements.txt | |
| ## The relevant files are currently on a shared Google | |
| ## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J | |
| ## Monitor for changes and eventually migrate to use the `datasets` library | |
| curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \ | |
| | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp | |
| curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \ | |
| | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp | |
| curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \ | |
| | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp | |
| export MAX_LENGTH=128 | |
| export BERT_MODEL=bert-base-multilingual-cased | |
| python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt | |
| python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt | |
| python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt | |
| cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt | |
| export BATCH_SIZE=32 | |
| export NUM_EPOCHS=3 | |
| export SEED=1 | |
| export OUTPUT_DIR_NAME=germeval-model | |
| export CURRENT_DIR=${PWD} | |
| export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} | |
| mkdir -p $OUTPUT_DIR | |
| # Add parent directory to python path to access lightning_base.py | |
| export PYTHONPATH="../":"${PYTHONPATH}" | |
| python3 run_ner.py --data_dir ./ \ | |
| --labels ./labels.txt \ | |
| --model_name_or_path $BERT_MODEL \ | |
| --output_dir $OUTPUT_DIR \ | |
| --max_seq_length $MAX_LENGTH \ | |
| --num_train_epochs $NUM_EPOCHS \ | |
| --train_batch_size $BATCH_SIZE \ | |
| --seed $SEED \ | |
| --gpus 1 \ | |
| --do_train \ | |
| --do_predict | |