antonlabate
commited on
Commit
•
3124aa4
1
Parent(s):
082b881
training
Browse files- generate_text2sql_dataset_amr.sh +38 -0
- preprocess.sh +19 -0
- train_text2sql_schema_item_classifier.sh +21 -0
- train_text2sql_t5_base.sh +31 -0
generate_text2sql_dataset_amr.sh
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set -e
|
2 |
+
|
3 |
+
# generate text2sql training dataset with noise_rate 0.2
|
4 |
+
python text2sql_data_generator.py \
|
5 |
+
--input_dataset_path "./data/preprocessed_data/preprocessed_train_spider_amr.json" \
|
6 |
+
--output_dataset_path "./data/preprocessed_data/resdsql_train_spider_amr.json" \
|
7 |
+
--topk_table_num 4 \
|
8 |
+
--topk_column_num 5 \
|
9 |
+
--mode "train" \
|
10 |
+
--noise_rate 0.2 \
|
11 |
+
--use_contents \
|
12 |
+
--add_fk_info \
|
13 |
+
--output_skeleton \
|
14 |
+
--target_type "sql"
|
15 |
+
|
16 |
+
# predict probability for each schema item in the eval set
|
17 |
+
python schema_item_classifier.py \
|
18 |
+
--batch_size 32 \
|
19 |
+
--device "0" \
|
20 |
+
--seed 42 \
|
21 |
+
--save_path "./models/text2sql_schema_item_classifier_semantic" \
|
22 |
+
--dev_filepath "./data/preprocessed_data/preprocessed_dev_amr.json" \
|
23 |
+
--output_filepath "./data/preprocessed_data/dev_with_probs_amr.json" \
|
24 |
+
--use_contents \
|
25 |
+
--add_fk_info \
|
26 |
+
--mode "eval"
|
27 |
+
|
28 |
+
# generate text2sql development dataset
|
29 |
+
python text2sql_data_generator.py \
|
30 |
+
--input_dataset_path "./data/preprocessed_data/dev_with_probs_amr.json" \
|
31 |
+
--output_dataset_path "./data/preprocessed_data/resdsql_dev_amr.json" \
|
32 |
+
--topk_table_num 4 \
|
33 |
+
--topk_column_num 5 \
|
34 |
+
--mode "eval" \
|
35 |
+
--use_contents \
|
36 |
+
--add_fk_info \
|
37 |
+
--output_skeleton \
|
38 |
+
--target_type "sql"
|
preprocess.sh
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set -e
|
2 |
+
|
3 |
+
# preprocess train_spider dataset
|
4 |
+
python preprocessing.py \
|
5 |
+
--mode "train" \
|
6 |
+
--table_path "./data/spider_amr/tables.json" \
|
7 |
+
--input_dataset_path "./data/spider_amr/train_spider.json" \
|
8 |
+
--output_dataset_path "./data/preprocessed_data/preprocessed_train_spider_amr.json" \
|
9 |
+
--db_path "./database" \
|
10 |
+
--target_type "sql"
|
11 |
+
|
12 |
+
# preprocess dev dataset
|
13 |
+
python preprocessing.py \
|
14 |
+
--mode "eval" \
|
15 |
+
--table_path "./data/spider_amr/tables.json" \
|
16 |
+
--input_dataset_path "./data/spider_amr/dev.json" \
|
17 |
+
--output_dataset_path "./data/preprocessed_data/preprocessed_dev_amr.json" \
|
18 |
+
--db_path "./database"\
|
19 |
+
--target_type "sql"
|
train_text2sql_schema_item_classifier.sh
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set -e
|
2 |
+
|
3 |
+
# train schema item classifier
|
4 |
+
python -u schema_item_classifier.py \
|
5 |
+
--batch_size 8 \
|
6 |
+
--gradient_descent_step 2 \
|
7 |
+
--device "0" \
|
8 |
+
--learning_rate 1e-5 \
|
9 |
+
--gamma 2.0 \
|
10 |
+
--alpha 0.75 \
|
11 |
+
--epochs 32 \
|
12 |
+
--patience 16 \
|
13 |
+
--seed 42 \
|
14 |
+
--save_path "./models/text2sql_schema_item_classifier_semantic" \
|
15 |
+
--tensorboard_save_path "./tensorboard_log/text2sql_schema_item_classifier_semantic" \
|
16 |
+
--train_filepath "./data/preprocessed_data/preprocessed_train_spider_amr.json" \
|
17 |
+
--dev_filepath "./data/preprocessed_data/preprocessed_dev_amr.json" \
|
18 |
+
--model_name_or_path "roberta-large" \
|
19 |
+
--use_contents \
|
20 |
+
--add_fk_info \
|
21 |
+
--mode "train"
|
train_text2sql_t5_base.sh
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set -e
|
2 |
+
|
3 |
+
# train text2sql-t5-base model
|
4 |
+
python -u text2sql_inputgrande.py \
|
5 |
+
--batch_size 8 \
|
6 |
+
--gradient_descent_step 2 \
|
7 |
+
--device "0" \
|
8 |
+
--learning_rate 1e-4 \
|
9 |
+
--epochs 128 \
|
10 |
+
--seed 42 \
|
11 |
+
--save_path "./models/text2sql-t5-amr" \
|
12 |
+
--tensorboard_save_path "./tensorboard_log/text2sql-t5-amr" \
|
13 |
+
--model_name_or_path "t5-base" \
|
14 |
+
--use_adafactor \
|
15 |
+
--mode train \
|
16 |
+
--train_filepath "./data/preprocessed_data/resdsql_train_spider_amr.json"
|
17 |
+
|
18 |
+
# select the best text2sql-t5-base ckpt
|
19 |
+
python -u evaluate_text2sql_ckpts_inputgrande.py \
|
20 |
+
--batch_size 8 \
|
21 |
+
--device "0" \
|
22 |
+
--seed 42 \
|
23 |
+
--save_path "./models/text2sql-t5-amr" \
|
24 |
+
--eval_results_path "./eval_results/text2sql-t5-amr" \
|
25 |
+
--mode eval \
|
26 |
+
--dev_filepath "./data/preprocessed_data/resdsql_dev_amr.json" \
|
27 |
+
--original_dev_filepath "./data/spider_amr/dev.json" \
|
28 |
+
--db_path "./database" \
|
29 |
+
--num_beams 8 \
|
30 |
+
--num_return_sequences 8 \
|
31 |
+
--target_type "sql"
|