Spaces:
Build error
Build error
PeteBleackley
commited on
Commit
•
8454a19
1
Parent(s):
5c2caf3
Removed train_base_models task. Not needed with RoBERTa models
Browse files- scripts.py +1 -28
scripts.py
CHANGED
@@ -6,7 +6,6 @@ import tokenizers
|
|
6 |
import transformers
|
7 |
import huggingface_hub
|
8 |
import qarac.corpora.BNCorpus
|
9 |
-
import qarac.corpora.Batcher
|
10 |
import qarac.models.qarac_base_model
|
11 |
import qarac.models.QaracTrainerModel
|
12 |
import qarac.corpora.CombinedCorpus
|
@@ -72,30 +71,6 @@ def prepare_wiki_qa(filename,outfilename):
|
|
72 |
data[['Cleaned_question','Resolved_answer','Label']].to_csv(outfilename)
|
73 |
|
74 |
|
75 |
-
def train_base_model(task,filename):
|
76 |
-
tokenizer = tokenizers.Tokenizer.from_pretrained('xlm-roberta-base')
|
77 |
-
tokenizer.add_special_tokens(['<start>','<end>','<pad>'])
|
78 |
-
tokenizer.save('/'.join([os.environ['HOME'],
|
79 |
-
'QARAC',
|
80 |
-
'models',
|
81 |
-
'tokenizer.json']))
|
82 |
-
bnc = qarac.corpora.BNCorpus.BNCorpus(tokenizer=tokenizer,
|
83 |
-
task=task)
|
84 |
-
(train,test)=bnc.split(0.01)
|
85 |
-
train_data=qarac.corpora.Batcher.Batcher(train)
|
86 |
-
model = qarac.models.qarac_base_model.qarac_base_model(tokenizer.get_vocab_size(),
|
87 |
-
768,
|
88 |
-
12,
|
89 |
-
task=='decode')
|
90 |
-
#optimizer = keras.optimizers.Nadam(learning_rate=keras.optimizers.schedules.ExponentialDecay(1.0e-5, 100, 0.99))
|
91 |
-
#model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics='accuracy')
|
92 |
-
#model.fit(train_data,
|
93 |
-
# epochs=100,
|
94 |
-
# workers = 16,
|
95 |
-
# use_multiprocessing=True)
|
96 |
-
test_data=qarac.corpora.Batcher.Batcher(test)
|
97 |
-
print(model.evaluate(test_data))
|
98 |
-
model.save(filename)
|
99 |
|
100 |
def prepare_training_datasets():
|
101 |
wikiqa = pandas.read_csv('corpora/WikiQA.csv')
|
@@ -478,9 +453,7 @@ if __name__ == '__main__':
|
|
478 |
parser.add_argument('-t','--training-task')
|
479 |
parser.add_argument('-o','--outputfile')
|
480 |
args = parser.parse_args()
|
481 |
-
if args.task == '
|
482 |
-
train_base_model(args.training_task,args.filename)
|
483 |
-
elif args.task == 'prepare_wiki_qa':
|
484 |
prepare_wiki_qa(args.filename,args.outputfile)
|
485 |
elif args.task == 'prepare_training_datasets':
|
486 |
prepare_training_datasets()
|
|
|
6 |
import transformers
|
7 |
import huggingface_hub
|
8 |
import qarac.corpora.BNCorpus
|
|
|
9 |
import qarac.models.qarac_base_model
|
10 |
import qarac.models.QaracTrainerModel
|
11 |
import qarac.corpora.CombinedCorpus
|
|
|
71 |
data[['Cleaned_question','Resolved_answer','Label']].to_csv(outfilename)
|
72 |
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
def prepare_training_datasets():
|
76 |
wikiqa = pandas.read_csv('corpora/WikiQA.csv')
|
|
|
453 |
parser.add_argument('-t','--training-task')
|
454 |
parser.add_argument('-o','--outputfile')
|
455 |
args = parser.parse_args()
|
456 |
+
if args.task == 'prepare_wiki_qa':
|
|
|
|
|
457 |
prepare_wiki_qa(args.filename,args.outputfile)
|
458 |
elif args.task == 'prepare_training_datasets':
|
459 |
prepare_training_datasets()
|