# -*- coding: utf-8 -*- import gradio as gr from transformers import pipeline from transformers import AutoTokenizer from datasets import load_dataset from transformers import DataCollatorWithPadding raw_datasets = load_dataset("glue", "sst2") raw_datasets checkpoint = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer(example["sentence"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True,remove_columns=['idx','sentence']) tokenized_datasets data_collator = DataCollatorWithPadding(tokenizer=tokenizer) from transformers import TrainingArguments from transformers import AutoModelForSequenceClassification from datasets import load_metric from transformers import Trainer import numpy as np training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) def compute_metrics(eval_preds): metric = load_metric("glue", "sst2") logits, labels = eval_preds predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) trainer = Trainer( model, training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train() #gr.Interface( # fn=trainer.train, # inputs=None, # outputs="training", # title="test", #).launch()