smhavens commited on
Commit
bfc9cbd
·
1 Parent(s): 5b3034b

switch dataset

Browse files
Files changed (1) hide show
  1. app.py +12 -14
app.py CHANGED
@@ -40,34 +40,34 @@ def compute_metrics(eval_pred):
40
 
41
 
42
  def training():
43
- dataset_id = "glue-cola"
44
- dataset = load_dataset("glue", "cola")
45
- dataset = dataset["train"]
46
  # tokenized_datasets = dataset.map(tokenize_function, batched=True)
47
 
48
- # print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
49
- # print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
50
- # print(f"- Examples look like this: {dataset['train'][0]}")
51
 
52
  # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
53
  # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
54
 
55
  train_examples = []
56
- train_data = dataset
57
  # For agility we only 1/2 of our available data
58
- n_examples = dataset.num_rows // 2
59
 
60
  for i in range(n_examples):
61
  example = train_data[i]
62
- train_examples.append(InputExample(texts=[example['sentence'], example['label'], example['idx']]))
63
 
64
- train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
65
 
66
 
67
 
68
- embeddings = finetune(train_dataloader)
69
 
70
- return (dataset.num_rows, type(dataset[0]), type(dataset[0]['set']), dataset[0], embeddings)
71
 
72
 
73
  def finetune(train_dataloader):
@@ -82,8 +82,6 @@ def finetune(train_dataloader):
82
 
83
  train_loss = losses.TripletLoss(model=model)
84
 
85
- print(train_dataloader)
86
-
87
  model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
88
 
89
  # accuracy = compute_metrics(eval, metric)
 
40
 
41
 
42
  def training():
43
+ dataset_id = "ag-news"
44
+ dataset = load_dataset(dataset_id)
45
+ # dataset = dataset["train"]
46
  # tokenized_datasets = dataset.map(tokenize_function, batched=True)
47
 
48
+ print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
49
+ print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
50
+ print(f"- Examples look like this: {dataset['train'][0]}")
51
 
52
  # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
53
  # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
54
 
55
  train_examples = []
56
+ train_data = dataset['train']['text']
57
  # For agility we only 1/2 of our available data
58
+ n_examples = dataset['train'].num_rows // 2
59
 
60
  for i in range(n_examples):
61
  example = train_data[i]
62
+ train_examples.append(InputExample(texts=[example['id'], example['text']]))
63
 
64
+ # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
65
 
66
 
67
 
68
+ embeddings = finetune(train_examples)
69
 
70
+ return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['text']), dataset['train'][0], embeddings)
71
 
72
 
73
  def finetune(train_dataloader):
 
82
 
83
  train_loss = losses.TripletLoss(model=model)
84
 
 
 
85
  model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
86
 
87
  # accuracy = compute_metrics(eval, metric)