Spaces:
Sleeping
Sleeping
smhavens
commited on
Commit
·
bfc9cbd
1
Parent(s):
5b3034b
switch dataset
Browse files
app.py
CHANGED
@@ -40,34 +40,34 @@ def compute_metrics(eval_pred):
|
|
40 |
|
41 |
|
42 |
def training():
|
43 |
-
dataset_id = "
|
44 |
-
dataset = load_dataset(
|
45 |
-
dataset = dataset["train"]
|
46 |
# tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
|
53 |
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
|
54 |
|
55 |
train_examples = []
|
56 |
-
train_data = dataset
|
57 |
# For agility we only 1/2 of our available data
|
58 |
-
n_examples = dataset.num_rows // 2
|
59 |
|
60 |
for i in range(n_examples):
|
61 |
example = train_data[i]
|
62 |
-
train_examples.append(InputExample(texts=[example['
|
63 |
|
64 |
-
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
|
65 |
|
66 |
|
67 |
|
68 |
-
embeddings = finetune(
|
69 |
|
70 |
-
return (dataset.num_rows, type(dataset[0]), type(dataset[0]['
|
71 |
|
72 |
|
73 |
def finetune(train_dataloader):
|
@@ -82,8 +82,6 @@ def finetune(train_dataloader):
|
|
82 |
|
83 |
train_loss = losses.TripletLoss(model=model)
|
84 |
|
85 |
-
print(train_dataloader)
|
86 |
-
|
87 |
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
|
88 |
|
89 |
# accuracy = compute_metrics(eval, metric)
|
|
|
40 |
|
41 |
|
42 |
def training():
|
43 |
+
dataset_id = "ag-news"
|
44 |
+
dataset = load_dataset(dataset_id)
|
45 |
+
# dataset = dataset["train"]
|
46 |
# tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
47 |
|
48 |
+
print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
|
49 |
+
print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
|
50 |
+
print(f"- Examples look like this: {dataset['train'][0]}")
|
51 |
|
52 |
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
|
53 |
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
|
54 |
|
55 |
train_examples = []
|
56 |
+
train_data = dataset['train']['text']
|
57 |
# For agility we only 1/2 of our available data
|
58 |
+
n_examples = dataset['train'].num_rows // 2
|
59 |
|
60 |
for i in range(n_examples):
|
61 |
example = train_data[i]
|
62 |
+
train_examples.append(InputExample(texts=[example['id'], example['text']]))
|
63 |
|
64 |
+
# train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
|
65 |
|
66 |
|
67 |
|
68 |
+
embeddings = finetune(train_examples)
|
69 |
|
70 |
+
return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['text']), dataset['train'][0], embeddings)
|
71 |
|
72 |
|
73 |
def finetune(train_dataloader):
|
|
|
82 |
|
83 |
train_loss = losses.TripletLoss(model=model)
|
84 |
|
|
|
|
|
85 |
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
|
86 |
|
87 |
# accuracy = compute_metrics(eval, metric)
|