VinceItsMe
/

MyFirstModel

Text Classification

sentence-transformers

setfit

Model card Files Files and versions Community

vincenttruum commited on Jan 23, 2023

Commit

6a06532

•

1 Parent(s): 043aa62

test

Browse files

Files changed (1) hide show

newtest.py +40 -18

newtest.py CHANGED Viewed

@@ -1,40 +1,62 @@
-from datasets import load_dataset
 from sentence_transformers.losses import CosineSimilarityLoss
-from setfit import SetFitModel, SetFitTrainer, sample_dataset
 # Load a dataset from the Hugging Face Hub
-dataset = load_dataset("sst2")
-# Simulate the few-shot regime by sampling 8 examples per class
-train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=8)
-eval_dataset = dataset["validation"]
 # Load a SetFit model from Hub
-model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
 # Create trainer
 trainer = SetFitTrainer(
     model=model,
     train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
     loss_class=CosineSimilarityLoss,
     metric="accuracy",
-    batch_size=16,
-    num_iterations=20, # The number of text pairs to generate for contrastive learning
-    num_epochs=1, # The number of epochs to use for contrastive learning
-    column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
 )
 # Train and evaluate
 trainer.train()
 metrics = trainer.evaluate()
 # Push model to the Hub
-trainer.push_to_hub("my-awesome-setfit-model")
 # Download from Hub and run inference
-model = SetFitModel.from_pretrained("lewtun/my-awesome-setfit-model")
 # Run inference
-preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])

+from datasets import load_dataset, concatenate_datasets
 from sentence_transformers.losses import CosineSimilarityLoss
+from setfit import SetFitModel, SetFitTrainer
 # Load a dataset from the Hugging Face Hub
+dataset = load_dataset("ag_news")
+# create train dataset
+seed = 20
+labels = 4
+samples_per_label = 8
+sampled_datasets = []
+# find the number of samples per label
+for i in range(labels):
+    sampled_datasets.append(
+        dataset["train"].filter(lambda x: x["label"] == i).shuffle(seed=seed).select(range(samples_per_label)))
+# concatenate the sampled datasets
+train_dataset = concatenate_datasets(sampled_datasets)
+# create test dataset
+labels = 4
+samples_per_label = 8
+sampled_datasets = []
+# find the number of samples per label
+for i in range(labels):
+    sampled_datasets.append(
+        dataset["test"].filter(lambda x: x["label"] == i).shuffle(seed=seed).select(range(samples_per_label)))
+test_dataset = concatenate_datasets(sampled_datasets)
 # Load a SetFit model from Hub
+model_id = "sentence-transformers/all-mpnet-base-v2"
+model = SetFitModel.from_pretrained(model_id)
 # Create trainer
 trainer = SetFitTrainer(
     model=model,
     train_dataset=train_dataset,
+    eval_dataset=test_dataset,
     loss_class=CosineSimilarityLoss,
     metric="accuracy",
+    batch_size=64,
+    num_iterations=1,  # 20, # The number of text pairs to generate for contrastive learning
+    num_epochs=1,  # The number of epochs to use for constrastive learning
 )
 # Train and evaluate
 trainer.train()
 metrics = trainer.evaluate()
+print(f"model used: {model_id}")
+print(f"train dataset: {len(train_dataset)} samples")
+print(f"accuracy: {metrics['accuracy']}")
 # Push model to the Hub
+trainer.push_to_hub("MyFirstModel")
 # Download from Hub and run inference
+model = SetFitModel.from_pretrained("VinceItsMe/MyFirstModel")
 # Run inference
+preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
+q = 1