TomSmail commited on
Commit
656f752
1 Parent(s): 8e22a7c

Rename psy.ipynb to psy.py

Browse files
Files changed (2) hide show
  1. psy.ipynb +0 -4
  2. psy.py +21 -0
psy.ipynb DELETED
@@ -1,4 +0,0 @@
1
-
2
-
3
-
4
-
 
 
 
 
 
psy.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer
3
+
4
+ DATA_SEED = 9843203
5
+ QUICK_TEST = True
6
+
7
+ # This is our baseline dataset
8
+ dataset = load_dataset("ClaudiaRichard/mbti_classification_v2")
9
+
10
+ # LLama3 8b
11
+ tokeniser = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
12
+
13
+ def tokenise_function(examples):
14
+ return tokeniser(examples["text"], padding="max_length", truncation=True)
15
+
16
+ tokenised_dataset = dataset.map(tokenise_function, batched=True)
17
+
18
+
19
+ # Different sized datasets will allow for different training times
20
+ train_dataset = tokenized_datasets["train"].shuffle(seed=DATA_SEED).select(range(1000)) if QUICK_TEST else tokenized_datasets["train"].shuffle(seed=DATA_SEED)
21
+ test_dataset = tokenized_datasets["test"].shuffle(seed=DATA_SEED).select(range(1000)) if QUICK_TEST else tokenized_datasets["test"].shuffle(seed=DATA_SEED)