MoodChartAI commited on
Commit
41d7246
·
verified ·
1 Parent(s): 74c6d23

Upload 2 files

Browse files
Files changed (2) hide show
  1. format_data.py +105 -0
  2. run_model.py +44 -0
format_data.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from peft import PeftModel
3
+ import pandas as pd
4
+ import shelve
5
+ from datasets import Dataset
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
7
+ from transformers import AutoModelForCausalLM
8
+ import torch
9
+ from datasets import load_dataset, Dataset
10
+ import datasets
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
12
+ from peft import LoraConfig, get_peft_model
13
+
14
+
15
+
16
+ #model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype="auto", trust_remote_code=True)
17
+ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", trust_remote_code=True)
18
+ tokenizer.pad_token = tokenizer.eos_token
19
+
20
+
21
+
22
+ moodb = shelve.open('mood.db')
23
+ happy, sad = moodb['happy'][1].split('\n'), moodb['sad'][1].split('\n')
24
+
25
+ for i, h in enumerate(happy):
26
+ happy[i] = "Prompt:"+h+"Completion: You're feeling happy"
27
+
28
+
29
+ for i, s in enumerate(sad):
30
+ sad[i] = "Prompt:"+s+"Completion: You're feeling sad"
31
+
32
+ happy = list(zip(happy, ["You're happy" for d in range(len(happy))]))
33
+ sad = list(zip(sad, ["You're sad" for d in range(len(sad))]))
34
+
35
+ data = sad+happy
36
+ #print(data)
37
+ df = pd.DataFrame(data, columns=['Prompt', 'Completion'])
38
+
39
+ #print(df)
40
+ def tokenize(sample):
41
+ tokenized_text = tokenizer(sample['Prompt'], padding=True, truncation=True, max_length=512)
42
+ return tokenized_text
43
+
44
+
45
+ data = Dataset.from_pandas(df)
46
+
47
+ tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
48
+
49
+
50
+ bnb_config = BitsAndBytesConfig(
51
+ load_in_4bit=True,
52
+ bnb_4bit_use_double_quant=True,
53
+ bnb_4bit_quant_type="nf4",
54
+ bnb_4bit_compute_dtype=torch.float16
55
+ )
56
+
57
+ model = AutoModelForCausalLM.from_pretrained(
58
+ "EleutherAI/gpt-neo-1.3B",
59
+ device_map={"":0},
60
+ trust_remote_code=True,
61
+ quantization_config=bnb_config
62
+ )
63
+
64
+
65
+
66
+
67
+ lora_config = LoraConfig(
68
+ r=16,
69
+ lora_alpha=16,
70
+ target_modules=["Wqkv", "out_proj"],
71
+ lora_dropout=0.05,
72
+ bias="none",
73
+ task_type="CAUSAL_LM"
74
+ )
75
+
76
+ model = get_peft_model(model, lora_config)
77
+
78
+
79
+
80
+ training_arguments = TrainingArguments(
81
+ output_dir="Multi-lingual-finetuned-med-text",
82
+ per_device_train_batch_size=4,
83
+ gradient_accumulation_steps=1,
84
+ learning_rate=2e-4,
85
+ lr_scheduler_type="cosine",
86
+ save_strategy="epoch",
87
+ logging_steps=1000,
88
+ max_steps=55550,
89
+ num_train_epochs=1
90
+ )
91
+
92
+
93
+
94
+ trainer = Trainer(
95
+ model=model,
96
+ train_dataset=tokenized_data,
97
+ args=training_arguments,
98
+ data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
99
+ )
100
+ trainer.train()
101
+ #peft_model = PeftModel.from_pretrained(model, "/root/projects/Multi-lingual-finetuned-med-text/checkpoint-10/", from_transformers=True)
102
+
103
+ #model = peft_model.merge_and_unload()
104
+
105
+ # model
run_model.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+ import torch
3
+ from peft import PeftModel, PeftConfig
4
+
5
+ import gc
6
+
7
+ gc.collect()
8
+
9
+ model_name = "MoodChartAI/basicmood"
10
+ adapters_name = ""
11
+
12
+
13
+ torch.cuda.empty_cache()
14
+
15
+
16
+ os.system("sudo swapoff -a; swapon -a")
17
+
18
+ print(f"Starting to load the model {model_name} into memory")
19
+
20
+ m = AutoModelForCausalLM.from_pretrained(
21
+ model_name,
22
+ #load_in_4bit=True,
23
+ ).to(device='cpu:7')
24
+
25
+ print(f"Loading the adapters from {adapters_name}")
26
+ m = PeftModel.from_pretrained(m, adapters_name)
27
+
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", trust_remote_code=True)
30
+
31
+
32
+
33
+ while True:
34
+ mood_input = input("Mood: ")
35
+
36
+ inputs = tokenizer("Prompt: %s Completions: You're feeling"%mood_input, return_tensors="pt", return_attention_mask=True)
37
+ inputs.to(device='cpu:8')
38
+ outputs = m.generate(**inputs, max_length=12)
39
+
40
+ print(tokenizer.batch_decode(outputs)[0])
41
+
42
+
43
+
44
+