from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, \ TrainingArguments import logging from transformers import logging as hf_logging import os from torch.quantization import quantize_dynamic import torch # Setup logging logging.basicConfig(level=logging.INFO) # Adjust as per the desired verbosity hf_logging.set_verbosity_info() hf_logging.enable_default_handler() hf_logging.enable_explicit_format() # Define the custom data collator class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling): def collate_batch(self, features): batch = super().collate_batch(features) batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()} return batch # Use GPT-2 XL model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(torch.bfloat16) tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') train_dataset = TextDataset( tokenizer=tokenizer, file_path="formatted_data_small.txt", block_size=256) # Increased block size for larger model # Instantiate the custom data collator data_collator = CustomDataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) training_args = TrainingArguments( output_dir="./gpt2-medium-finetuned", overwrite_output_dir=True, num_train_epochs=4, per_device_train_batch_size=4, # Adjusted for potential memory constraints gradient_accumulation_steps=4, # Increased accumulation to handle larger model size learning_rate=2e-4, save_steps=1_000, save_total_limit=3, logging_dir='./logs', logging_steps=50, fp16=False # Ensure this is false since we're using bfloat16 manually ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train() # Save model and tokenizer model_path = "./saved_gpt2_medium_nice_model_directory" if not os.path.exists(model_path): os.makedirs(model_path) model.save_pretrained(model_path) tokenizer.save_pretrained(model_path) # Load the full-precision model model.eval() # Ensure the model is in evaluation mode quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) quantized_model_path = "./quantized_model_directory" if not os.path.exists(quantized_model_path): os.makedirs(quantized_model_path) torch.save(quantized_model.state_dict(), os.path.join(quantized_model_path, 'quantized_nice_medium_model.pth')) # from transformers import BertForMaskedLM, BertTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments # import logging # from transformers import logging as hf_logging # import os # from torch.quantization import quantize_dynamic # import torch # # # Setup logging # logging.basicConfig(level=logging.INFO) # hf_logging.set_verbosity_info() # hf_logging.enable_default_handler() # hf_logging.enable_explicit_format() # # # Define the custom data collator for masked language modeling # class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling): # def collate_batch(self, features): # batch = super().collate_batch(features) # batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()} # return batch # # # Load BioBERT # model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1').to(torch.bfloat16) # tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1') # # # Prepare the dataset # train_dataset = TextDataset( # tokenizer=tokenizer, # file_path="papers_data_mountain.txt", # block_size=512) # Adjust block_size if necessary # # data_collator = CustomDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True) # # training_args = TrainingArguments( # output_dir="./biobert-finetuned", # overwrite_output_dir=True, # num_train_epochs=4, # per_device_train_batch_size=8, # gradient_accumulation_steps=2, # learning_rate=2e-4, # save_steps=1_000, # save_total_limit=3, # logging_dir='./logs', # logging_steps=50, # fp16=False # Ensure this is false since we're using bfloat16 manually # ) # # trainer = Trainer( # model=model, # args=training_args, # data_collator=data_collator, # train_dataset=train_dataset, # ) # # trainer.train() # # # Save model and tokenizer # model_path = "./saved_mountain_model_directory" # if not os.path.exists(model_path): # os.makedirs(model_path) # model.save_pretrained(model_path) # tokenizer.save_pretrained(model_path) # # # Quantize the model # model.eval() # Ensure the model is in evaluation mode # quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) # quantized_model_path = "./quantized_model_directory" # if not os.path.exists(quantized_model_path): # os.makedirs(quantized_model_path) # torch.save(quantized_model.state_dict(), os.path.join(quantized_model_path, 'quantized_mountain_model.pth')) # from transformers import Phi3Model, Phi3Config, AutoTokenizer, Trainer, TrainingArguments # import logging # from transformers import logging as hf_logging # import os # import torch # from torch.utils.data import Dataset # # # class TextDataset(Dataset): # def __init__(self, tokenizer, file_path, block_size=512): # self.tokenizer = tokenizer # self.block_size = block_size # self.input_ids = [] # # # Read and tokenize the file content in chunks # with open(file_path, 'r', encoding='utf-8') as f: # while True: # text = f.read(1024 * 1024) # Read approximately 1MB of text at a time # if not text: # break # tokens = tokenizer(text, add_special_tokens=True, truncation=True, max_length=block_size, # return_tensors="pt") # self.input_ids.extend(tokens.input_ids.tolist()) # # def __len__(self): # # Ensure we return a non-negative value # return max(0, len(self.input_ids) - self.block_size + 1) # # def __getitem__(self, idx): # # Ensure the index does not exceed the bounds and forms a proper sequence # input_ids = self.input_ids[idx:idx + self.block_size] # return {"input_ids": torch.tensor(input_ids, dtype=torch.long)} # # # # # # Setup logging # logging.basicConfig(level=logging.INFO) # hf_logging.set_verbosity_info() # hf_logging.enable_default_handler() # hf_logging.enable_explicit_format() # # # Load Phi-3 model # configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") # model = Phi3Model(configuration).to(torch.bfloat16) # # # Load tokenizer # tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") # # # # Initialize the dataset # train_dataset = TextDataset(tokenizer, "papers_data_mountain.txt", block_size=512) # # # Custom data collator function (simplified for generality) # def custom_collate_fn(examples): # batch = tokenizer.pad( # examples, # return_tensors='pt', # padding=True, # max_length=512 # ) # batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()} # return batch # # # training_args = TrainingArguments( # output_dir="./phi3-finetuned", # overwrite_output_dir=True, # num_train_epochs=4, # per_device_train_batch_size=8, # gradient_accumulation_steps=2, # learning_rate=2e-4, # save_steps=1_000, # save_total_limit=3, # logging_dir='./logs', # logging_steps=50, # fp16=False # bfloat16 usage is manual # ) # # trainer = Trainer( # model=model, # args=training_args, # data_collator=custom_collate_fn, # train_dataset=train_dataset, # ) # # trainer.train() # # # Save model and tokenizer # model_path = "./saved_phi3_model_directory" # if not os.path.exists(model_path): # os.makedirs(model_path) # model.save_pretrained(model_path) # tokenizer.save_pretrained(model_path)