import os import torch import logging from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer from datasets import load_dataset # ✅ Set a writable cache directory inside the container os.environ["HF_HOME"] = "/app/hf_cache" os.environ["TRANSFORMERS_CACHE"] = "/app/hf_cache" os.environ["HF_DATASETS_CACHE"] = "/app/hf_cache" # Ensure cache directory exists os.makedirs("/app/hf_cache", exist_ok=True) # Set verbose logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info("Loading dataset...") ds = load_dataset("facebook/natural_reasoning") # Replace with your dataset logger.info(f"Dataset loaded successfully! Dataset info:\n{ds}") # Load tokenizer logger.info("Loading tokenizer...") model_name = "deepseek-ai/DeepSeek-R1" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) logger.info("Tokenizer loaded successfully!") # Tokenization function def preprocess_function(examples): input_texts = [f"Q: {q} A: {a}" for q, a in zip(examples["question"], examples["reference_answer"])] return tokenizer(input_texts, truncation=True, padding="max_length", max_length=512) # Tokenize dataset logger.info("Tokenizing dataset...") tokenized_datasets = ds.map(preprocess_function, batched=True) logger.info("Dataset tokenized successfully!") # Load model logger.info("Loading model...") model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) logger.info("Model loaded successfully!") # Training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=4, # Adjust based on available RAM per_device_eval_batch_size=4, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs", logging_steps=10, push_to_hub=True, report_to="none", logging_first_step=True ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], tokenizer=tokenizer ) # Start training logger.info("Starting training...") trainer.train() logger.info("Training completed!") # Push trained model to Hugging Face Hub logger.info("Pushing trained model to Hugging Face Hub...") trainer.push_to_hub() logger.info("Model push completed! Training process finished successfully.")