Spaces:

hashvibe007
/

smollm2

Running

App Files Files Community

Vibi007 commited on 25 days ago

Commit

035761e

0 Parent(s):

first commit

Browse files

Files changed (8) hide show

.gitignore +93 -0
.gradio/certificate.pem +31 -0
README.md +49 -0
config_smollm2_135.yaml +128 -0
inference.py +157 -0
model.py +203 -0
requirements.txt +0 -0
train_smollm2.py +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,93 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Training artifacts
+checkpoints/
+runs/
+logs/
+*.ckpt
+*.pt
+*.pth
+wandb/
+lightning_logs/
+final_model/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+# Jupyter Notebook
+.ipynb_checkpoints
+*.ipynb_checkpoints/
+*.ipynb
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Logs
+*.log
+*.logs
+log.txt
+logs.txt
+# Data
+data/
+datasets/
+*.csv
+*.h5
+*.pkl
+*.npz
+# Environment
+.env
+.env.local
+.env.*.local
+.env.development.local
+.env.test.local
+.env.production.local
+# Misc
+*.bak
+*.tmp
+*.temp
+.coverage
+htmlcov/
+.pytest_cache/
+.mypy_cache/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+<!-- use venv to create a virtual environment -->
+```
+uv venv
+source .venv/bin/activate
+```
+<!-- Train smollm2 model -->
+use dataset from https://huggingface.co/datasets/HuggingFaceTB/smollm-corpus/tree/main/cosmopedia-v2
+```
+dataset = load_dataset("HuggingFaceTB/smollm-corpus", "cosmopedia-v2")
+```
+use tokeniser from https://huggingface.co/HuggingFaceTB/cosmo2-tokenizer
+```
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
+```
+use config from https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config_smollm2_135M.yaml
+create model from above parameters
+Use it for training using pytorch lightning
+<!-- Model architecture -->
+LlamaForCausalLM(
+  (model): LlamaModel(
+    (embed_tokens): Embedding(49152, 576)
+    (layers): ModuleList(
+      (0-29): 30 x LlamaDecoderLayer(
+        (self_attn): LlamaAttention(
+          (q_proj): Linear(in_features=576, out_features=576, bias=False)
+          (k_proj): Linear(in_features=576, out_features=192, bias=False)
+          (v_proj): Linear(in_features=576, out_features=192, bias=False)
+          (o_proj): Linear(in_features=576, out_features=576, bias=False)
+        )
+        (mlp): LlamaMLP(
+          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
+          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
+          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
+        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
+      )
+    )
+    (norm): LlamaRMSNorm((576,), eps=1e-05)
+    (rotary_emb): LlamaRotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=576, out_features=49152, bias=False)
+)

config_smollm2_135.yaml ADDED Viewed

	@@ -0,0 +1,128 @@

+checkpoints:
+  checkpoint_interval: 2000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_final_state: false
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_folder:
+      - datasets/smollm2-corpus
+      dataset_weights:
+      - 1.0
+    num_loading_workers: 0
+    seed: 8
+  name: stable phase
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: smollm2
+  run: smollm2-135M
+  seed: 8
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.041666666666666664
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 0
+    hidden_act: silu
+    hidden_size: 576
+    initializer_range: 0.041666666666666664
+    intermediate_size: 1536
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 9
+    num_hidden_layers: 30
+    num_key_value_heads: 3
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_interleaved: false
+    rope_scaling: null
+    rope_theta: 10000.0
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.003
+    lr_decay_starting_step: 1600000
+    lr_decay_steps: 400000
+    lr_decay_style: linear
+    lr_warmup_steps: 2000
+    lr_warmup_style: linear
+    min_decay_lr: 0
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 64
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 1
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 8
+  sequence_length: 2048
+  train_steps: 2000000
+  val_check_interval: 1000
+# model:
+#   LlamaForCausalLM(
+#   (model): LlamaModel(
+#     (embed_tokens): Embedding(49152, 576)
+#     (layers): ModuleList(
+#       (0-29): 30 x LlamaDecoderLayer(
+#         (self_attn): LlamaAttention(
+#           (q_proj): Linear(in_features=576, out_features=576, bias=False)
+#           (k_proj): Linear(in_features=576, out_features=192, bias=False)
+#           (v_proj): Linear(in_features=576, out_features=192, bias=False)
+#           (o_proj): Linear(in_features=576, out_features=576, bias=False)
+#         )
+#         (mlp): LlamaMLP(
+#           (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
+#           (up_proj): Linear(in_features=576, out_features=1536, bias=False)
+#           (down_proj): Linear(in_features=1536, out_features=576, bias=False)
+#           (act_fn): SiLU()
+#         )
+#         (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
+#         (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
+#       )
+#     )
+#     (norm): LlamaRMSNorm((576,), eps=1e-05)
+#     (rotary_emb): LlamaRotaryEmbedding()
+#   )
+#   (lm_head): Linear(in_features=576, out_features=49152, bias=False)
+# )

inference.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+import gradio as gr
+import torch
+from model import SmolLMModule, create_model_config
+from transformers import AutoTokenizer
+import yaml
+import glob
+# Load config
+with open("config_smollm2_135.yaml", "r") as file:
+    config = yaml.safe_load(file)
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
+tokenizer.pad_token = tokenizer.eos_token
+def load_model_from_checkpoint(checkpoint_path):
+    """Load model from checkpoint"""
+    model = SmolLMModule.load_from_checkpoint(checkpoint_path, config=config)
+    model.eval()  # Set to evaluation mode
+    return model
+def get_available_checkpoints():
+    """Get list of available checkpoints sorted by step number"""
+    checkpoints = glob.glob("checkpoints/*.ckpt")
+    if not checkpoints:
+        return [], []
+    # Sort by step number
+    def get_step_number(filepath):
+        try:
+            # Extract step number from the filename
+            filename = os.path.basename(filepath)
+            # Remove .ckpt extension
+            filename = filename.replace(".ckpt", "")
+            # Get the step number
+            if "step=" in filename:
+                return int(filename.split("step=")[1])
+            elif "-step-" in filename:
+                return int(filename.split("-step-")[1])
+            else:
+                return int("".join(filter(str.isdigit, filename)))
+        except (ValueError, IndexError):
+            return 0
+    # Sort checkpoints by step number
+    checkpoints.sort(key=get_step_number)
+    # Create display names
+    display_names = [f"Step {get_step_number(x)}" for x in checkpoints]
+    return display_names, checkpoints
+def generate_text(
+    prompt, checkpoint_choice, max_length=100, temperature=0.7, top_p=0.9
+):
+    """Generate text based on prompt using selected checkpoint"""
+    # Check if checkpoint is selected
+    if not checkpoint_choice:
+        return "Please select a checkpoint first!"
+    if not prompt:
+        return "Please enter a prompt!"
+    try:
+        # Get actual checkpoint path
+        step_num = int("".join(filter(str.isdigit, checkpoint_choice)))
+        checkpoints = glob.glob("checkpoints/*.ckpt")
+        checkpoint_path = None
+        for ckpt in checkpoints:
+            if str(step_num) in ckpt:
+                checkpoint_path = ckpt
+                break
+        if not checkpoint_path or not os.path.exists(checkpoint_path):
+            return f"Checkpoint for step {step_num} not found!"
+        # Load model from checkpoint
+        model = load_model_from_checkpoint(checkpoint_path)
+        # Move model to GPU if available
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = model.to(device)
+        # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
+        # Move inputs to same device as model
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate
+        with torch.no_grad():
+            outputs = model.model.generate(
+                inputs["input_ids"],
+                max_length=max_length,
+                temperature=temperature,
+                top_p=top_p,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        # Decode and return generated text
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return generated_text
+    except Exception as e:
+        return f"Error during generation: {str(e)}"
+# Get available checkpoints
+display_names, _ = get_available_checkpoints()
+# Create Gradio interface
+with gr.Blocks(title="SmolLM2 Inference") as demo:
+    gr.Markdown("# SmolLM2 Text Generation")
+    if not display_names:
+        gr.Markdown("⚠️ No checkpoints found! Please train the model first.")
+    else:
+        gr.Markdown(
+            f"Found {len(display_names)} checkpoints. Select one and enter a prompt to generate text."
+        )
+    with gr.Row():
+        with gr.Column():
+            checkpoint_dropdown = gr.Dropdown(
+                choices=display_names,
+                label="Select Checkpoint",
+                value=display_names[-1] if display_names else None,
+                interactive=True,
+            )
+            prompt = gr.Textbox(
+                lines=3, placeholder="Enter your prompt here...", label="Input Prompt"
+            )
+            max_length = gr.Slider(
+                minimum=10, maximum=500, value=100, step=10, label="Max Length"
+            )
+            temperature = gr.Slider(
+                minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"
+            )
+            top_p = gr.Slider(
+                minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top-p"
+            )
+            generate_btn = gr.Button("Generate")
+        with gr.Column():
+            output = gr.Textbox(lines=8, label="Generated Text")
+    generate_btn.click(
+        fn=generate_text,
+        inputs=[prompt, checkpoint_dropdown, max_length, temperature, top_p],
+        outputs=output,
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

model.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# import libraries
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
+from transformers import Trainer
+import pytorch_lightning as pl
+import yaml
+from pytorch_lightning.callbacks import LearningRateMonitor
+from pytorch_lightning.callbacks import RichProgressBar
+from pytorch_lightning.loggers import TensorBoardLogger
+import torch
+from torch.utils.data import DataLoader
+# load dataset
+dataset = load_dataset("HuggingFaceTB/smollm-corpus", "cosmopedia-v2", streaming=True)
+train_dataset = dataset["train"]
+for sample in train_dataset:
+    print(sample)
+    break
+# load tokenizer
+# use tokeniser from https://huggingface.co/HuggingFaceTB/cosmo2-tokenizer
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
+# Set padding token to be the same as EOS token
+tokenizer.pad_token = tokenizer.eos_token
+# load config
+# use config from https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config_smollm2_135M.yaml
+# config = AutoConfig.from_pretrained("HuggingFaceTB/SmolLM2-135M")
+def collate_fn(examples):
+    # Tokenize the texts
+    encoding = tokenizer(
+        [example["text"] for example in examples],
+        padding=True,
+        truncation=True,
+        max_length=512,
+        return_tensors="pt",
+    )
+    # Create labels (same as input_ids for causal language modeling)
+    encoding["labels"] = encoding["input_ids"].clone()
+    return encoding
+def create_model_config(config):
+    model_config = config["model"]["model_config"]
+    return LlamaConfig(
+        vocab_size=49152,  # From the model architecture
+        hidden_size=model_config["hidden_size"],
+        intermediate_size=model_config["intermediate_size"],
+        num_hidden_layers=model_config["num_hidden_layers"],
+        num_attention_heads=model_config["num_attention_heads"],
+        num_key_value_heads=model_config["num_key_value_heads"],
+        hidden_act=model_config["hidden_act"],
+        max_position_embeddings=model_config["max_position_embeddings"],
+        initializer_range=model_config["initializer_range"],
+        rms_norm_eps=1e-5,  # From the model architecture
+        use_cache=True,
+        pad_token_id=model_config["pad_token_id"],
+        bos_token_id=model_config["bos_token_id"],
+        eos_token_id=model_config["eos_token_id"],
+    )
+# create model
+class SmolLMModule(pl.LightningModule):
+    def __init__(self, config, learning_rate=1e-4):
+        super().__init__()
+        self.config = config
+        self.learning_rate = learning_rate
+        self.save_hyperparameters()  # Save hyperparameters for resuming
+        # Create model from config
+        model_config = create_model_config(config)
+        self.model = AutoModelForCausalLM.from_config(model_config)
+    def forward(self, **inputs):
+        return self.model(**inputs)
+    def training_step(self, batch, batch_idx):
+        outputs = self.model(**batch)
+        loss = outputs.loss
+        self.log("train_loss", loss, prog_bar=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.model.parameters(),
+            lr=self.learning_rate,
+            betas=(0.9, 0.95),
+            eps=1e-8,
+            weight_decay=0.1,
+        )
+        return optimizer
+    def on_save_checkpoint(self, checkpoint):
+        # Save additional info if needed
+        checkpoint["step"] = self.global_step
+        checkpoint["model_config"] = self.config
+    def on_load_checkpoint(self, checkpoint):
+        # Restore additional info if needed
+        self.global_step = checkpoint["step"]
+        self.config = checkpoint["model_config"]
+# train model
+# save model
+# training script
+if __name__ == "__main__":
+    import os
+    from pytorch_lightning.callbacks import ModelCheckpoint
+    # parameters load from config file
+    with open("config_smollm2_135.yaml", "r") as file:
+        config = yaml.safe_load(file)
+    max_steps = 5000  # Total training steps
+    # Create checkpoint directory if it doesn't exist
+    checkpoint_dir = "checkpoints"
+    os.makedirs(checkpoint_dir, exist_ok=True)
+    # Checkpoint callback
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=checkpoint_dir,
+        filename="model-step={step}",
+        save_top_k=-1,  # Save all checkpoints
+        every_n_train_steps=500,  # Save every 500 steps
+        save_weights_only=False,  # Save the full model state
+    )
+    # load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
+    # Set padding token to be the same as EOS token
+    tokenizer.pad_token = tokenizer.eos_token
+    # load dataset
+    dataset = load_dataset(
+        "HuggingFaceTB/smollm-corpus", "cosmopedia-v2", streaming=True
+    )
+    train_dataset = dataset["train"]
+    # Create DataLoader
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=4,  # Small batch size for testing
+        collate_fn=collate_fn,
+        num_workers=2,
+    )
+    # create model
+    model = SmolLMModule(config, learning_rate=1e-4)
+    # progress bar
+    progress_bar = RichProgressBar(leave=False, refresh_rate=1, console_kwargs=None)
+    # Find latest checkpoint if exists
+    latest_checkpoint = None
+    if os.path.exists(checkpoint_dir):
+        checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith(".ckpt")]
+        if checkpoints:
+            # Sort by step number and get the latest
+            latest_checkpoint = os.path.join(
+                checkpoint_dir,
+                sorted(checkpoints, key=lambda x: int(x.split("-")[1].split(".")[0]))[
+                    -1
+                ],
+            )
+            print(f"Resuming from checkpoint: {latest_checkpoint}")
+    # create trainer
+    trainer = pl.Trainer(
+        max_steps=max_steps,
+        accelerator="gpu",
+        devices=1,
+        precision="bf16-mixed",
+        callbacks=[
+            LearningRateMonitor(logging_interval="step"),
+            progress_bar,
+            checkpoint_callback,
+        ],
+        log_every_n_steps=1,
+        enable_progress_bar=True,
+        enable_model_summary=True,
+    )
+    # train model
+    if latest_checkpoint:
+        # Resume training from checkpoint if it exists
+        trainer.fit(model, train_loader, ckpt_path=latest_checkpoint)
+    else:
+        # Start training from scratch
+        trainer.fit(model, train_loader)
+    # Save final model and tokenizer
+    if trainer.is_global_zero:  # Only save on main process
+        output_dir = "final_model"
+        os.makedirs(output_dir, exist_ok=True)
+        model.model.save_pretrained(os.path.join(output_dir, "model"))
+        tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer"))

requirements.txt ADDED Viewed

File without changes

train_smollm2.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer
+# from datasets import load_dataset
+# dataset = load_dataset("HuggingFaceTB/smollm-corpus", "cosmopedia-v2")
+# use tokeniser https://huggingface.co/HuggingFaceTB/cosmo2-tokenizer
+# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
+print(model)