--- language: en tags: - mistral - gguf - mistral-trimegistrus-7b license: - apache-2.0 datasets: - pharaouk/dharma-1/dharma_1_mini.json metrics: - adam_beta1=0.9 - adam_beta2=0.999 - adam_epsilon=0.00000001 - add_cross_attention=false - loss=1.4308836460113523 - runtime=11.3829 - samples_per_second=8.522 - steps_per_second=2.196 --- # Model Card for Model mistral-trimegistus-7b-gguf This model repo holds gguf quantized versions of ["teknium/Mistral-Trismegistus-7B"] (https://huggingface.co/teknium/Mistral-Trismegistus-7B). ## Model Details Transcendence is All You Need! Mistral Trismegistus is a model made for people interested in the esoteric, occult, and spiritual. ### Model Description - The First Powerful Occult Expert Model: ~10,000 high quality, deep, rich, instructions on the occult, esoteric, and spiritual. - Fast: Trained on Mistral, a state of the art 7B parameter model, you can run this model FAST on even a cpu. - Not a positivity-nazi: This model was trained on all forms of esoteric tasks and knowledge, and is not burdened by the flowery nature of many other models, who chose positivity over creativity. ### Model Sources [optional] All credits go [here](https://huggingface.co/teknium/Mistral-Trismegistus-7B) ## Usage USER: ASSISTANT: OR USER: ASSISTANT: ## Training Details #### Training Hyperparameters "_name_or_path": { "desc": null, "value": "mistralai/Mistral-7B-v0.1" }, "architectures": { "desc": null, "value": [ "MistralForCausalLM" ] }, "bad_words_ids": { "desc": null, "value": null }, "bench_dataset": { "desc": null, "value": "pharaouk/dharma-1/dharma_1_mini.json" }, "learning_rate": { "desc": null, "value": 0.0004 }, "max_grad_norm": { "desc": null, "value": 1 }, "fp16_opt_level": { "desc": null, "value": "O1" }, "length_penalty": { "desc": null, "value": 1 }, "max_seq_length": { "desc": null, "value": 4096 }, "sliding_window": { "desc": null, "value": 4096 }, "num_beam_groups": { "desc": null, "value": 1 }, "initializer_range": { "desc": null, "value": 0.02 }, "intermediate_size": { "desc": null, "value": 14336 }, "lr_scheduler_type": { "desc": null, "value": "cosine" }, "num_hidden_layers": { "desc": null, "value": 32 }, "repetition_penalty": { "desc": null, "value": 1 }, "evaluation_strategy": { "desc": null, "value": "steps" }, "num_attention_heads": { "desc": null, "value": 32 }, "num_key_value_heads": { "desc": null, "value": 8 }, "quantization_config": { "desc": null, "value": { "load_in_4bit": true, "load_in_8bit": false, "quant_method": "QuantizationMethod.BITS_AND_BYTES", "llm_int8_threshold": 6, "bnb_4bit_quant_type": "nf4", "llm_int8_skip_modules": null, "bnb_4bit_compute_dtype": "bfloat16", "llm_int8_has_fp16_weight": false, "bnb_4bit_use_double_quant": true, "llm_int8_enable_fp32_cpu_offload": false } } #### Speeds, Sizes, Times { "_step": 9589, "_wandb.runtime": 12960, "_runtime": 12960.192620515823, "eval/loss": 1.4308836460113523, "train/train_steps_per_second": 0.739, "train/train_samples_per_second": 2.956, "train/loss": 0.3396, "train/epoch": 4, "train/total_flos": 1757020072120942600, "train/train_loss": 0.8929485179171377, "train/learning_rate": 0, "eval/steps_per_second": 2.196, "_timestamp": 1696542775.2713604, "eval/runtime": 11.3829, "train/global_step": 9584, "train/train_runtime": 12962.7813, "eval/samples_per_second": 8.522 }