Upload 5 files

Files changed (5) hide show

README.md ADDED Viewed

+---
+license: mit
+---
+This repository highlights the outcome of an experimental merging algorithm that combined the weights of two distinct language models through the application of the add difference technique. The process of weight merging is an innovative approach that enables the integration of knowledge from multiple models, culminating in the development of a more dynamic and advanced language model.
+  Proto-Synthia showcases an achievement in optimization within a mere 10 minutes, thereby, in many cases, obviating the need for the conventional time-intensive training process.

config.json ADDED Viewed

+{
+  "_name_or_path": "gpt2-xl",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1600,
+  "n_head": 25,
+  "n_inner": null,
+  "n_layer": 48,
+  "n_positions": 1024,
+  "output_past": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1",
+  "use_cache": true,
+  "vocab_size": 50257
+}

model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:b67a0e2ea20c28fd91e737737937ca67eb80a8e0e5c72681858fcbcdc926ab7c
+size 6280847584

pytorch_model.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:c05937f1dd96505d5f35728c6bfbabe604ccf5997e63b4fe103d35b2918ac759
+size 6280990705

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff