Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +9 -12
model.bin +2 -2
special_tokens_map.json +3 -21
tokenizer.json +20 -2
tokenizer_config.json +53 -23
vocabulary.json +3 -1

README.md CHANGED Viewed

@@ -3,7 +3,7 @@ license: apache-2.0
 datasets:
 - cerebras/SlimPajama-627B
 - bigcode/starcoderdata
-- timdettmers/openassistant-guanaco
 language:
 - en
 ---
@@ -16,23 +16,20 @@ https://github.com/jzhang38/TinyLlama
 The TinyLlama project aims to **pretrain** a **1.1B Llama model on 3 trillion tokens**. With some proper optimization, we can achieve this within a span of "just" 90 days using 16 A100-40G GPUs 🚀🚀. The training has started on 2023-09-01.
-<div align="center">
-  <img src="./TinyLlama_logo.png" width="300"/>
-</div>
 We adopted exactly the same architecture and tokenizer as Llama 2. This means TinyLlama can be plugged and played in many open-source projects built upon Llama. Besides, TinyLlama is compact with only 1.1B parameters. This compactness allows it to cater to a multitude of applications demanding a restricted computation and memory footprint.
 #### This Model
-This is the chat model finetuned on [PY007/TinyLlama-1.1B-intermediate-step-240k-503b](https://huggingface.co/PY007/TinyLlama-1.1B-intermediate-step-240k-503b). The dataset used is [openassistant-guananco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco).
 #### How to use
 You will need the transformers>=4.31
 Do check the [TinyLlama](https://github.com/jzhang38/TinyLlama) github page for more information.
-```python
 from transformers import AutoTokenizer
 import transformers
 import torch
-model = "PY007/TinyLlama-1.1B-Chat-v0.1"
 tokenizer = AutoTokenizer.from_pretrained(model)
 pipeline = transformers.pipeline(
     "text-generation",
@@ -41,9 +38,9 @@ pipeline = transformers.pipeline(
     device_map="auto",
 )
-prompt = "What are the values in open source projects?"
 formatted_prompt = (
-    f"### Human: {prompt}### Assistant:"
 )
@@ -51,10 +48,10 @@ sequences = pipeline(
     formatted_prompt,
     do_sample=True,
     top_k=50,
-    top_p = 0.7,
     num_return_sequences=1,
     repetition_penalty=1.1,
-    max_new_tokens=500,
 )
 for seq in sequences:
     print(f"Result: {seq['generated_text']}")

 datasets:
 - cerebras/SlimPajama-627B
 - bigcode/starcoderdata
+- OpenAssistant/oasst_top1_2023-08-25
 language:
 - en
 ---
 The TinyLlama project aims to **pretrain** a **1.1B Llama model on 3 trillion tokens**. With some proper optimization, we can achieve this within a span of "just" 90 days using 16 A100-40G GPUs 🚀🚀. The training has started on 2023-09-01.
 We adopted exactly the same architecture and tokenizer as Llama 2. This means TinyLlama can be plugged and played in many open-source projects built upon Llama. Besides, TinyLlama is compact with only 1.1B parameters. This compactness allows it to cater to a multitude of applications demanding a restricted computation and memory footprint.
 #### This Model
+This is the chat model finetuned on top of [PY007/TinyLlama-1.1B-intermediate-step-480k-1T](https://huggingface.co/PY007/TinyLlama-1.1B-intermediate-step-480k-1T).
+The dataset used is [OpenAssistant/oasst_top1_2023-08-25](https://huggingface.co/datasets/OpenAssistant/oasst_top1_2023-08-25) following the [chatml](https://github.com/openai/openai-python/blob/main/chatml.md) format.
 #### How to use
 You will need the transformers>=4.31
 Do check the [TinyLlama](https://github.com/jzhang38/TinyLlama) github page for more information.
+```
 from transformers import AutoTokenizer
 import transformers
 import torch
+model = "PY007/TinyLlama-1.1B-Chat-v0.3"
 tokenizer = AutoTokenizer.from_pretrained(model)
 pipeline = transformers.pipeline(
     "text-generation",
     device_map="auto",
 )
+prompt = "How to get in a good university?"
 formatted_prompt = (
+    f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 )
     formatted_prompt,
     do_sample=True,
     top_k=50,
+    top_p = 0.9,
     num_return_sequences=1,
     repetition_penalty=1.1,
+    max_new_tokens=1024,
 )
 for seq in sequences:
     print(f"Result: {seq['generated_text']}")

model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:71b5ffae759f0a4f0d85b5ff20bdc36fbd14167fa439da58af4e597f9d05f8bc
-size 1102182891

 version https://git-lfs.github.com/spec/v1
+oid sha256:89eef765f34bef31eab7e1ec4a1c9209d75d702e322eb29ae1926b477eb1b821
+size 1102191099

special_tokens_map.json CHANGED Viewed

@@ -1,24 +1,6 @@
 {
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
   "pad_token": "[PAD]",
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
 }

 {
+  "bos_token": "<s>",
+  "eos_token": "</s>",
   "pad_token": "[PAD]",
+  "unk_token": "<unk>"
 }

tokenizer.json CHANGED Viewed

@@ -34,10 +34,28 @@
       "id": 32000,
       "content": "[PAD]",
       "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {

       "id": 32000,
       "content": "[PAD]",
       "single_word": false,
+      "lstrip": true,
+      "rstrip": true,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 32001,
+      "content": "<|im_start|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
+    },
+    {
+      "id": 32002,
+      "content": "<|im_end|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": true,
+      "special": false
     }
   ],
   "normalizer": {

tokenizer_config.json CHANGED Viewed

@@ -1,34 +1,64 @@
 {
-  "bos_token": {
-    "__type": "AddedToken",
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
   },
   "clean_up_tokenization_spaces": false,
-  "eos_token": {
-    "__type": "AddedToken",
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
   "legacy": false,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": null,
   "padding_side": "right",
   "sp_model_kwargs": {},
   "tokenizer_class": "LlamaTokenizer",
-  "unk_token": {
-    "__type": "AddedToken",
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
   "use_default_system_prompt": true
 }

 {
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "[PAD]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32002": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
   },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
   "legacy": false,
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": null,
   "padding_side": "right",
   "sp_model_kwargs": {},
   "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
   "use_default_system_prompt": true
 }

vocabulary.json CHANGED Viewed

@@ -31999,5 +31999,7 @@
   "\u6536",
   "\u5f18",
   "\u7ed9",
-  "[PAD]"
 ]

   "\u6536",
   "\u5f18",
   "\u7ed9",
+  "[PAD]",
+  "<|im_start|>",
+  "<|im_end|>"
 ]