Bitsy commited on
Commit
1916043
1 Parent(s): fc9fcea

Upload 7 files

Browse files
CITATION.bib ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @misc{Yannic_2022,
2
+ title = {GPT-4chan},
3
+ url = {https://gpt-4chan.com/},
4
+ author = {Yannic, Kilcher},
5
+ year = 2022
6
+ }
7
+
8
+ @article{papasavva2020raiders,
9
+ title = {Raiders of the Lost Kek: 3.5 Years of Augmented 4chan Posts from the Politically Incorrect Board},
10
+ author = {Antonis Papasavva, Savvas Zannettou, Emiliano De Cristofaro, Gianluca Stringhini, Jeremy Blackburn},
11
+ journal = {14th International AAAI Conference On Web And Social Media (ICWSM), 2020},
12
+ year = 2020
13
+ }
14
+
15
+ @misc{mesh-transformer-jax,
16
+ author = {Wang, Ben},
17
+ title = {{Mesh-Transformer-JAX: Model-Parallel Implementation of Transformer Language Model with JAX}},
18
+ howpublished = {\url{https://github.com/kingoflolz/mesh-transformer-jax}},
19
+ year = 2021,
20
+ month = May
21
+ }
22
+
23
+ @misc{gpt-j,
24
+ author = {Wang, Ben and Komatsuzaki, Aran},
25
+ title = {{GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model}},
26
+ howpublished = {\url{https://github.com/kingoflolz/mesh-transformer-jax}},
27
+ year = 2021,
28
+ month = May
29
+ }
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPTJForCausalLM"
5
+ ],
6
+ "attn_pdrop": 0.0,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.0,
9
+ "eos_token_id": 50256,
10
+ "gradient_checkpointing": false,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gptj",
14
+ "n_embd": 4096,
15
+ "n_head": 16,
16
+ "n_layer": 28,
17
+ "n_positions": 2048,
18
+ "rotary_dim": 64,
19
+ "summary_activation": null,
20
+ "summary_first_dropout": 0.1,
21
+ "summary_proj_to_labels": true,
22
+ "summary_type": "cls_index",
23
+ "summary_use_proj": true,
24
+ "transformers_version": "4.10.0.dev0",
25
+ "tokenizer_class": "GPT2Tokenizer",
26
+ "task_specific_params": {
27
+ "text-generation": {
28
+ "do_sample": true,
29
+ "temperature": 1.0,
30
+ "max_length": 50
31
+ }
32
+ },
33
+ "torch_dtype": "float32",
34
+ "use_cache": true,
35
+ "vocab_size": 50400
36
+ }
gpt4chan_model_meta.sqlite ADDED
Binary file (20.5 kB). View file
 
gpt4chan_model_meta.xml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <metadata>
3
+ <identifier>gpt4chan_model</identifier>
4
+ <collection>datasets_unsorted</collection>
5
+ <licenseurl>https://www.apache.org/licenses/LICENSE-2.0</licenseurl>
6
+ <scanner>Internet Archive Python library 3.0.1</scanner>
7
+ <mediatype>data</mediatype>
8
+ <uploader>valentino.giudice96@gmail.com</uploader>
9
+ <title> GPT-4chan Model</title>
10
+ <publicdate>2022-06-07 01:56:14</publicdate>
11
+ <addeddate>2022-06-07 01:56:14</addeddate>
12
+ <curation>[curator]validator@archive.org[/curator][date]20220607020703[/date][comment]checked for malware[/comment]</curation>
13
+ <creator> Yannic Kilcher</creator>
14
+ <description>&lt;div&gt;&lt;div&gt;GPT-4chan is a language model fine-tuned from &lt;a href="https://huggingface.co/EleutherAI/gpt-j-6B" rel="nofollow"&gt;GPT-J 6B&lt;/a&gt; on 3.5 years worth of data from 4chan's politically incorrect (/pol/) board, as included in the dataset &lt;span style="border-style:solid;border-color:rgb(229,231,235);"&gt;&lt;a href="https://zenodo.org/record/3606810" rel="nofollow"&gt;Raiders of the Lost Kek: 3.5 Years of Augmented 4chan Posts from the Politically Incorrect Board&lt;/a&gt;&lt;/span&gt;.&lt;/div&gt;&lt;/div&gt;</description>
15
+ <publisher> Yannic Kilcher</publisher>
16
+ <language>English</language>
17
+ <collection>datasets</collection>
18
+ </metadata>
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "gpt-j-6B", "from_slow": true, "tokenizer_class": "GPT2Tokenizer"}