wissamantoun commited on
Commit
c59ebda
1 Parent(s): 5bff47f

added language generation

Browse files
.github/workflows/push_to_hf_hub.yml CHANGED
@@ -17,4 +17,4 @@ jobs:
17
  - name: Push to hub
18
  env:
19
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
- run: git push --force https://wissamantoun:$HF_TOKEN@huggingface.co/spaces/wissamantoun/Arabic-NLP main
17
  - name: Push to hub
18
  env:
19
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push --force https://aubmindlab:$HF_TOKEN@huggingface.co/spaces/aubmindlab/Arabic-NLP main
.gitignore CHANGED
@@ -129,4 +129,5 @@ dmypy.json
129
  .pyre/
130
 
131
 
132
- .vscode/
 
129
  .pyre/
130
 
131
 
132
+ .vscode/
133
+ add_key.bat
app.py CHANGED
@@ -1,24 +1,36 @@
1
- import streamlit as st
2
  import awesome_streamlit as ast
3
- import pages.home
4
- import pages.processor
5
 
 
 
 
6
 
7
  st.set_page_config(
8
  page_title="TEST", page_icon="📖", initial_sidebar_state="expanded", layout="wide"
9
  )
10
 
11
- PAGES = {"Home": pages.home, "Arabic Text Preprocessor": pages.processor}
 
 
 
 
12
 
13
 
14
  st.sidebar.title("Navigation")
15
  selection = st.sidebar.radio("Pages", list(PAGES.keys()))
16
 
17
  page = PAGES[selection]
18
- with st.spinner(f"Loading {selection} ..."):
19
- ast.shared.components.write_page(page)
20
 
21
  st.sidebar.header("Info")
22
  st.sidebar.write("Made by [Wissam Antoun](https://twitter.com/wissam_antoun)")
23
- st.sidebar.write("[Models Repo](https://github.com/aub-mind/arabert)")
24
- st.sidebar.write("Source Code [GitHub](https://github.com/WissamAntoun/Arabic-NLP-app)")
 
 
 
 
 
 
 
 
1
  import awesome_streamlit as ast
2
+ import streamlit as st
 
3
 
4
+ import backend.aragpt
5
+ import backend.home
6
+ import backend.processor
7
 
8
  st.set_page_config(
9
  page_title="TEST", page_icon="📖", initial_sidebar_state="expanded", layout="wide"
10
  )
11
 
12
+ PAGES = {
13
+ "Home": backend.home,
14
+ "Arabic Text Preprocessor": backend.processor,
15
+ "Arabic Language Generation": backend.aragpt,
16
+ }
17
 
18
 
19
  st.sidebar.title("Navigation")
20
  selection = st.sidebar.radio("Pages", list(PAGES.keys()))
21
 
22
  page = PAGES[selection]
23
+ # with st.spinner(f"Loading {selection} ..."):
24
+ ast.shared.components.write_page(page)
25
 
26
  st.sidebar.header("Info")
27
  st.sidebar.write("Made by [Wissam Antoun](https://twitter.com/wissam_antoun)")
28
+ st.sidebar.write(
29
+ "Pre-trained models are available on [HF Hub](https://huggingface.co/aubmindlab)"
30
+ )
31
+ st.sidebar.write(
32
+ "Models source code available on [GitHub](https://github.com/aub-mind/arabert)"
33
+ )
34
+ st.sidebar.write(
35
+ "App source code available on [GitHub](https://github.com/WissamAntoun/Arabic-NLP-app)"
36
+ )
backend.py DELETED
File without changes
{pages → backend}/__init__.py RENAMED
File without changes
backend/aragpt.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from .services import TextGeneration
3
+ from tokenizers import Tokenizer
4
+ from functools import lru_cache
5
+
6
+ # @st.cache(allow_output_mutation=False, hash_funcs={Tokenizer: str})
7
+ @lru_cache(maxsize=1)
8
+ def load_text_generator():
9
+ generator = TextGeneration()
10
+ generator.load()
11
+ return generator
12
+
13
+
14
+ generator = load_text_generator()
15
+
16
+ qa_prompt = """
17
+ أجب عن السؤال التالي:
18
+ """
19
+ qa_prompt_post = """ الجواب هو """
20
+ qa_prompt_post_year = """ في سنة: """
21
+
22
+
23
+ def write():
24
+ # Sidebar
25
+
26
+ # Taken from https://huggingface.co/spaces/flax-community/spanish-gpt2/blob/main/app.py
27
+ st.sidebar.subheader("Configurable parameters")
28
+
29
+ model_name = st.sidebar.selectbox(
30
+ "Model Selector",
31
+ options=[
32
+ "AraGPT2-Base",
33
+ "AraGPT2-Medium",
34
+ "Aragpt2-Large",
35
+ "AraGPT2-Mega",
36
+ ],
37
+ index=0,
38
+ )
39
+
40
+ max_new_tokens = st.sidebar.number_input(
41
+ "Maximum length",
42
+ min_value=0,
43
+ max_value=1024,
44
+ value=100,
45
+ help="The maximum length of the sequence to be generated.",
46
+ )
47
+ temp = st.sidebar.slider(
48
+ "Temperature",
49
+ value=1.0,
50
+ min_value=0.1,
51
+ max_value=100.0,
52
+ help="The value used to module the next token probabilities.",
53
+ )
54
+ top_k = st.sidebar.number_input(
55
+ "Top k",
56
+ value=10,
57
+ help="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
58
+ )
59
+ top_p = st.sidebar.number_input(
60
+ "Top p",
61
+ value=0.95,
62
+ help=" If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
63
+ )
64
+ do_sample = st.sidebar.selectbox(
65
+ "Sampling?",
66
+ (True, False),
67
+ help="Whether or not to use sampling; use greedy decoding otherwise.",
68
+ )
69
+ num_beams = st.sidebar.number_input(
70
+ "Number of beams",
71
+ min_value=1,
72
+ max_value=10,
73
+ value=3,
74
+ help="The number of beams to use for beam search.",
75
+ )
76
+ repetition_penalty = st.sidebar.number_input(
77
+ "Repetition Penalty",
78
+ min_value=0.0,
79
+ value=3.0,
80
+ step=0.1,
81
+ help="The parameter for repetition penalty. 1.0 means no penalty",
82
+ )
83
+ no_repeat_ngram_size = st.sidebar.number_input(
84
+ "No Repear N-Gram Size",
85
+ min_value=0,
86
+ value=3,
87
+ help="If set to int > 0, all ngrams of that size can only occur once.",
88
+ )
89
+
90
+ st.write("#")
91
+
92
+ col = st.beta_columns(2)
93
+
94
+ col[0].image("images/AraGPT2.png", width=200)
95
+
96
+ st.markdown(
97
+ """
98
+
99
+ <h3 style="text-align:left;">AraGPT2 is GPT2 model trained from scratch on 77GB of Arabic text.</h3>
100
+ <h4 style="text-align:left;"> More details in our <a href="https://github.com/aub-mind/arabert/tree/master/aragpt2">repo</a>.</h4>
101
+
102
+ <p style="text-align:left;"><p>
103
+ <p style="text-align:left;">Use the generation paramters on the sidebar to adjust generation quality.</p>
104
+ <p style="text-align:right;"><p>
105
+ """,
106
+ unsafe_allow_html=True,
107
+ )
108
+
109
+ # col[0].write(
110
+ # "AraGPT2 is trained from screatch on 77GB of Arabic text. More details in our [repo](https://github.com/aub-mind/arabert/tree/master/aragpt2)."
111
+ # )
112
+ # st.write("## Generate Arabic Text")
113
+
114
+ st.markdown(
115
+ """
116
+ <style>
117
+ p, div, input, label, textarea{
118
+ text-align: right;
119
+ }
120
+ </style>
121
+ """,
122
+ unsafe_allow_html=True,
123
+ )
124
+
125
+ prompt = st.text_area(
126
+ "Prompt",
127
+ "يحكى أن مزارعا مخادعا قام ببيع بئر الماء الموجود في أرضه لجاره مقابل مبلغ كبير من المال",
128
+ )
129
+ if st.button("Generate"):
130
+ with st.spinner("Generating..."):
131
+ generated_text = generator.generate(
132
+ prompt=prompt,
133
+ model_name=model_name,
134
+ max_new_tokens=max_new_tokens,
135
+ temperature=temp,
136
+ top_k=top_k,
137
+ top_p=top_p,
138
+ repetition_penalty=repetition_penalty,
139
+ do_sample=do_sample,
140
+ num_beams=num_beams,
141
+ no_repeat_ngram_size=no_repeat_ngram_size,
142
+ )
143
+ st.write(generated_text)
144
+
145
+ st.markdown("---")
146
+ st.subheader("")
147
+ st.markdown(
148
+ """
149
+ <p style="text-align:left;"><p>
150
+ <h2 style="text-align:left;">Zero-Shot Question Answering</h2>
151
+
152
+ <p style="text-align:left;">Adjust the maximum length to closely match the expected output length. Setting the Sampling paramter to False is recommended</p>
153
+ <p style="text-align:left;"><p>
154
+ """,
155
+ unsafe_allow_html=True,
156
+ )
157
+
158
+ question = st.text_input(
159
+ "Question", "من كان رئيس ألمانيا النازية في الحرب العالمية ��لثانية ؟"
160
+ )
161
+ is_date = st.checkbox("Help the model: Is the answer a date?")
162
+ if st.button("Answer"):
163
+
164
+ prompt = qa_prompt + question + qa_prompt_post
165
+ if is_date:
166
+ prompt += qa_prompt_post_year
167
+ else:
168
+ prompt += " : "
169
+ with st.spinner("Thinking..."):
170
+ answer = generator.generate(
171
+ prompt=prompt,
172
+ model_name=model_name,
173
+ max_new_tokens=max_new_tokens,
174
+ temperature=temp,
175
+ top_k=top_k,
176
+ top_p=top_p,
177
+ repetition_penalty=repetition_penalty,
178
+ do_sample=do_sample,
179
+ num_beams=num_beams,
180
+ no_repeat_ngram_size=no_repeat_ngram_size,
181
+ )
182
+ st.write(answer)
{pages → backend}/home.py RENAMED
File without changes
backend/modeling_gpt2.py ADDED
@@ -0,0 +1,1196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """
18
+ PyTorch OpenAI GPT-2 model.
19
+ Adapted from https://github.com/huggingface/transformers/blob/v4.0.1/src/transformers/models/gpt2/modeling_gpt2.py
20
+ and https://github.com/ghosthamlet/gpt2-ml-torch/blob/master/gpt2_ml_torch/modeling_gpt2.py
21
+ """
22
+
23
+
24
+ import logging
25
+ import os
26
+
27
+ from dataclasses import dataclass
28
+ from typing import List, Optional, Tuple
29
+
30
+ import torch
31
+ import torch.nn as nn
32
+ from torch.nn import CrossEntropyLoss, MSELoss
33
+
34
+
35
+
36
+ from transformers.activations import ACT2FN
37
+ from transformers import GPT2Config
38
+
39
+ from transformers.modeling_utils import (
40
+ Conv1D,
41
+ PreTrainedModel,
42
+ SequenceSummary,
43
+ prune_conv1d_layer,
44
+ find_pruneable_heads_and_indices
45
+ )
46
+
47
+ from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model
48
+
49
+ from transformers.modeling_outputs import (
50
+ BaseModelOutputWithPastAndCrossAttentions,
51
+ CausalLMOutputWithCrossAttentions,
52
+ SequenceClassifierOutputWithPast
53
+ )
54
+
55
+ from transformers.file_utils import (
56
+ ModelOutput,
57
+ add_start_docstrings,
58
+ add_start_docstrings_to_model_forward,
59
+ add_code_sample_docstrings,
60
+ replace_return_docstrings
61
+ )
62
+
63
+ # THe Difference from Transformers is code under _USE_GROVER
64
+ _USE_GROVER = True
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+ _CONFIG_FOR_DOC = "GPT2Config"
69
+ _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
70
+
71
+ GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
72
+ "gpt2",
73
+ "gpt2-medium",
74
+ "gpt2-large",
75
+ "gpt2-xl",
76
+ "distilgpt2",
77
+ # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
78
+ ]
79
+
80
+ logger.setLevel(logging.INFO)
81
+ console = logging.StreamHandler()
82
+ console.setLevel(logging.INFO)
83
+ logger.addHandler(console)
84
+
85
+ _GPT2_ML_TF_TO_TORCH = {
86
+ 'LayerNorm_embed_norm': 'emb_norm',
87
+ 'pos_embed': 'wpe.weight',
88
+ 'word_embed': 'wte.weight',
89
+
90
+ 'layer': 'h',
91
+ # Most importently This two layer norm must be put on the same position as gpt2-ml
92
+ # or generated data is bad, just repeat the last token
93
+ 'LayerNorm_mlp_ln0': 'ln_1',
94
+ 'LayerNorm_mlp_ln1': 'ln_2',
95
+ 'intermediate': 'mlp.c_fc',
96
+ 'output': 'mlp.c_proj',
97
+ 'query_layer': 'attn.c_attn',
98
+ 'key_layer': 'attn.c_attn',
99
+ 'value_layer': 'attn.c_attn',
100
+ 'context_projection_layer': 'attn.c_proj',
101
+
102
+ 'gamma': 'weight',
103
+ 'kernel': 'weight',
104
+ 'beta': 'bias',
105
+ 'bias': 'bias',
106
+ }
107
+
108
+
109
+ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
110
+ # Construct model
111
+ if gpt2_config_file == "":
112
+ config = GPT2Config()
113
+ else:
114
+ config = GPT2Config.from_json_file(gpt2_config_file)
115
+ model = GPT2Model(config)
116
+
117
+ # Load weights from numpy
118
+ load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
119
+
120
+ # Save pytorch-model
121
+ pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
122
+ pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
123
+ print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
124
+ torch.save(model.state_dict(), pytorch_weights_dump_path)
125
+ print("Save configuration file to {}".format(pytorch_config_dump_path))
126
+ with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
127
+ f.write(config.to_json_string())
128
+
129
+
130
+ # XXX: MUST do like: convert_gpt2_checkpoint_to_pytorch('./model.ckpt-100000', './mega.json', './')
131
+ # https://github.com/tensorflow/models/issues/2675#issuecomment-516595597
132
+ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
133
+ """ Load tf checkpoints in a pytorch model
134
+ """
135
+ try:
136
+ import re
137
+ import tensorflow as tf
138
+ except ImportError:
139
+ logger.error(
140
+ "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
141
+ "https://www.tensorflow.org/install/ for installation instructions."
142
+ )
143
+ raise
144
+ tf_path = os.path.abspath(gpt2_checkpoint_path)
145
+ logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
146
+ # Load weights from TF model
147
+ init_vars = tf.train.list_variables(tf_path)
148
+ names = []
149
+ arrays = []
150
+ for name, shape in init_vars:
151
+ logger.info("Loading TF weight {} with shape {}".format(name, shape))
152
+ array = tf.train.load_variable(tf_path, name)
153
+ names.append(name)
154
+ arrays.append(array.squeeze())
155
+
156
+ import copy
157
+ orig_model = copy.deepcopy(model)
158
+
159
+ for name, array in zip(names, arrays):
160
+ name = name[6:] # skip "model/"
161
+ name = name.split("/")
162
+ pointer = model
163
+
164
+ attn_layer = ''
165
+ for m_name in name:
166
+ if re.fullmatch(r"[A-Za-z]+\d+", m_name):
167
+ scope_names = re.split(r"(\d+)", m_name)
168
+ else:
169
+ scope_names = [m_name]
170
+ sname = scope_names[0]
171
+
172
+ if sname == '' or sname == 'embeddings':
173
+ continue
174
+ elif sname not in _GPT2_ML_TF_TO_TORCH:
175
+ print('=========================================================')
176
+ logger.info('Skip var name {}'.format(scope_names))
177
+ pointer = None
178
+ break
179
+ else:
180
+ tname = _GPT2_ML_TF_TO_TORCH[sname]
181
+ if '.' in tname:
182
+ parent, child = tname.split('.')
183
+ pointer = getattr(pointer, parent)
184
+ pointer = getattr(pointer, child)
185
+ else:
186
+ pointer = getattr(pointer, tname)
187
+
188
+ if tname == 'attn.c_attn':
189
+ attn_layer = sname
190
+
191
+ if len(scope_names) >= 2:
192
+ num = int(scope_names[1])
193
+ pointer = pointer[num]
194
+
195
+ if pointer is None:
196
+ continue
197
+ if attn_layer == '':
198
+ try:
199
+ assert pointer.shape == array.shape
200
+ except AssertionError as e:
201
+ e.args += (pointer.shape, array.shape)
202
+ raise
203
+ logger.info("Initialize PyTorch weight {}, {}, {}".format(name, array.mean(), pointer.mean()))
204
+ if attn_layer == '':
205
+ pointer.data = torch.from_numpy(array)
206
+ else:
207
+ shape = pointer.shape
208
+ d = torch.from_numpy(array)
209
+ is_bias = len(shape) == 1
210
+ end = int(shape[0 if is_bias else 1]/3)
211
+ m = dict(
212
+ query_layer=0,
213
+ key_layer=end,
214
+ value_layer=end*2,
215
+ )
216
+ start = m[attn_layer]
217
+ end = start + end
218
+ if is_bias:
219
+ pointer.data[start:end] = d
220
+ else:
221
+ pointer.data[:, start:end] = d
222
+ logger.info("Initialize PyTorch weight {}, {}, {}".format(name, array.mean(), pointer.mean()))
223
+
224
+ for name, params in orig_model.named_parameters():
225
+ for n, p in model.named_parameters():
226
+ if name == n:
227
+ if params.equal(p):
228
+ print('--------------------------')
229
+ print(' %s not changed!' % n)
230
+ return model
231
+
232
+
233
+ class Attention(nn.Module):
234
+ def __init__(self, nx, n_ctx, config, scale=False, is_cross_attention=False):
235
+ super().__init__()
236
+
237
+ n_state = nx # in Attention: n_state=768 (nx=n_embd)
238
+ # [switch nx => n_state from Block to Attention to keep identical to TF implem]
239
+ assert n_state % config.n_head == 0
240
+ self.register_buffer(
241
+ "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)
242
+ )
243
+ self.register_buffer("masked_bias", torch.tensor(-1e4))
244
+ self.n_head = config.n_head
245
+ self.split_size = n_state
246
+ self.scale = scale
247
+ self.is_cross_attention = is_cross_attention
248
+ if self.is_cross_attention:
249
+ self.c_attn = Conv1D(2 * n_state, nx)
250
+ self.q_attn = Conv1D(n_state, nx)
251
+ else:
252
+ self.c_attn = Conv1D(3 * n_state, nx)
253
+ self.c_proj = Conv1D(n_state, nx)
254
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
255
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
256
+ self.pruned_heads = set()
257
+
258
+ def prune_heads(self, heads):
259
+ if len(heads) == 0:
260
+ return
261
+ heads, index = find_pruneable_heads_and_indices(
262
+ heads, self.n_head, self.split_size // self.n_head, self.pruned_heads
263
+ )
264
+ index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
265
+
266
+ # Prune conv1d layers
267
+ self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
268
+ self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
269
+
270
+ # Update hyper params
271
+ self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
272
+ self.n_head = self.n_head - len(heads)
273
+ self.pruned_heads = self.pruned_heads.union(heads)
274
+
275
+ def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
276
+ w = torch.matmul(q, k)
277
+ if self.scale:
278
+ w = w / (float(v.size(-1)) ** 0.5)
279
+ nd, ns = w.size(-2), w.size(-1)
280
+
281
+ if not self.is_cross_attention:
282
+ # if only "normal" attention layer implements causal mask
283
+ mask = self.bias[:, :, ns - nd : ns, :ns]
284
+ w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype))
285
+
286
+ if attention_mask is not None:
287
+ # Apply the attention mask
288
+ w = w + attention_mask
289
+
290
+ w = nn.Softmax(dim=-1)(w)
291
+ w = self.attn_dropout(w)
292
+
293
+ # Mask heads if we want to
294
+ if head_mask is not None:
295
+ w = w * head_mask
296
+
297
+ outputs = [torch.matmul(w, v)]
298
+ if output_attentions:
299
+ outputs.append(w)
300
+ return outputs
301
+
302
+ def merge_heads(self, x):
303
+ x = x.permute(0, 2, 1, 3).contiguous()
304
+ new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
305
+ return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
306
+
307
+ def split_heads(self, x, k=False):
308
+ new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
309
+ x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
310
+ if k:
311
+ return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length)
312
+ else:
313
+ return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
314
+
315
+ def forward(
316
+ self,
317
+ hidden_states,
318
+ layer_past=None,
319
+ attention_mask=None,
320
+ head_mask=None,
321
+ encoder_hidden_states=None,
322
+ encoder_attention_mask=None,
323
+ use_cache=False,
324
+ output_attentions=False,
325
+ ):
326
+ if encoder_hidden_states is not None:
327
+ assert hasattr(
328
+ self, "q_attn"
329
+ ), "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `Attention(..., is_cross_attention=True)`."
330
+ query = self.q_attn(hidden_states)
331
+ key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
332
+ attention_mask = encoder_attention_mask
333
+ else:
334
+ query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
335
+
336
+ query = self.split_heads(query)
337
+ key = self.split_heads(key, k=True)
338
+ value = self.split_heads(value)
339
+ if layer_past is not None:
340
+ past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1] # transpose back cf below
341
+ key = torch.cat((past_key, key), dim=-1)
342
+ value = torch.cat((past_value, value), dim=-2)
343
+
344
+ if use_cache is True:
345
+ present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
346
+ else:
347
+ present = (None,)
348
+
349
+ attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions)
350
+ a = attn_outputs[0]
351
+
352
+ a = self.merge_heads(a)
353
+ a = self.c_proj(a)
354
+ a = self.resid_dropout(a)
355
+
356
+ outputs = [a, present] + attn_outputs[1:]
357
+ return outputs # a, present, (attentions)
358
+
359
+
360
+ class MLP(nn.Module):
361
+ def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
362
+ super().__init__()
363
+ nx = config.n_embd
364
+ self.c_fc = Conv1D(n_state, nx)
365
+ self.c_proj = Conv1D(nx, n_state)
366
+ self.act = ACT2FN[config.activation_function]
367
+ self.dropout = nn.Dropout(config.resid_pdrop)
368
+
369
+ def forward(self, x):
370
+ h = self.act(self.c_fc(x))
371
+ h2 = self.c_proj(h)
372
+ return self.dropout(h2)
373
+
374
+
375
+ class Block(nn.Module):
376
+ def __init__(self, n_ctx, config, scale=False):
377
+ super().__init__()
378
+ hidden_size = config.n_embd
379
+ inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
380
+ self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
381
+ self.attn = Attention(hidden_size, n_ctx, config, scale)
382
+ self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
383
+ if config.add_cross_attention:
384
+ self.crossattention = Attention(hidden_size, n_ctx, config, scale, is_cross_attention=True)
385
+ self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
386
+ self.mlp = MLP(inner_dim, config)
387
+
388
+ def forward(
389
+ self,
390
+ hidden_states,
391
+ layer_past=None,
392
+ attention_mask=None,
393
+ head_mask=None,
394
+ encoder_hidden_states=None,
395
+ encoder_attention_mask=None,
396
+ use_cache=False,
397
+ output_attentions=False,
398
+ ):
399
+ attn_outputs = self.attn(
400
+ hidden_states,
401
+ layer_past=layer_past,
402
+ attention_mask=attention_mask,
403
+ head_mask=head_mask,
404
+ use_cache=use_cache,
405
+ output_attentions=output_attentions,
406
+ )
407
+ attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
408
+ outputs = attn_outputs[1:]
409
+ # residual connection
410
+ hidden_states = attn_output + hidden_states
411
+
412
+ if encoder_hidden_states is not None:
413
+ # add one self-attention block for cross-attention
414
+ assert hasattr(
415
+ self, "crossattention"
416
+ ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
417
+ cross_attn_outputs = self.crossattention(
418
+ self.ln_cross_attn(hidden_states),
419
+ attention_mask=attention_mask,
420
+ head_mask=head_mask,
421
+ encoder_hidden_states=encoder_hidden_states,
422
+ encoder_attention_mask=encoder_attention_mask,
423
+ output_attentions=output_attentions,
424
+ )
425
+ attn_output = cross_attn_outputs[0]
426
+ # residual connection
427
+ hidden_states = hidden_states + attn_output
428
+ outputs = outputs + cross_attn_outputs[2:] # add cross attentions if we output attention weights
429
+
430
+ feed_forward_hidden_states = self.mlp(self.ln_1(hidden_states))
431
+ # residual connection
432
+ hidden_states = hidden_states + feed_forward_hidden_states
433
+
434
+ hidden_states = self.ln_2(hidden_states)
435
+
436
+ outputs = [hidden_states] + outputs
437
+ return outputs # hidden_states, present, (attentions, cross_attentions)
438
+
439
+
440
+ class GPT2PreTrainedModel(PreTrainedModel):
441
+ """
442
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
443
+ models.
444
+ """
445
+
446
+ config_class = GPT2Config
447
+ load_tf_weights = load_tf_weights_in_gpt2
448
+ base_model_prefix = "transformer"
449
+
450
+ def __init__(self, *inputs, **kwargs):
451
+ super().__init__(*inputs, **kwargs)
452
+
453
+ def _init_weights(self, module):
454
+ """Initialize the weights."""
455
+ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
456
+ # Slightly different from the TF version which uses truncated_normal for initialization
457
+ # cf https://github.com/pytorch/pytorch/pull/5617
458
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
459
+ if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
460
+ module.bias.data.zero_()
461
+ elif isinstance(module, nn.LayerNorm):
462
+ module.bias.data.zero_()
463
+ module.weight.data.fill_(1.0)
464
+
465
+
466
+ @dataclass
467
+ class GPT2DoubleHeadsModelOutput(ModelOutput):
468
+ """
469
+ Base class for outputs of models predicting if two sentences are consecutive or not.
470
+
471
+ Args:
472
+ loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
473
+ Language modeling loss.
474
+ mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
475
+ Multiple choice classification loss.
476
+ logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
477
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
478
+ mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
479
+ Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
480
+ past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
481
+ List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
482
+ batch_size, num_heads, sequence_length, embed_size_per_head)`).
483
+
484
+ Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
485
+ :obj:`past_key_values` input) to speed up sequential decoding.
486
+ hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
487
+ Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
488
+ of shape :obj:`(batch_size, sequence_length, hidden_size)`.
489
+
490
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
491
+ attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
492
+ Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
493
+ sequence_length, sequence_length)`.
494
+
495
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
496
+ heads.
497
+ """
498
+
499
+ loss: Optional[torch.FloatTensor] = None
500
+ mc_loss: Optional[torch.FloatTensor] = None
501
+ logits: torch.FloatTensor = None
502
+ mc_logits: torch.FloatTensor = None
503
+ past_key_values: Optional[List[torch.FloatTensor]] = None
504
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
505
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
506
+
507
+
508
+ GPT2_START_DOCSTRING = r"""
509
+
510
+ This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
511
+ methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
512
+ pruning heads etc.)
513
+
514
+ This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
515
+ subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
516
+ general usage and behavior.
517
+
518
+ Parameters:
519
+ config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
520
+ Initializing with a config file does not load the weights associated with the model, only the
521
+ configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
522
+ weights.
523
+ """
524
+
525
+ GPT2_INPUTS_DOCSTRING = r"""
526
+ Args:
527
+ input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
528
+ :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
529
+ ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
530
+ sequence tokens in the vocabulary.
531
+
532
+ If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
533
+ passed as ``input_ids``.
534
+
535
+ Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
536
+ :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
537
+ details.
538
+
539
+ `What are input IDs? <../glossary.html#input-ids>`__
540
+ past_key_values (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
541
+ Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
542
+ :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
543
+ have their past given to this model should not be passed as ``input_ids`` as they have already been
544
+ computed.
545
+ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
546
+ Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
547
+
548
+ - 1 for tokens that are **not masked**,
549
+ - 0 for tokens that are **masked**.
550
+
551
+ `What are attention masks? <../glossary.html#attention-mask>`__
552
+ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
553
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
554
+ 1]``:
555
+
556
+ - 0 corresponds to a `sentence A` token,
557
+ - 1 corresponds to a `sentence B` token.
558
+
559
+ `What are token type IDs? <../glossary.html#token-type-ids>`_
560
+ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
561
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
562
+ config.max_position_embeddings - 1]``.
563
+
564
+ `What are position IDs? <../glossary.html#position-ids>`_
565
+ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
566
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
567
+
568
+ - 1 indicates the head is **not masked**,
569
+ - 0 indicates the head is **masked**.
570
+
571
+ inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
572
+ Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
573
+ This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
574
+ vectors than the model's internal embedding lookup matrix.
575
+
576
+ If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
577
+ :obj:`past_key_values`).
578
+ use_cache (:obj:`bool`, `optional`):
579
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
580
+ decoding (see :obj:`past_key_values`).
581
+ output_attentions (:obj:`bool`, `optional`):
582
+ Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
583
+ tensors for more detail.
584
+ output_hidden_states (:obj:`bool`, `optional`):
585
+ Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
586
+ more detail.
587
+ return_dict (:obj:`bool`, `optional`):
588
+ Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
589
+ """
590
+
591
+
592
+ @add_start_docstrings(
593
+ "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
594
+ GPT2_START_DOCSTRING,
595
+ )
596
+ class GPT2Model(GPT2PreTrainedModel):
597
+ def __init__(self, config):
598
+ super().__init__(config)
599
+
600
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
601
+ self.wpe = nn.Embedding(config.n_positions, config.n_embd)
602
+ if _USE_GROVER:
603
+ self.emb_norm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
604
+
605
+ self.drop = nn.Dropout(config.embd_pdrop)
606
+ self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
607
+ if not _USE_GROVER:
608
+ self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
609
+
610
+ self.init_weights()
611
+
612
+ def get_input_embeddings(self):
613
+ return self.wte
614
+
615
+ def set_input_embeddings(self, new_embeddings):
616
+ self.wte = new_embeddings
617
+
618
+ def _prune_heads(self, heads_to_prune):
619
+ """
620
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
621
+ """
622
+ for layer, heads in heads_to_prune.items():
623
+ self.h[layer].attn.prune_heads(heads)
624
+
625
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
626
+ @add_code_sample_docstrings(
627
+ tokenizer_class=_TOKENIZER_FOR_DOC,
628
+ checkpoint="gpt2",
629
+ output_type=BaseModelOutputWithPastAndCrossAttentions,
630
+ config_class=_CONFIG_FOR_DOC,
631
+ )
632
+ def forward(
633
+ self,
634
+ input_ids=None,
635
+ past_key_values=None,
636
+ attention_mask=None,
637
+ token_type_ids=None,
638
+ position_ids=None,
639
+ head_mask=None,
640
+ inputs_embeds=None,
641
+ encoder_hidden_states=None,
642
+ encoder_attention_mask=None,
643
+ use_cache=None,
644
+ output_attentions=None,
645
+ output_hidden_states=None,
646
+ return_dict=None,
647
+ ):
648
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
649
+ output_hidden_states = (
650
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
651
+ )
652
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
653
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
654
+
655
+ if input_ids is not None and inputs_embeds is not None:
656
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
657
+ elif input_ids is not None:
658
+ input_shape = input_ids.size()
659
+ input_ids = input_ids.view(-1, input_shape[-1])
660
+ batch_size = input_ids.shape[0]
661
+ elif inputs_embeds is not None:
662
+ input_shape = inputs_embeds.size()[:-1]
663
+ batch_size = inputs_embeds.shape[0]
664
+ else:
665
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
666
+
667
+ if token_type_ids is not None:
668
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
669
+ if position_ids is not None:
670
+ position_ids = position_ids.view(-1, input_shape[-1])
671
+
672
+ if past_key_values is None:
673
+ past_length = 0
674
+ past_key_values = [None] * len(self.h)
675
+ else:
676
+ past_length = past_key_values[0][0].size(-2)
677
+ if position_ids is None:
678
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
679
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
680
+ position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
681
+
682
+ # Attention mask.
683
+ if attention_mask is not None:
684
+ assert batch_size > 0, "batch_size has to be defined and > 0"
685
+ attention_mask = attention_mask.view(batch_size, -1)
686
+ # We create a 3D attention mask from a 2D tensor mask.
687
+ # Sizes are [batch_size, 1, 1, to_seq_length]
688
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
689
+ # this attention mask is more simple than the triangular masking of causal attention
690
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
691
+ attention_mask = attention_mask[:, None, None, :]
692
+
693
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
694
+ # masked positions, this operation will create a tensor which is 0.0 for
695
+ # positions we want to attend and -10000.0 for masked positions.
696
+ # Since we are adding it to the raw scores before the softmax, this is
697
+ # effectively the same as removing these entirely.
698
+ attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
699
+ attention_mask = (1.0 - attention_mask) * -10000.0
700
+
701
+ # If a 2D ou 3D attention mask is provided for the cross-attention
702
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
703
+ if self.config.add_cross_attention and encoder_hidden_states is not None:
704
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
705
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
706
+ if encoder_attention_mask is None:
707
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
708
+ encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
709
+ else:
710
+ encoder_attention_mask = None
711
+
712
+ # Prepare head mask if needed
713
+ # 1.0 in head_mask indicate we keep the head
714
+ # attention_probs has shape bsz x n_heads x N x N
715
+ # head_mask has shape n_layer x batch x n_heads x N x N
716
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
717
+
718
+ if inputs_embeds is None:
719
+ inputs_embeds = self.wte(input_ids)
720
+ position_embeds = self.wpe(position_ids)
721
+ hidden_states = inputs_embeds + position_embeds
722
+
723
+ if token_type_ids is not None:
724
+ token_type_embeds = self.wte(token_type_ids)
725
+ hidden_states = hidden_states + token_type_embeds
726
+
727
+ hidden_states = self.drop(hidden_states)
728
+ if _USE_GROVER:
729
+ hidden_states = self.emb_norm(hidden_states)
730
+ output_shape = input_shape + (hidden_states.size(-1),)
731
+
732
+ presents = () if use_cache else None
733
+ all_self_attentions = () if output_attentions else None
734
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
735
+ all_hidden_states = () if output_hidden_states else None
736
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
737
+ if output_hidden_states:
738
+ all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
739
+
740
+ if getattr(self.config, "gradient_checkpointing", False):
741
+
742
+ def create_custom_forward(module):
743
+ def custom_forward(*inputs):
744
+ # checkpointing only works with tuple returns, not with lists
745
+ return tuple(output for output in module(*inputs, use_cache, output_attentions))
746
+
747
+ return custom_forward
748
+
749
+ outputs = torch.utils.checkpoint.checkpoint(
750
+ create_custom_forward(block),
751
+ hidden_states,
752
+ layer_past,
753
+ attention_mask,
754
+ head_mask[i],
755
+ encoder_hidden_states,
756
+ encoder_attention_mask,
757
+ )
758
+ else:
759
+ outputs = block(
760
+ hidden_states,
761
+ layer_past=layer_past,
762
+ attention_mask=attention_mask,
763
+ head_mask=head_mask[i],
764
+ encoder_hidden_states=encoder_hidden_states,
765
+ encoder_attention_mask=encoder_attention_mask,
766
+ use_cache=use_cache,
767
+ output_attentions=output_attentions,
768
+ )
769
+
770
+ hidden_states, present = outputs[:2]
771
+ if use_cache is True:
772
+ presents = presents + (present,)
773
+
774
+ if output_attentions:
775
+ all_self_attentions = all_self_attentions + (outputs[2],)
776
+ if self.config.add_cross_attention:
777
+ all_cross_attentions = all_cross_attentions + (outputs[3],)
778
+
779
+ if not _USE_GROVER:
780
+ hidden_states = self.ln_f(hidden_states)
781
+
782
+ hidden_states = hidden_states.view(*output_shape)
783
+ # Add last hidden state
784
+ if output_hidden_states:
785
+ all_hidden_states = all_hidden_states + (hidden_states,)
786
+
787
+ if not return_dict:
788
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
789
+
790
+ return BaseModelOutputWithPastAndCrossAttentions(
791
+ last_hidden_state=hidden_states,
792
+ past_key_values=presents,
793
+ hidden_states=all_hidden_states,
794
+ attentions=all_self_attentions,
795
+ cross_attentions=all_cross_attentions,
796
+ )
797
+
798
+
799
+ @add_start_docstrings(
800
+ """
801
+ The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
802
+ embeddings).
803
+ """,
804
+ GPT2_START_DOCSTRING,
805
+ )
806
+ class GPT2LMHeadModel(GPT2PreTrainedModel):
807
+ _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
808
+
809
+ def __init__(self, config):
810
+ super().__init__(config)
811
+ self.transformer = GPT2Model(config)
812
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
813
+
814
+ self.init_weights()
815
+
816
+ def get_output_embeddings(self):
817
+ return self.lm_head
818
+
819
+ def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
820
+ token_type_ids = kwargs.get("token_type_ids", None)
821
+ # only last token for inputs_ids if past is defined in kwargs
822
+ if past:
823
+ input_ids = input_ids[:, -1].unsqueeze(-1)
824
+ if token_type_ids is not None:
825
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
826
+
827
+ attention_mask = kwargs.get("attention_mask", None)
828
+ position_ids = kwargs.get("position_ids", None)
829
+
830
+ if attention_mask is not None and position_ids is None:
831
+ # create position_ids on the fly for batch generation
832
+ position_ids = attention_mask.long().cumsum(-1) - 1
833
+ position_ids.masked_fill_(attention_mask == 0, 1)
834
+ if past:
835
+ position_ids = position_ids[:, -1].unsqueeze(-1)
836
+ else:
837
+ position_ids = None
838
+ return {
839
+ "input_ids": input_ids,
840
+ "past_key_values": past,
841
+ "use_cache": kwargs.get("use_cache"),
842
+ "position_ids": position_ids,
843
+ "attention_mask": attention_mask,
844
+ "token_type_ids": token_type_ids,
845
+ }
846
+
847
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
848
+ @add_code_sample_docstrings(
849
+ tokenizer_class=_TOKENIZER_FOR_DOC,
850
+ checkpoint="gpt2",
851
+ output_type= CausalLMOutputWithCrossAttentions,
852
+ config_class=_CONFIG_FOR_DOC,
853
+ )
854
+ def forward(
855
+ self,
856
+ input_ids=None,
857
+ past_key_values=None,
858
+ attention_mask=None,
859
+ token_type_ids=None,
860
+ position_ids=None,
861
+ head_mask=None,
862
+ inputs_embeds=None,
863
+ encoder_hidden_states=None,
864
+ encoder_attention_mask=None,
865
+ labels=None,
866
+ use_cache=None,
867
+ output_attentions=None,
868
+ output_hidden_states=None,
869
+ return_dict=None,
870
+ ):
871
+ r"""
872
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
873
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
874
+ ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
875
+ ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
876
+ """
877
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
878
+
879
+ transformer_outputs = self.transformer(
880
+ input_ids,
881
+ past_key_values=past_key_values,
882
+ attention_mask=attention_mask,
883
+ token_type_ids=token_type_ids,
884
+ position_ids=position_ids,
885
+ head_mask=head_mask,
886
+ inputs_embeds=inputs_embeds,
887
+ encoder_hidden_states=encoder_hidden_states,
888
+ encoder_attention_mask=encoder_attention_mask,
889
+ use_cache=use_cache,
890
+ output_attentions=output_attentions,
891
+ output_hidden_states=output_hidden_states,
892
+ return_dict=return_dict,
893
+ )
894
+ hidden_states = transformer_outputs[0]
895
+
896
+ lm_logits = self.lm_head(hidden_states)
897
+
898
+ loss = None
899
+ if labels is not None:
900
+ # Shift so that tokens < n predict n
901
+ shift_logits = lm_logits[..., :-1, :].contiguous()
902
+ shift_labels = labels[..., 1:].contiguous()
903
+ # Flatten the tokens
904
+ loss_fct = CrossEntropyLoss()
905
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
906
+
907
+ if not return_dict:
908
+ output = (lm_logits,) + transformer_outputs[1:]
909
+ return ((loss,) + output) if loss is not None else output
910
+
911
+ return CausalLMOutputWithCrossAttentions(
912
+ loss=loss,
913
+ logits=lm_logits,
914
+ past_key_values=transformer_outputs.past_key_values,
915
+ hidden_states=transformer_outputs.hidden_states,
916
+ attentions=transformer_outputs.attentions,
917
+ cross_attentions=transformer_outputs.cross_attentions,
918
+ )
919
+
920
+
921
+ @add_start_docstrings(
922
+ """
923
+ The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
924
+ RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
925
+ input embeddings, the classification head takes as input the input of a specified classification token index in the
926
+ input sequence).
927
+ """,
928
+ GPT2_START_DOCSTRING,
929
+ )
930
+ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
931
+ def __init__(self, config):
932
+ super().__init__(config)
933
+ config.num_labels = 1
934
+ self.transformer = GPT2Model(config)
935
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
936
+ self.multiple_choice_head = SequenceSummary(config)
937
+
938
+ self.init_weights()
939
+
940
+ def get_output_embeddings(self):
941
+ return self.lm_head
942
+
943
+ def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
944
+ token_type_ids = kwargs.get("token_type_ids", None)
945
+ # only last token for inputs_ids if past is defined in kwargs
946
+ if past:
947
+ input_ids = input_ids[:, -1].unsqueeze(-1)
948
+ if token_type_ids is not None:
949
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
950
+
951
+ attention_mask = kwargs.get("attention_mask", None)
952
+ position_ids = kwargs.get("position_ids", None)
953
+
954
+ if attention_mask is not None and position_ids is None:
955
+ # create position_ids on the fly for batch generation
956
+ position_ids = attention_mask.long().cumsum(-1) - 1
957
+ position_ids.masked_fill_(attention_mask == 0, 1)
958
+ if past:
959
+ position_ids = position_ids[:, -1].unsqueeze(-1)
960
+ else:
961
+ position_ids = None
962
+
963
+ return {
964
+ "input_ids": input_ids,
965
+ "past_key_values": past,
966
+ "use_cache": kwargs.get("use_cache"),
967
+ "position_ids": position_ids,
968
+ "attention_mask": attention_mask,
969
+ "token_type_ids": token_type_ids,
970
+ }
971
+
972
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
973
+ @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
974
+ def forward(
975
+ self,
976
+ input_ids=None,
977
+ past_key_values=None,
978
+ attention_mask=None,
979
+ token_type_ids=None,
980
+ position_ids=None,
981
+ head_mask=None,
982
+ inputs_embeds=None,
983
+ mc_token_ids=None,
984
+ labels=None,
985
+ mc_labels=None,
986
+ use_cache=None,
987
+ output_attentions=None,
988
+ output_hidden_states=None,
989
+ return_dict=None,
990
+ **kwargs,
991
+ ):
992
+ r"""
993
+ mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
994
+ Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
995
+ 1[``.
996
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
997
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
998
+ ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to
999
+ ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
1000
+ mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`):
1001
+ Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
1002
+ num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
1003
+ `input_ids` above)
1004
+
1005
+ Return:
1006
+
1007
+ Example::
1008
+
1009
+ >>> import torch
1010
+ >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
1011
+
1012
+ >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
1013
+ >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
1014
+
1015
+ >>> # Add a [CLS] to the vocabulary (we should train it also!)
1016
+ >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
1017
+
1018
+ >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
1019
+
1020
+ >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
1021
+ >>> encoded_choices = [tokenizer.encode(s) for s in choices]
1022
+ >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
1023
+
1024
+ >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
1025
+ >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
1026
+
1027
+ >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
1028
+ >>> lm_logits = outputs.lm_logits
1029
+ >>> mc_logits = outputs.mc_logits
1030
+
1031
+ """
1032
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1033
+
1034
+ transformer_outputs = self.transformer(
1035
+ input_ids,
1036
+ past_key_values=past_key_values,
1037
+ attention_mask=attention_mask,
1038
+ token_type_ids=token_type_ids,
1039
+ position_ids=position_ids,
1040
+ head_mask=head_mask,
1041
+ inputs_embeds=inputs_embeds,
1042
+ use_cache=use_cache,
1043
+ output_attentions=output_attentions,
1044
+ output_hidden_states=output_hidden_states,
1045
+ return_dict=return_dict,
1046
+ )
1047
+
1048
+ hidden_states = transformer_outputs[0]
1049
+
1050
+ lm_logits = self.lm_head(hidden_states)
1051
+ mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
1052
+
1053
+ mc_loss = None
1054
+ if mc_labels is not None:
1055
+ loss_fct = CrossEntropyLoss()
1056
+ mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
1057
+ lm_loss = None
1058
+ if labels is not None:
1059
+ shift_logits = lm_logits[..., :-1, :].contiguous()
1060
+ shift_labels = labels[..., 1:].contiguous()
1061
+ loss_fct = CrossEntropyLoss()
1062
+ lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
1063
+
1064
+ if not return_dict:
1065
+ output = (lm_logits, mc_logits) + transformer_outputs[1:]
1066
+ if mc_loss is not None:
1067
+ output = (mc_loss,) + output
1068
+ return ((lm_loss,) + output) if lm_loss is not None else output
1069
+
1070
+ return GPT2DoubleHeadsModelOutput(
1071
+ loss=lm_loss,
1072
+ mc_loss=mc_loss,
1073
+ logits=lm_logits,
1074
+ mc_logits=mc_logits,
1075
+ past_key_values=transformer_outputs.past_key_values,
1076
+ hidden_states=transformer_outputs.hidden_states,
1077
+ attentions=transformer_outputs.attentions,
1078
+ )
1079
+
1080
+
1081
+ @add_start_docstrings(
1082
+ """
1083
+ The GPT2 Model transformer with a sequence classification head on top (linear layer).
1084
+
1085
+ :class:`~transformers.GPT2ForSequenceClassification` uses the last token in order to do the classification, as
1086
+ other causal models (e.g. GPT-1) do.
1087
+
1088
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1089
+ :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
1090
+ row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
1091
+ guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
1092
+ the last value in each row of the batch).
1093
+ """,
1094
+ GPT2_START_DOCSTRING,
1095
+ )
1096
+ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
1097
+ _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
1098
+
1099
+ def __init__(self, config):
1100
+ super().__init__(config)
1101
+ self.num_labels = config.num_labels
1102
+ self.transformer = GPT2Model(config)
1103
+ self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
1104
+
1105
+ self.init_weights()
1106
+
1107
+ @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
1108
+ @add_code_sample_docstrings(
1109
+ tokenizer_class=_TOKENIZER_FOR_DOC,
1110
+ checkpoint="microsoft/dialogrpt",
1111
+ output_type=SequenceClassifierOutputWithPast,
1112
+ config_class=_CONFIG_FOR_DOC,
1113
+ )
1114
+ def forward(
1115
+ self,
1116
+ input_ids=None,
1117
+ past_key_values=None,
1118
+ attention_mask=None,
1119
+ token_type_ids=None,
1120
+ position_ids=None,
1121
+ head_mask=None,
1122
+ inputs_embeds=None,
1123
+ labels=None,
1124
+ use_cache=None,
1125
+ output_attentions=None,
1126
+ output_hidden_states=None,
1127
+ return_dict=None,
1128
+ ):
1129
+ r"""
1130
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
1131
+ Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
1132
+ config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
1133
+ If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1134
+ """
1135
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1136
+
1137
+ transformer_outputs = self.transformer(
1138
+ input_ids,
1139
+ past_key_values=past_key_values,
1140
+ attention_mask=attention_mask,
1141
+ token_type_ids=token_type_ids,
1142
+ position_ids=position_ids,
1143
+ head_mask=head_mask,
1144
+ inputs_embeds=inputs_embeds,
1145
+ use_cache=use_cache,
1146
+ output_attentions=output_attentions,
1147
+ output_hidden_states=output_hidden_states,
1148
+ return_dict=return_dict,
1149
+ )
1150
+ hidden_states = transformer_outputs[0]
1151
+ logits = self.score(hidden_states)
1152
+
1153
+ if input_ids is not None:
1154
+ batch_size, sequence_length = input_ids.shape[:2]
1155
+ else:
1156
+ batch_size, sequence_length = inputs_embeds.shape[:2]
1157
+
1158
+ assert (
1159
+ self.config.pad_token_id is not None or batch_size == 1
1160
+ ), "Cannot handle batch sizes > 1 if no padding token is defined."
1161
+ if self.config.pad_token_id is None:
1162
+ sequence_lengths = -1
1163
+ else:
1164
+ if input_ids is not None:
1165
+ sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
1166
+ else:
1167
+ sequence_lengths = -1
1168
+ logger.warning(
1169
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
1170
+ f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
1171
+ )
1172
+
1173
+ pooled_logits = logits[range(batch_size), sequence_lengths]
1174
+
1175
+ loss = None
1176
+ if labels is not None:
1177
+ if self.num_labels == 1:
1178
+ # We are doing regression
1179
+ loss_fct = MSELoss()
1180
+ loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
1181
+ else:
1182
+ loss_fct = CrossEntropyLoss()
1183
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1184
+
1185
+ if not return_dict:
1186
+ output = (pooled_logits,) + transformer_outputs[1:]
1187
+ return ((loss,) + output) if loss is not None else output
1188
+
1189
+ return SequenceClassifierOutputWithPast(
1190
+ loss=loss,
1191
+ logits=pooled_logits,
1192
+ past_key_values=transformer_outputs.past_key_values,
1193
+ hidden_states=transformer_outputs.hidden_states,
1194
+ attentions=transformer_outputs.attentions,
1195
+ )
1196
+
{pages → backend}/preprocess.py RENAMED
File without changes
{pages → backend}/processor.py RENAMED
@@ -122,8 +122,7 @@ def write():
122
 
123
  st.sidebar.title("Model Selector")
124
  model_selector = st.sidebar.selectbox(
125
- """Select None to enable further filters""",
126
- options=MODELS_to_SELECT,
127
  )
128
  if model_selector == "None":
129
  keep_emojis = st.sidebar.checkbox("Keep emojis", False)
122
 
123
  st.sidebar.title("Model Selector")
124
  model_selector = st.sidebar.selectbox(
125
+ """Select None to enable further filters""", options=MODELS_to_SELECT, index=3
 
126
  )
127
  if model_selector == "None":
128
  keep_emojis = st.sidebar.checkbox("Keep emojis", False)
backend/services.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import requests
5
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, set_seed
6
+ from .modeling_gpt2 import GPT2LMHeadModel as GROVERLMHeadModel
7
+ from .preprocess import ArabertPreprocessor
8
+
9
+
10
+ # Taken and Modified from https://huggingface.co/spaces/flax-community/chef-transformer/blob/main/app.py
11
+ class TextGeneration:
12
+ def __init__(self):
13
+ self.debug = False
14
+ self.generation_pipline = {}
15
+ self.preprocessor = ArabertPreprocessor(model_name="aragpt2-mega")
16
+ self.tokenizer = GPT2Tokenizer.from_pretrained(
17
+ "aubmindlab/aragpt2-mega", use_fast=False
18
+ )
19
+ self.tokenizer.pad_token = self.tokenizer.eos_token
20
+ self.API_KEY = os.getenv("API_KEY")
21
+ self.headers = {"Authorization": f"Bearer {self.API_KEY}"}
22
+ # self.model_names_or_paths = {
23
+ # "aragpt2-medium": "D:/ML/Models/aragpt2-medium",
24
+ # "aragpt2-base": "D:/ML/Models/aragpt2-base",
25
+ # }
26
+ self.model_names_or_paths = {
27
+ "aragpt2-medium": "https://huggingface.co/aubmindlab/aragpt2-medium",
28
+ "aragpt2-base": "https://huggingface.co/aubmindlab/aragpt2-base",
29
+ "aragpt2-large": "https://huggingface.co/aubmindlab/aragpt2-large",
30
+ "aragpt2-mega": "https://huggingface.co/aubmindlab/aragpt2-mega",
31
+ }
32
+ set_seed(42)
33
+
34
+ def load_pipeline(self):
35
+ for model_name, model_path in self.model_names_or_paths.items():
36
+ if "base" in model_name or "medium" in model_name:
37
+ self.generation_pipline[model_name] = pipeline(
38
+ "text-generation",
39
+ model=GPT2LMHeadModel.from_pretrained(model_path),
40
+ tokenizer=self.tokenizer,
41
+ device=-1,
42
+ )
43
+ else:
44
+ self.generation_pipline[model_name] = pipeline(
45
+ "text-generation",
46
+ model=GROVERLMHeadModel.from_pretrained(model_path),
47
+ tokenizer=self.tokenizer,
48
+ device=-1,
49
+ )
50
+
51
+ def load(self):
52
+ if not self.debug:
53
+ self.load_pipeline()
54
+
55
+ def generate(
56
+ self,
57
+ model_name,
58
+ prompt,
59
+ max_new_tokens: int,
60
+ temperature: float,
61
+ top_k: int,
62
+ top_p: float,
63
+ repetition_penalty: float,
64
+ no_repeat_ngram_size: int,
65
+ do_sample: bool,
66
+ num_beams: int,
67
+ ):
68
+ prompt = self.preprocessor.preprocess(prompt)
69
+ return_full_text = False
70
+ return_text = True
71
+ num_return_sequences = 1
72
+ pad_token_id = 0
73
+ eos_token_id = 0
74
+ input_tok = self.tokenizer.tokenize(prompt)
75
+ max_length = len(input_tok) + max_new_tokens
76
+ if max_length > 1024:
77
+ max_length = 1024
78
+ if not self.debug:
79
+ generated_text = self.generation_pipline[model_name.lower()](
80
+ prompt,
81
+ max_length=max_length,
82
+ temperature=temperature,
83
+ top_k=top_k,
84
+ top_p=top_p,
85
+ repetition_penalty=repetition_penalty,
86
+ no_repeat_ngram_size=no_repeat_ngram_size,
87
+ pad_token_id=pad_token_id,
88
+ eos_token_id=eos_token_id,
89
+ return_full_text=return_full_text,
90
+ return_text=return_text,
91
+ do_sample=do_sample,
92
+ num_beams=num_beams,
93
+ num_return_sequences=num_return_sequences,
94
+ )[0]["generated_text"]
95
+ else:
96
+ generated_text = self.generate_by_query(
97
+ prompt,
98
+ model_name,
99
+ max_length=max_length,
100
+ temperature=temperature,
101
+ top_k=top_k,
102
+ top_p=top_p,
103
+ repetition_penalty=repetition_penalty,
104
+ no_repeat_ngram_size=no_repeat_ngram_size,
105
+ pad_token_id=pad_token_id,
106
+ eos_token_id=eos_token_id,
107
+ return_full_text=return_full_text,
108
+ return_text=return_text,
109
+ do_sample=do_sample,
110
+ num_beams=num_beams,
111
+ num_return_sequences=num_return_sequences,
112
+ )
113
+ # print(generated_text)
114
+ if isinstance(generated_text, dict):
115
+ if "error" in generated_text:
116
+ if "is currently loading" in generated_text["error"]:
117
+ return f"Model is currently loading, estimated time is {generated_text['estimated_time']}"
118
+ return generated_text["error"]
119
+ else:
120
+ return "Something happened 🤷‍♂️!!"
121
+ else:
122
+ generated_text = generated_text[0]["generated_text"]
123
+ return self.preprocessor.unpreprocess(generated_text)
124
+
125
+ def query(self, payload, model_name):
126
+ data = json.dumps(payload)
127
+ url = (
128
+ "https://api-inference.huggingface.co/models/aubmindlab/"
129
+ + model_name.lower()
130
+ )
131
+ response = requests.request("POST", url, headers=self.headers, data=data)
132
+ return json.loads(response.content.decode("utf-8"))
133
+
134
+ def generate_by_query(
135
+ self,
136
+ prompt: str,
137
+ model_name: str,
138
+ max_length: int,
139
+ temperature: float,
140
+ top_k: int,
141
+ top_p: float,
142
+ repetition_penalty: float,
143
+ no_repeat_ngram_size: int,
144
+ pad_token_id: int,
145
+ eos_token_id: int,
146
+ return_full_text: int,
147
+ return_text: int,
148
+ do_sample: bool,
149
+ num_beams: int,
150
+ num_return_sequences: int,
151
+ ):
152
+ payload = {
153
+ "inputs": prompt,
154
+ "parameters": {
155
+ "max_length ": max_length,
156
+ "top_k": top_k,
157
+ "top_p": top_p,
158
+ "temperature": temperature,
159
+ "repetition_penalty": repetition_penalty,
160
+ "no_repeat_ngram_size": no_repeat_ngram_size,
161
+ "pad_token_id": pad_token_id,
162
+ "eos_token_id": eos_token_id,
163
+ "return_full_text": return_full_text,
164
+ "return_text": return_text,
165
+ "pad_token_id": pad_token_id,
166
+ "do_sample": do_sample,
167
+ "num_beams": num_beams,
168
+ "num_return_sequences": num_return_sequences,
169
+ },
170
+ "options": {
171
+ "use_cache": True,
172
+ },
173
+ }
174
+ return self.query(payload, model_name)
requirements.txt CHANGED
@@ -4,4 +4,6 @@ python-bidi==0.4.2
4
  PyArabic
5
  farasapy==0.0.14
6
  emoji==1.4.2
7
- awesome_streamlit
 
 
4
  PyArabic
5
  farasapy==0.0.14
6
  emoji==1.4.2
7
+ awesome_streamlit
8
+ torch
9
+ transformers==4.10.0
test.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ from transformers import GPT2Tokenizer
3
+
4
+ # %%
5
+ tok = GPT2Tokenizer.from_pretrained("D:/ML/Models/aragpt2-medium", use_fast=False)
6
+ # %%
7
+ tok.pad_token = tok.eos_token
8
+ #%%
9
+ tok.pad_token_id = [tok.eos_token_id]
10
+ # %%