', '#', '▃', '▁', '▂', ' ']
+ for char in special_chars:
+ msg = msg.replace(char, '')
+ return msg
+
+ def submit_API(self, prompt, trun=[]):
+ """Submit prompt to yuan API interface and obtain an pure text reply.
+ :prompt: Question or any content a user may input.
+ :return: pure text response."""
+ query = self.craft_query(prompt)
+ res = self.response(query, engine=self.engine,
+ max_tokens=self.max_tokens,
+ temperature=self.temperature,
+ topP=self.topP,
+ topK=self.topK,
+ frequencyPenalty=self.frequencyPenalty,
+ responsePenalty=self.responsePenalty,
+ noRepeatNgramSize=self.noRepeatNgramSize)
+ if 'resData' in res and res['resData'] != None:
+ txt = res['resData']
+ else:
+ txt = '模型返回为空,请尝试修改输入'
+ # 单独针对翻译模型的后处理
+ if self.engine == 'translate':
+ txt = txt.replace(' ##', '').replace(' "', '"').replace(": ", ":").replace(" ,", ",") \
+ .replace('英文:', '').replace('文:', '').replace("( ", "(").replace(" )", ")")
+ else:
+ txt = txt.replace(' ', '')
+ txt = self.del_special_chars(txt)
+
+ # trun多结束符截断模型输出
+ if isinstance(trun, str):
+ trun = [trun]
+ try:
+ if trun != None and isinstance(trun, list) and trun != []:
+ for tr in trun:
+ if tr in txt and tr != "":
+ txt = txt[:txt.index(tr)]
+ else:
+ continue
+ except:
+ return txt
+ return txt
+
+
+class YuanAPI:
+ ACCOUNT = ''
+ PHONE = ''
+
+ SUBMIT_URL = "http://api.airyuan.cn:32102/v1/interface/api/infer/getRequestId?"
+ REPLY_URL = "http://api.airyuan.cn:32102/v1/interface/api/result?"
+
+ def __init__(self, user, phone):
+ self.ACCOUNT = user
+ self.PHONE = phone
+
+ @staticmethod
+ def code_md5(str):
+ code = str.encode("utf-8")
+ m = hashlib.md5()
+ m.update(code)
+ result = m.hexdigest()
+ return result
+
+ @staticmethod
+ def rest_get(url, header, timeout, show_error=False):
+ '''Call rest get method'''
+ try:
+ response = requests.get(url, headers=header, timeout=timeout, verify=False)
+ return response
+ except Exception as exception:
+ if show_error:
+ print(exception)
+ return None
+
+ def header_generation(self):
+ """Generate header for API request."""
+ t = datetime.now(pytz.timezone("Asia/Shanghai")).strftime("%Y-%m-%d")
+ token = self.code_md5(self.ACCOUNT + self.PHONE + t)
+ headers = {'token': token}
+ return headers
+
+ def submit_request(self, query, temperature, topP, topK, max_tokens, engine, frequencyPenalty, responsePenalty,
+ noRepeatNgramSize):
+ """Submit query to the backend server and get requestID."""
+ headers = self.header_generation()
+ # url=SUBMIT_URL + "account={0}&data={1}&temperature={2}&topP={3}&topK={4}&tokensToGenerate={5}&type={6}".format(ACCOUNT,query,temperature,topP,topK,max_tokens,"api")
+ # url=SUBMIT_URL + "engine={0}&account={1}&data={2}&temperature={3}&topP={4}&topK={5}&tokensToGenerate={6}" \
+ # "&type={7}".format(engine,ACCOUNT,query,temperature,topP,topK, max_tokens,"api")
+ url = self.SUBMIT_URL + "engine={0}&account={1}&data={2}&temperature={3}&topP={4}&topK={5}&tokensToGenerate={6}" \
+ "&type={7}&frequencyPenalty={8}&responsePenalty={9}&noRepeatNgramSize={10}". \
+ format(engine, self.ACCOUNT, query, temperature, topP, topK, max_tokens, "api", frequencyPenalty,
+ responsePenalty, noRepeatNgramSize)
+ response = self.rest_get(url, headers, 30)
+ response_text = json.loads(response.text)
+ if response_text["flag"]:
+ requestId = response_text["resData"]
+ return requestId
+ else:
+ raise RuntimeWarning(response_text)
+
+ def reply_request(self, requestId, cycle_count=5):
+ """Check reply API to get the inference response."""
+ url = self.REPLY_URL + "account={0}&requestId={1}".format(self.ACCOUNT, requestId)
+ headers = self.header_generation()
+ response_text = {"flag": True, "resData": None}
+ for i in range(cycle_count):
+ response = self.rest_get(url, headers, 30, show_error=True)
+ response_text = json.loads(response.text)
+ if response_text["resData"] is not None:
+ return response_text
+ if response_text["flag"] is False and i == cycle_count - 1:
+ raise RuntimeWarning(response_text)
+ time.sleep(3)
+ return response_text
+
+
+class Yuan_Client(BaseLLMModel):
+
+ def __init__(self, model_name, api_key, user_name="", system_prompt=None):
+ super().__init__(model_name=model_name, user=user_name)
+ self.history = []
+ self.api_key = api_key
+ self.system_prompt = system_prompt
+
+ self.input_prefix = ""
+ self.output_prefix = ""
+
+ def set_text_prefix(self, option, value):
+ if option == 'input_prefix':
+ self.input_prefix = value
+ elif option == 'output_prefix':
+ self.output_prefix = value
+
+ def get_answer_at_once(self):
+ # yuan temperature is (0,1] and base model temperature is [0,2], and yuan 0.9 == base 1 so need to convert
+ temperature = self.temperature if self.temperature <= 1 else 0.9 + (self.temperature - 1) / 10
+ topP = self.top_p
+ topK = self.n_choices
+ # max_tokens should be in [1,200]
+ max_tokens = self.max_generation_token if self.max_generation_token is not None else 50
+ if max_tokens > 200:
+ max_tokens = 200
+ stop = self.stop_sequence if self.stop_sequence is not None else []
+ examples = []
+ system_prompt = self.system_prompt
+ if system_prompt is not None:
+ lines = system_prompt.splitlines()
+ # TODO: support prefixes in system prompt or settings
+ """
+ if lines[0].startswith('-'):
+ prefixes = lines.pop()[1:].split('|')
+ self.input_prefix = prefixes[0]
+ if len(prefixes) > 1:
+ self.output_prefix = prefixes[1]
+ if len(prefixes) > 2:
+ stop = prefixes[2].split(',')
+ """
+ for i in range(0, len(lines), 2):
+ in_line = lines[i]
+ out_line = lines[i + 1] if i + 1 < len(lines) else ""
+ examples.append((in_line, out_line))
+ yuan = Yuan(engine=self.model_name.replace('yuanai-1.0-', ''),
+ temperature=temperature,
+ max_tokens=max_tokens,
+ topK=topK,
+ topP=topP,
+ input_prefix=self.input_prefix,
+ input_suffix="",
+ output_prefix=self.output_prefix,
+ output_suffix="".join(stop),
+ )
+ if not self.api_key:
+ return NO_APIKEY_MSG, 0
+ yuan.set_account(self.api_key)
+
+ for in_line, out_line in examples:
+ yuan.add_example(Example(inp=in_line, out=out_line))
+
+ prompt = self.history[-1]["content"]
+ answer = yuan.submit_API(prompt, trun=stop)
+ return answer, len(answer)
diff --git a/modules/models/modeling_moss.py b/modules/models/modeling_moss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7adea5bca857f7fdd6399dde7ce359f8f8cecfe
--- /dev/null
+++ b/modules/models/modeling_moss.py
@@ -0,0 +1,711 @@
+""" PyTorch Moss model."""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging
+)
+
+from .configuration_moss import MossConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "fnlp/moss-moon-003-base"
+_CONFIG_FOR_DOC = "MossConfig"
+
+
+MOSS_PRETRAINED_MODEL_ARCHIVE_LIST = [
+ "fnlp/moss-moon-003-base",
+ "fnlp/moss-moon-003-sft",
+ "fnlp/moss-moon-003-sft-plugin",
+]
+
+
+# Copied from transformers.models.gptj.modeling_gptj.create_sinusoidal_positions
+def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2) / dim))
+ sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.float), inv_freq).float()
+ return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
+
+
+# Copied from transformers.models.gptj.modeling_gptj.rotate_every_two
+def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
+ x1 = x[:, :, :, ::2]
+ x2 = x[:, :, :, 1::2]
+ x = torch.stack((-x2, x1), dim=-1)
+ return x.flatten(-2) # in einsum notation: rearrange(x, '... d j -> ... (d j)')
+
+
+# Copied from transformers.models.gptj.modeling_gptj.apply_rotary_pos_emb
+def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
+ sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
+ cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
+ return (tensor * cos) + (rotate_every_two(tensor) * sin)
+
+
+class MossAttention(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+
+ max_positions = config.max_position_embeddings
+ self.register_buffer(
+ "causal_mask",
+ torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+ 1, 1, max_positions, max_positions
+ ),
+ )
+
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+ self.embed_dim = config.hidden_size
+ self.num_attention_heads = config.num_attention_heads
+ self.head_dim = self.embed_dim // self.num_attention_heads
+ if self.head_dim * self.num_attention_heads != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
+ f" `num_attention_heads`: {self.num_attention_heads})."
+ )
+ self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
+ self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False)
+
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+ self.rotary_dim = config.rotary_dim
+ pos_embd_dim = self.rotary_dim or self.embed_dim
+ self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim)
+
+ def _split_heads(self, x, n_head, dim_head, mp_num):
+ reshaped = x.reshape(x.shape[:-1] + (n_head // mp_num, dim_head))
+ reshaped = reshaped.reshape(x.shape[:-2] + (-1,) + reshaped.shape[-1:])
+ return reshaped
+
+ def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
+ """
+ Merges attn_head_size dim and num_attn_heads dim into n_ctx
+ """
+ if len(tensor.shape) == 5:
+ tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
+ elif len(tensor.shape) == 4:
+ tensor = tensor.permute(0, 2, 1, 3).contiguous()
+ else:
+ raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+ new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,)
+ return tensor.view(new_shape)
+
+ def _attn(
+ self,
+ query,
+ key,
+ value,
+ attention_mask=None,
+ head_mask=None,
+ ):
+ # compute causal mask from causal mask buffer
+ query_length, key_length = query.size(-2), key.size(-2)
+ causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length]
+
+ # Keep the attention weights computation in fp32 to avoid overflow issues
+ query = query.to(torch.float32)
+ key = key.to(torch.float32)
+
+ attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+ attn_weights = attn_weights / self.scale_attn
+ mask_value = torch.finfo(attn_weights.dtype).min
+ # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+ # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+ mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+ attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+ if attention_mask is not None:
+ # Apply the attention mask
+ attn_weights = attn_weights + attention_mask
+
+ attn_weights = nn.Softmax(dim=-1)(attn_weights)
+ attn_weights = attn_weights.to(value.dtype)
+ attn_weights = self.attn_dropout(attn_weights)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attn_weights = attn_weights * head_mask
+
+ attn_output = torch.matmul(attn_weights, value)
+
+ return attn_output, attn_weights
+
+ def forward(
+ self,
+ hidden_states: Optional[torch.FloatTensor],
+ layer_past: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = False,
+ output_attentions: Optional[bool] = False,
+ ) -> Union[
+ Tuple[torch.Tensor, Tuple[torch.Tensor]],
+ Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
+ ]:
+ qkv = self.qkv_proj(hidden_states)
+ # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
+ mp_num = 4
+ qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))
+
+ local_dim = self.head_dim * self.num_attention_heads // mp_num
+ query, value, key = torch.split(qkv_split, local_dim, dim=-1)
+ query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+ key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+
+ value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+ value = value.permute(0, 2, 1, 3)
+
+ embed_positions = self.embed_positions
+ if embed_positions.device != position_ids.device:
+ embed_positions = embed_positions.to(position_ids.device)
+ self.embed_positions = embed_positions
+
+ sincos = embed_positions[position_ids]
+ sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+
+ if self.rotary_dim is not None:
+ k_rot = key[:, :, :, : self.rotary_dim]
+ k_pass = key[:, :, :, self.rotary_dim :]
+
+ q_rot = query[:, :, :, : self.rotary_dim]
+ q_pass = query[:, :, :, self.rotary_dim :]
+
+ k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
+ q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
+
+ key = torch.cat([k_rot, k_pass], dim=-1)
+ query = torch.cat([q_rot, q_pass], dim=-1)
+ else:
+ key = apply_rotary_pos_emb(key, sin, cos)
+ query = apply_rotary_pos_emb(query, sin, cos)
+
+ key = key.permute(0, 2, 1, 3)
+ query = query.permute(0, 2, 1, 3)
+
+ if layer_past is not None:
+ past_key = layer_past[0]
+ past_value = layer_past[1]
+ key = torch.cat((past_key, key), dim=-2)
+ value = torch.cat((past_value, value), dim=-2)
+
+ if use_cache is True:
+ present = (key, value)
+ else:
+ present = None
+
+ # compute self-attention: V x Softmax(QK^T)
+ attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+ attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
+ attn_output = self.out_proj(attn_output)
+ attn_output = self.resid_dropout(attn_output)
+
+ outputs = (attn_output, present)
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs # a, present, (attentions)
+
+
+# Copied from transformers.models.gptj.modeling_gptj.GPTJMLP with GPTJ->Moss
+class MossMLP(nn.Module):
+ def __init__(self, intermediate_size, config): # in MLP: intermediate_size= 4 * embed_dim
+ super().__init__()
+ embed_dim = config.n_embd
+
+ self.fc_in = nn.Linear(embed_dim, intermediate_size)
+ self.fc_out = nn.Linear(intermediate_size, embed_dim)
+
+ self.act = ACT2FN[config.activation_function]
+ self.dropout = nn.Dropout(config.resid_pdrop)
+
+ def forward(self, hidden_states: Optional[torch.FloatTensor]) -> torch.FloatTensor:
+ hidden_states = self.fc_in(hidden_states)
+ hidden_states = self.act(hidden_states)
+ hidden_states = self.fc_out(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->Moss
+class MossBlock(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
+ self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+ self.attn = MossAttention(config)
+ self.mlp = MossMLP(inner_dim, config)
+
+ def forward(
+ self,
+ hidden_states: Optional[torch.FloatTensor],
+ layer_past: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = False,
+ output_attentions: Optional[bool] = False,
+ ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+ residual = hidden_states
+ hidden_states = self.ln_1(hidden_states)
+ attn_outputs = self.attn(
+ hidden_states=hidden_states,
+ layer_past=layer_past,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ )
+ attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
+ outputs = attn_outputs[1:]
+
+ feed_forward_hidden_states = self.mlp(hidden_states)
+ hidden_states = attn_output + feed_forward_hidden_states + residual
+
+ if use_cache:
+ outputs = (hidden_states,) + outputs
+ else:
+ outputs = (hidden_states,) + outputs[1:]
+
+ return outputs # hidden_states, present, (attentions)
+
+
+class MossPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = MossConfig
+ base_model_prefix = "transformer"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["MossBlock"]
+
+ def __init__(self, *inputs, **kwargs):
+ super().__init__(*inputs, **kwargs)
+
+ def _init_weights(self, module):
+ """Initialize the weights."""
+ if isinstance(module, (nn.Linear,)):
+ # Slightly different from Mesh Transformer JAX which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+ def _set_gradient_checkpointing(self, module, value=False):
+ if isinstance(module, MossModel):
+ module.gradient_checkpointing = value
+
+
+MOSS_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`MossConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MOSS_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`AutoProcenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+ 1]`:
+
+ - 0 corresponds to a *sentence A* token,
+ - 1 corresponds to a *sentence B* token.
+
+ [What are token type IDs?](../glossary#token-type-ids)
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Moss Model transformer outputting raw hidden-states without any specific head on top.",
+ MOSS_START_DOCSTRING,
+)
+class MossModel(MossPreTrainedModel):
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.embed_dim = config.n_embd
+ self.vocab_size = config.vocab_size
+ self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+ self.drop = nn.Dropout(config.embd_pdrop)
+ self.h = nn.ModuleList([MossBlock(config) for _ in range(config.n_layer)])
+ self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+ self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)
+
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.wte
+
+ def set_input_embeddings(self, new_embeddings):
+ self.wte = new_embeddings
+
+ @add_start_docstrings_to_model_forward(MOSS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutputWithPast,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ input_shape = input_ids.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+ batch_size = input_ids.shape[0]
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ batch_size = inputs_embeds.shape[0]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+ if token_type_ids is not None:
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+ if position_ids is not None:
+ position_ids = position_ids.view(-1, input_shape[-1]).long()
+
+ if past_key_values is None:
+ past_length = 0
+ past_key_values = tuple([None] * len(self.h))
+ else:
+ past_length = past_key_values[0][0].size(-2)
+
+ if position_ids is None:
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+ position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+
+ # Attention mask.
+ if attention_mask is not None:
+ if batch_size <= 0:
+ raise ValueError("batch_size has to be defined and > 0")
+ attention_mask = attention_mask.view(batch_size, -1)
+ # We create a 3D attention mask from a 2D tensor mask.
+ # Sizes are [batch_size, 1, 1, to_seq_length]
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+ # this attention mask is more simple than the triangular masking of causal attention
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+ attention_mask = attention_mask[:, None, None, :]
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and the dtype's smallest value for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
+ attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x num_attention_heads x N x N
+ # head_mask has shape n_layer x batch x num_attention_heads x N x N
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+ if inputs_embeds is None:
+ inputs_embeds = self.wte(input_ids)
+
+ hidden_states = inputs_embeds
+
+ if token_type_ids is not None:
+ token_type_embeds = self.wte(token_type_ids)
+ hidden_states = hidden_states + token_type_embeds
+
+ hidden_states = self.drop(hidden_states)
+
+ output_shape = input_shape + (hidden_states.size(-1),)
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+ "`use_cache=False`..."
+ )
+ use_cache = False
+
+ presents = () if use_cache else None
+ all_self_attentions = () if output_attentions else None
+ all_hidden_states = () if output_hidden_states else None
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ # None for past_key_value
+ return module(*inputs, use_cache, output_attentions)
+
+ return custom_forward
+
+ outputs = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ hidden_states,
+ None,
+ attention_mask,
+ position_ids,
+ head_mask[i],
+ )
+ else:
+ outputs = block(
+ hidden_states=hidden_states,
+ layer_past=layer_past,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ head_mask=head_mask[i],
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = outputs[0]
+ if use_cache is True:
+ presents = presents + (outputs[1],)
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+ hidden_states = self.ln_f(hidden_states)
+
+ hidden_states = hidden_states.view(output_shape)
+ # Add last hidden state
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=presents,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ The Moss Model transformer with a language modeling head on top.
+ """,
+ MOSS_START_DOCSTRING,
+)
+class MossForCausalLM(MossPreTrainedModel):
+ _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.causal_mask"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.transformer = MossModel(config)
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+ token_type_ids = kwargs.get("token_type_ids", None)
+ # only last token for inputs_ids if past is defined in kwargs
+ if past_key_values:
+ input_ids = input_ids[:, -1].unsqueeze(-1)
+ if token_type_ids is not None:
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+ attention_mask = kwargs.get("attention_mask", None)
+ position_ids = kwargs.get("position_ids", None)
+
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ return {
+ "input_ids": input_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "position_ids": position_ids,
+ "attention_mask": attention_mask,
+ "token_type_ids": token_type_ids,
+ }
+
+ @add_start_docstrings_to_model_forward(MOSS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=CausalLMOutputWithPast,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+ attention_mask: Optional[torch.FloatTensor] = None,
+ token_type_ids: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ head_mask: Optional[torch.FloatTensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+ are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ transformer_outputs = self.transformer(
+ input_ids,
+ past_key_values=past_key_values,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ hidden_states = transformer_outputs[0]
+
+ # make sure sampling in fp16 works correctly and
+ # compute loss in fp32 to match with mesh-tf version
+ # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+ lm_logits = self.lm_head(hidden_states).to(torch.float32)
+
+ loss = None
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = lm_logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+ loss = loss.to(hidden_states.dtype)
+
+ if not return_dict:
+ output = (lm_logits,) + transformer_outputs[1:]
+ return ((loss,) + output) if loss is not None else output
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=lm_logits,
+ past_key_values=transformer_outputs.past_key_values,
+ hidden_states=transformer_outputs.hidden_states,
+ attentions=transformer_outputs.attentions,
+ )
+
+ @staticmethod
+ def _reorder_cache(
+ past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+ ) -> Tuple[Tuple[torch.Tensor]]:
+ """
+ This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
+ [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+ beam_idx at every generation step.
+ """
+ return tuple(
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+ for layer_past in past_key_values
+ )
diff --git a/modules/models/models.py b/modules/models/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..4105dd3dbcdf7a1dba564c527639787697d2e2eb
--- /dev/null
+++ b/modules/models/models.py
@@ -0,0 +1,651 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, List
+
+import logging
+import json
+import commentjson as cjson
+import os
+import sys
+import requests
+import urllib3
+import platform
+import base64
+from io import BytesIO
+from PIL import Image
+
+from tqdm import tqdm
+import colorama
+from duckduckgo_search import ddg
+import asyncio
+import aiohttp
+from enum import Enum
+import uuid
+
+from ..presets import *
+from ..llama_func import *
+from ..utils import *
+from .. import shared
+from ..config import retrieve_proxy, usage_limit
+from modules import config
+from .base_model import BaseLLMModel, ModelType
+
+
+class OpenAIClient(BaseLLMModel):
+ def __init__(
+ self,
+ model_name,
+ api_key,
+ system_prompt=INITIAL_SYSTEM_PROMPT,
+ temperature=1.0,
+ top_p=1.0,
+ user_name=""
+ ) -> None:
+ super().__init__(
+ model_name=model_name,
+ temperature=temperature,
+ top_p=top_p,
+ system_prompt=system_prompt,
+ user=user_name
+ )
+ self.api_key = api_key
+ self.need_api_key = True
+ self._refresh_header()
+
+ def get_answer_stream_iter(self):
+ response = self._get_response(stream=True)
+ if response is not None:
+ iter = self._decode_chat_response(response)
+ partial_text = ""
+ for i in iter:
+ partial_text += i
+ yield partial_text
+ else:
+ yield STANDARD_ERROR_MSG + GENERAL_ERROR_MSG
+
+ def get_answer_at_once(self):
+ response = self._get_response()
+ response = json.loads(response.text)
+ content = response["choices"][0]["message"]["content"]
+ total_token_count = response["usage"]["total_tokens"]
+ return content, total_token_count
+
+ def count_token(self, user_input):
+ input_token_count = count_token(construct_user(user_input))
+ if self.system_prompt is not None and len(self.all_token_counts) == 0:
+ system_prompt_token_count = count_token(
+ construct_system(self.system_prompt)
+ )
+ return input_token_count + system_prompt_token_count
+ return input_token_count
+
+ def billing_info(self):
+ try:
+ curr_time = datetime.datetime.now()
+ last_day_of_month = get_last_day_of_month(
+ curr_time).strftime("%Y-%m-%d")
+ first_day_of_month = curr_time.replace(day=1).strftime("%Y-%m-%d")
+ usage_url = f"{shared.state.usage_api_url}?start_date={first_day_of_month}&end_date={last_day_of_month}"
+ try:
+ usage_data = self._get_billing_data(usage_url)
+ except Exception as e:
+ logging.error(f"获取API使用情况失败:" + str(e))
+ return i18n("**获取API使用情况失败**")
+ # rounded_usage = "{:.5f}".format(usage_data["total_usage"] / 100)
+ rounded_usage = round(usage_data["total_usage"] / 100, 5)
+ usage_percent = round(usage_data["total_usage"] / usage_limit, 2)
+ # return i18n("**本月使用金额** ") + f"\u3000 ${rounded_usage}"
+ return """\
+ """ + i18n("本月使用金额") + f"""
+
+ ${rounded_usage}${usage_limit}
+ """
+ except requests.exceptions.ConnectTimeout:
+ status_text = (
+ STANDARD_ERROR_MSG + CONNECTION_TIMEOUT_MSG + ERROR_RETRIEVE_MSG
+ )
+ return status_text
+ except requests.exceptions.ReadTimeout:
+ status_text = STANDARD_ERROR_MSG + READ_TIMEOUT_MSG + ERROR_RETRIEVE_MSG
+ return status_text
+ except Exception as e:
+ import traceback
+ traceback.print_exc()
+ logging.error(i18n("获取API使用情况失败:") + str(e))
+ return STANDARD_ERROR_MSG + ERROR_RETRIEVE_MSG
+
+ def set_token_upper_limit(self, new_upper_limit):
+ pass
+
+ @shared.state.switching_api_key # 在不开启多账号模式的时候,这个装饰器不会起作用
+ def _get_response(self, stream=False):
+ openai_api_key = self.api_key
+ system_prompt = self.system_prompt
+ history = self.history
+ logging.debug(colorama.Fore.YELLOW +
+ f"{history}" + colorama.Fore.RESET)
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {openai_api_key}",
+ }
+
+ if system_prompt is not None:
+ history = [construct_system(system_prompt), *history]
+
+ payload = {
+ "model": self.model_name,
+ "messages": history,
+ "temperature": self.temperature,
+ "top_p": self.top_p,
+ "n": self.n_choices,
+ "stream": stream,
+ "presence_penalty": self.presence_penalty,
+ "frequency_penalty": self.frequency_penalty,
+ }
+
+ if self.max_generation_token is not None:
+ payload["max_tokens"] = self.max_generation_token
+ if self.stop_sequence is not None:
+ payload["stop"] = self.stop_sequence
+ if self.logit_bias is not None:
+ payload["logit_bias"] = self.logit_bias
+ if self.user_identifier:
+ payload["user"] = self.user_identifier
+
+ if stream:
+ timeout = TIMEOUT_STREAMING
+ else:
+ timeout = TIMEOUT_ALL
+
+ # 如果有自定义的api-host,使用自定义host发送请求,否则使用默认设置发送请求
+ if shared.state.completion_url != COMPLETION_URL:
+ logging.info(f"使用自定义API URL: {shared.state.completion_url}")
+
+ with retrieve_proxy():
+ try:
+ response = requests.post(
+ shared.state.completion_url,
+ headers=headers,
+ json=payload,
+ stream=stream,
+ timeout=timeout,
+ )
+ except:
+ return None
+ return response
+
+ def _refresh_header(self):
+ self.headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {self.api_key}",
+ }
+
+ def _get_billing_data(self, billing_url):
+ with retrieve_proxy():
+ response = requests.get(
+ billing_url,
+ headers=self.headers,
+ timeout=TIMEOUT_ALL,
+ )
+
+ if response.status_code == 200:
+ data = response.json()
+ return data
+ else:
+ raise Exception(
+ f"API request failed with status code {response.status_code}: {response.text}"
+ )
+
+ def _decode_chat_response(self, response):
+ error_msg = ""
+ for chunk in response.iter_lines():
+ if chunk:
+ chunk = chunk.decode()
+ chunk_length = len(chunk)
+ try:
+ chunk = json.loads(chunk[6:])
+ except json.JSONDecodeError:
+ print(i18n("JSON解析错误,收到的内容: ") + f"{chunk}")
+ error_msg += chunk
+ continue
+ if chunk_length > 6 and "delta" in chunk["choices"][0]:
+ if chunk["choices"][0]["finish_reason"] == "stop":
+ break
+ try:
+ yield chunk["choices"][0]["delta"]["content"]
+ except Exception as e:
+ # logging.error(f"Error: {e}")
+ continue
+ if error_msg:
+ raise Exception(error_msg)
+
+ def set_key(self, new_access_key):
+ ret = super().set_key(new_access_key)
+ self._refresh_header()
+ return ret
+
+
+class ChatGLM_Client(BaseLLMModel):
+ def __init__(self, model_name, user_name="") -> None:
+ super().__init__(model_name=model_name, user=user_name)
+ from transformers import AutoTokenizer, AutoModel
+ import torch
+ global CHATGLM_TOKENIZER, CHATGLM_MODEL
+ if CHATGLM_TOKENIZER is None or CHATGLM_MODEL is None:
+ system_name = platform.system()
+ model_path = None
+ if os.path.exists("models"):
+ model_dirs = os.listdir("models")
+ if model_name in model_dirs:
+ model_path = f"models/{model_name}"
+ if model_path is not None:
+ model_source = model_path
+ else:
+ model_source = f"THUDM/{model_name}"
+ CHATGLM_TOKENIZER = AutoTokenizer.from_pretrained(
+ model_source, trust_remote_code=True
+ )
+ quantified = False
+ if "int4" in model_name:
+ quantified = True
+ model = AutoModel.from_pretrained(
+ model_source, trust_remote_code=True
+ )
+ if torch.cuda.is_available():
+ # run on CUDA
+ logging.info("CUDA is available, using CUDA")
+ model = model.half().cuda()
+ # mps加速还存在一些问题,暂时不使用
+ elif system_name == "Darwin" and model_path is not None and not quantified:
+ logging.info("Running on macOS, using MPS")
+ # running on macOS and model already downloaded
+ model = model.half().to("mps")
+ else:
+ logging.info("GPU is not available, using CPU")
+ model = model.float()
+ model = model.eval()
+ CHATGLM_MODEL = model
+
+ def _get_glm_style_input(self):
+ history = [x["content"] for x in self.history]
+ query = history.pop()
+ logging.debug(colorama.Fore.YELLOW +
+ f"{history}" + colorama.Fore.RESET)
+ assert (
+ len(history) % 2 == 0
+ ), f"History should be even length. current history is: {history}"
+ history = [[history[i], history[i + 1]]
+ for i in range(0, len(history), 2)]
+ return history, query
+
+ def get_answer_at_once(self):
+ history, query = self._get_glm_style_input()
+ response, _ = CHATGLM_MODEL.chat(
+ CHATGLM_TOKENIZER, query, history=history)
+ return response, len(response)
+
+ def get_answer_stream_iter(self):
+ history, query = self._get_glm_style_input()
+ for response, history in CHATGLM_MODEL.stream_chat(
+ CHATGLM_TOKENIZER,
+ query,
+ history,
+ max_length=self.token_upper_limit,
+ top_p=self.top_p,
+ temperature=self.temperature,
+ ):
+ yield response
+
+
+class LLaMA_Client(BaseLLMModel):
+ def __init__(
+ self,
+ model_name,
+ lora_path=None,
+ user_name=""
+ ) -> None:
+ super().__init__(model_name=model_name, user=user_name)
+ from lmflow.datasets.dataset import Dataset
+ from lmflow.pipeline.auto_pipeline import AutoPipeline
+ from lmflow.models.auto_model import AutoModel
+ from lmflow.args import ModelArguments, DatasetArguments, InferencerArguments
+
+ self.max_generation_token = 1000
+ self.end_string = "\n\n"
+ # We don't need input data
+ data_args = DatasetArguments(dataset_path=None)
+ self.dataset = Dataset(data_args)
+ self.system_prompt = ""
+
+ global LLAMA_MODEL, LLAMA_INFERENCER
+ if LLAMA_MODEL is None or LLAMA_INFERENCER is None:
+ model_path = None
+ if os.path.exists("models"):
+ model_dirs = os.listdir("models")
+ if model_name in model_dirs:
+ model_path = f"models/{model_name}"
+ if model_path is not None:
+ model_source = model_path
+ else:
+ model_source = f"decapoda-research/{model_name}"
+ # raise Exception(f"models目录下没有这个模型: {model_name}")
+ if lora_path is not None:
+ lora_path = f"lora/{lora_path}"
+ model_args = ModelArguments(model_name_or_path=model_source, lora_model_path=lora_path, model_type=None, config_overrides=None, config_name=None, tokenizer_name=None, cache_dir=None,
+ use_fast_tokenizer=True, model_revision='main', use_auth_token=False, torch_dtype=None, use_lora=False, lora_r=8, lora_alpha=32, lora_dropout=0.1, use_ram_optimized_load=True)
+ pipeline_args = InferencerArguments(
+ local_rank=0, random_seed=1, deepspeed='configs/ds_config_chatbot.json', mixed_precision='bf16')
+
+ with open(pipeline_args.deepspeed, "r") as f:
+ ds_config = json.load(f)
+ LLAMA_MODEL = AutoModel.get_model(
+ model_args,
+ tune_strategy="none",
+ ds_config=ds_config,
+ )
+ LLAMA_INFERENCER = AutoPipeline.get_pipeline(
+ pipeline_name="inferencer",
+ model_args=model_args,
+ data_args=data_args,
+ pipeline_args=pipeline_args,
+ )
+
+ def _get_llama_style_input(self):
+ history = []
+ instruction = ""
+ if self.system_prompt:
+ instruction = (f"Instruction: {self.system_prompt}\n")
+ for x in self.history:
+ if x["role"] == "user":
+ history.append(f"{instruction}Input: {x['content']}")
+ else:
+ history.append(f"Output: {x['content']}")
+ context = "\n\n".join(history)
+ context += "\n\nOutput: "
+ return context
+
+ def get_answer_at_once(self):
+ context = self._get_llama_style_input()
+
+ input_dataset = self.dataset.from_dict(
+ {"type": "text_only", "instances": [{"text": context}]}
+ )
+
+ output_dataset = LLAMA_INFERENCER.inference(
+ model=LLAMA_MODEL,
+ dataset=input_dataset,
+ max_new_tokens=self.max_generation_token,
+ temperature=self.temperature,
+ )
+
+ response = output_dataset.to_dict()["instances"][0]["text"]
+ return response, len(response)
+
+ def get_answer_stream_iter(self):
+ context = self._get_llama_style_input()
+ partial_text = ""
+ step = 1
+ for _ in range(0, self.max_generation_token, step):
+ input_dataset = self.dataset.from_dict(
+ {"type": "text_only", "instances": [
+ {"text": context + partial_text}]}
+ )
+ output_dataset = LLAMA_INFERENCER.inference(
+ model=LLAMA_MODEL,
+ dataset=input_dataset,
+ max_new_tokens=step,
+ temperature=self.temperature,
+ )
+ response = output_dataset.to_dict()["instances"][0]["text"]
+ if response == "" or response == self.end_string:
+ break
+ partial_text += response
+ yield partial_text
+
+
+class XMChat(BaseLLMModel):
+ def __init__(self, api_key, user_name=""):
+ super().__init__(model_name="xmchat", user=user_name)
+ self.api_key = api_key
+ self.session_id = None
+ self.reset()
+ self.image_bytes = None
+ self.image_path = None
+ self.xm_history = []
+ self.url = "https://xmbot.net/web"
+ self.last_conv_id = None
+
+ def reset(self):
+ self.session_id = str(uuid.uuid4())
+ self.last_conv_id = None
+ return [], "已重置"
+
+ def image_to_base64(self, image_path):
+ # 打开并加载图片
+ img = Image.open(image_path)
+
+ # 获取图片的宽度和高度
+ width, height = img.size
+
+ # 计算压缩比例,以确保最长边小于4096像素
+ max_dimension = 2048
+ scale_ratio = min(max_dimension / width, max_dimension / height)
+
+ if scale_ratio < 1:
+ # 按压缩比例调整图片大小
+ new_width = int(width * scale_ratio)
+ new_height = int(height * scale_ratio)
+ img = img.resize((new_width, new_height), Image.ANTIALIAS)
+
+ # 将图片转换为jpg格式的二进制数据
+ buffer = BytesIO()
+ if img.mode == "RGBA":
+ img = img.convert("RGB")
+ img.save(buffer, format='JPEG')
+ binary_image = buffer.getvalue()
+
+ # 对二进制数据进行Base64编码
+ base64_image = base64.b64encode(binary_image).decode('utf-8')
+
+ return base64_image
+
+ def try_read_image(self, filepath):
+ def is_image_file(filepath):
+ # 判断文件是否为图片
+ valid_image_extensions = [
+ ".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"]
+ file_extension = os.path.splitext(filepath)[1].lower()
+ return file_extension in valid_image_extensions
+
+ if is_image_file(filepath):
+ logging.info(f"读取图片文件: {filepath}")
+ self.image_bytes = self.image_to_base64(filepath)
+ self.image_path = filepath
+ else:
+ self.image_bytes = None
+ self.image_path = None
+
+ def like(self):
+ if self.last_conv_id is None:
+ return "点赞失败,你还没发送过消息"
+ data = {
+ "uuid": self.last_conv_id,
+ "appraise": "good"
+ }
+ requests.post(self.url, json=data)
+ return "👍点赞成功,感谢反馈~"
+
+ def dislike(self):
+ if self.last_conv_id is None:
+ return "点踩失败,你还没发送过消息"
+ data = {
+ "uuid": self.last_conv_id,
+ "appraise": "bad"
+ }
+ requests.post(self.url, json=data)
+ return "👎点踩成功,感谢反馈~"
+
+ def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):
+ fake_inputs = real_inputs
+ display_append = ""
+ limited_context = False
+ return limited_context, fake_inputs, display_append, real_inputs, chatbot
+
+ def handle_file_upload(self, files, chatbot):
+ """if the model accepts multi modal input, implement this function"""
+ if files:
+ for file in files:
+ if file.name:
+ logging.info(f"尝试读取图像: {file.name}")
+ self.try_read_image(file.name)
+ if self.image_path is not None:
+ chatbot = chatbot + [((self.image_path,), None)]
+ if self.image_bytes is not None:
+ logging.info("使用图片作为输入")
+ # XMChat的一轮对话中实际上只能处理一张图片
+ self.reset()
+ conv_id = str(uuid.uuid4())
+ data = {
+ "user_id": self.api_key,
+ "session_id": self.session_id,
+ "uuid": conv_id,
+ "data_type": "imgbase64",
+ "data": self.image_bytes
+ }
+ response = requests.post(self.url, json=data)
+ response = json.loads(response.text)
+ logging.info(f"图片回复: {response['data']}")
+ return None, chatbot, None
+
+ def get_answer_at_once(self):
+ question = self.history[-1]["content"]
+ conv_id = str(uuid.uuid4())
+ self.last_conv_id = conv_id
+ data = {
+ "user_id": self.api_key,
+ "session_id": self.session_id,
+ "uuid": conv_id,
+ "data_type": "text",
+ "data": question
+ }
+ response = requests.post(self.url, json=data)
+ try:
+ response = json.loads(response.text)
+ return response["data"], len(response["data"])
+ except Exception as e:
+ return response.text, len(response.text)
+
+
+def get_model(
+ model_name,
+ lora_model_path=None,
+ access_key=None,
+ temperature=None,
+ top_p=None,
+ system_prompt=None,
+ user_name=""
+) -> BaseLLMModel:
+ msg = i18n("模型设置为了:") + f" {model_name}"
+ model_type = ModelType.get_type(model_name)
+ lora_selector_visibility = False
+ lora_choices = []
+ dont_change_lora_selector = False
+ if model_type != ModelType.OpenAI:
+ config.local_embedding = True
+ # del current_model.model
+ model = None
+ try:
+ if model_type == ModelType.OpenAI:
+ logging.info(f"正在加载OpenAI模型: {model_name}")
+ model = OpenAIClient(
+ model_name=model_name,
+ api_key=access_key,
+ system_prompt=system_prompt,
+ temperature=temperature,
+ top_p=top_p,
+ user_name=user_name,
+ )
+ elif model_type == ModelType.ChatGLM:
+ logging.info(f"正在加载ChatGLM模型: {model_name}")
+ model = ChatGLM_Client(model_name, user_name=user_name)
+ elif model_type == ModelType.LLaMA and lora_model_path == "":
+ msg = f"现在请为 {model_name} 选择LoRA模型"
+ logging.info(msg)
+ lora_selector_visibility = True
+ if os.path.isdir("lora"):
+ lora_choices = get_file_names(
+ "lora", plain=True, filetypes=[""])
+ lora_choices = ["No LoRA"] + lora_choices
+ elif model_type == ModelType.LLaMA and lora_model_path != "":
+ logging.info(f"正在加载LLaMA模型: {model_name} + {lora_model_path}")
+ dont_change_lora_selector = True
+ if lora_model_path == "No LoRA":
+ lora_model_path = None
+ msg += " + No LoRA"
+ else:
+ msg += f" + {lora_model_path}"
+ model = LLaMA_Client(
+ model_name, lora_model_path, user_name=user_name)
+ elif model_type == ModelType.XMChat:
+ if os.environ.get("XMCHAT_API_KEY") != "":
+ access_key = os.environ.get("XMCHAT_API_KEY")
+ model = XMChat(api_key=access_key, user_name=user_name)
+ elif model_type == ModelType.StableLM:
+ from .StableLM import StableLM_Client
+ model = StableLM_Client(model_name, user_name=user_name)
+ elif model_type == ModelType.MOSS:
+ from .MOSS import MOSS_Client
+ model = MOSS_Client(model_name, user_name=user_name)
+ elif model_type == ModelType.YuanAI:
+ from .inspurai import Yuan_Client
+ model = Yuan_Client(model_name, api_key=access_key, user_name=user_name, system_prompt=system_prompt)
+ elif model_type == ModelType.Unknown:
+ raise ValueError(f"未知模型: {model_name}")
+ logging.info(msg)
+ chatbot = gr.Chatbot.update(label=model_name)
+ except Exception as e:
+ logging.error(e)
+ msg = f"{STANDARD_ERROR_MSG}: {e}"
+ if dont_change_lora_selector:
+ return model, msg, chatbot
+ else:
+ return model, msg, chatbot, gr.Dropdown.update(choices=lora_choices, visible=lora_selector_visibility)
+
+
+if __name__ == "__main__":
+ with open("config.json", "r") as f:
+ openai_api_key = cjson.load(f)["openai_api_key"]
+ # set logging level to debug
+ logging.basicConfig(level=logging.DEBUG)
+ # client = ModelManager(model_name="gpt-3.5-turbo", access_key=openai_api_key)
+ client = get_model(model_name="chatglm-6b-int4")
+ chatbot = []
+ stream = False
+ # 测试账单功能
+ logging.info(colorama.Back.GREEN + "测试账单功能" + colorama.Back.RESET)
+ logging.info(client.billing_info())
+ # 测试问答
+ logging.info(colorama.Back.GREEN + "测试问答" + colorama.Back.RESET)
+ question = "巴黎是中国的首都吗?"
+ for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
+ logging.info(i)
+ logging.info(f"测试问答后history : {client.history}")
+ # 测试记忆力
+ logging.info(colorama.Back.GREEN + "测试记忆力" + colorama.Back.RESET)
+ question = "我刚刚问了你什么问题?"
+ for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
+ logging.info(i)
+ logging.info(f"测试记忆力后history : {client.history}")
+ # 测试重试功能
+ logging.info(colorama.Back.GREEN + "测试重试功能" + colorama.Back.RESET)
+ for i in client.retry(chatbot=chatbot, stream=stream):
+ logging.info(i)
+ logging.info(f"重试后history : {client.history}")
+ # # 测试总结功能
+ # print(colorama.Back.GREEN + "测试总结功能" + colorama.Back.RESET)
+ # chatbot, msg = client.reduce_token_size(chatbot=chatbot)
+ # print(chatbot, msg)
+ # print(f"总结后history: {client.history}")
diff --git a/modules/models/tokenization_moss.py b/modules/models/tokenization_moss.py
new file mode 100644
index 0000000000000000000000000000000000000000..626315eb9e429ada99a15b04b9736c05e6743ffe
--- /dev/null
+++ b/modules/models/tokenization_moss.py
@@ -0,0 +1,368 @@
+"""Tokenization classes for Moss"""
+
+import json
+import os
+import numpy as np
+import regex as re
+
+from functools import lru_cache
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+from transformers.utils import is_tf_available, is_torch_available, logging
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+
+
+if TYPE_CHECKING:
+ if is_torch_available():
+ import torch
+ if is_tf_available():
+ import tensorflow as tf
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+ "vocab_file": "vocab.json",
+ "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+ "vocab_file": {
+ "fnlp/moss-moon-003-base": "https://huggingface.co/fnlp/moss-moon-003-base/resolve/main/vocab.json",
+ "fnlp/moss-moon-003-sft": "https://huggingface.co/fnlp/moss-moon-003-sft/resolve/main/vocab.json",
+ "fnlp/moss-moon-003-sft-plugin": "https://huggingface.co/fnlp/moss-moon-003-sft-plugin/resolve/main/vocab.json",
+ },
+ "merges_file": {
+ "fnlp/moss-moon-003-base": "https://huggingface.co/fnlp/moss-moon-003-base/resolve/main/merges.txt",
+ "fnlp/moss-moon-003-sft": "https://huggingface.co/fnlp/moss-moon-003-sft/resolve/main/merges.txt",
+ "fnlp/moss-moon-003-sft-plugin": "https://huggingface.co/fnlp/moss-moon-003-sft-plugin/resolve/main/merges.txt",
+ },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+ "fnlp/moss-moon-003-base": 2048,
+ "fnlp/moss-moon-003-sft": 2048,
+ "fnlp/moss-moon-003-sft-plugin": 2048,
+}
+
+
+@lru_cache()
+def bytes_to_unicode():
+ """
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+ characters the bpe code barfs on.
+
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+ tables between utf-8 bytes and unicode strings.
+ """
+ bs = (
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+ )
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8 + n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+ """
+ Return set of symbol pairs in a word.
+
+ Word is represented as tuple of symbols (symbols being variable-length strings).
+ """
+ pairs = set()
+ prev_char = word[0]
+ for char in word[1:]:
+ pairs.add((prev_char, char))
+ prev_char = char
+ return pairs
+
+
+class MossTokenizer(PreTrainedTokenizer):
+ """
+ Construct a Moss tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+ This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+ You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+ call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+
+
+ When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+
+
+
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+ this superclass for more information regarding those methods.
+
+ Args:
+ vocab_file (`str`):
+ Path to the vocabulary file.
+ merges_file (`str`):
+ Path to the merges file.
+ errors (`str`, *optional*, defaults to `"replace"`):
+ Paradigm to follow when decoding bytes to UTF-8. See
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+ unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+ token instead.
+ bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The beginning of sequence token.
+ eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+ The end of sequence token.
+ add_prefix_space (`bool`, *optional*, defaults to `False`):
+ Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+ other word. (Moss tokenizer detect beginning of words by the preceding space).
+ """
+
+ vocab_files_names = VOCAB_FILES_NAMES
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+ model_input_names = ["input_ids", "attention_mask"]
+
+ def __init__(
+ self,
+ vocab_file,
+ merges_file,
+ errors="replace",
+ unk_token="<|endoftext|>",
+ bos_token="<|endoftext|>",
+ eos_token="",
+ pad_token=None,
+ add_prefix_space=False,
+ add_bos_token=False,
+ **kwargs,
+ ):
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+ super().__init__(
+ errors=errors,
+ unk_token=unk_token,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ pad_token=pad_token,
+ add_prefix_space=add_prefix_space,
+ add_bos_token=add_bos_token,
+ **kwargs,
+ )
+ self.add_bos_token = add_bos_token
+
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
+ self.encoder = json.load(vocab_handle)
+ self.decoder = {v: k for k, v in self.encoder.items()}
+ self.errors = errors # how to handle errors in decoding
+ self.byte_encoder = bytes_to_unicode()
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+ with open(merges_file, encoding="utf-8") as merges_handle:
+ bpe_merges = merges_handle.read().split("\n")[1:-1]
+ bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+ self.cache = {}
+ self.add_prefix_space = add_prefix_space
+
+ # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+ self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+ @property
+ def vocab_size(self):
+ return len(self.encoder)
+
+ def get_vocab(self):
+ return dict(self.encoder, **self.added_tokens_encoder)
+
+ def bpe(self, token):
+ if token in self.cache:
+ return self.cache[token]
+ word = tuple(token)
+ pairs = get_pairs(word)
+
+ if not pairs:
+ return token
+
+ while True:
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+ if bigram not in self.bpe_ranks:
+ break
+ first, second = bigram
+ new_word = []
+ i = 0
+ while i < len(word):
+ try:
+ j = word.index(first, i)
+ except ValueError:
+ new_word.extend(word[i:])
+ break
+ else:
+ new_word.extend(word[i:j])
+ i = j
+
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+ new_word.append(first + second)
+ i += 2
+ else:
+ new_word.append(word[i])
+ i += 1
+ new_word = tuple(new_word)
+ word = new_word
+ if len(word) == 1:
+ break
+ else:
+ pairs = get_pairs(word)
+ word = " ".join(word)
+ self.cache[token] = word
+ return word
+
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+ if self.add_bos_token:
+ bos_token_ids = [self.bos_token_id]
+ else:
+ bos_token_ids = []
+
+ output = bos_token_ids + token_ids_0
+
+ if token_ids_1 is None:
+ return output
+
+ return output + bos_token_ids + token_ids_1
+
+ def _tokenize(self, text):
+ """Tokenize a string."""
+ bpe_tokens = []
+ for token in re.findall(self.pat, text):
+ token = "".join(
+ self.byte_encoder[b] for b in token.encode("utf-8")
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+ return bpe_tokens
+
+ def _convert_token_to_id(self, token):
+ """Converts a token (str) in an id using the vocab."""
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+ def _convert_id_to_token(self, index):
+ """Converts an index (integer) in a token (str) using the vocab."""
+ return self.decoder.get(index)
+
+ def convert_tokens_to_string(self, tokens):
+ """Converts a sequence of tokens (string) in a single string."""
+ text = "".join(tokens)
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+ return text
+
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+ if not os.path.isdir(save_directory):
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+ return
+ vocab_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+ )
+ merge_file = os.path.join(
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+ )
+
+ with open(vocab_file, "w", encoding="utf-8") as f:
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+ index = 0
+ with open(merge_file, "w", encoding="utf-8") as writer:
+ writer.write("#version: 0.2\n")
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+ if index != token_index:
+ logger.warning(
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+ " Please check that the tokenizer is not corrupted!"
+ )
+ index = token_index
+ writer.write(" ".join(bpe_tokens) + "\n")
+ index += 1
+
+ return vocab_file, merge_file
+
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+ add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+ if is_split_into_words or add_prefix_space:
+ text = " " + text
+ return (text, kwargs)
+
+ def decode(
+ self,
+ token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+ skip_special_tokens: bool = False,
+ clean_up_tokenization_spaces: bool = None,
+ truncate_before_pattern: Optional[List[str]] = None,
+ **kwargs,
+ ) -> str:
+ """
+ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+ tokens and clean up tokenization spaces.
+
+ Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+ Args:
+ token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+ List of tokenized input ids. Can be obtained using the `__call__` method.
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
+ Whether or not to remove special tokens in the decoding.
+ clean_up_tokenization_spaces (`bool`, *optional*):
+ Whether or not to clean up the tokenization spaces. If `None`, will default to
+ `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
+ truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
+ A list of regular expression strings that will be used to truncate the returned string. This can be
+ used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
+ of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
+ kwargs (additional keyword arguments, *optional*):
+ Will be passed to the underlying model specific decode method.
+
+ Returns:
+ `str`: The decoded sentence.
+ """
+ decoded_text = super()._decode(
+ token_ids=token_ids,
+ skip_special_tokens=skip_special_tokens,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ **kwargs,
+ )
+
+ if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
+ decoded_text = self.truncate(decoded_text, truncate_before_pattern)
+
+ return decoded_text
+
+ def truncate(self, completion, truncate_before_pattern):
+ def find_re(string, pattern, start_pos):
+ m = pattern.search(string, start_pos)
+ return m.start() if m else -1
+
+ terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
+
+ prints = list(re.finditer("^print", completion, re.MULTILINE))
+
+ if len(prints) > 1:
+ completion = completion[: prints[1].start()]
+
+ defs = list(re.finditer("^def", completion, re.MULTILINE))
+
+ if len(defs) > 1:
+ completion = completion[: defs[1].start()]
+
+ start_pos = 0
+
+ terminals_pos = [
+ pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
+ ]
+
+ if len(terminals_pos) > 0:
+ return completion[: min(terminals_pos)]
+ else:
+ return completion
diff --git a/modules/overwrites.py b/modules/overwrites.py
index 035a4a52722d66ee28af1c05231ad1cea3339ef5..d17f56873c156e9fb883d35b50e2a28740f2cf90 100644
--- a/modules/overwrites.py
+++ b/modules/overwrites.py
@@ -8,7 +8,7 @@ from gradio_client import utils as client_utils
from modules.presets import *
from modules.llama_func import *
-
+from modules.config import render_latex
def compact_text_chunks(self, prompt: Prompt, text_chunks: List[str]) -> List[str]:
logging.debug("Compacting text chunks...🚀🚀🚀")
@@ -76,13 +76,20 @@ def postprocess_chat_messages(
else:
raise ValueError(f"Invalid message for Chatbot component: {chat_message}")
-with open("./assets/custom.js", "r", encoding="utf-8") as f, open("./assets/Kelpy-Codos.js", "r", encoding="utf-8") as f2:
+with open("./assets/custom.js", "r", encoding="utf-8") as f, \
+ open("./assets/external-scripts.js", "r", encoding="utf-8") as f1:
customJS = f.read()
- kelpyCodos = f2.read()
+ externalScripts = f1.read()
+
def reload_javascript():
print("Reloading javascript...")
- js = f''
+ js = f''
+ if render_latex:
+ js += """\
+
+
+ """
def template_response(*args, **kwargs):
res = GradioTemplateResponseOriginal(*args, **kwargs)
res.body = res.body.replace(b'