keyfan
/

Qwen-72B-Chat-2bit

@@ -37,7 +37,7 @@ from torch import nn
 SUPPORT_CUDA = torch.cuda.is_available()
 SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
 SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
-SUPPORT_TORCH2 = False #hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
 from .configuration_qwen import QWenConfig
@@ -414,7 +414,6 @@ class QWenAttention(nn.Module):
         use_cache: Optional[bool] = False,
     ):
         mixed_x_layer = self.c_attn(hidden_states)
-        #print("mixed out: ", mixed_x_layer)
         query, key, value = mixed_x_layer.split(self.split_size, dim=2)
@@ -444,7 +443,6 @@ class QWenAttention(nn.Module):
                     key_list += [apply_rotary_pos_emb(key[i:i+1, :, :], k_pos_emb)]
                 query = torch.cat(query_list, dim=0)
                 key = torch.cat(key_list, dim=0)
-        #print("query: ", query, "key:", key)
         if self.use_cache_quantization:
             key = quantize_cache_v(key.permute(0, 2, 1, 3),
@@ -474,7 +472,7 @@ class QWenAttention(nn.Module):
                 # present=(key,value)
                 key = torch.cat((past_key, key), dim=1)
                 value = torch.cat((past_value, value), dim=1)
-        #print("key: ", key, key.size(), value)
         if use_cache:
             present = (key, value)
         else:
@@ -540,10 +538,8 @@ class QWenAttention(nn.Module):
         context_layer = self._merge_heads(
             attn_output, self.num_heads, self.head_dim
         )
-        #print("context: ", context_layer)
         attn_output = self.c_proj(context_layer)
-        #print("attn: ", attn_output)
         outputs = (attn_output, present)
         if output_attentions:
@@ -622,7 +618,6 @@ class QWenBlock(nn.Module):
             use_cache=use_cache,
             output_attentions=output_attentions,
         )
-        #print("attn output: ", attn_outputs[0])
         attn_output = attn_outputs[0]
         outputs = attn_outputs[1:]
@@ -634,7 +629,6 @@ class QWenBlock(nn.Module):
         residual = layernorm_input
         mlp_output = self.mlp(layernorm_output)
-        #print("mlp output: ", mlp_output)
         hidden_states = residual + mlp_output
         if use_cache:
@@ -909,7 +903,6 @@ class QWenModel(QWenPreTrainedModel):
                 )
             hidden_states = outputs[0]
-            #print(i, hidden_states)
             if use_cache is True:
                 presents = presents + (outputs[1],)

 SUPPORT_CUDA = torch.cuda.is_available()
 SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
 SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
+SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
 from .configuration_qwen import QWenConfig
         use_cache: Optional[bool] = False,
     ):
         mixed_x_layer = self.c_attn(hidden_states)
         query, key, value = mixed_x_layer.split(self.split_size, dim=2)
                     key_list += [apply_rotary_pos_emb(key[i:i+1, :, :], k_pos_emb)]
                 query = torch.cat(query_list, dim=0)
                 key = torch.cat(key_list, dim=0)
         if self.use_cache_quantization:
             key = quantize_cache_v(key.permute(0, 2, 1, 3),
                 # present=(key,value)
                 key = torch.cat((past_key, key), dim=1)
                 value = torch.cat((past_value, value), dim=1)
         if use_cache:
             present = (key, value)
         else:
         context_layer = self._merge_heads(
             attn_output, self.num_heads, self.head_dim
         )
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
             use_cache=use_cache,
             output_attentions=output_attentions,
         )
         attn_output = attn_outputs[0]
         outputs = attn_outputs[1:]
         residual = layernorm_input
         mlp_output = self.mlp(layernorm_output)
         hidden_states = residual + mlp_output
         if use_cache:
                 )
             hidden_states = outputs[0]
             if use_cache is True:
                 presents = presents + (outputs[1],)

tokenization_qwen.py CHANGED Viewed

@@ -243,8 +243,6 @@ class QWenTokenizer(PreTrainedTokenizer):
         """Converts an id to a token, special tokens included"""
         if index in self.decoder:
             return self.decoder[index]
-        print("error index", index)
-        return ""
         raise ValueError("unknown ids")
     def _convert_token_to_id(self, token: Union[bytes, str]) -> int:

         """Converts an id to a token, special tokens included"""
         if index in self.decoder:
             return self.decoder[index]
         raise ValueError("unknown ids")
     def _convert_token_to_id(self, token: Union[bytes, str]) -> int: