ighoshsubho commited on
Commit
aeb3c43
1 Parent(s): d03d4bb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +53 -20
README.md CHANGED
@@ -66,28 +66,61 @@ print(tokenizer.decode(outputs[0]))
66
  Memory footprint: 269.03 MB
67
  ```
68
 
69
- #### Quantized Versions through `bitsandbytes`
70
- * _Using 8-bit precision (int8)_
71
 
72
  ```python
73
- # pip install bitsandbytes accelerate
74
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
75
- # to use 4bit use `load_in_4bit=True` instead
76
- quantization_config = BitsAndBytesConfig(load_in_8bit=True)
77
- checkpoint = "HuggingFaceTB/SmolLM-135M"
78
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
79
- model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=quantization_config)
80
- inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to("cuda")
81
- outputs = model.generate(inputs)
82
- print(tokenizer.decode(outputs[0]))
83
- ```
84
- ```bash
85
- >>> print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
86
- # load_in_8bit
87
- Memory footprint: 162.87 MB
88
- # load_in_4bit
89
- >>> print(f"Memory footprint: {model.get_memory_footprint() / 1e6:.2f} MB")
90
- Memory footprint: 109.78 MB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  ```
92
 
93
  # Limitations
 
66
  Memory footprint: 269.03 MB
67
  ```
68
 
69
+ #### Quantized Version 2Bit (BitNet)
 
70
 
71
  ```python
72
+ model = "ighoshsubho/Bitnet-SmolLM-135M"
73
+ tokenizer = AutoTokenizer.from_pretrained(model)
74
+ model = AutoModelForCausalLM.from_pretrained(model)
75
+
76
+ def activation_quant(x):
77
+ scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
78
+ y = (x * scale).round().clamp_(-128, 127)
79
+ y = y / scale
80
+ return y
81
+ def weight_quant(w):
82
+ scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
83
+ u = (w * scale).round().clamp_(-1, 1)
84
+ u = u / scale
85
+ return u
86
+
87
+ class BitLinear(nn.Linear):
88
+ def forward(self, x):
89
+ w = self.weight # a weight tensor with shape [d, k]
90
+ x = x.to(w.device)
91
+ RMSNorm = LlamaRMSNorm(x.shape[-1]).to(w.device)
92
+ x_norm = RMSNorm(x)
93
+ # A trick for implementing Straight−Through−Estimator (STE) using detach()
94
+ x_quant = x_norm + (activation_quant(x_norm) - x_norm).detach()
95
+ w_quant = w + (weight_quant(w) - w).detach()
96
+ y = F.linear(x_quant, w_quant)
97
+ return y
98
+
99
+ def convert_to_bitnet(model, copy_weights):
100
+ for name, module in model.named_modules():
101
+ # Replace linear layers with BitNet
102
+ if isinstance(module, LlamaSdpaAttention) or isinstance(module, LlamaMLP):
103
+ for child_name, child_module in module.named_children():
104
+ if isinstance(child_module, nn.Linear):
105
+ bitlinear = BitLinear(child_module.in_features, child_module.out_features, child_module.bias is not None).to(device="cuda:0")
106
+ if copy_weights:
107
+ bitlinear.weight = child_module.weight
108
+ if child_module.bias is not None:
109
+ bitlinear.bias = child_module.bias
110
+ setattr(module, child_name, bitlinear)
111
+ # Remove redundant input_layernorms
112
+ elif isinstance(module, LlamaDecoderLayer):
113
+ for child_name, child_module in module.named_children():
114
+ if isinstance(child_module, LlamaRMSNorm) and child_name == "input_layernorm":
115
+ setattr(module, child_name, nn.Identity().to(device="cuda:0"))
116
+
117
+ convert_to_bitnet(model, copy_weights=True)
118
+ model.to(device="cuda:0")
119
+
120
+ prompt = "Lovely works as a Senior Software Engineer at Axian Consulting. She has Master’s degree in Software Engineering. She is a full stack developer with 10 years of commercial experience working on web-based applications development, having wide knowledge on end"
121
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
122
+ generate_ids = model.generate(inputs.input_ids, max_length=200)
123
+ tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
124
  ```
125
 
126
  # Limitations