BucketOfFish
/

simplified_phi2

Text Generation

feature-extraction

Model card Files Files and versions Community

BucketOfFish commited on Jan 6

Commit

c07c430

•

1 Parent(s): 455129a

Passing KV cache through iterations

Files changed (2) hide show

phi2_model.py +3 -2
streaming_inference.py +2 -2

phi2_model.py CHANGED Viewed

@@ -35,10 +35,11 @@ class Phi2PreTrainedModel(PreTrainedModel):
     def prepare_inputs_for_generation(
         self,
         input_ids: torch.LongTensor,  # dim: (batch_size, seq_len)
-        kv_cache: KVCache | None = None,
         key_padding_mask: torch.LongTensor | torch.BoolTensor | None = None,
         **kwargs,  # has to be here
     ) -> dict[str, Any]:
         if not kv_cache:
             kv_cache = KVCache(
                 max_seqlen=self.config.initial_cos_sin_cache_len,
@@ -160,4 +161,4 @@ class Phi2ModelForCausalLM(Phi2PreTrainedModel):
             if labels is not None
             else None
         )
-        return CausalLMOutputWithPast(loss=loss, logits=logits)

     def prepare_inputs_for_generation(
         self,
         input_ids: torch.LongTensor,  # dim: (batch_size, seq_len)
+        past_key_values: KVCache | None = None,  # has to be named this
         key_padding_mask: torch.LongTensor | torch.BoolTensor | None = None,
         **kwargs,  # has to be here
     ) -> dict[str, Any]:
+        kv_cache = past_key_values
         if not kv_cache:
             kv_cache = KVCache(
                 max_seqlen=self.config.initial_cos_sin_cache_len,
             if labels is not None
             else None
         )
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=kv_cache)

streaming_inference.py CHANGED Viewed

@@ -43,11 +43,11 @@ if __name__ == "__main__":
     thread = Thread(
         target=model.generate,
         kwargs=dict(
-            tokenizer(  # returns a torch dictionary
                 "Here is an essay on sea monkeys: ",
                 return_tensors="pt",
                 return_attention_mask=False,
-                ).to(device),
             streamer=token_streamer,
             max_new_tokens=500,
             eos_token_id=tokenizer.eos_token_id,

     thread = Thread(
         target=model.generate,
         kwargs=dict(
+            inputs=tokenizer(  # returns a torch dictionary
                 "Here is an essay on sea monkeys: ",
                 return_tensors="pt",
                 return_attention_mask=False,
+            ).to(device),
             streamer=token_streamer,
             max_new_tokens=500,
             eos_token_id=tokenizer.eos_token_id,