philschmid
/

gpt-j-6B-fp16-sharded

endpoints-template

Inference Endpoints

Model card Files Files and versions Community

philschmid HF staff commited on May 6, 2023

Commit

4d500b9

•

1 Parent(s): a9d9c29

Update handler.py

Files changed (1) hide show

handler.py +3 -3

handler.py CHANGED Viewed

@@ -2,15 +2,15 @@ import torch
 from typing import Dict, List, Any
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-# check for GPU
-device = 0 if torch.cuda.is_available() else -1
 class EndpointHandler:
     def __init__(self, path=""):
         # load the model
         tokenizer = AutoTokenizer.from_pretrained(path)
-        model = AutoModelForCausalLM.from_pretrained(path, low_cpu_mem_usage=True)
         # create inference pipeline
         self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)

 from typing import Dict, List, Any
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+# get dtype
+dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
 class EndpointHandler:
     def __init__(self, path=""):
         # load the model
         tokenizer = AutoTokenizer.from_pretrained(path)
+        model = AutoModelForCausalLM.from_pretrained(path, device_map="auto",torch_dtype=dtype)
         # create inference pipeline
         self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)