Encountering KeyError: 'mistral' on GPUs for Inference
#148
by
cratnoc
- opened
I'm trying to run inference on mistral-7b-instruct-v0.2
model using Ray and Fast API. My serve script is below. I'm getting the below error.
_call_func_or_gen
result = callable(*args, **kwargs)
File "/serve_app/ray_serve_mistral.py", line 48, in __init__
self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 434, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 829, in from_pretrained
config_class = CONFIG_MAPPING[config_dict["model_type"]]
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 536, in __getitem__
raise KeyError(key)
KeyError: 'mistral'
I'm installing the pip dependencies as RUN pip install -U --no-cache-dir requests torch transformers accelerate uvicorn fastapi
.
The versions are as below
Python Dependencies:
transformers` version: 4.35.0
- Platform: Linux-5.10.219-208.866.amzn2.x86_64-x86_64-with-glibc2.31
- Python version: 3.10.13
- Huggingface_hub version: 0.17.3
- Safetensors version: 0.4.1
- Accelerate version: 0.20.3
Serving Script:
import os
import time
from typing import List
import ray
from ray import serve
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch
app = FastAPI()
MODEL_NAME = os.getenv("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.2")
HF_TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN')
# Deployment settings for the API ingress using Ray Serve
@serve
.deployment(name="mistral-deployment-2", num_replicas=1, route_prefix="/mistral")
@serve
.ingress(app)
class APIIngress:
# Constructor to initialize the API with a model handle
def __init__(self, mistral_model_handle) -> None:
self.handle = mistral_model_handle
# Define a GET endpoint for generateence
@app
.get("/infer")
async def infer(self, request: str):
# Asynchronously perform generateence using the provided sentence and return the result
result = await self.handle.infer.remote(request)
return result
@serve
.deployment(
name="mistral-7b",
ray_actor_options={"num_gpus": 1},
autoscaling_config={
"min_replicas": 1,
"max_replicas": 5,
"target_num_ongoing_requests_per_replica": 10,
}
)
class MistralModel:
def __init__(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
login(token=HF_TOKEN)
self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def infer(self, request: str):
text = "[INST]" + request + "[/INST]"
inputs = self.tokenizer.encode(text, return_tensors="pt").to(self.device)
self.model.to(self.device)
with torch.inference_mode():
generated_sequence = self.model.sample(
inputs,
sequence_length=512,
start_ids=None
)
return [self.tokenizer.decode(seq) for seq in generated_sequence]
# Bind the model to the API ingress to enable endpoint functionality
entrypoint = APIIngress.bind(MistralModel.bind())
transformers` version: 4.35.0
The latest version of transformers is now 4.43.1
. Can you try upgrade to a later version?