mosaicml/mpt-30b-chat

I am not able to get this work. All i am getting is empty responses i have tried playing with parameters but still no luck any advise what to modify .

app = Flask(name)

@dataclass
class GenerationConfig:
temperature: float
top_k: int
top_p: float
repetition_penalty: float
max_new_tokens: int
seed: int
reset: bool
stream: bool
threads: int
stop: list

def format_prompt(system_prompt: str, user_prompt: str):
system_prompt = f"system\n{system_prompt}\n"
user_prompt = f"user\n{user_prompt}\n"
assistant_prompt = f"assistant\n"
return f"{system_prompt}{user_prompt}{assistant_prompt}"

def generate(
llm: AutoModelForCausalLM,
generation_config: GenerationConfig,
system_prompt: str,
user_input: str,
):
# return llm(
# format_prompt(
# system_prompt,
# user_prompt,
# ),
# **asdict(generation_config),
# )
model_output = llm(
format_prompt(system_prompt, user_input.strip()),
**asdict(generation_config),
)
print("Model output:", model_output)
return model_output

@app .route('/generate', methods=['GET','POST'])
def generate_response_endpoint():
#user_input = request.data.decode('utf-8')
# Load the model and configuration
if request.method == 'GET':
user_input = request.args.get('user_input', '') # Get input from query parameter
elif request.method == 'POST':
user_input = request.data.decode('utf-8')

print("Loading model...")
#config = AutoConfig.from_pretrained("mosaicml/mpt-30b-chat", context_length=8192)
llm = AutoModelForCausalLM.from_pretrained(
    "/home/azureuser/mpt-30B-inference/models/mpt-30b-chat.ggmlv0.q4_1.bin",
    model_type="mpt"
)
print("model Loaded")

system_prompt = "Reply."

generation_config = GenerationConfig(
    temperature=0.2,
    top_k=0,
    top_p=0.9,
    repetition_penalty=1.0,
    max_new_tokens=512, 
    seed=42,
    reset=False,  
    stream=False, 
    threads=int(os.cpu_count() / 2),  # adjust for your CPU
    stop=["", "|<"],
)

generator = generate(llm, generation_config, system_prompt, user_input.strip())
#time.sleep(60)

print(generator)

response = generator

print(response)
return Response(response, content_type='text/plain; charset=utf-8')

if name == "main":
app.run(host='0.0.0.0', port=3002)