OpenAssistant/falcon-40b-sft-mix-1226 · Expand Output after deploying it on SageMaker

I get back a response from the model but it is not complete. I manage to deploy it on ml.g5.12xlarge following the instructions.

from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import json 
model = "OpenAssistant/falcon-40b-sft-mix-1226"

tokenizer = AutoTokenizer.from_pretrained(model)

# grab environment variables
ENDPOINT_NAME = "huggingface-pytorch-tgi-inference-2023-06-14-22-44-39-458"
runtime= boto3.client('runtime.sagemaker')
prompt = "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>"
input_data = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "temperature":0.1,
    "include_prompt_in_result": False,
    "top_k":10,
    "num_return_sequences":10,
    "max_length": 10,
    #"eos_token_id":tokenizer.eos_token_id,
    "return_full_text":False,
  }
}

response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                   ContentType='application/json',
                                   Body=json.dumps(input_data).encode('utf-8'))
response_json = json.loads(response['Body'].read().decode("utf-8"))
response_json