SmartLedger / modal /demo_llama_client.py
weitu's picture
demo llama inference
c9dea17
"""This simple script shows how to interact with an OpenAI-compatible server from a client."""
import argparse
import modal
from openai import OpenAI
class Colors:
"""ANSI color codes"""
GREEN = "\033[0;32m"
RED = "\033[0;31m"
BLUE = "\033[0;34m"
GRAY = "\033[0;90m"
BOLD = "\033[1m"
END = "\033[0m"
def get_completion(client, model_id, messages, args):
completion_args = {
"model": model_id,
"messages": messages,
"frequency_penalty": args.frequency_penalty,
"max_tokens": args.max_tokens,
"n": args.n,
"presence_penalty": args.presence_penalty,
"seed": args.seed,
"stop": args.stop,
"stream": args.stream,
"temperature": args.temperature,
"top_p": args.top_p,
}
completion_args = {k: v for k, v in completion_args.items() if v is not None}
try:
response = client.chat.completions.create(**completion_args)
return response
except Exception as e:
print(Colors.RED, f"Error during API call: {e}", Colors.END, sep="")
return None
def main():
parser = argparse.ArgumentParser(description="OpenAI Client CLI")
parser.add_argument(
"--model",
type=str,
default=None,
help="The model to use for completion, defaults to the first available model",
)
parser.add_argument(
"--workspace",
type=str,
default=None,
help="The workspace where the LLM server app is hosted, defaults to your current Modal workspace",
)
parser.add_argument(
"--environment",
type=str,
default=None,
help="The environment in your Modal workspace where the LLM server app is hosted, defaults to your current environment",
)
parser.add_argument(
"--app-name",
type=str,
default="example-vllm-openai-compatible",
help="A Modal App serving an OpenAI-compatible API",
)
parser.add_argument(
"--function-name",
type=str,
default="serve",
help="A Modal Function serving an OpenAI-compatible API. Append `-dev` to use a `modal serve`d Function.",
)
parser.add_argument(
"--api-key",
type=str,
default="super-secret-key",
help="The API key to use for authentication, set in your api.py",
)
# Completion parameters
parser.add_argument("--max-tokens", type=int, default=None)
parser.add_argument("--temperature", type=float, default=0.7)
parser.add_argument("--top-p", type=float, default=0.9)
parser.add_argument("--top-k", type=int, default=0)
parser.add_argument("--frequency-penalty", type=float, default=0)
parser.add_argument("--presence-penalty", type=float, default=0)
parser.add_argument(
"--n",
type=int,
default=1,
help="Number of completions to generate. Streaming and chat mode only support n=1.",
)
parser.add_argument("--stop", type=str, default=None)
parser.add_argument("--seed", type=int, default=None)
# Prompting
parser.add_argument(
"--prompt",
type=str,
default="Compose a limerick about baboons and racoons.",
help="The user prompt for the chat completion",
)
parser.add_argument(
"--system-prompt",
type=str,
default="You are a poetic assistant, skilled in writing satirical doggerel with creative flair.",
help="The system prompt for the chat completion",
)
# UI options
parser.add_argument(
"--no-stream",
dest="stream",
action="store_false",
help="Disable streaming of response chunks",
)
parser.add_argument(
"--chat", action="store_true", help="Enable interactive chat mode"
)
args = parser.parse_args()
client = OpenAI(api_key=args.api_key)
workspace = args.workspace or modal.config._profile
environment = args.environment or modal.config.config["environment"]
prefix = workspace + (f"-{environment}" if environment else "")
client.base_url = (
f"https://{prefix}--{args.app_name}-{args.function_name}.modal.run/v1"
)
if args.model:
model_id = args.model
print(
Colors.BOLD,
f"🧠: Using model {model_id}. This may trigger a model load on first call!",
Colors.END,
sep="",
)
else:
print(
Colors.BOLD,
f"🔎: Looking up available models on server at {client.base_url}. This may trigger a model load!",
Colors.END,
sep="",
)
model = client.models.list().data[0]
model_id = model.id
print(
Colors.BOLD,
f"🧠: Using {model_id}",
Colors.END,
sep="",
)
messages = [
{
"role": "system",
"content": args.system_prompt,
}
]
print(Colors.BOLD + "🧠: Using system prompt: " + args.system_prompt + Colors.END)
if args.chat:
print(
Colors.GREEN
+ Colors.BOLD
+ "\nEntering chat mode. Type 'bye' to end the conversation."
+ Colors.END
)
while True:
user_input = input("\nYou: ")
if user_input.lower() in ["bye"]:
break
MAX_HISTORY = 10
if len(messages) > MAX_HISTORY:
messages = messages[:1] + messages[-MAX_HISTORY + 1 :]
messages.append({"role": "user", "content": user_input})
response = get_completion(client, model_id, messages, args)
if response:
if args.stream:
# only stream assuming n=1
print(Colors.BLUE + "\n🤖: ", end="")
assistant_message = ""
for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
print(content, end="")
assistant_message += content
print(Colors.END)
else:
assistant_message = response.choices[0].message.content
print(
Colors.BLUE + "\n🤖:" + assistant_message + Colors.END,
sep="",
)
messages.append({"role": "assistant", "content": assistant_message})
else:
messages.append({"role": "user", "content": args.prompt})
print(Colors.GREEN + f"\nYou: {args.prompt}" + Colors.END)
response = get_completion(client, model_id, messages, args)
if response:
if args.stream:
print(Colors.BLUE + "\n🤖:", end="")
for chunk in response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
print(Colors.END)
else:
# only case where multiple completions are returned
for i, response in enumerate(response.choices):
print(
Colors.BLUE
+ f"\n🤖 Choice {i + 1}:{response.message.content}"
+ Colors.END,
sep="",
)
if __name__ == "__main__":
main()