Spaces:

TobDeBer
/

Granite4MicroCPU

Running

App Files Files Community

Granite4MicroCPU / app.py

TobDeBer

t3 binaries

1dfbab6 verified about 1 month ago

raw

history blame contribute delete

7.15 kB

	from collections.abc import Iterator
	from datetime import datetime
	from pathlib import Path
	from threading import Thread
	from huggingface_hub import hf_hub_download, login
	from themes.research_monochrome import ResearchMonochrome
	from typing import Iterator, List, Dict

	import os
	import requests
	import json
	import subprocess
	import gradio as gr

	today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002

	SYS_PROMPT = f"""Today's Date: {today_date}.
	You are Granite, developed by IBM. You are a helpful AI assistant"""
	TITLE = "IBM Granite 4 Micro served from local GGUF server"
	DESCRIPTION = """
	<p>Granite 4 Micro is an open-source LLM supporting a 1M context window. This demo uses only 2K context and max 1K output tokens.
	<span class="gr_docs_link">
	<a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a>
	</span>
	</p>
	"""
	LLAMA_CPP_SERVER = "http://127.0.0.1:8081"
	MAX_NEW_TOKENS = 1024
	TEMPERATURE = 0.7
	TOP_P = 0.85
	TOP_K = 50
	REPETITION_PENALTY = 1.05

	# determine platform: CUDA or CPU
	try:
	subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
	platform = "CUDA"
	except subprocess.CalledProcessError:
	platform = "CPU"
	except FileNotFoundError:
	platform = "CPU"

	print(f"Detected platform {platform}")

	gguf_name = "granite-4.0-h-micro-UD-Q2_K_XL.gguf"
	gguf_path = hf_hub_download(
	repo_id="unsloth/granite-4.0-h-micro-GGUF",
	filename=gguf_name,
	local_dir="."
	)

	# set exe_name depending on platform
	exe_name = "llama-server-t3-6266-cuda" if platform == "CUDA" else "llama-server-t3-6268-blas"
	exe_path = hf_hub_download(
	repo_id="TobDeBer/Skipper",
	filename=exe_name,
	local_dir="."
	)

	# start llama-server
	subprocess.run(["chmod", "+x", exe_name])
	command = ["./" + exe_name, "-m", gguf_name, "-c", "2048", "--port", "8081"]
	process = subprocess.Popen(command)
	print(f"Llama-server process started with PID {process.pid}")

	custom_theme = ResearchMonochrome()
	print("Theme type:", type(custom_theme))

	def generate(
	message: str,
	chat_history: List[Dict],
	temperature: float = TEMPERATURE,
	repetition_penalty: float = REPETITION_PENALTY,
	top_p: float = TOP_P,
	top_k: float = TOP_K,
	max_new_tokens: int = MAX_NEW_TOKENS,
	) -> Iterator[str]:
	"""Generate function for chat demo using Llama.cpp server."""

	# Build messages
	conversation = []
	conversation.append({"role": "system", "content": SYS_PROMPT})
	conversation += chat_history
	conversation.append({"role": "user", "content": message})

	# Prepare the prompt for the Llama.cpp server
	prompt = ""
	for item in conversation:
	if item["role"] == "system":
	prompt += f"<\|system\|>\n{item['content']}\n<\|file_separator\|>\n"
	elif item["role"] == "user":
	prompt += f"<\|user\|>\n{item['content']}\n<\|file_separator\|>\n"
	elif item["role"] == "assistant":
	prompt += f"<\|model\|>\n{item['content']}\n<\|file_separator\|>\n"
	prompt += "<\|model\|>\n" # Add the beginning token for the assistant


	# Construct the request payload
	payload = {
	"prompt": prompt,
	"stream": True, # Enable streaming
	"max_tokens": max_new_tokens,
	"temperature": temperature,
	"repeat_penalty": repetition_penalty,
	"top_p": top_p,
	"top_k": top_k,
	"stop": ["<\|file_separator\|>"], #stops after it sees this
	}

	try:
	# Make the request to the Llama.cpp server
	with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=(30, 300)) as response:
	response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

	# Stream the response from the server
	outputs = []
	for line in response.iter_lines():
	if line:
	# Decode the line
	decoded_line = line.decode('utf-8')
	# Remove 'data: ' prefix if present
	if decoded_line.startswith("data: "):
	decoded_line = decoded_line[6:]

	# Handle potential JSON decoding errors
	try:
	json_data = json.loads(decoded_line)
	text = json_data.get("content", "") # Extract content field. crucial.
	if text:
	outputs.append(text)
	yield "".join(outputs)

	except json.JSONDecodeError:
	print(f"JSONDecodeError: {decoded_line}")
	# Handle the error, potentially skipping the line or logging it.

	except requests.exceptions.RequestException as e:
	print(f"Request failed: {e}")
	yield f"Error: {e}" # Yield an error message to the user
	except Exception as e:
	print(f"An unexpected error occurred: {e}")
	yield f"Error: {e}" # Yield error message


	css_file_path = Path(Path(__file__).parent / "app.css")

	# advanced settings (displayed in Accordion)
	temperature_slider = gr.Slider(
	minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"]
	)
	top_p_slider = gr.Slider(
	minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"]
	)
	top_k_slider = gr.Slider(
	minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"]
	)
	repetition_penalty_slider = gr.Slider(
	minimum=0,
	maximum=2.0,
	value=REPETITION_PENALTY,
	step=0.05,
	label="Repetition Penalty",
	elem_classes=["gr_accordion_element"],
	)
	max_new_tokens_slider = gr.Slider(
	minimum=1,
	maximum=2000,
	value=MAX_NEW_TOKENS,
	step=1,
	label="Max New Tokens",
	elem_classes=["gr_accordion_element"],
	)
	chat_interface_accordion = gr.Accordion(label="Advanced Settings", open=False)

	with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=custom_theme, title=TITLE) as demo:
	gr.HTML(f"<h2>{TITLE}</h2>", elem_classes=["gr_title"])
	gr.HTML(DESCRIPTION)
	chat_interface = gr.ChatInterface(
	fn=generate,
	examples=[
	["Explain the concept of quantum computing to someone with no background in physics or computer science."],
	["What is OpenShift?"],
	["What's the importance of low latency inference?"],
	["Help me boost productivity habits."],
	],
	example_labels=[
	"Explain quantum computing",
	"What is OpenShift?",
	"Importance of low latency inference",
	"Boosting productivity habits",
	],
	cache_examples=False,
	type="messages",
	additional_inputs=[
	temperature_slider,
	repetition_penalty_slider,
	top_p_slider,
	top_k_slider,
	max_new_tokens_slider,
	],
	additional_inputs_accordion=chat_interface_accordion,
	)

	if __name__ == "__main__":
	demo.queue().launch()