File size: 4,949 Bytes
9c1188f 6af00ba 4213f50 e3894fb 7e3fb58 d9e0520 6af00ba d9e0520 0ebff8f ee032a8 d9e0520 0ebff8f 6af00ba 0ebff8f 6af00ba 0ebff8f 6af00ba 0ebff8f d9e0520 6af00ba 0ebff8f 6af00ba d9e0520 fea7e7a 74d7e67 fea7e7a 6f94d14 74d7e67 6f94d14 74d7e67 6f94d14 078637c cb5934e 2877fe7 078637c 74d7e67 078637c 7e3fb58 c903cf3 2877fe7 762532e c903cf3 2877fe7 762532e 6f94d14 9c1188f bbb68cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
import psutil
import subprocess
import time
def generate_response(user_message): #generate_response_token_by_token
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
process_monitor = psutil.Process(process.pid)
start_time = time.time()
monitor_start_time = time.time()
alltokens = ""
token_buffer = ''
tokencount = 0
try:
while True:
# Read one character at a time
char = process.stdout.read(1)
if char == '' and process.poll() is not None:
break
if char != '':
token_buffer += char
if char == ' ' or char == '\n': # Token delimiters
elapsed_time = time.time() - start_time # Calculate elapsed time
alltokens += token_buffer
tokencount += 1
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds | Tokens: { tokencount }]"
token_buffer = '' # Reset token buffer
# Log resource usage every minute
if time.time() - monitor_start_time > 60:
cpu_usage = process_monitor.cpu_percent()
memory_usage = process_monitor.memory_info().rss # in bytes
print(f"Subprocess CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage / 1024 ** 2} MB")
monitor_start_time = time.time() # Reset the timer
# Yield the last token if there is any
if token_buffer:
elapsed_time = time.time() - start_time # Calculate elapsed time
alltokens += token_buffer
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds | Average Tokens per second: { tokencount / elapsed_time}]"
finally:
try:
# Wait for the process to complete, with a timeout
process.wait(timeout=60) # Timeout in seconds
except subprocess.TimeoutExpired:
print("Process didn't complete within the timeout. Killing it.")
process.kill()
process.wait() # Ensure proper cleanup
# Wait for the subprocess to finish if it hasn't already
process.stdout.close()
process.stderr.close()
# Check for any errors
if process.returncode != 0:
error_message = process.stderr.read()
print(f"Error: {error_message}")
def custom_generate_response(cust_user_message):
cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message + '\n\n'
yield from generate_response(cust_user_message)
def custom_generate_response1(cust_user_message):
cust_user_message = CustomPrompts[1] + '\n\n' + cust_user_message + '\n\n'
yield from generate_response(cust_user_message)
def custom_generate_response2(cust_user_message):
cust_user_message = CustomPrompts[2] + '\n' + cust_user_message + '\n\n'
yield from generate_response(cust_user_message)
CustomPrompts = [
"Write a Class Diagram based on the following text:",
"Write a Pydot code based on the following text:",
"Describe what a standard happy scene in any movie would be planned in great detail, based on the following text:",
]
with gr.Blocks() as iface:
gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
outputs="text",
title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test",
description="No Prompt template used yet (Essentially autocomplete). No Message History for now - Enter your message and get a response.",
flagging_dir="/usr/src/app/flagged",
)
#gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
with gr.Group():
gr.HTML("Test for wrapping generator (Instead of buttons tabs and dropdowns?)")
MainOutput = gr.TextArea(placeholder='Output will show here')
CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here')
CustomButtonClassDiagram = gr.Button(CustomPrompts[0])
CustomButtonPydotcode = gr.Button(CustomPrompts[1])
CustomButtonHappyMovieScene = gr.Button(CustomPrompts[2])
CustomButtonClassDiagram.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)
CustomButtonPydotcode.click(custom_generate_response1, inputs=[CustomButtonInput], outputs=MainOutput)
CustomButtonHappyMovieScene.click(custom_generate_response2, inputs=[CustomButtonInput], outputs=MainOutput)
iface.queue().launch(server_name="0.0.0.0", share=True) |