Spaces:
Sleeping
Sleeping
Commit
•
574a1e8
1
Parent(s):
a0db7b5
Add comments to app.py (#1)
Browse files- Add comments to app.py (4edbba604f95c63f865c6d02074a52ab922b7c99)
Co-authored-by: Kishor Kumar <nzwildcode@users.noreply.huggingface.co>
app.py
CHANGED
@@ -2,34 +2,34 @@ import os
|
|
2 |
from threading import Thread
|
3 |
from typing import Iterator
|
4 |
|
5 |
-
import gradio as gr
|
6 |
-
import spaces
|
7 |
-
import torch
|
8 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
9 |
|
|
|
10 |
MAX_MAX_NEW_TOKENS = 2048
|
11 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
12 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
13 |
|
|
|
14 |
DESCRIPTION = """\
|
15 |
# Masher AI v6 7B
|
16 |
-
|
17 |
This Space demonstrates Masher AI v6 7B by Maheswar.
|
18 |
-
|
19 |
"""
|
20 |
|
21 |
-
|
22 |
if not torch.cuda.is_available():
|
23 |
DESCRIPTION += "\n<p>Running on CPU! This demo does not work on CPU.</p>"
|
24 |
|
25 |
-
|
26 |
if torch.cuda.is_available():
|
27 |
model_id = "mahiatlinux/MasherAI-v6.1-7B-checkpoint1"
|
28 |
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
|
29 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
30 |
tokenizer.use_default_system_prompt = False
|
31 |
|
32 |
-
|
33 |
@spaces.GPU(enable_queue=True)
|
34 |
def generate(
|
35 |
message: str,
|
@@ -41,19 +41,25 @@ def generate(
|
|
41 |
top_k: int = 50,
|
42 |
repetition_penalty: float = 1.2,
|
43 |
) -> Iterator[str]:
|
|
|
44 |
conversation = []
|
|
|
45 |
if system_prompt:
|
46 |
conversation.append({"from": "human", "value": "You are an AI assistant. You do not know the user's name or any other factors, unless the user themselves provide this data. You are to not assume, speculate or use placeholders for these."})
|
|
|
47 |
for user, assistant in chat_history:
|
48 |
conversation.extend([{"from": "human", "value": user}, {"from": "gpt", "value": assistant}])
|
|
|
49 |
conversation.append({"from": "human", "value": message})
|
50 |
|
|
|
51 |
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt", add_generation_prompt=True)
|
52 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
53 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
54 |
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
|
55 |
input_ids = input_ids.to(model.device)
|
56 |
|
|
|
57 |
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
|
58 |
generate_kwargs = dict(
|
59 |
{"input_ids": input_ids},
|
@@ -69,12 +75,13 @@ def generate(
|
|
69 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
70 |
t.start()
|
71 |
|
|
|
72 |
outputs = []
|
73 |
for text in streamer:
|
74 |
outputs.append(text)
|
75 |
yield "".join(outputs)
|
76 |
|
77 |
-
|
78 |
chat_interface = gr.ChatInterface(
|
79 |
fn=generate,
|
80 |
additional_inputs=[
|
@@ -117,17 +124,15 @@ chat_interface = gr.ChatInterface(
|
|
117 |
],
|
118 |
stop_btn=None,
|
119 |
examples=[
|
120 |
-
|
121 |
-
["Can you explain briefly to me what is the Python programming language?"],
|
122 |
-
["Explain the plot of Cinderella in a sentence."],
|
123 |
-
["How many hours does it take a man to eat a Helicopter?"],
|
124 |
-
["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
|
125 |
],
|
126 |
)
|
127 |
|
|
|
128 |
with gr.Blocks(css="style.css") as demo:
|
129 |
gr.Markdown(DESCRIPTION)
|
130 |
chat_interface.render()
|
131 |
|
|
|
132 |
if __name__ == "__main__":
|
133 |
-
demo.queue(max_size=20).launch()
|
|
|
2 |
from threading import Thread
|
3 |
from typing import Iterator
|
4 |
|
5 |
+
import gradio as gr # Importing Gradio for creating UI interfaces.
|
6 |
+
import spaces # Import for using Hugging Face Spaces functionalities.
|
7 |
+
import torch # PyTorch library for deep learning applications.
|
8 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer # Import necessary components from Hugging Face's Transformers.
|
9 |
|
10 |
+
# Constants for maximum token lengths and defaults.
|
11 |
MAX_MAX_NEW_TOKENS = 2048
|
12 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
13 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
14 |
|
15 |
+
# Initial description for the UI interface, showcasing the AI version and creator.
|
16 |
DESCRIPTION = """\
|
17 |
# Masher AI v6 7B
|
|
|
18 |
This Space demonstrates Masher AI v6 7B by Maheswar.
|
|
|
19 |
"""
|
20 |
|
21 |
+
# Check for GPU availability, append a warning to the description if running on CPU.
|
22 |
if not torch.cuda.is_available():
|
23 |
DESCRIPTION += "\n<p>Running on CPU! This demo does not work on CPU.</p>"
|
24 |
|
25 |
+
# If a GPU is available, load the model and tokenizer with specific configurations.
|
26 |
if torch.cuda.is_available():
|
27 |
model_id = "mahiatlinux/MasherAI-v6.1-7B-checkpoint1"
|
28 |
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
|
29 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
30 |
tokenizer.use_default_system_prompt = False
|
31 |
|
32 |
+
# Define a function decorated to use GPU and enable queue for processing the generation tasks.
|
33 |
@spaces.GPU(enable_queue=True)
|
34 |
def generate(
|
35 |
message: str,
|
|
|
41 |
top_k: int = 50,
|
42 |
repetition_penalty: float = 1.2,
|
43 |
) -> Iterator[str]:
|
44 |
+
# Preparing conversation history for processing.
|
45 |
conversation = []
|
46 |
+
# Adding system prompt to the conversation, if any.
|
47 |
if system_prompt:
|
48 |
conversation.append({"from": "human", "value": "You are an AI assistant. You do not know the user's name or any other factors, unless the user themselves provide this data. You are to not assume, speculate or use placeholders for these."})
|
49 |
+
# Extending the conversation history with user and assistant interactions.
|
50 |
for user, assistant in chat_history:
|
51 |
conversation.extend([{"from": "human", "value": user}, {"from": "gpt", "value": assistant}])
|
52 |
+
# Adding the latest message from the user to the conversation.
|
53 |
conversation.append({"from": "human", "value": message})
|
54 |
|
55 |
+
# Tokenize and prepare the input, handle exceeding token lengths.
|
56 |
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt", add_generation_prompt=True)
|
57 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
58 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
59 |
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
|
60 |
input_ids = input_ids.to(model.device)
|
61 |
|
62 |
+
# Setup for asynchronous text generation.
|
63 |
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
|
64 |
generate_kwargs = dict(
|
65 |
{"input_ids": input_ids},
|
|
|
75 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
76 |
t.start()
|
77 |
|
78 |
+
# Collect and yield generated outputs as they become available.
|
79 |
outputs = []
|
80 |
for text in streamer:
|
81 |
outputs.append(text)
|
82 |
yield "".join(outputs)
|
83 |
|
84 |
+
# Setup Gradio interface for chat, including additional controls for the generation parameters.
|
85 |
chat_interface = gr.ChatInterface(
|
86 |
fn=generate,
|
87 |
additional_inputs=[
|
|
|
124 |
],
|
125 |
stop_btn=None,
|
126 |
examples=[
|
127 |
+
# Examples to assist users in starting conversations with the AI.
|
|
|
|
|
|
|
|
|
128 |
],
|
129 |
)
|
130 |
|
131 |
+
# Setup and launch the Gradio demo with Blocks API.
|
132 |
with gr.Blocks(css="style.css") as demo:
|
133 |
gr.Markdown(DESCRIPTION)
|
134 |
chat_interface.render()
|
135 |
|
136 |
+
# Main entry point to start the web application if this script is run directly.
|
137 |
if __name__ == "__main__":
|
138 |
+
demo.queue(max_size=20).launch()
|