Spaces:

rohitghosh1763
/

final-chatbot

Runtime error

App Files Files Community

rohitghosh1763 commited on Nov 29, 2024

Commit

c360f03

verified ·

1 Parent(s): b50df89

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -60

app.py CHANGED Viewed

@@ -1,64 +1,109 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
 )
-if __name__ == "__main__":
-    demo.launch()

+from unsloth import FastLanguageModel
+import torch
+max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
+dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
+# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
+fourbit_models = [
+    "unsloth/mistral-7b-bnb-4bit",
+    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
+    "unsloth/llama-2-7b-bnb-4bit",
+    "unsloth/llama-2-13b-bnb-4bit",
+    "unsloth/codellama-34b-bnb-4bit",
+    "unsloth/tinyllama-bnb-4bit",
+    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
+    "unsloth/gemma-2b-bnb-4bit",
+] # More models at https://huggingface.co/unsloth
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
+    max_seq_length = max_seq_length,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
+)
 import gradio as gr
+# Function to handle user query and return response
+def chatbot_response(user_query):
+    if True:
+        from unsloth import FastLanguageModel
+        from transformers import TextStreamer
+        model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name = "/content/drive/MyDrive/Colab Notebooks/lora_model", # YOUR MODEL YOU USED FOR TRAINING
+            max_seq_length = max_seq_length,
+            dtype = dtype,
+            load_in_4bit = load_in_4bit,
+    )
+    FastLanguageModel.for_inference(model)
+    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+    ### Instruction:
+    {}
+    ### Input:
+    {}
+    ### Response:
+    {}"""
+    inputs = tokenizer(
+    [
+        alpaca_prompt.format(
+            "Category,Instruction General Response,'Answer the user’s query thoroughly and accurately, ensuring no details or points are omitted. Always recognize that 'AEC' refers to 'Assam Engineering College,' and vice versa, and use this understanding to provide clear, context-aware responses.' Formatting,'Structure the output to be attractive, engaging, and professional, using proper formatting. Break the response into multiple paragraphs or sections if necessary to improve readability and organization.' Use of Lists,'For queries that involve enumerations, options, or multiple steps, use bullet points or numbered lists to present the information clearly and concisely. For example: - When listing departments or facilities. - When explaining procedures or step-by-step guides. - When summarizing key features or FAQs.' Tone,'Maintain a friendly, informative tone, and deliver complete, standard answers to meet the user's expectations", # instruction
+            user_query, # input
+            "", # output - leave this blank for generation!
+        )
+    ], return_tensors = "pt").to("cuda")
+    text_streamer = TextStreamer(tokenizer)
+    # Generate the response
+    response = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)
+    # Decode the response
+    decoded_output = tokenizer.batch_decode(response, skip_special_tokens=False)[0]
+    # Extract the desired portion
+    # Find the start of the Response section
+    response_start = decoded_output.find("### Response:") + len("### Response:")
+    # Extract only the response part
+    final_response = decoded_output[response_start:].strip()
+    # Input query
+    print("User Query:", user_query)  # Just for debugging, can be removed
+    # --- Your model inference logic goes here ---
+    # Example: Replace the following line with your model's response
+    model_response = f"{final_response}"
+    # ---------------------------------------------
+    # Output response
+    print("Model Response:", final_response)  # Just for debugging, can be removed
+    return model_response
+# Gradio Interface
+interface = gr.Interface(
+    fn=chatbot_response,  # Function for processing user input
+    inputs=gr.Textbox(
+        label="Enter your query:",  # Label for the input box
+        placeholder="Type something...",  # Placeholder text
+    ),
+    outputs=gr.Textbox(label="Response:"),  # Output box for model response
+    title="Simple Chatbot",
+    description="This is a simple chatbot interface. Type your query and get a response.",
 )
+# Launch the Gradio app
+interface.launch()