rohitghosh1763 commited on
Commit
c360f03
·
verified ·
1 Parent(s): b50df89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -60
app.py CHANGED
@@ -1,64 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  )
61
 
 
 
62
 
63
- if __name__ == "__main__":
64
- demo.launch()
 
1
+ from unsloth import FastLanguageModel
2
+ import torch
3
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
4
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
5
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
6
+
7
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
8
+ fourbit_models = [
9
+ "unsloth/mistral-7b-bnb-4bit",
10
+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
11
+ "unsloth/llama-2-7b-bnb-4bit",
12
+ "unsloth/llama-2-13b-bnb-4bit",
13
+ "unsloth/codellama-34b-bnb-4bit",
14
+ "unsloth/tinyllama-bnb-4bit",
15
+ "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
16
+ "unsloth/gemma-2b-bnb-4bit",
17
+ ] # More models at https://huggingface.co/unsloth
18
+
19
+ model, tokenizer = FastLanguageModel.from_pretrained(
20
+ model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
21
+ max_seq_length = max_seq_length,
22
+ dtype = dtype,
23
+ load_in_4bit = load_in_4bit,
24
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
25
+ )
26
+
27
+
28
  import gradio as gr
29
+
30
+ # Function to handle user query and return response
31
+ def chatbot_response(user_query):
32
+
33
+ if True:
34
+ from unsloth import FastLanguageModel
35
+ from transformers import TextStreamer
36
+ model, tokenizer = FastLanguageModel.from_pretrained(
37
+ model_name = "/content/drive/MyDrive/Colab Notebooks/lora_model", # YOUR MODEL YOU USED FOR TRAINING
38
+ max_seq_length = max_seq_length,
39
+ dtype = dtype,
40
+ load_in_4bit = load_in_4bit,
41
+ )
42
+ FastLanguageModel.for_inference(model)
43
+
44
+
45
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
46
+
47
+ ### Instruction:
48
+ {}
49
+
50
+ ### Input:
51
+ {}
52
+
53
+ ### Response:
54
+ {}"""
55
+
56
+
57
+
58
+ inputs = tokenizer(
59
+ [
60
+ alpaca_prompt.format(
61
+ "Category,Instruction General Response,'Answer the user’s query thoroughly and accurately, ensuring no details or points are omitted. Always recognize that 'AEC' refers to 'Assam Engineering College,' and vice versa, and use this understanding to provide clear, context-aware responses.' Formatting,'Structure the output to be attractive, engaging, and professional, using proper formatting. Break the response into multiple paragraphs or sections if necessary to improve readability and organization.' Use of Lists,'For queries that involve enumerations, options, or multiple steps, use bullet points or numbered lists to present the information clearly and concisely. For example: - When listing departments or facilities. - When explaining procedures or step-by-step guides. - When summarizing key features or FAQs.' Tone,'Maintain a friendly, informative tone, and deliver complete, standard answers to meet the user's expectations", # instruction
62
+ user_query, # input
63
+ "", # output - leave this blank for generation!
64
+ )
65
+ ], return_tensors = "pt").to("cuda")
66
+
67
+
68
+ text_streamer = TextStreamer(tokenizer)
69
+
70
+ # Generate the response
71
+ response = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)
72
+
73
+ # Decode the response
74
+ decoded_output = tokenizer.batch_decode(response, skip_special_tokens=False)[0]
75
+
76
+ # Extract the desired portion
77
+ # Find the start of the Response section
78
+ response_start = decoded_output.find("### Response:") + len("### Response:")
79
+ # Extract only the response part
80
+ final_response = decoded_output[response_start:].strip()
81
+
82
+
83
+ # Input query
84
+ print("User Query:", user_query) # Just for debugging, can be removed
85
+
86
+ # --- Your model inference logic goes here ---
87
+ # Example: Replace the following line with your model's response
88
+ model_response = f"{final_response}"
89
+ # ---------------------------------------------
90
+
91
+ # Output response
92
+ print("Model Response:", final_response) # Just for debugging, can be removed
93
+ return model_response
94
+
95
+ # Gradio Interface
96
+ interface = gr.Interface(
97
+ fn=chatbot_response, # Function for processing user input
98
+ inputs=gr.Textbox(
99
+ label="Enter your query:", # Label for the input box
100
+ placeholder="Type something...", # Placeholder text
101
+ ),
102
+ outputs=gr.Textbox(label="Response:"), # Output box for model response
103
+ title="Simple Chatbot",
104
+ description="This is a simple chatbot interface. Type your query and get a response.",
105
  )
106
 
107
+ # Launch the Gradio app
108
+ interface.launch()
109