DipeshChaudhary commited on
Commit
cdb88fb
1 Parent(s): d0009f7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install torch==2.3.0 torchvision torchaudio -f https://download.pytorch.org/whl/cu121/torch_stable.html
2
+ # Check Python version
3
+ import sys
4
+ print("Python version:", sys.version)
5
+
6
+ # Check PyTorch version
7
+ import torch
8
+ print("PyTorch version:", torch.__version__)
9
+
10
+ !pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
11
+
12
+ !pip install triton
13
+
14
+ from unsloth import FastLanguageModel
15
+
16
+ import torch
17
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
18
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
19
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False
20
+
21
+
22
+ from transformers import AutoTokenizer
23
+
24
+ model, tokenizer = FastLanguageModel.from_pretrained(
25
+ model_name="DipeshChaudhary/ShareGPTChatBot-Counselchat1", # Your fine-tuned model
26
+ max_seq_length=max_seq_length,
27
+ dtype=dtype,
28
+ load_in_4bit=load_in_4bit,
29
+ )
30
+
31
+
32
+ from unsloth.chat_templates import get_chat_template
33
+
34
+
35
+ tokenizer = get_chat_template(
36
+ tokenizer,
37
+ chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
38
+ mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
39
+ )
40
+
41
+
42
+ import re
43
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
44
+
45
+ messages = [
46
+ {"from": "human", "value": "hlo"},
47
+ ]
48
+ inputs = tokenizer.apply_chat_template(
49
+ messages,
50
+ tokenize = True,
51
+ add_generation_prompt = True, # Must add for generation
52
+ return_tensors = "pt",
53
+ ).to("cuda")
54
+
55
+
56
+ from transformers import TextStreamer
57
+ text_streamer = TextStreamer(tokenizer)
58
+
59
+
60
+ x= model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)
61
+
62
+
63
+ # Function to generate response
64
+ def generate_response(conversation_history):
65
+
66
+ inputs = tokenizer.apply_chat_template(conversation_history,
67
+ tokenize = True,
68
+ add_generation_prompt = True, # Must add for generation
69
+ return_tensors = "pt",
70
+ ).to("cuda")
71
+ text_streamer = TextStreamer(tokenizer)
72
+
73
+
74
+ # Set the pad_token_id to the eos_token_id if it's not set
75
+ if tokenizer.pad_token_id is None:
76
+ tokenizer.pad_token_id = tokenizer.eos_token_id
77
+
78
+ # Generate the response
79
+ output = model.generate(
80
+ inputs,
81
+ max_new_tokens=10000,
82
+ use_cache=True,
83
+ pad_token_id=tokenizer.pad_token_id,
84
+ attention_mask=inputs.ne(tokenizer.pad_token_id)
85
+ )
86
+
87
+ # Decode the output, skipping special tokens
88
+ decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
89
+
90
+ # Extract only the bot's response
91
+ bot_response = decoded_output.split("assistant")[-1].strip()
92
+
93
+ return bot_response
94
+
95
+ # Example usage
96
+ conversation_history = []
97
+ while True:
98
+ user_input = input("User: ")
99
+ if user_input.lower() == "exit":
100
+ print("Exiting...")
101
+ break
102
+
103
+ # Append user message to history
104
+ conversation_history.append({"from": "human", "value": user_input})
105
+
106
+ # Generate response
107
+ response = generate_response(conversation_history)
108
+
109
+ # Append bot response to history
110
+ conversation_history.append({"from": "bot", "value": response})
111
+
112
+ #Print bot's response
113
+ print("Bot:", response)