junaidbaber commited on
Commit
03b1321
·
verified ·
1 Parent(s): 0d5774d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -58
app.py CHANGED
@@ -12,41 +12,22 @@ def initialize_model():
12
  if token:
13
  login(token)
14
 
15
- # Use a smaller model that's more CPU-friendly
16
- model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Much smaller model
17
 
18
  # Load tokenizer
19
  tokenizer = AutoTokenizer.from_pretrained(model_id)
20
 
21
- # Configure 4-bit quantization for CPU
22
  try:
23
- # First try with bitsandbytes 4-bit quantization
24
- from transformers import AutoModelForCausalLM, BitsAndBytesConfig
25
-
26
- compute_dtype = getattr(torch, "float16")
27
-
28
- bnb_config = BitsAndBytesConfig(
29
- load_in_4bit=True,
30
- bnb_4bit_quant_type="nf4",
31
- bnb_4bit_compute_dtype=compute_dtype,
32
- bnb_4bit_use_double_quant=False,
33
- )
34
-
35
- model = AutoModelForCausalLM.from_pretrained(
36
- model_id,
37
- quantization_config=bnb_config,
38
- device_map="auto",
39
- trust_remote_code=True
40
- )
41
- except:
42
- # Fallback to CPU without quantization
43
- print("Falling back to CPU without quantization")
44
  model = AutoModelForCausalLM.from_pretrained(
45
  model_id,
46
  device_map="cpu",
47
  trust_remote_code=True,
48
  low_cpu_mem_usage=True
49
  )
 
 
 
50
 
51
  # Ensure padding token is defined
52
  if tokenizer.pad_token is None:
@@ -54,54 +35,70 @@ def initialize_model():
54
 
55
  return model, tokenizer
56
 
57
- def format_conversation(conversation_history):
58
- """Format the conversation history into a single string."""
59
- formatted = ""
 
 
60
  for turn in conversation_history:
61
- formatted += f"Human: {turn['user']}\nAssistant: {turn['assistant']}\n"
62
- return formatted.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  def generate_response(model, tokenizer, prompt, conversation_history):
65
  """Generate model response"""
66
- # Format the entire conversation context
67
- context = format_conversation(conversation_history[:-1])
68
- if context:
69
- full_prompt = f"{context}\nHuman: {prompt}"
70
- else:
71
- full_prompt = f"Human: {prompt}"
72
-
73
- # Tokenize input
74
- inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
75
-
76
- # Move inputs to the same device as the model
77
- device = next(model.parameters()).device
78
- inputs = {k: v.to(device) for k, v in inputs.items()}
79
 
80
- # Calculate max new tokens
81
- input_length = inputs["input_ids"].shape[1]
82
- max_model_length = 1024 # Reduced context window for memory efficiency
83
- max_new_tokens = min(150, max_model_length - input_length)
84
 
85
- try:
86
- # Generate response with lower temperature for faster generation
87
  outputs = model.generate(
88
  inputs["input_ids"],
89
  attention_mask=inputs["attention_mask"],
90
  max_new_tokens=max_new_tokens,
91
- temperature=0.5, # Lower temperature for faster, more focused responses
92
  top_p=0.9,
93
  pad_token_id=tokenizer.pad_token_id,
94
  do_sample=True,
95
- min_length=10, # Reduced minimum length
96
- no_repeat_ngram_size=3
 
97
  )
98
 
99
- # Decode response
100
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
101
- response_parts = response.split("Human: ")
102
- model_response = response_parts[-1].split("Assistant: ")[-1].strip()
103
 
104
- return model_response
 
 
 
 
105
  except RuntimeError as e:
106
  if "out of memory" in str(e):
107
  torch.cuda.empty_cache()
@@ -128,7 +125,7 @@ def main():
128
  </style>
129
  """, unsafe_allow_html=True)
130
 
131
- st.title("Welcome to LowCode No Code Demo")
132
 
133
  # Initialize session state for chat history
134
  if "chat_history" not in st.session_state:
@@ -190,7 +187,7 @@ def main():
190
  st.markdown("""
191
  ### Model Info
192
  - Using TinyLlama 1.1B Chat
193
- - Optimized for CPU usage
194
  - Context window: 1024 tokens
195
  """)
196
 
 
12
  if token:
13
  login(token)
14
 
15
+ model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
16
 
17
  # Load tokenizer
18
  tokenizer = AutoTokenizer.from_pretrained(model_id)
19
 
 
20
  try:
21
+ # Try with regular CPU mode first (simpler and more reliable)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  model = AutoModelForCausalLM.from_pretrained(
23
  model_id,
24
  device_map="cpu",
25
  trust_remote_code=True,
26
  low_cpu_mem_usage=True
27
  )
28
+ except Exception as e:
29
+ print(f"Error loading model: {str(e)}")
30
+ raise e
31
 
32
  # Ensure padding token is defined
33
  if tokenizer.pad_token is None:
 
35
 
36
  return model, tokenizer
37
 
38
+ def format_prompt(user_input, conversation_history=[]):
39
+ """Format the prompt according to TinyLlama's expected chat format"""
40
+ messages = []
41
+
42
+ # Add conversation history
43
  for turn in conversation_history:
44
+ messages.append({"role": "user", "content": turn["user"]})
45
+ messages.append({"role": "assistant", "content": turn["assistant"]})
46
+
47
+ # Add current user input
48
+ messages.append({"role": "user", "content": user_input})
49
+
50
+ # Format into TinyLlama chat format
51
+ formatted_prompt = "<|system|>You are a helpful AI assistant.</s>"
52
+
53
+ for message in messages:
54
+ if message["role"] == "user":
55
+ formatted_prompt += f"<|user|>{message['content']}</s>"
56
+ else:
57
+ formatted_prompt += f"<|assistant|>{message['content']}</s>"
58
+
59
+ formatted_prompt += "<|assistant|>"
60
+ return formatted_prompt
61
 
62
  def generate_response(model, tokenizer, prompt, conversation_history):
63
  """Generate model response"""
64
+ try:
65
+ # Format prompt using TinyLlama's chat template
66
+ formatted_prompt = format_prompt(prompt, conversation_history[:-1])
67
+
68
+ # Tokenize input
69
+ inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True)
70
+
71
+ # Move inputs to the same device as the model
72
+ device = next(model.parameters()).device
73
+ inputs = {k: v.to(device) for k, v in inputs.items()}
 
 
 
74
 
75
+ # Calculate max new tokens
76
+ input_length = inputs["input_ids"].shape[1]
77
+ max_model_length = 1024
78
+ max_new_tokens = min(150, max_model_length - input_length)
79
 
80
+ # Generate response
 
81
  outputs = model.generate(
82
  inputs["input_ids"],
83
  attention_mask=inputs["attention_mask"],
84
  max_new_tokens=max_new_tokens,
85
+ temperature=0.7,
86
  top_p=0.9,
87
  pad_token_id=tokenizer.pad_token_id,
88
  do_sample=True,
89
+ min_length=10,
90
+ no_repeat_ngram_size=3,
91
+ eos_token_id=tokenizer.encode("</s>")[0] # Set end token
92
  )
93
 
94
+ # Decode response and extract only the assistant's message
95
+ full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
 
 
96
 
97
+ # Extract only the last assistant response
98
+ assistant_response = full_response.split("<|assistant|>")[-1].split("</s>")[0].strip()
99
+
100
+ return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response."
101
+
102
  except RuntimeError as e:
103
  if "out of memory" in str(e):
104
  torch.cuda.empty_cache()
 
125
  </style>
126
  """, unsafe_allow_html=True)
127
 
128
+ st.title("Chat with TinyLlama 🤖")
129
 
130
  # Initialize session state for chat history
131
  if "chat_history" not in st.session_state:
 
187
  st.markdown("""
188
  ### Model Info
189
  - Using TinyLlama 1.1B Chat
190
+ - CPU optimized
191
  - Context window: 1024 tokens
192
  """)
193