Soumik555 commited on
Commit
d25d5e9
·
1 Parent(s): ef34958
Files changed (1) hide show
  1. main.py +120 -29
main.py CHANGED
@@ -10,6 +10,17 @@ import threading
10
  import uvicorn
11
  from pathlib import Path
12
  import time
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Configure logging
15
  logging.basicConfig(
@@ -75,6 +86,11 @@ CACHE_DIR = os.getenv("TRANSFORMERS_CACHE", "/app/model_cache")
75
  MAX_LENGTH = int(os.getenv("MAX_LENGTH", "100"))
76
  DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
77
 
 
 
 
 
 
78
  def ensure_cache_dir():
79
  """Ensure cache directory exists"""
80
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
@@ -92,14 +108,21 @@ def is_model_cached(model_name: str) -> bool:
92
  return False
93
 
94
  def load_model():
95
- """Load the Hugging Face model with caching"""
96
  global tokenizer, model, generator, model_loaded
97
 
98
  try:
99
  ensure_cache_dir()
100
 
 
 
 
 
101
  logger.info(f"Loading model: {MODEL_NAME}")
102
  logger.info(f"Cache dir: {CACHE_DIR}")
 
 
 
103
  logger.info(f"CUDA available: {torch.cuda.is_available()}")
104
 
105
  start_time = time.time()
@@ -116,32 +139,58 @@ def load_model():
116
  if tokenizer.pad_token is None:
117
  tokenizer.pad_token = tokenizer.eos_token
118
 
119
- # Load model
120
  logger.info("Loading model...")
 
 
121
  model = AutoModelForCausalLM.from_pretrained(
122
  MODEL_NAME,
123
  cache_dir=CACHE_DIR,
124
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
125
- device_map="auto" if torch.cuda.is_available() else None,
126
  low_cpu_mem_usage=True,
127
- local_files_only=False
 
 
128
  )
129
 
130
- # Create text generation pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  logger.info("Creating pipeline...")
132
  device = 0 if torch.cuda.is_available() else -1
 
133
  generator = pipeline(
134
  "text-generation",
135
  model=model,
136
  tokenizer=tokenizer,
137
  device=device,
138
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
 
 
 
 
 
139
  )
140
 
141
  load_time = time.time() - start_time
142
  model_loaded = True
143
  logger.info(f"✅ Model loaded successfully in {load_time:.2f} seconds!")
144
- logger.info(f"Model device: {model.device}")
 
 
145
 
146
  return True
147
 
@@ -149,42 +198,68 @@ def load_model():
149
  logger.error(f"❌ Error loading model: {str(e)}", exc_info=True)
150
  return False
151
 
152
- def generate_response(message: str, max_length: int = 100, temperature: float = 0.7, top_p: float = 0.9) -> str:
153
- """Generate response using the loaded model"""
154
  if not generator:
155
  return "❌ Model not loaded. Please wait for initialization...", 0.0
156
 
157
  try:
158
  start_time = time.time()
159
 
160
- # Generate response with parameters
161
- response = generator(
162
- message,
163
- max_length=max_length,
164
- temperature=temperature,
165
- top_p=top_p,
166
- num_return_sequences=1,
167
- pad_token_id=tokenizer.eos_token_id,
168
- do_sample=True,
169
- truncation=True,
170
- repetition_penalty=1.1
171
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  # Extract generated text
174
  generated_text = response[0]['generated_text']
175
 
176
- # Clean up response
177
  if generated_text.startswith(message):
178
  bot_response = generated_text[len(message):].strip()
179
  else:
180
  bot_response = generated_text.strip()
181
 
182
- # Fallback if empty response
183
- if not bot_response:
 
 
 
 
 
 
 
 
 
 
184
  bot_response = "I'm not sure how to respond to that. Could you try rephrasing?"
185
 
186
  response_time = time.time() - start_time
187
- logger.info(f"Generated response in {response_time:.2f}s")
188
 
189
  return bot_response, response_time
190
 
@@ -241,7 +316,7 @@ async def chat_endpoint(request: ChatRequest):
241
 
242
  @app.get("/model-info")
243
  async def get_model_info():
244
- """Get detailed model information"""
245
  device = "cuda" if torch.cuda.is_available() else "cpu"
246
  if model and hasattr(model, 'device'):
247
  device = str(model.device)
@@ -252,6 +327,12 @@ async def get_model_info():
252
  "device": device,
253
  "cache_directory": CACHE_DIR,
254
  "model_cached": is_model_cached(MODEL_NAME),
 
 
 
 
 
 
255
  "parameters": {
256
  "max_length": MAX_LENGTH,
257
  "default_temperature": DEFAULT_TEMPERATURE
@@ -277,14 +358,24 @@ async def startup_event():
277
  threading.Thread(target=load_model_background, daemon=True).start()
278
 
279
  def run_fastapi():
280
- """Run FastAPI server"""
281
- uvicorn.run(
 
282
  app,
283
  host="0.0.0.0",
284
- port=7860, # Changed to 7860 for HuggingFace
285
  log_level="info",
286
- access_log=True
 
 
 
287
  )
 
 
 
288
 
289
  if __name__ == "__main__":
 
 
 
290
  run_fastapi()
 
10
  import uvicorn
11
  from pathlib import Path
12
  import time
13
+ import multiprocessing
14
+
15
+ # CPU Performance Optimization
16
+ os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
17
+ os.environ["MKL_NUM_THREADS"] = str(multiprocessing.cpu_count())
18
+ os.environ["OPENBLAS_NUM_THREADS"] = str(multiprocessing.cpu_count())
19
+ os.environ["VECLIB_MAXIMUM_THREADS"] = str(multiprocessing.cpu_count())
20
+ os.environ["NUMEXPR_NUM_THREADS"] = str(multiprocessing.cpu_count())
21
+
22
+ # Set PyTorch to use all CPU cores
23
+ torch.set_num_threads(multiprocessing.cpu_count())
24
 
25
  # Configure logging
26
  logging.basicConfig(
 
86
  MAX_LENGTH = int(os.getenv("MAX_LENGTH", "100"))
87
  DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
88
 
89
+ # CPU Optimization settings
90
+ CPU_CORES = multiprocessing.cpu_count()
91
+ INTRAOP_THREADS = CPU_CORES
92
+ INTEROP_THREADS = max(1, CPU_CORES // 2) # Use half cores for inter-op parallelism
93
+
94
  def ensure_cache_dir():
95
  """Ensure cache directory exists"""
96
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 
108
  return False
109
 
110
  def load_model():
111
+ """Load the Hugging Face model with caching and CPU optimization"""
112
  global tokenizer, model, generator, model_loaded
113
 
114
  try:
115
  ensure_cache_dir()
116
 
117
+ # Set PyTorch threading for optimal CPU performance
118
+ torch.set_num_interop_threads(INTEROP_THREADS)
119
+ torch.set_num_threads(INTRAOP_THREADS)
120
+
121
  logger.info(f"Loading model: {MODEL_NAME}")
122
  logger.info(f"Cache dir: {CACHE_DIR}")
123
+ logger.info(f"CPU cores: {CPU_CORES}")
124
+ logger.info(f"Intra-op threads: {INTRAOP_THREADS}")
125
+ logger.info(f"Inter-op threads: {INTEROP_THREADS}")
126
  logger.info(f"CUDA available: {torch.cuda.is_available()}")
127
 
128
  start_time = time.time()
 
139
  if tokenizer.pad_token is None:
140
  tokenizer.pad_token = tokenizer.eos_token
141
 
142
+ # Load model with CPU optimization
143
  logger.info("Loading model...")
144
+ device_map = "auto" if torch.cuda.is_available() else "cpu"
145
+
146
  model = AutoModelForCausalLM.from_pretrained(
147
  MODEL_NAME,
148
  cache_dir=CACHE_DIR,
149
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
150
+ device_map=device_map,
151
  low_cpu_mem_usage=True,
152
+ local_files_only=False,
153
+ # CPU-specific optimizations
154
+ use_cache=True, # Enable KV cache for faster generation
155
  )
156
 
157
+ # Move model to CPU if CUDA is not available and optimize
158
+ if not torch.cuda.is_available():
159
+ model = model.to('cpu')
160
+ # Enable CPU-specific optimizations
161
+ model.eval() # Set to evaluation mode
162
+
163
+ # Enable torch.jit optimization for CPU (optional, can improve performance)
164
+ try:
165
+ # This is experimental and might not work with all models
166
+ # model = torch.jit.script(model)
167
+ logger.info("Model loaded in CPU mode with optimizations")
168
+ except Exception as e:
169
+ logger.warning(f"JIT compilation not available: {e}")
170
+
171
+ # Create text generation pipeline with optimized settings
172
  logger.info("Creating pipeline...")
173
  device = 0 if torch.cuda.is_available() else -1
174
+
175
  generator = pipeline(
176
  "text-generation",
177
  model=model,
178
  tokenizer=tokenizer,
179
  device=device,
180
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
181
+ # CPU optimization: batch processing
182
+ batch_size=1, # Optimal for single requests
183
+ model_kwargs={
184
+ "use_cache": True, # Enable KV caching
185
+ }
186
  )
187
 
188
  load_time = time.time() - start_time
189
  model_loaded = True
190
  logger.info(f"✅ Model loaded successfully in {load_time:.2f} seconds!")
191
+
192
+ if hasattr(model, 'device'):
193
+ logger.info(f"Model device: {model.device}")
194
 
195
  return True
196
 
 
198
  logger.error(f"❌ Error loading model: {str(e)}", exc_info=True)
199
  return False
200
 
201
+ def generate_response(message: str, max_length: int = 100, temperature: float = 0.7, top_p: float = 0.9) -> tuple[str, float]:
202
+ """Generate response using the loaded model with CPU optimizations"""
203
  if not generator:
204
  return "❌ Model not loaded. Please wait for initialization...", 0.0
205
 
206
  try:
207
  start_time = time.time()
208
 
209
+ # Optimize input length to prevent excessive computation
210
+ max_input_length = 512 # Reasonable limit for DialoGPT
211
+ if len(message) > max_input_length:
212
+ message = message[:max_input_length]
213
+ logger.info(f"Input truncated to {max_input_length} characters")
214
+
215
+ # Calculate total max length (input + generation)
216
+ input_length = len(tokenizer.encode(message))
217
+ total_max_length = min(input_length + max_length, 1024) # DialoGPT max context
218
+
219
+ # Generate response with optimized parameters for CPU
220
+ with torch.no_grad(): # Disable gradient computation for inference
221
+ response = generator(
222
+ message,
223
+ max_length=total_max_length,
224
+ min_length=input_length + 10, # Ensure some generation
225
+ temperature=temperature,
226
+ top_p=top_p,
227
+ num_return_sequences=1,
228
+ pad_token_id=tokenizer.eos_token_id,
229
+ eos_token_id=tokenizer.eos_token_id,
230
+ do_sample=True,
231
+ repetition_penalty=1.1,
232
+ length_penalty=1.0,
233
+ early_stopping=True, # Stop when EOS is generated
234
+ # Remove unsupported parameters
235
+ # truncation=True # This was causing the error
236
+ )
237
 
238
  # Extract generated text
239
  generated_text = response[0]['generated_text']
240
 
241
+ # Clean up response - remove input prompt
242
  if generated_text.startswith(message):
243
  bot_response = generated_text[len(message):].strip()
244
  else:
245
  bot_response = generated_text.strip()
246
 
247
+ # Post-process response
248
+ if bot_response:
249
+ # Remove any repetitive patterns
250
+ sentences = bot_response.split('.')
251
+ if len(sentences) > 1:
252
+ # Take only the first complete sentence to avoid repetition
253
+ bot_response = sentences[0].strip() + '.'
254
+
255
+ # Ensure response isn't too short or just punctuation
256
+ if len(bot_response.replace('.', '').replace('!', '').replace('?', '').strip()) < 3:
257
+ bot_response = "I understand. Could you tell me more about that?"
258
+ else:
259
  bot_response = "I'm not sure how to respond to that. Could you try rephrasing?"
260
 
261
  response_time = time.time() - start_time
262
+ logger.info(f"Generated response in {response_time:.2f}s (length: {len(bot_response)} chars)")
263
 
264
  return bot_response, response_time
265
 
 
316
 
317
  @app.get("/model-info")
318
  async def get_model_info():
319
+ """Get detailed model information including CPU optimization details"""
320
  device = "cuda" if torch.cuda.is_available() else "cpu"
321
  if model and hasattr(model, 'device'):
322
  device = str(model.device)
 
327
  "device": device,
328
  "cache_directory": CACHE_DIR,
329
  "model_cached": is_model_cached(MODEL_NAME),
330
+ "cpu_optimization": {
331
+ "cpu_cores": CPU_CORES,
332
+ "intra_op_threads": INTRAOP_THREADS,
333
+ "inter_op_threads": INTEROP_THREADS,
334
+ "torch_threads": torch.get_num_threads(),
335
+ },
336
  "parameters": {
337
  "max_length": MAX_LENGTH,
338
  "default_temperature": DEFAULT_TEMPERATURE
 
358
  threading.Thread(target=load_model_background, daemon=True).start()
359
 
360
  def run_fastapi():
361
+ """Run FastAPI server with CPU optimization"""
362
+ # Additional CPU optimization for uvicorn
363
+ config = uvicorn.Config(
364
  app,
365
  host="0.0.0.0",
366
+ port=7860,
367
  log_level="info",
368
+ access_log=True,
369
+ workers=1, # Single worker to avoid model loading multiple times
370
+ loop="asyncio", # Use asyncio loop for better performance
371
+ http="httptools", # Use httptools for faster HTTP parsing
372
  )
373
+
374
+ server = uvicorn.Server(config)
375
+ server.run()
376
 
377
  if __name__ == "__main__":
378
+ logger.info(f"🚀 Starting FastAPI Chatbot with CPU optimization...")
379
+ logger.info(f"💻 CPU cores available: {CPU_CORES}")
380
+ logger.info(f"🧵 Thread configuration - Intra-op: {INTRAOP_THREADS}, Inter-op: {INTEROP_THREADS}")
381
  run_fastapi()