Spaces:

cronjob-python
/

chatbot

Sleeping

App Files Files Community

Soumik555 commited on Sep 14, 2025

Commit

d25d5e9

1 Parent(s): ef34958

hello

Browse files

Files changed (1) hide show

main.py +120 -29

main.py CHANGED Viewed

@@ -10,6 +10,17 @@ import threading
 import uvicorn
 from pathlib import Path
 import time
 # Configure logging
 logging.basicConfig(
@@ -75,6 +86,11 @@ CACHE_DIR = os.getenv("TRANSFORMERS_CACHE", "/app/model_cache")
 MAX_LENGTH = int(os.getenv("MAX_LENGTH", "100"))
 DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
 def ensure_cache_dir():
     """Ensure cache directory exists"""
     Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
@@ -92,14 +108,21 @@ def is_model_cached(model_name: str) -> bool:
         return False
 def load_model():
-    """Load the Hugging Face model with caching"""
     global tokenizer, model, generator, model_loaded
     try:
         ensure_cache_dir()
         logger.info(f"Loading model: {MODEL_NAME}")
         logger.info(f"Cache dir: {CACHE_DIR}")
         logger.info(f"CUDA available: {torch.cuda.is_available()}")
         start_time = time.time()
@@ -116,32 +139,58 @@ def load_model():
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        # Load model
         logger.info("Loading model...")
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             cache_dir=CACHE_DIR,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
             low_cpu_mem_usage=True,
-            local_files_only=False
         )
-        # Create text generation pipeline
         logger.info("Creating pipeline...")
         device = 0 if torch.cuda.is_available() else -1
         generator = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
             device=device,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         )
         load_time = time.time() - start_time
         model_loaded = True
         logger.info(f"✅ Model loaded successfully in {load_time:.2f} seconds!")
-        logger.info(f"Model device: {model.device}")
         return True
@@ -149,42 +198,68 @@ def load_model():
         logger.error(f"❌ Error loading model: {str(e)}", exc_info=True)
         return False
-def generate_response(message: str, max_length: int = 100, temperature: float = 0.7, top_p: float = 0.9) -> str:
-    """Generate response using the loaded model"""
     if not generator:
         return "❌ Model not loaded. Please wait for initialization...", 0.0
     try:
         start_time = time.time()
-        # Generate response with parameters
-        response = generator(
-            message,
-            max_length=max_length,
-            temperature=temperature,
-            top_p=top_p,
-            num_return_sequences=1,
-            pad_token_id=tokenizer.eos_token_id,
-            do_sample=True,
-            truncation=True,
-            repetition_penalty=1.1
-        )
         # Extract generated text
         generated_text = response[0]['generated_text']
-        # Clean up response
         if generated_text.startswith(message):
             bot_response = generated_text[len(message):].strip()
         else:
             bot_response = generated_text.strip()
-        # Fallback if empty response
-        if not bot_response:
             bot_response = "I'm not sure how to respond to that. Could you try rephrasing?"
         response_time = time.time() - start_time
-        logger.info(f"Generated response in {response_time:.2f}s")
         return bot_response, response_time
@@ -241,7 +316,7 @@ async def chat_endpoint(request: ChatRequest):
 @app.get("/model-info")
 async def get_model_info():
-    """Get detailed model information"""
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if model and hasattr(model, 'device'):
         device = str(model.device)
@@ -252,6 +327,12 @@ async def get_model_info():
         "device": device,
         "cache_directory": CACHE_DIR,
         "model_cached": is_model_cached(MODEL_NAME),
         "parameters": {
             "max_length": MAX_LENGTH,
             "default_temperature": DEFAULT_TEMPERATURE
@@ -277,14 +358,24 @@ async def startup_event():
     threading.Thread(target=load_model_background, daemon=True).start()
 def run_fastapi():
-    """Run FastAPI server"""
-    uvicorn.run(
         app,
         host="0.0.0.0",
-        port=7860,  # Changed to 7860 for HuggingFace
         log_level="info",
-        access_log=True
     )
 if __name__ == "__main__":
     run_fastapi()

 import uvicorn
 from pathlib import Path
 import time
+import multiprocessing
+# CPU Performance Optimization
+os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
+os.environ["MKL_NUM_THREADS"] = str(multiprocessing.cpu_count())
+os.environ["OPENBLAS_NUM_THREADS"] = str(multiprocessing.cpu_count())
+os.environ["VECLIB_MAXIMUM_THREADS"] = str(multiprocessing.cpu_count())
+os.environ["NUMEXPR_NUM_THREADS"] = str(multiprocessing.cpu_count())
+# Set PyTorch to use all CPU cores
+torch.set_num_threads(multiprocessing.cpu_count())
 # Configure logging
 logging.basicConfig(
 MAX_LENGTH = int(os.getenv("MAX_LENGTH", "100"))
 DEFAULT_TEMPERATURE = float(os.getenv("DEFAULT_TEMPERATURE", "0.7"))
+# CPU Optimization settings
+CPU_CORES = multiprocessing.cpu_count()
+INTRAOP_THREADS = CPU_CORES
+INTEROP_THREADS = max(1, CPU_CORES // 2)  # Use half cores for inter-op parallelism
 def ensure_cache_dir():
     """Ensure cache directory exists"""
     Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
         return False
 def load_model():
+    """Load the Hugging Face model with caching and CPU optimization"""
     global tokenizer, model, generator, model_loaded
     try:
         ensure_cache_dir()
+        # Set PyTorch threading for optimal CPU performance
+        torch.set_num_interop_threads(INTEROP_THREADS)
+        torch.set_num_threads(INTRAOP_THREADS)
         logger.info(f"Loading model: {MODEL_NAME}")
         logger.info(f"Cache dir: {CACHE_DIR}")
+        logger.info(f"CPU cores: {CPU_CORES}")
+        logger.info(f"Intra-op threads: {INTRAOP_THREADS}")
+        logger.info(f"Inter-op threads: {INTEROP_THREADS}")
         logger.info(f"CUDA available: {torch.cuda.is_available()}")
         start_time = time.time()
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        # Load model with CPU optimization
         logger.info("Loading model...")
+        device_map = "auto" if torch.cuda.is_available() else "cpu"
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             cache_dir=CACHE_DIR,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map=device_map,
             low_cpu_mem_usage=True,
+            local_files_only=False,
+            # CPU-specific optimizations
+            use_cache=True,  # Enable KV cache for faster generation
         )
+        # Move model to CPU if CUDA is not available and optimize
+        if not torch.cuda.is_available():
+            model = model.to('cpu')
+            # Enable CPU-specific optimizations
+            model.eval()  # Set to evaluation mode
+            # Enable torch.jit optimization for CPU (optional, can improve performance)
+            try:
+                # This is experimental and might not work with all models
+                # model = torch.jit.script(model)
+                logger.info("Model loaded in CPU mode with optimizations")
+            except Exception as e:
+                logger.warning(f"JIT compilation not available: {e}")
+        # Create text generation pipeline with optimized settings
         logger.info("Creating pipeline...")
         device = 0 if torch.cuda.is_available() else -1
         generator = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
             device=device,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            # CPU optimization: batch processing
+            batch_size=1,  # Optimal for single requests
+            model_kwargs={
+                "use_cache": True,  # Enable KV caching
+            }
         )
         load_time = time.time() - start_time
         model_loaded = True
         logger.info(f"✅ Model loaded successfully in {load_time:.2f} seconds!")
+        if hasattr(model, 'device'):
+            logger.info(f"Model device: {model.device}")
         return True
         logger.error(f"❌ Error loading model: {str(e)}", exc_info=True)
         return False
+def generate_response(message: str, max_length: int = 100, temperature: float = 0.7, top_p: float = 0.9) -> tuple[str, float]:
+    """Generate response using the loaded model with CPU optimizations"""
     if not generator:
         return "❌ Model not loaded. Please wait for initialization...", 0.0
     try:
         start_time = time.time()
+        # Optimize input length to prevent excessive computation
+        max_input_length = 512  # Reasonable limit for DialoGPT
+        if len(message) > max_input_length:
+            message = message[:max_input_length]
+            logger.info(f"Input truncated to {max_input_length} characters")
+        # Calculate total max length (input + generation)
+        input_length = len(tokenizer.encode(message))
+        total_max_length = min(input_length + max_length, 1024)  # DialoGPT max context
+        # Generate response with optimized parameters for CPU
+        with torch.no_grad():  # Disable gradient computation for inference
+            response = generator(
+                message,
+                max_length=total_max_length,
+                min_length=input_length + 10,  # Ensure some generation
+                temperature=temperature,
+                top_p=top_p,
+                num_return_sequences=1,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                do_sample=True,
+                repetition_penalty=1.1,
+                length_penalty=1.0,
+                early_stopping=True,  # Stop when EOS is generated
+                # Remove unsupported parameters
+                # truncation=True  # This was causing the error
+            )
         # Extract generated text
         generated_text = response[0]['generated_text']
+        # Clean up response - remove input prompt
         if generated_text.startswith(message):
             bot_response = generated_text[len(message):].strip()
         else:
             bot_response = generated_text.strip()
+        # Post-process response
+        if bot_response:
+            # Remove any repetitive patterns
+            sentences = bot_response.split('.')
+            if len(sentences) > 1:
+                # Take only the first complete sentence to avoid repetition
+                bot_response = sentences[0].strip() + '.'
+            # Ensure response isn't too short or just punctuation
+            if len(bot_response.replace('.', '').replace('!', '').replace('?', '').strip()) < 3:
+                bot_response = "I understand. Could you tell me more about that?"
+        else:
             bot_response = "I'm not sure how to respond to that. Could you try rephrasing?"
         response_time = time.time() - start_time
+        logger.info(f"Generated response in {response_time:.2f}s (length: {len(bot_response)} chars)")
         return bot_response, response_time
 @app.get("/model-info")
 async def get_model_info():
+    """Get detailed model information including CPU optimization details"""
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if model and hasattr(model, 'device'):
         device = str(model.device)
         "device": device,
         "cache_directory": CACHE_DIR,
         "model_cached": is_model_cached(MODEL_NAME),
+        "cpu_optimization": {
+            "cpu_cores": CPU_CORES,
+            "intra_op_threads": INTRAOP_THREADS,
+            "inter_op_threads": INTEROP_THREADS,
+            "torch_threads": torch.get_num_threads(),
+        },
         "parameters": {
             "max_length": MAX_LENGTH,
             "default_temperature": DEFAULT_TEMPERATURE
     threading.Thread(target=load_model_background, daemon=True).start()
 def run_fastapi():
+    """Run FastAPI server with CPU optimization"""
+    # Additional CPU optimization for uvicorn
+    config = uvicorn.Config(
         app,
         host="0.0.0.0",
+        port=7860,
         log_level="info",
+        access_log=True,
+        workers=1,  # Single worker to avoid model loading multiple times
+        loop="asyncio",  # Use asyncio loop for better performance
+        http="httptools",  # Use httptools for faster HTTP parsing
     )
+    server = uvicorn.Server(config)
+    server.run()
 if __name__ == "__main__":
+    logger.info(f"🚀 Starting FastAPI Chatbot with CPU optimization...")
+    logger.info(f"💻 CPU cores available: {CPU_CORES}")
+    logger.info(f"🧵 Thread configuration - Intra-op: {INTRAOP_THREADS}, Inter-op: {INTEROP_THREADS}")
     run_fastapi()