Update app.py
Browse files
app.py
CHANGED
|
@@ -36,18 +36,12 @@ CONFIG = {
|
|
| 36 |
"max_tokens": 350,
|
| 37 |
}
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
#
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
#
|
| 44 |
-
#
|
| 45 |
-
# PHI models are excellent lightweight instruction-following models:
|
| 46 |
-
# - microsoft/phi-2 (2.7B parameters, free inference)
|
| 47 |
-
# - microsoft/Phi-3-mini-4k-instruct (3.8B parameters, recommended)
|
| 48 |
-
# - microsoft/Phi-3-mini-128k-instruct (3.8B with longer context)
|
| 49 |
USE_REMOTE_LLM = False
|
| 50 |
-
REMOTE_LLM_MODEL = os.environ.get("HF_INFERENCE_MODEL", "microsoft/Phi-3-mini-4k-instruct")
|
| 51 |
|
| 52 |
# Prefer the environment variable, but also allow a local token file for users
|
| 53 |
# who don't know how to set env vars. Create a file named `hf_token.txt` in the
|
|
@@ -71,44 +65,92 @@ if HF_INFERENCE_API_KEY:
|
|
| 71 |
# ============================================================================
|
| 72 |
|
| 73 |
def initialize_llm():
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
# Final fallback: attempt to initialize the free local T5 model (as before)
|
| 87 |
-
logger.info("π Initializing FREE local language model (fallback to T5)...")
|
| 88 |
-
model_name = "google/flan-t5-large"
|
| 89 |
-
|
| 90 |
try:
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
llm_client = pipeline(
|
| 97 |
-
"
|
| 98 |
-
model=
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
| 101 |
)
|
| 102 |
-
|
| 103 |
-
CONFIG["llm_model"] =
|
| 104 |
-
CONFIG["model_type"] = "
|
| 105 |
-
|
| 106 |
-
logger.info(f"
|
|
|
|
|
|
|
|
|
|
| 107 |
return llm_client
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
except Exception as e:
|
| 110 |
-
logger.error(f"β Failed to load model: {str(e)}")
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
|
|
@@ -472,14 +514,30 @@ Draft:
|
|
| 472 |
Answer:
|
| 473 |
"""
|
| 474 |
|
| 475 |
-
logger.info(" β Polishing scaffold with
|
| 476 |
try:
|
| 477 |
-
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
else:
|
| 480 |
-
out = llm_client(polish_prompt, max_new_tokens=600, temperature=0.72, top_p=0.92, do_sample=True, num_beams=1)
|
| 481 |
-
polished = out[0].get('generated_text', '') if isinstance(out, list) and out else str(out)
|
| 482 |
polished = polished.strip()
|
|
|
|
| 483 |
except Exception as e:
|
| 484 |
logger.error(f" β Polishing error: {e}")
|
| 485 |
return None
|
|
@@ -555,9 +613,9 @@ def generate_llm_answer(
|
|
| 555 |
llm_client,
|
| 556 |
attempt: int = 1
|
| 557 |
) -> Optional[str]:
|
| 558 |
-
#
|
| 559 |
-
if not llm_client
|
| 560 |
-
logger.error(" β
|
| 561 |
return None
|
| 562 |
|
| 563 |
query_lower = query.lower()
|
|
@@ -600,27 +658,36 @@ def generate_llm_answer(
|
|
| 600 |
max_iterations = 4
|
| 601 |
|
| 602 |
def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
|
| 603 |
-
logger.info(f" β
|
| 604 |
try:
|
| 605 |
-
|
| 606 |
-
# Use remote Hugging Face Inference API
|
| 607 |
-
return remote_generate(prompt, max_new_tokens, temperature, top_p)
|
| 608 |
-
|
| 609 |
out = llm_client(
|
| 610 |
prompt,
|
| 611 |
max_new_tokens=max_new_tokens,
|
| 612 |
temperature=temperature,
|
| 613 |
top_p=top_p,
|
| 614 |
do_sample=True,
|
| 615 |
-
num_beams=1,
|
| 616 |
repetition_penalty=repetition_penalty,
|
| 617 |
-
|
|
|
|
|
|
|
| 618 |
)
|
|
|
|
|
|
|
| 619 |
if isinstance(out, list) and out:
|
| 620 |
-
|
| 621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
except Exception as e:
|
| 623 |
-
logger.error(f" β
|
| 624 |
return ''
|
| 625 |
|
| 626 |
# Build initial prompt
|
|
@@ -771,18 +838,15 @@ def generate_answer_langchain(
|
|
| 771 |
|
| 772 |
if not llm_answer:
|
| 773 |
logger.error(f" β All 2 LLM attempts failed")
|
| 774 |
-
#
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
logger.info("
|
| 780 |
-
polished
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
return polished
|
| 784 |
-
except Exception as e:
|
| 785 |
-
logger.error(f" β Scaffold-and-polish error: {e}")
|
| 786 |
|
| 787 |
# Final fallback: extractive templated answer (guaranteed deterministic)
|
| 788 |
try:
|
|
|
|
| 36 |
"max_tokens": 350,
|
| 37 |
}
|
| 38 |
|
| 39 |
+
# Local PHI model configuration for Hugging Face Spaces
|
| 40 |
+
# PHI-2 is optimal for CPU deployment: 2.7B parameters, excellent quality
|
| 41 |
+
# Can be swapped with Phi-3-mini-4k-instruct if more memory is available
|
| 42 |
+
LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
|
| 43 |
+
USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
USE_REMOTE_LLM = False
|
|
|
|
| 45 |
|
| 46 |
# Prefer the environment variable, but also allow a local token file for users
|
| 47 |
# who don't know how to set env vars. Create a file named `hf_token.txt` in the
|
|
|
|
| 65 |
# ============================================================================
|
| 66 |
|
| 67 |
def initialize_llm():
|
| 68 |
+
"""Initialize PHI model locally with CPU optimizations for Hugging Face Spaces.
|
| 69 |
+
|
| 70 |
+
Uses efficient techniques:
|
| 71 |
+
- 8-bit quantization to reduce memory by ~50%
|
| 72 |
+
- CPU-optimized loading with device_map
|
| 73 |
+
- Lazy loading and minimal memory footprint
|
| 74 |
+
"""
|
| 75 |
+
global LOCAL_PHI_MODEL, USE_8BIT_QUANTIZATION
|
| 76 |
+
|
| 77 |
+
logger.info(f"π Initializing local PHI model: {LOCAL_PHI_MODEL}")
|
| 78 |
+
logger.info(" Using CPU-optimized configuration for Hugging Face Spaces")
|
| 79 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
try:
|
| 81 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 82 |
+
|
| 83 |
+
# Check if we have GPU (unlikely on free Spaces, but check anyway)
|
| 84 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 85 |
+
logger.info(f" Target device: {device}")
|
| 86 |
+
|
| 87 |
+
# Load tokenizer (lightweight)
|
| 88 |
+
logger.info(" Loading tokenizer...")
|
| 89 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 90 |
+
LOCAL_PHI_MODEL,
|
| 91 |
+
trust_remote_code=True,
|
| 92 |
+
use_fast=True
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Set padding token if not present (PHI models need this)
|
| 96 |
+
if tokenizer.pad_token is None:
|
| 97 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 98 |
+
|
| 99 |
+
# Configure model loading for CPU efficiency
|
| 100 |
+
model_kwargs = {
|
| 101 |
+
"trust_remote_code": True,
|
| 102 |
+
"low_cpu_mem_usage": True,
|
| 103 |
+
"torch_dtype": torch.float32, # CPU works best with float32
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Try to use 8-bit quantization if available (requires bitsandbytes)
|
| 107 |
+
if USE_8BIT_QUANTIZATION and device == "cpu":
|
| 108 |
+
try:
|
| 109 |
+
logger.info(" Attempting 8-bit quantization for memory efficiency...")
|
| 110 |
+
model_kwargs["load_in_8bit"] = True
|
| 111 |
+
except Exception as quant_error:
|
| 112 |
+
logger.warning(f" 8-bit quantization unavailable: {quant_error}")
|
| 113 |
+
logger.info(" Falling back to float32 (will use more memory)")
|
| 114 |
+
|
| 115 |
+
# Load the model
|
| 116 |
+
logger.info(" Loading PHI model (this may take 30-60 seconds)...")
|
| 117 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 118 |
+
LOCAL_PHI_MODEL,
|
| 119 |
+
**model_kwargs
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Move to eval mode to disable dropout and save memory
|
| 123 |
+
model.eval()
|
| 124 |
+
|
| 125 |
+
# Create pipeline for generation
|
| 126 |
+
logger.info(" Creating text-generation pipeline...")
|
| 127 |
llm_client = pipeline(
|
| 128 |
+
"text-generation",
|
| 129 |
+
model=model,
|
| 130 |
+
tokenizer=tokenizer,
|
| 131 |
+
device=0 if device == "cuda" else -1,
|
| 132 |
+
max_new_tokens=512,
|
| 133 |
+
pad_token_id=tokenizer.eos_token_id
|
| 134 |
)
|
| 135 |
+
|
| 136 |
+
CONFIG["llm_model"] = LOCAL_PHI_MODEL
|
| 137 |
+
CONFIG["model_type"] = "phi_local"
|
| 138 |
+
|
| 139 |
+
logger.info(f"β
PHI model initialized successfully: {LOCAL_PHI_MODEL}")
|
| 140 |
+
logger.info(f" Model size: ~2.7B parameters (PHI-2) or ~3.8B (PHI-3)")
|
| 141 |
+
logger.info(f" Memory optimization: {'8-bit quantization' if USE_8BIT_QUANTIZATION else 'float32'}")
|
| 142 |
+
|
| 143 |
return llm_client
|
| 144 |
+
|
| 145 |
+
except ImportError as ie:
|
| 146 |
+
logger.error(f"β Missing required library: {ie}")
|
| 147 |
+
logger.info(" Install with: pip install transformers accelerate bitsandbytes")
|
| 148 |
+
raise
|
| 149 |
except Exception as e:
|
| 150 |
+
logger.error(f"β Failed to load PHI model: {str(e)}")
|
| 151 |
+
logger.info(" This may be due to insufficient memory on the Space")
|
| 152 |
+
logger.info(" Try using a smaller model or enabling 8-bit quantization")
|
| 153 |
+
raise Exception(f"Failed to initialize PHI LLM: {str(e)}")
|
| 154 |
|
| 155 |
|
| 156 |
def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
|
|
|
|
| 514 |
Answer:
|
| 515 |
"""
|
| 516 |
|
| 517 |
+
logger.info(" β Polishing scaffold with PHI model")
|
| 518 |
try:
|
| 519 |
+
out = llm_client(
|
| 520 |
+
polish_prompt,
|
| 521 |
+
max_new_tokens=600,
|
| 522 |
+
temperature=0.72,
|
| 523 |
+
top_p=0.92,
|
| 524 |
+
do_sample=True,
|
| 525 |
+
repetition_penalty=1.1,
|
| 526 |
+
pad_token_id=llm_client.tokenizer.eos_token_id
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
# Extract and clean the polished text
|
| 530 |
+
if isinstance(out, list) and out:
|
| 531 |
+
polished = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
|
| 532 |
+
else:
|
| 533 |
+
polished = str(out)
|
| 534 |
+
|
| 535 |
+
# Remove prompt echo if present
|
| 536 |
+
if polish_prompt in polished:
|
| 537 |
+
polished = polished[len(polish_prompt):].strip()
|
| 538 |
else:
|
|
|
|
|
|
|
| 539 |
polished = polished.strip()
|
| 540 |
+
|
| 541 |
except Exception as e:
|
| 542 |
logger.error(f" β Polishing error: {e}")
|
| 543 |
return None
|
|
|
|
| 613 |
llm_client,
|
| 614 |
attempt: int = 1
|
| 615 |
) -> Optional[str]:
|
| 616 |
+
# Ensure we have a local PHI model loaded
|
| 617 |
+
if not llm_client:
|
| 618 |
+
logger.error(" β PHI model not initialized")
|
| 619 |
return None
|
| 620 |
|
| 621 |
query_lower = query.lower()
|
|
|
|
| 658 |
max_iterations = 4
|
| 659 |
|
| 660 |
def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
|
| 661 |
+
logger.info(f" β PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
|
| 662 |
try:
|
| 663 |
+
# Call local PHI model (causal LM)
|
|
|
|
|
|
|
|
|
|
| 664 |
out = llm_client(
|
| 665 |
prompt,
|
| 666 |
max_new_tokens=max_new_tokens,
|
| 667 |
temperature=temperature,
|
| 668 |
top_p=top_p,
|
| 669 |
do_sample=True,
|
|
|
|
| 670 |
repetition_penalty=repetition_penalty,
|
| 671 |
+
num_return_sequences=1,
|
| 672 |
+
pad_token_id=llm_client.tokenizer.eos_token_id,
|
| 673 |
+
eos_token_id=llm_client.tokenizer.eos_token_id
|
| 674 |
)
|
| 675 |
+
|
| 676 |
+
# Extract generated text from pipeline output
|
| 677 |
if isinstance(out, list) and out:
|
| 678 |
+
generated = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
|
| 679 |
+
else:
|
| 680 |
+
generated = str(out)
|
| 681 |
+
|
| 682 |
+
# PHI models return prompt + completion, extract only new text
|
| 683 |
+
if prompt in generated:
|
| 684 |
+
# Remove the prompt from the output
|
| 685 |
+
generated = generated[len(prompt):].strip()
|
| 686 |
+
|
| 687 |
+
return generated
|
| 688 |
+
|
| 689 |
except Exception as e:
|
| 690 |
+
logger.error(f" β PHI model call error: {e}")
|
| 691 |
return ''
|
| 692 |
|
| 693 |
# Build initial prompt
|
|
|
|
| 838 |
|
| 839 |
if not llm_answer:
|
| 840 |
logger.error(f" β All 2 LLM attempts failed")
|
| 841 |
+
# Try scaffold-and-polish as a fallback strategy
|
| 842 |
+
try:
|
| 843 |
+
logger.info(" β Attempting scaffold-and-polish using PHI model")
|
| 844 |
+
polished = scaffold_and_polish(query, retrieved_docs, llm_client)
|
| 845 |
+
if polished:
|
| 846 |
+
logger.info(" β
Scaffold-and-polish produced an answer")
|
| 847 |
+
return polished
|
| 848 |
+
except Exception as e:
|
| 849 |
+
logger.error(f" β Scaffold-and-polish error: {e}")
|
|
|
|
|
|
|
|
|
|
| 850 |
|
| 851 |
# Final fallback: extractive templated answer (guaranteed deterministic)
|
| 852 |
try:
|