2B / app /core /llm.py
37-AN
Update for Hugging Face Space deployment
2a735cc
from langchain_community.llms import HuggingFaceHub
from langchain_community.llms import HuggingFaceEndpoint, HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import sys
import os
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Add project root to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from app.config import HF_API_KEY, LLM_MODEL, EMBEDDING_MODEL, DEFAULT_TEMPERATURE, MAX_TOKENS
def get_llm():
"""Initialize and return the language model."""
# Set up cache directories with proper permissions
cache_dir = "/app/models"
if not os.path.exists(cache_dir):
try:
os.makedirs(cache_dir, exist_ok=True)
os.chmod(cache_dir, 0o777)
except Exception as e:
logger.warning(f"Could not create cache directory: {e}")
cache_dir = None
# Never rely on API key in Spaces environment
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
logger.info(f"Using model: {LLM_MODEL}")
# Always try local pipeline first (most reliable in Spaces)
try:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
logger.info(f"Loading model {LLM_MODEL} as local pipeline")
# Try multiple fallbacks with increasingly simpler models
models_to_try = [
LLM_MODEL,
"distilgpt2", # Smaller fallback
"gpt2", # Standard fallback
"EleutherAI/gpt-neo-125M" # Another option
]
last_error = None
for model_name in models_to_try:
try:
logger.info(f"Attempting to load model: {model_name}")
# Try with explicit loading first
try:
# Set trust_remote_code to False to avoid security issues
tokenizer = AutoTokenizer.from_pretrained(
model_name,
use_auth_token=api_key if api_key else None,
trust_remote_code=False
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
use_auth_token=api_key if api_key else None,
trust_remote_code=False,
low_cpu_mem_usage=True # Help with memory issues
)
# Create pipeline with loaded components
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=MAX_TOKENS,
temperature=DEFAULT_TEMPERATURE,
device=-1 # Use CPU
)
logger.info(f"Successfully loaded model: {model_name}")
return HuggingFacePipeline(pipeline=pipe)
except Exception as e:
logger.warning(f"Error loading {model_name} with explicit model/tokenizer: {e}")
last_error = e
# Try direct pipeline loading
pipe = pipeline(
"text-generation",
model=model_name,
max_length=MAX_TOKENS,
temperature=DEFAULT_TEMPERATURE,
use_auth_token=api_key if api_key else None,
device=-1 # Use CPU
)
logger.info(f"Successfully loaded model: {model_name} via direct pipeline")
return HuggingFacePipeline(pipeline=pipe)
except Exception as e:
logger.warning(f"Error loading model {model_name}: {e}")
last_error = e
# Continue to the next model
continue
# If we get here, all models failed
logger.error(f"All models failed to load. Last error: {last_error}")
raise last_error
except Exception as e:
logger.warning(f"Error creating local pipeline: {e}")
# Try the HuggingFaceEndpoint as fallback
try:
logger.info("Attempting to use HuggingFaceEndpoint")
return HuggingFaceEndpoint(
repo_id="gpt2",
max_length=MAX_TOKENS,
temperature=DEFAULT_TEMPERATURE,
huggingfacehub_api_token=api_key
)
except Exception as endpoint_error:
logger.warning(f"HuggingFaceEndpoint failed: {endpoint_error}")
# Last resort - mock LLM for fallback
from langchain.llms.fake import FakeListLLM
logger.warning("Using mock LLM as fallback")
return FakeListLLM(
responses=[
"I'm running in fallback mode due to model loading issues. I have limited capabilities right now.",
"I can't access the language model currently. Please check the Space logs for more information.",
"I'm operating with a simplified model. For better performance, try running this app locally with proper models configured."
]
)
def get_embeddings():
"""Initialize and return the embeddings model."""
# Set up cache directories with proper permissions
cache_dir = "/app/models"
if not os.path.exists(cache_dir):
try:
os.makedirs(cache_dir, exist_ok=True)
os.chmod(cache_dir, 0o777)
except Exception as e:
logger.warning(f"Could not create cache directory: {e}")
cache_dir = None
# Try multiple models with fallbacks
embedding_models_to_try = [
EMBEDDING_MODEL,
"sentence-transformers/all-MiniLM-L6-v2", # Standard model
"sentence-transformers/paraphrase-MiniLM-L3-v2", # Smaller model
"sentence-transformers/paraphrase-albert-small-v2" # Even smaller model
]
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
for model_name in embedding_models_to_try:
# Try to use local embeddings
try:
logger.info(f"Loading embeddings model: {model_name}")
return HuggingFaceEmbeddings(
model_name=model_name,
cache_folder=cache_dir,
encode_kwargs={"normalize_embeddings": True},
model_kwargs={"device": "cpu"} # Ensure using CPU
)
except Exception as e:
logger.warning(f"Error initializing embeddings with {model_name}: {e}")
# Continue to the next model
# If all models fail, try with direct transformers access
try:
from sentence_transformers import SentenceTransformer
logger.info("Loading embeddings with SentenceTransformer directly")
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
# Create a custom embeddings class
class DirectEmbeddings:
def embed_documents(self, texts):
return model.encode(texts, normalize_embeddings=True).tolist()
def embed_query(self, text):
return model.encode(text, normalize_embeddings=True).tolist()
return DirectEmbeddings()
except Exception as e:
logger.warning(f"Error with direct SentenceTransformer: {e}")
# Create mock embeddings as last resort
from langchain.embeddings.fake import FakeEmbeddings
logger.warning("Using mock embeddings as fallback")
return FakeEmbeddings(size=384) # Standard size for small embedding models
def get_chat_model():
"""
Create a chat-like interface using a regular LLM.
This is necessary because many free HF models don't have chat interfaces.
"""
llm = get_llm()
# Create a chat-like prompt template
chat_template = """
Context: {context}
Chat History:
{chat_history}
User: {question}
AI Assistant:
"""
prompt = PromptTemplate(
input_variables=["context", "chat_history", "question"],
template=chat_template
)
# Create a chain
return LLMChain(llm=llm, prompt=prompt)