Spaces:

p3rc03
/

2B

Running

2B / app /core /llm.py

37-AN

Update for Hugging Face Space deployment

2a735cc 3 months ago

8.64 kB

	from langchain_community.llms import HuggingFaceHub
	from langchain_community.llms import HuggingFaceEndpoint, HuggingFacePipeline
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.chains import LLMChain
	from langchain.prompts import PromptTemplate
	import sys
	import os
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Add project root to path for imports
	sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
	from app.config import HF_API_KEY, LLM_MODEL, EMBEDDING_MODEL, DEFAULT_TEMPERATURE, MAX_TOKENS

	def get_llm():
	"""Initialize and return the language model."""
	# Set up cache directories with proper permissions
	cache_dir = "/app/models"
	if not os.path.exists(cache_dir):
	try:
	os.makedirs(cache_dir, exist_ok=True)
	os.chmod(cache_dir, 0o777)
	except Exception as e:
	logger.warning(f"Could not create cache directory: {e}")
	cache_dir = None

	# Never rely on API key in Spaces environment
	api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
	logger.info(f"Using model: {LLM_MODEL}")

	# Always try local pipeline first (most reliable in Spaces)
	try:
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

	logger.info(f"Loading model {LLM_MODEL} as local pipeline")

	# Try multiple fallbacks with increasingly simpler models
	models_to_try = [
	LLM_MODEL,
	"distilgpt2", # Smaller fallback
	"gpt2", # Standard fallback
	"EleutherAI/gpt-neo-125M" # Another option
	]

	last_error = None

	for model_name in models_to_try:
	try:
	logger.info(f"Attempting to load model: {model_name}")

	# Try with explicit loading first
	try:
	# Set trust_remote_code to False to avoid security issues
	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	use_auth_token=api_key if api_key else None,
	trust_remote_code=False
	)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	use_auth_token=api_key if api_key else None,
	trust_remote_code=False,
	low_cpu_mem_usage=True # Help with memory issues
	)

	# Create pipeline with loaded components
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_length=MAX_TOKENS,
	temperature=DEFAULT_TEMPERATURE,
	device=-1 # Use CPU
	)

	logger.info(f"Successfully loaded model: {model_name}")
	return HuggingFacePipeline(pipeline=pipe)
	except Exception as e:
	logger.warning(f"Error loading {model_name} with explicit model/tokenizer: {e}")
	last_error = e

	# Try direct pipeline loading
	pipe = pipeline(
	"text-generation",
	model=model_name,
	max_length=MAX_TOKENS,
	temperature=DEFAULT_TEMPERATURE,
	use_auth_token=api_key if api_key else None,
	device=-1 # Use CPU
	)

	logger.info(f"Successfully loaded model: {model_name} via direct pipeline")
	return HuggingFacePipeline(pipeline=pipe)

	except Exception as e:
	logger.warning(f"Error loading model {model_name}: {e}")
	last_error = e
	# Continue to the next model
	continue

	# If we get here, all models failed
	logger.error(f"All models failed to load. Last error: {last_error}")
	raise last_error

	except Exception as e:
	logger.warning(f"Error creating local pipeline: {e}")

	# Try the HuggingFaceEndpoint as fallback
	try:
	logger.info("Attempting to use HuggingFaceEndpoint")
	return HuggingFaceEndpoint(
	repo_id="gpt2",
	max_length=MAX_TOKENS,
	temperature=DEFAULT_TEMPERATURE,
	huggingfacehub_api_token=api_key
	)
	except Exception as endpoint_error:
	logger.warning(f"HuggingFaceEndpoint failed: {endpoint_error}")

	# Last resort - mock LLM for fallback
	from langchain.llms.fake import FakeListLLM
	logger.warning("Using mock LLM as fallback")
	return FakeListLLM(
	responses=[
	"I'm running in fallback mode due to model loading issues. I have limited capabilities right now.",
	"I can't access the language model currently. Please check the Space logs for more information.",
	"I'm operating with a simplified model. For better performance, try running this app locally with proper models configured."
	]
	)

	def get_embeddings():
	"""Initialize and return the embeddings model."""
	# Set up cache directories with proper permissions
	cache_dir = "/app/models"
	if not os.path.exists(cache_dir):
	try:
	os.makedirs(cache_dir, exist_ok=True)
	os.chmod(cache_dir, 0o777)
	except Exception as e:
	logger.warning(f"Could not create cache directory: {e}")
	cache_dir = None

	# Try multiple models with fallbacks
	embedding_models_to_try = [
	EMBEDDING_MODEL,
	"sentence-transformers/all-MiniLM-L6-v2", # Standard model
	"sentence-transformers/paraphrase-MiniLM-L3-v2", # Smaller model
	"sentence-transformers/paraphrase-albert-small-v2" # Even smaller model
	]

	api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")

	for model_name in embedding_models_to_try:
	# Try to use local embeddings
	try:
	logger.info(f"Loading embeddings model: {model_name}")
	return HuggingFaceEmbeddings(
	model_name=model_name,
	cache_folder=cache_dir,
	encode_kwargs={"normalize_embeddings": True},
	model_kwargs={"device": "cpu"} # Ensure using CPU
	)
	except Exception as e:
	logger.warning(f"Error initializing embeddings with {model_name}: {e}")
	# Continue to the next model

	# If all models fail, try with direct transformers access
	try:
	from sentence_transformers import SentenceTransformer
	logger.info("Loading embeddings with SentenceTransformer directly")
	model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

	# Create a custom embeddings class
	class DirectEmbeddings:
	def embed_documents(self, texts):
	return model.encode(texts, normalize_embeddings=True).tolist()

	def embed_query(self, text):
	return model.encode(text, normalize_embeddings=True).tolist()

	return DirectEmbeddings()
	except Exception as e:
	logger.warning(f"Error with direct SentenceTransformer: {e}")

	# Create mock embeddings as last resort
	from langchain.embeddings.fake import FakeEmbeddings
	logger.warning("Using mock embeddings as fallback")
	return FakeEmbeddings(size=384) # Standard size for small embedding models

	def get_chat_model():
	"""
	Create a chat-like interface using a regular LLM.
	This is necessary because many free HF models don't have chat interfaces.
	"""
	llm = get_llm()

	# Create a chat-like prompt template
	chat_template = """
	Context: {context}

	Chat History:
	{chat_history}

	User: {question}
	AI Assistant:
	"""

	prompt = PromptTemplate(
	input_variables=["context", "chat_history", "question"],
	template=chat_template
	)

	# Create a chain
	return LLMChain(llm=llm, prompt=prompt)