|
from langchain_community.llms import HuggingFaceHub |
|
from langchain_community.llms import HuggingFaceEndpoint, HuggingFacePipeline |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain.chains import LLMChain |
|
from langchain.prompts import PromptTemplate |
|
import sys |
|
import os |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) |
|
from app.config import HF_API_KEY, LLM_MODEL, EMBEDDING_MODEL, DEFAULT_TEMPERATURE, MAX_TOKENS |
|
|
|
def get_llm(): |
|
"""Initialize and return the language model.""" |
|
|
|
cache_dir = "/app/models" |
|
if not os.path.exists(cache_dir): |
|
try: |
|
os.makedirs(cache_dir, exist_ok=True) |
|
os.chmod(cache_dir, 0o777) |
|
except Exception as e: |
|
logger.warning(f"Could not create cache directory: {e}") |
|
cache_dir = None |
|
|
|
|
|
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "") |
|
logger.info(f"Using model: {LLM_MODEL}") |
|
|
|
|
|
try: |
|
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM |
|
|
|
logger.info(f"Loading model {LLM_MODEL} as local pipeline") |
|
|
|
|
|
models_to_try = [ |
|
LLM_MODEL, |
|
"distilgpt2", |
|
"gpt2", |
|
"EleutherAI/gpt-neo-125M" |
|
] |
|
|
|
last_error = None |
|
|
|
for model_name in models_to_try: |
|
try: |
|
logger.info(f"Attempting to load model: {model_name}") |
|
|
|
|
|
try: |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_name, |
|
use_auth_token=api_key if api_key else None, |
|
trust_remote_code=False |
|
) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
use_auth_token=api_key if api_key else None, |
|
trust_remote_code=False, |
|
low_cpu_mem_usage=True |
|
) |
|
|
|
|
|
pipe = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
max_length=MAX_TOKENS, |
|
temperature=DEFAULT_TEMPERATURE, |
|
device=-1 |
|
) |
|
|
|
logger.info(f"Successfully loaded model: {model_name}") |
|
return HuggingFacePipeline(pipeline=pipe) |
|
except Exception as e: |
|
logger.warning(f"Error loading {model_name} with explicit model/tokenizer: {e}") |
|
last_error = e |
|
|
|
|
|
pipe = pipeline( |
|
"text-generation", |
|
model=model_name, |
|
max_length=MAX_TOKENS, |
|
temperature=DEFAULT_TEMPERATURE, |
|
use_auth_token=api_key if api_key else None, |
|
device=-1 |
|
) |
|
|
|
logger.info(f"Successfully loaded model: {model_name} via direct pipeline") |
|
return HuggingFacePipeline(pipeline=pipe) |
|
|
|
except Exception as e: |
|
logger.warning(f"Error loading model {model_name}: {e}") |
|
last_error = e |
|
|
|
continue |
|
|
|
|
|
logger.error(f"All models failed to load. Last error: {last_error}") |
|
raise last_error |
|
|
|
except Exception as e: |
|
logger.warning(f"Error creating local pipeline: {e}") |
|
|
|
|
|
try: |
|
logger.info("Attempting to use HuggingFaceEndpoint") |
|
return HuggingFaceEndpoint( |
|
repo_id="gpt2", |
|
max_length=MAX_TOKENS, |
|
temperature=DEFAULT_TEMPERATURE, |
|
huggingfacehub_api_token=api_key |
|
) |
|
except Exception as endpoint_error: |
|
logger.warning(f"HuggingFaceEndpoint failed: {endpoint_error}") |
|
|
|
|
|
from langchain.llms.fake import FakeListLLM |
|
logger.warning("Using mock LLM as fallback") |
|
return FakeListLLM( |
|
responses=[ |
|
"I'm running in fallback mode due to model loading issues. I have limited capabilities right now.", |
|
"I can't access the language model currently. Please check the Space logs for more information.", |
|
"I'm operating with a simplified model. For better performance, try running this app locally with proper models configured." |
|
] |
|
) |
|
|
|
def get_embeddings(): |
|
"""Initialize and return the embeddings model.""" |
|
|
|
cache_dir = "/app/models" |
|
if not os.path.exists(cache_dir): |
|
try: |
|
os.makedirs(cache_dir, exist_ok=True) |
|
os.chmod(cache_dir, 0o777) |
|
except Exception as e: |
|
logger.warning(f"Could not create cache directory: {e}") |
|
cache_dir = None |
|
|
|
|
|
embedding_models_to_try = [ |
|
EMBEDDING_MODEL, |
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
"sentence-transformers/paraphrase-MiniLM-L3-v2", |
|
"sentence-transformers/paraphrase-albert-small-v2" |
|
] |
|
|
|
api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "") |
|
|
|
for model_name in embedding_models_to_try: |
|
|
|
try: |
|
logger.info(f"Loading embeddings model: {model_name}") |
|
return HuggingFaceEmbeddings( |
|
model_name=model_name, |
|
cache_folder=cache_dir, |
|
encode_kwargs={"normalize_embeddings": True}, |
|
model_kwargs={"device": "cpu"} |
|
) |
|
except Exception as e: |
|
logger.warning(f"Error initializing embeddings with {model_name}: {e}") |
|
|
|
|
|
|
|
try: |
|
from sentence_transformers import SentenceTransformer |
|
logger.info("Loading embeddings with SentenceTransformer directly") |
|
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") |
|
|
|
|
|
class DirectEmbeddings: |
|
def embed_documents(self, texts): |
|
return model.encode(texts, normalize_embeddings=True).tolist() |
|
|
|
def embed_query(self, text): |
|
return model.encode(text, normalize_embeddings=True).tolist() |
|
|
|
return DirectEmbeddings() |
|
except Exception as e: |
|
logger.warning(f"Error with direct SentenceTransformer: {e}") |
|
|
|
|
|
from langchain.embeddings.fake import FakeEmbeddings |
|
logger.warning("Using mock embeddings as fallback") |
|
return FakeEmbeddings(size=384) |
|
|
|
def get_chat_model(): |
|
""" |
|
Create a chat-like interface using a regular LLM. |
|
This is necessary because many free HF models don't have chat interfaces. |
|
""" |
|
llm = get_llm() |
|
|
|
|
|
chat_template = """ |
|
Context: {context} |
|
|
|
Chat History: |
|
{chat_history} |
|
|
|
User: {question} |
|
AI Assistant: |
|
""" |
|
|
|
prompt = PromptTemplate( |
|
input_variables=["context", "chat_history", "question"], |
|
template=chat_template |
|
) |
|
|
|
|
|
return LLMChain(llm=llm, prompt=prompt) |