import os import subprocess from dotenv import load_dotenv load_dotenv() try: os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN") PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") except: PINECONE_API_KEY = subprocess.check_output(["bash", "-c", "echo ${{ secrets.PINECONE_API_KEY }}"]).decode("utf-8").strip() from typing import Optional,List,Mapping,Any from langchain.embeddings import HuggingFaceEmbeddings import pinecone import torch from langchain import PromptTemplate, LLMChain,HuggingFacePipeline from langchain.vectorstores import Pinecone from langchain.llms.base import LLM from transformers import pipeline class CustomLLM(LLM): # def __init__(self,model_name,pipeline): model_name ="databricks/dolly-v2-3b" num_output = 128 pipeline = pipeline(model=model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto", return_full_text=True, do_sample=False, max_new_tokens=128) def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: prompt_length = len(prompt) response = self.pipeline(prompt, max_new_tokens=self.num_output)[0]["generated_text"] # only return newly generated tokens return response[prompt_length:] @property def _identifying_params(self) -> Mapping[str, Any]: return {"name_of_model": self.model_name} @property def _llm_type(self) -> str: return "custom" def get_llm(model_name,pinecone_index,llm): # model_name = "bert-large-uncased" #"t5-large" model_kwargs = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'} embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs) pinecone.init( api_key=PINECONE_API_KEY, environment="us-east-1-aws" ) index = pinecone.Index(pinecone_index) # print(index.describe_index_stats()) docsearch = Pinecone(index, embeddings.embed_query,"text") # print("About to load the model") instruct_pipeline = pipeline(model=llm, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto", return_full_text=True, do_sample=False, max_new_tokens=128) llm = HuggingFacePipeline(pipeline=instruct_pipeline) # print("Loaded the LLM") # print("Prompting") template = """Context: {context} Question: {question} Answer: Let's go step by step.""" prompt = PromptTemplate(template=template, input_variables=["question","context"]) llm_chain = LLMChain(prompt=prompt, llm=llm) return llm_chain, docsearch if __name__ == "__main__": model_name = "bert-large-uncased" pinecone_index = "bert-large-uncased" llm = "databricks/dolly-v2-3b" llm_chain, docsearch = get_llm(model_name,pinecone_index,llm) print(":"*40) questions = ["what is the name of the first Hindi newspaper published in Bihar?", "what is the capital of Bihar?", "Brief about the Gupta Dynasty"] for question in questions: context = docsearch.similarity_search(question, k=3,metadata=False) content = "" for i in context: content= content + f"{i.__dict__['page_content']}" print(f"{question}") response = llm_chain.predict(question=question,context=content) print(f"{response}\n{'--'*25}")