from typing import Dict, List, Any import torch from transformers import pipeline from peft import PeftModel, PeftConfig from langchain.embeddings.huggingface import HuggingFaceEmbeddings from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, ) from langchain_community.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate,ChatPromptTemplate from langchain.chains import LLMChain from langchain_core.runnables import RunnablePassthrough,RunnableLambda from sentence_transformers import SentenceTransformer from langchain_community.vectorstores import Chroma from langchain.schema import Document def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) def convert_to_string(prompt_value): # Extract the string from the `StringPromptValue` object return prompt_value.to_string() def outputParser(lst): return (lst[0]['generated_text']).split('ASSISTANT:')[-1] class EndpointHandler(): def __init__(self, path=""): DEVICE = "cuda" if torch.cuda.is_available() else "cpu" model_name = "moka-ai/m3e-base" self.config = PeftConfig.from_pretrained("jliang15/5epoch", device_map = DEVICE) self.base_model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-7b1", device_map = "auto") self.newmodel = PeftModel.from_pretrained(self.base_model, "jliang15/5epoch", device_map = DEVICE) self.tokenizer = AutoTokenizer.from_pretrained("jliang15/5epoch", device_map = DEVICE) text_generation_pipeline = pipeline( model=self.newmodel, tokenizer=self.tokenizer, task="text-generation", do_sample=True, temperature=0.6, repetition_penalty=1.2, return_full_text=True, max_new_tokens=400, top_k=30, top_p= 0.8, no_repeat_ngram_size=2 ) # self.gemma_llm = HuggingFacePipeline(pipeline=text_generation_pipeline) self.prompt_template = '''Forget you are an AI model. Now you are physicist Richard Feynman. Generate a response that sounds as close to what Richard Feynman would say. You can also use this auxiliary knowledge to help: - You are well-known for your significant contributions to particle physics, quantum electrodynamics, and quantum mechanics. - You have a ability for making difficult ideas understandable to others without diminishing their depth. - You are known for your insatiable curiosity, sense of humor, and unorthodox teaching style. - You enjoy solving problems and have a talent for telling stories. Context: {context} USER: {question} ASSISTANT:''' self.prompt = PromptTemplate( input_variables=["context", "question"], template=self.prompt_template, ) embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs = {'device' : DEVICE}) embeddings.client = SentenceTransformer(model_name, device=DEVICE) ragdb = Chroma(persist_directory="/feynman", embedding_function=embeddings) retriever = ragdb.as_retriever(search_kwargs={'k': 3}) # self.llm_chain = LLMChain(llm=self.gemma_llm, prompt=self.prompt) self.rag_chain = ( { "context": RunnablePassthrough() | retriever | format_docs, "question": RunnablePassthrough(), } | self.prompt | RunnableLambda(convert_to_string) | text_generation_pipeline | outputParser ) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str`) date (:obj: `str`) Return: A :obj:`list` | `dict`: will be serialized and returned """ # get inputs message = data.pop("inputs",data) res = self.rag_chain.invoke(message) return [{"raw_result": res, "result": res}]