File size: 6,650 Bytes
c625a8c ed1e95d c625a8c 1dfd50d 654eaa0 30b9c64 a2a0b63 654eaa0 f88f764 b94326e b2c95c6 6a34b4c 97da889 1a4faaf 97da889 1a4faaf 0aeb705 1a4faaf 97da889 e3b5aa2 befa051 1a4faaf cb52748 0aeb705 1a4faaf 0aeb705 cb52748 1a4faaf a49001a befa051 38986b4 1a4faaf befa051 97da889 6a34b4c d1f386f 609ebbf aad9e06 b9c177c cefc820 71778ca cefc820 e3b5aa2 609ebbf e0024f6 f8392b7 d9ff7d3 609ebbf 5e9f4b2 609ebbf 71778ca 609ebbf 3ace823 09f4627 654eaa0 609ebbf ef7bf1f 682ac66 213eaca 654eaa0 c625a8c 40d7f6a 1ede826 ef7bf1f 1ede826 1182d2f c625a8c 051e53e c625a8c 3e76b56 f88f764 3e76b56 c625a8c 051e53e 43f6d46 941cbbb c625a8c 609ebbf a010ff1 3c58d3e ef6577b c625a8c 0d38122 886ba9c c625a8c 8766e00 886ba9c 1dfd50d 5b0eb6a c625a8c 1322444 051e53e 3523ac0 aad9e06 96cc7ba aad9e06 1322444 72b576d de46666 96cc7ba a389a25 cb2b08b 602e5e4 1322444 c2e7cc6 1322444 1631980 1322444 a161c80 c2e7cc6 72b576d 602e5e4 1322444 7e33769 1322444 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import fastapi
from fastapi.responses import JSONResponse
from fastapi_users import schemas
from time import time
#from fastapi.middleware.cors import CORSMiddleware
#MODEL_PATH = "./qwen1_5-0_5b-chat-q4_0.gguf" #"./qwen1_5-0_5b-chat-q4_0.gguf"
import logging
from langchain_community.llms import LlamaCpp
import llama_cpp
import llama_cpp.llama_tokenizer
from pydantic import BaseModel
from fastapi import APIRouter
from app.users import current_active_user
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnablePick
rag_prompt_llama = hub.pull("rlm/rag-prompt-llama")
rag_prompt.messages
llm = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat"),
verbose=False,
n_ctx=512,
n_gpu_layers=0,
#chat_format="llama-2"
)
class RagChat:
def agent(self):
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
return all_splits
def download_embedding(self):
vectorstore = Chroma.from_documents(documents=self.agent, embedding=GPT4AllEmbeddings())
return vectorstore
def chat(self, question):
retriever = vectorstore.as_retriever()
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| rag_prompt_llama
| llm
| StrOutputParser()
)
return chain.invoke({"context": self.search(question), "question": question})
def search(self, question):
docs = self.download_embedding().similarity_search(question)
return docs
class GenModel(BaseModel):
question: str
system: str = "You are a helpful medical AI chat assistant. Help as much as you can.Also continuously ask for possible symptoms in order to atat a conclusive ailment or sickness and possible solutions.Remember, response in English."
temperature: float = 0.8
seed: int = 101
mirostat_mode: int=2
mirostat_tau: float=4.0
mirostat_eta: float=1.1
class ChatModel(BaseModel):
question: list
system: str = "You are chatDoctor, a helpful health and medical assistant. You are chatting with a human. Help as much as you can. Also continuously ask for possible symptoms in order to a conclusive ailment or sickness and possible solutions.Remember, response in English."
temperature: float = 0.8
seed: int = 101
mirostat_mode: int=2
mirostat_tau: float=4.0
mirostat_eta: float=1.1
llm_chat = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat"),
verbose=False,
n_ctx=512,
n_gpu_layers=0,
#chat_format="llama-2"
)
llm_generate = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
#tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat"),
verbose=False,
n_ctx=4096,
n_gpu_layers=0,
mirostat_mode=2,
mirostat_tau=4.0,
mirostat_eta=1.1,
#chat_format="llama-2"
)
# Logger setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
#app = fastapi.FastAPI(
#title="OpenGenAI",
#description="Your Excellect AI Physician")
"""
app.add_middleware(
CORSMiddleware,
allow_origins = ["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
)
"""
llm_router = APIRouter(prefix="/llm")
@llm_router.get("/health", tags=["llm"])
def health():
return {"status": "ok"}
@llm_router.post("/rag/", tags=["llm"])
async def ragchat(chatm:ChatModel):#, user: schemas.BaseUser = fastapi.Depends(current_active_user)):
r = RagChat().chat(chatml.question)
print(r)
# Chat Completion API
@llm_router.post("/chat/", tags=["llm"])
async def chat(chatm:ChatModel):#, user: schemas.BaseUser = fastapi.Depends(current_active_user)):
#chatm.system = chatm.system.format("")#user.email)
try:
st = time()
output = llm_chat.create_chat_completion(
messages = chatm.question,
temperature = chatm.temperature,
seed = chatm.seed,
#stream=True
)
print(output)
#print(output)
et = time()
output["time"] = et - st
#messages.append({'role': "assistant", "content": output['choices'][0]['message']['content']})
#print(messages)
return output
except Exception as e:
logger.error(f"Error in /complete endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
# Chat Completion API
@llm_router.post("/generate", tags=["llm"])
async def generate(gen:GenModel):#, user: schemas.BaseUser = fastapi.Depends(current_active_user)):
gen.system = "You are an helpful medical AI assistant."
gen.temperature = 0.5
gen.seed = 42
try:
#st = time()
output = llm_generate.create_completion(
#messages=[
# {"role": "system", "content": gen.system},
# {"role": "user", "content": gen.question},
# ],
gen.question,
temperature = gen.temperature,
seed= gen.seed,
#chat_format="llama-2",
stream=True,
echo = True
)
for chunk in output:
delta = chunk['choices'][0]#['delta']
print(delta)
if 'role' in delta:
print(delta['role'], end=': ')
elif 'content' in delta:
print(delta['content'], end='')
#print(chunk)
#et = time()
#output["time"] = et - st
#print(output)
except Exception as e:
logger.error(f"Error in /generate endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
|