from flask import Flask, redirect, render_template, request, url_for from langchain.embeddings import HuggingFaceEmbeddings from langchain.chains import RetrievalQA from langchain_community.vectorstores import Pinecone from langchain.prompts import PromptTemplate from langchain_community.llms import CTransformers from flask_limiter import Limiter from flask_limiter.util import get_remote_address from langchain_community.llms import LlamaCpp import time app = Flask(__name__) # Setup Flask-Limiter limiter = Limiter( app=app, key_func=get_remote_address, # Correctly specify key_func as a keyword argument default_limits=["200 per day", "20 per hour"] ) # Initialize embeddings directly embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") prompt_template=""" Use the following pieces of information to answer the user's question. If you don't know the answer say that you don't know it, don't try to make up an answer. Context: {context} Question: {question} Only return the correct answer in human readable text and avoide printing programming code! Make it short with no more text than needed and do not repeat your answers or the question! """ # Make sure the model path is correct for your system! llm = LlamaCpp( model_path="model/phi-2.Q2_K.gguf", temperature=0.1, max_tokens=128, repetition_penalty=1, top_p=1, verbose=True, # Verbose is required to pass to the callback manager ) PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"]) docsearch = Pinecone.from_existing_index("medicalbot", embeddings) retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 2}) qa = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs={"prompt": PROMPT} ) # chat history messages = [] @app.route("/", methods=["GET"]) @limiter.limit("10/minute") def home(): return render_template("home.html", messages=messages) @app.route("/post_message", methods=["POST"]) def post_message(): start_time = time.time() try: msg = request.form['message'] messages.append({"sender": "user", "text": msg}) if len(messages) > 10: messages.clear() bot_response = qa({"query": msg}) response_time = time.time() - start_time response_with_time = f"{bot_response['result']} (Response time: {response_time:.2f} seconds)" messages.append({"sender": "bot", "text": response_with_time}) except Exception as e: print(f"Error processing the message: {e}") messages.append({"sender": "bot", "text": "Sorry, I couldn't process your request."}) return redirect(url_for('home')) if __name__ == "__main__": app.run(host='0.0.0.0', port=7860)