import openai import os openai.api_key=os.getenv("OPENAI_API_KEY") from dotenv import load_dotenv load_dotenv() from flask import Flask, jsonify, render_template, request import requests, json # import nltk # nltk.download("punkt") import shutil from werkzeug.utils import secure_filename from werkzeug.datastructures import FileStorage import nltk from datetime import datetime import openai from langchain.llms import OpenAI from langchain.embeddings.openai import OpenAIEmbeddings #from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.document_loaders import SeleniumURLLoader, PyPDFLoader from langchain.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains import VectorDBQA from langchain.document_loaders import UnstructuredFileLoader, TextLoader from langchain import PromptTemplate from langchain.chains import RetrievalQA from langchain.memory import ConversationBufferWindowMemory import warnings warnings.filterwarnings("ignore") openai.api_key = os.environ["OPENAI_API_KEY"] #app = Flask(__name__) app = Flask(__name__, template_folder="./") # Create a directory in a known location to save files to. uploads_dir = os.path.join(app.root_path,'static', 'searchUploads') os.makedirs(uploads_dir, exist_ok=True) def pretty_print_docs(docs): print(f"\n{'-' * 100}\n".join([f"Document {i + 1}:\n\n" + "Document Length>>>" + str( len(d.page_content)) + "\n\nDocument Source>>> " + d.metadata['source'] + "\n\nContent>>> " + d.page_content for i, d in enumerate(docs)])) def getEmbeddingModel(embeddingId): # if (embeddingId == 1): # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # else: # embeddings = OpenAIEmbeddings() return OpenAIEmbeddings() def clearKBUploadDirectory(uploads_dir): for filename in os.listdir(uploads_dir): file_path = os.path.join(uploads_dir, filename) print("Clearing Doc Directory. Trying to delete" + file_path) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print('Failed to delete %s. Reason: %s' % (file_path, e)) @app.route('/', methods=['GET']) def test(): return "Docker hello" @app.route('/KBUploader') def KBUpload(): return render_template("KBTrain.html") @app.route('/aiassist') def aiassist(): return render_template("index.html") @app.route('/post_json', methods=['POST']) def post_json(): print(f"\n{'*' * 100}\n") print("Request Received >>>>>>>>>>>>>>>>>>", datetime.now().strftime("%H:%M:%S")) content_type = request.headers.get('Content-Type') if (content_type == 'application/json'): requestQuery = request.get_json() print("Search Query:"+requestQuery['query']) relevantDoc=vectordb.similarity_search_with_score(requestQuery['query'],distance_metric="cos", k = 3) searchResultArray=[] for doc in relevantDoc: searchResult = {} print(f"\n{'-' * 100}\n") searchResult['documentSource']=doc[len(doc)-2].metadata['source'] searchResult['pageContent']=doc[len(doc)-2].page_content searchResult['similarityScore']=str(doc[len(doc)-1]) print(doc) print("Document Source>>>>>> "+searchResult['documentSource']+"\n\n") print("Page Content>>>>>> "+searchResult['pageContent']+"\n\n") print("Similarity Score>>>> "+searchResult['similarityScore']) print(f"\n{'-' * 100}\n") searchResultArray.append(searchResult) print(f"\n{'*' * 100}\n") return jsonify(botMessage=searchResultArray) else: return 'Content-Type not supported!' @app.route('/file_upload', methods=['POST']) def file_Upload(): fileprovided=not request.files.getlist('files[]')[0].filename=='' urlProvided=not request.form.getlist('weburl')[0]=='' print("*******") print("File Provided:"+str(fileprovided)) print("URL Provided:"+str(urlProvided)) print("*******") documents = [] if fileprovided: #Delete Files for filename in os.listdir(uploads_dir): file_path = os.path.join(uploads_dir, filename) print("Clearing Doc Directory. Trying to delete"+file_path) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print('Failed to delete %s. Reason: %s' % (file_path, e)) #Read and Embed New Files provided for file in request.files.getlist('files[]'): print("File Received>>>"+file.filename) file.save(os.path.join(uploads_dir, secure_filename(file.filename))) #loader = UnstructuredFileLoader(os.path.join(uploads_dir, secure_filename(file.filename)), mode='elements') loader = PyPDFLoader(os.path.join(uploads_dir, secure_filename(file.filename))) documents.extend(loader.load()) if urlProvided: weburl=request.form.getlist('weburl') print(weburl) urlList=weburl[0].split(';') print(urlList) print("Selenium Started", datetime.now().strftime("%H:%M:%S")) #urlLoader=RecursiveUrlLoader(urlList[0]) urlLoader=SeleniumURLLoader(urlList) print("Selenium Completed", datetime.now().strftime("%H:%M:%S")) documents.extend(urlLoader.load()) print(uploads_dir) global chain; text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150) #text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=150,separator="") texts = text_splitter.split_documents(documents) print("All chunk List START ***********************\n\n") pretty_print_docs(texts) print("All chunk List END ***********************\n\n") embeddings = OpenAIEmbeddings() #from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings #embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # from langchain.embeddings import HuggingFaceEmbeddings # model_name = "sentence-transformers/all-MiniLM-L6-v2" # model_kwargs = {'device': 'cpu'} # encode_kwargs = {'normalize_embeddings': False} # embeddings = HuggingFaceEmbeddings( # model_name=model_name, # model_kwargs=model_kwargs, # encode_kwargs=encode_kwargs # ) global vectordb #vectordb = Chroma.from_documents(texts,embeddings) vectordb=Chroma.from_documents(documents=texts, embedding=embeddings, collection_metadata={"hnsw:space": "cosine"}) return render_template("index.html") if __name__ == '__main__': app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 7860)))