import os import gradio as gr import time from langchain.document_loaders import PDFMinerLoader,CSVLoader ,UnstructuredWordDocumentLoader,TextLoader,OnlinePDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import SentenceTransformerEmbeddings from langchain.vectorstores import FAISS from langchain import HuggingFaceHub from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate DEVICE = 'cpu' FILE_EXT = ['pdf','text','csv','word','wav'] DEFAULT_SYSTEM_PROMPT = "As a chatbot you are answering set of questions being requested ." MAX_NEW_TOKENS = 4096 DEFAULT_TEMPERATURE = 0.1 DEFAULT_MAX_NEW_TOKENS = 2048 MAX_INPUT_TOKEN_LENGTH = 4000 def loading_file(): return "Loading..." def get_openai_chat_model(API_key): try: from langchain.llms import OpenAI except ImportError as err: raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY" os.environ["OPENAI_API_KEY"] = API_key llm = OpenAI() return llm def process_documents(documents,data_chunk=1500,chunk_overlap=100): text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n') texts = text_splitter.split_documents(documents) return texts def get_hugging_face_model(model_id,API_key,temperature=0.1,max_tokens=4096): chat_llm = HuggingFaceHub(huggingfacehub_api_token=API_key, repo_id=model_id, model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens}) return chat_llm def chat_application(llm_service,key,temperature=0.1,max_tokens=1024): if llm_service == 'HuggingFace': llm = get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',API_key=key) else: llm = get_openai_chat_model(API_key=key) return llm def document_loader(file_path,api_key,doc_type='pdf',llm='Huggingface',temperature=0.1,max_tokens=4096): document = None if doc_type == 'pdf': document = process_pdf_document(document_file=file_path) elif doc_type == 'text': document = process_text_document(document_file=file_path) elif doc_type == 'csv': document = process_csv_document(document_file=file_path) elif doc_type == 'word': document = process_word_document(document_file=file_path) print("Document :",document) embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-base',model_kwargs={"device": DEVICE}) texts = process_documents(documents=document) global vector_db vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model) global qa qa = RetrievalQA.from_chain_type(llm=chat_application(llm_service=llm,key=api_key, temperature=temperature, max_tokens=max_tokens ), chain_type='stuff', retriever=vector_db.as_retriever(), # chain_type_kwargs=chain_type_kwargs, return_source_documents=True ) return "Document Processing completed ..." def process_text_document(document_file): loader = TextLoader(document_file.name) document = loader.load() return document def process_csv_document(document_file): loader = CSVLoader(file_path=document_file.name) document = loader.load() return document def process_word_document(document_file): loader = UnstructuredWordDocumentLoader(file_path=document_file.name) document = loader.load() return document def process_pdf_document(document_file): print("Document File Name :",document_file.name) loader = PDFMinerLoader(document_file.name) document = loader.load() return document def clear_chat(): return [] def infer(question, history): # res = [] # # for human, ai in history[:-1]: # # pair = (human, ai) # # res.append(pair) # chat_history = res print("Question in infer :",question) result = qa({"query": question}) matching_docs_score = vector_db.similarity_search_with_score(question) print(" Matching_doc ",matching_docs_score) return result["result"] def bot(history): response = infer(history[-1][0], history) history[-1][1] = "" for character in response: history[-1][1] += character time.sleep(0.05) yield history def add_text(history, text): history = history + [(text, None)] return history, "" css=""" #col-container {max-width: 700px; margin-left: auto; margin-right: auto;} """ title = """
Upload a file from system,UpLoad file and generate embeddings,
once status is ready, you can start asking questions about the data you uploaded without chat history
and gives you option to use HuggingFace/OpenAI as LLM's, make sure to add your key.