import os import gradio as gr import openai from langchain import hub from langchain_community.document_loaders import PyPDFLoader from langchain_community.vectorstores import Chroma from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from pypdf import PdfReader, PdfWriter from pathlib import Path from typing import List def build_rag_chain(pdf_paths: List[str], chunk_size: int, chunk_overlap: int, model_name: str): loaders = [PyPDFLoader(path) for path in pdf_paths] docs = [] for loader in loaders: docs.extend( loader.load()[0:] # skip first page ) text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) splits = text_splitter.split_documents(docs) vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings()) retriever = vectorstore.as_retriever() prompt = hub.pull("rlm/rag-prompt") # model_name = 'gpt-3.5-turbo-0125' # model_name = 'gpt-4-1106-preview' # model_name = 'gpt-4-0125-preview' llm = ChatOpenAI(model_name=model_name, temperature=0) def format_docs(docs): return '\n\n'.join(doc.page_content for doc in docs) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) return rag_chain def predict(query: str, pdf_id: str =None, user_id: str = None, chunk_size: int =1000, chunk_overlap: int =200, model_name: str ='gpt-4-0125-preview'): print(type(pdf_id)) # print(user_id) if pdf_id: # pdf_path = Path(pdf_file) # pdf_reader = PdfReader(pdf_path) # pdf_writer = PdfWriter() # pdf_name = pdf_file.split('/')[-1] # pdf_path = data_root + pdf_name # if pdf_path not in load_pdf_paths(data_root): # print('Saving file...') # for page in pdf_reader.pages: # pdf_writer.add_page(page) # with open(pdf_path, 'wb') as f: # pdf_writer.write(f) # os.system("ls data/pdf") # pdf_paths = load_pdf_paths(data_root) rag_chain = build_rag_chain([pdf_id], chunk_size=chunk_size, chunk_overlap=chunk_overlap, model_name=model_name) return rag_chain.invoke(query) return "Please upload PDF file" # examples = [ # "هل هناك غرامة للتخلف عن سداد ضريبة القيمة المضافة؟", # "ما هي ضريبة القيمة المضافة؟", # "ما الواجب على الخاضغين لضريبة القيمة المضافة؟", # "من هو الشخص الخاضغ لضريبة القيمة المضافة؟", # "متى يجب على الشخص التسجيل لضريبة القيمة المضافة؟", # "أريد بيع منزل, هل يخضع ذلك لضريبة القيمة المضافة؟" # ] textbox = gr.Textbox(label="اكتب سؤالك هنا", placeholder="", lines=4) upload_btn = gr.UploadButton(label='Upload a PDF file.') iface = gr.Interface(fn=predict, inputs=[textbox, upload_btn], outputs="text") iface.launch(share=True)