elshehawy commited on
Commit
c1fc26e
1 Parent(s): 344fa1c

add application files

Browse files
Files changed (3) hide show
  1. app.py +97 -0
  2. data/pdf/4.pdf +0 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import openai
4
+ from langchain import hub
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.runnables import RunnablePassthrough
9
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+ from pypdf import PdfReader, PdfWriter
12
+ from pathlib import Path
13
+
14
+
15
+ data_root = './data/pdf/'
16
+
17
+ def load_pdf_paths(data_root):
18
+ return [data_root+path for path in os.listdir(data_root)]
19
+
20
+
21
+ def build_rag_chain(pdf_paths):
22
+ loaders = [PyPDFLoader(path) for path in pdf_paths]
23
+
24
+ docs = []
25
+ for loader in loaders:
26
+ docs.extend(
27
+ loader.load()[0:] # skip first page
28
+ )
29
+
30
+ chunk_size = 1000
31
+ chunk_overlap = 200
32
+
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
34
+ chunk_overlap=chunk_overlap)
35
+
36
+ splits = text_splitter.split_documents(docs)
37
+
38
+ vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
39
+ retriever = vectorstore.as_retriever()
40
+ prompt = hub.pull("rlm/rag-prompt")
41
+
42
+ # model_name = 'gpt-3.5-turbo-0125'
43
+ # model_name = 'gpt-4-1106-preview'
44
+ model_name = 'gpt-4-0125-preview'
45
+ llm = ChatOpenAI(model_name=model_name, temperature=0)
46
+
47
+ def format_docs(docs):
48
+ return '\n\n'.join(doc.page_content for doc in docs)
49
+
50
+ rag_chain = (
51
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
52
+ | prompt
53
+ | llm
54
+ | StrOutputParser()
55
+ )
56
+
57
+ return rag_chain
58
+
59
+
60
+ def predict(query, pdf_file):
61
+ print(type(pdf_file))
62
+ if pdf_file:
63
+ pdf_path = Path(pdf_file)
64
+ pdf_reader = PdfReader(pdf_path)
65
+ pdf_writer = PdfWriter()
66
+
67
+
68
+ pdf_name = pdf_file.split('/')[-1]
69
+ pdf_path = data_root + pdf_name
70
+
71
+ if pdf_path not in load_pdf_paths(data_root):
72
+ print('Saving file...')
73
+ for page in pdf_reader.pages:
74
+ pdf_writer.add_page(page)
75
+
76
+ with open(pdf_path, 'wb') as f:
77
+ pdf_writer.write(f)
78
+ os.system("ls data/pdf")
79
+
80
+ pdf_paths = load_pdf_paths(data_root)
81
+ rag_chain = build_rag_chain(pdf_paths)
82
+ return rag_chain.invoke(query)
83
+
84
+ # examples = [
85
+ # "هل هناك غرامة للتخلف عن سداد ضريبة القيمة المضافة؟",
86
+ # "ما هي ضريبة القيمة المضافة؟",
87
+ # "ما الواجب على الخاضغين لضريبة القيمة المضافة؟",
88
+ # "من هو الشخص الخاضغ لضريبة القيمة المضافة؟",
89
+ # "متى يجب على الشخص التسجيل لضريبة القيمة المضافة؟",
90
+ # "أريد بيع منزل, هل يخضع ذلك لضريبة القيمة المضافة؟"
91
+ # ]
92
+
93
+ textbox = gr.Textbox(label="اكتب سؤالك هنا", placeholder="", lines=4)
94
+ upload_btn = gr.UploadButton(label='Upload a PDF file.')
95
+
96
+ iface = gr.Interface(fn=predict, inputs=[textbox, upload_btn], outputs="text")
97
+ iface.launch(share=True)
data/pdf/4.pdf ADDED
Binary file (741 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ langchainhub
4
+ langchain-openai
5
+ chromadb
6
+ bs4
7
+ pypdf