ShiwenNi commited on
Commit
b4bac0d
1 Parent(s): 6c47ad7

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +74 -0
  2. pdfquery.py +35 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import time
4
+ import shutil
5
+ import base64
6
+ from pdfquery import PDFQuery
7
+
8
+ pquery = PDFQuery()
9
+
10
+
11
+ def openai_create(s):
12
+ global pquery
13
+ return pquery.ask(s)
14
+
15
+ def chatgpt_clone(input, history, chatbot):
16
+ if input == "":
17
+ return chatbot, history, ""
18
+ history = history or []
19
+ s = list(sum(history, ()))
20
+ s.append(input)
21
+ inp = ' '.join(s)
22
+ output = openai_create(input)
23
+ history.append((inp, output))
24
+ chatbot.append((input, output))
25
+ return chatbot, history, ""
26
+
27
+
28
+ title_html = f"<h1 align=\"center\">Chat With Pdf</h1>"
29
+
30
+ gr_L1 = lambda: gr.Row().style()
31
+ gr_L2 = lambda scale, elem_id: gr.Column(scale=scale, elem_id=elem_id)
32
+
33
+
34
+ def pdf_to_markdown(file_obj):
35
+ try:
36
+ shutil.rmtree('./private_upload/')
37
+ except:
38
+ pass
39
+ time_tag = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
40
+ os.makedirs(f'private_upload/{time_tag}', exist_ok=True)
41
+ file_name = os.path.basename(file_obj.name)
42
+ destination = f'private_upload/{time_tag}/{file_name}'
43
+ shutil.copy(file_obj.name, destination)
44
+ global pquery
45
+ pquery.ingest(destination)
46
+ with open(destination, "rb") as f:
47
+ pdf = base64.b64encode(f.read()).decode('utf-8')
48
+ pdf_display = f'<embed src="data:application/pdf;base64,{pdf}" ' \
49
+ f'width="700" height="800" type="application/pdf">'
50
+ return [pdf_display, gr.update(visible=False),gr.update(visible=True),gr.update(visible=True),gr.update(visible=True),
51
+ gr.update(visible=True),gr.update(visible=True)]
52
+
53
+ # 清空
54
+ cle = lambda :""
55
+
56
+ with gr.Blocks(title="Chat With Pdf") as demo:
57
+ gr.HTML(title_html)
58
+ file = gr.File()
59
+ with gr_L1():
60
+ with gr_L2(scale=1.5, elem_id="gpt-chat"):
61
+ out = gr.Markdown()
62
+ with gr_L2(scale=1, elem_id="gpt-chat"):
63
+ title = gr.Markdown("""<h1><center><strong>文档问答 </strong></center></h1>
64
+ """, visible=False)
65
+ chatbot = gr.Chatbot(scale=3, height=600, visible=False)
66
+ with gr_L1():
67
+ message = gr.Textbox(placeholder="Input question here.", scale=10, visible=False)
68
+ state = gr.State([])
69
+ submit = gr.Button("发送", scale=1, visible=False)
70
+
71
+ file.upload(pdf_to_markdown, file, [out, file, out, title, chatbot, message, submit])
72
+ submit.click(chatgpt_clone, inputs=[message, state, chatbot], outputs=[chatbot, state, message])
73
+
74
+ demo.launch(share=True)
pdfquery.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.document_loaders import PyPDFium2Loader
6
+ from langchain.chains.question_answering import load_qa_chain
7
+ # from langchain.llms import OpenAI
8
+ from langchain.chat_models import ChatOpenAI
9
+
10
+
11
+ class PDFQuery:
12
+ def __init__(self):
13
+ os.environ["OPENAI_API_KEY"] = "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"
14
+ self.embeddings = OpenAIEmbeddings()
15
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
16
+ # self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
17
+ self.llm = ChatOpenAI(temperature=0)
18
+ self.chain = None
19
+ self.db = None
20
+
21
+ def ask(self, question: str) -> str:
22
+ if self.chain is None:
23
+ response = "Please, add a document."
24
+ else:
25
+ docs = self.db.get_relevant_documents(question)
26
+ response = self.chain.run(input_documents=docs, question=question)
27
+ return response
28
+
29
+ def ingest(self, file_path: os.PathLike) -> None:
30
+ loader = PyPDFium2Loader(file_path)
31
+ documents = loader.load()
32
+ splitted_documents = self.text_splitter.split_documents(documents)
33
+ self.db = Chroma.from_documents(splitted_documents, self.embeddings).as_retriever()
34
+ # self.chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
35
+ self.chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ openai
4
+ pypdfium2
5
+ chromadb
6
+ tiktoken