Volko commited on
Commit
d98144d
1 Parent(s): c7b2ed6

Version 1.0

Browse files
Files changed (4) hide show
  1. app.py +138 -0
  2. pdf2vectorstore.py +72 -0
  3. requirements.txt +11 -0
  4. template.py +18 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from typing import Optional, Tuple
4
+ import gradio as gr
5
+ from threading import Lock
6
+
7
+ from langchain.llms import OpenAI
8
+ from langchain.chains import ChatVectorDBChain
9
+ from template import QA_PROMPT, CONDENSE_QUESTION_PROMPT
10
+ from pdf2vectorstore import convert_to_vectorstore
11
+
12
+ def get_chain(api_key, vectorstore, model_name):
13
+ llm = OpenAI(model_name = model_name, temperature=0, openai_api_key=api_key)
14
+ qa_chain = ChatVectorDBChain.from_llm(
15
+ llm,
16
+ vectorstore,
17
+ qa_prompt=QA_PROMPT,
18
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
19
+ )
20
+ return qa_chain
21
+
22
+ def set_openai_api_key(api_key: str, vectorstore, model_name: str):
23
+ if api_key:
24
+ chain = get_chain(api_key, vectorstore, model_name)
25
+ return chain
26
+
27
+ class ChatWrapper:
28
+
29
+ def __init__(self):
30
+ self.lock = Lock()
31
+ self.previous_url = ""
32
+ self.vectorstore_state = None
33
+ self.chain = None
34
+
35
+ def __call__(
36
+ self,
37
+ api_key: str,
38
+ arxiv_url: str,
39
+ inp: str,
40
+ history: Optional[Tuple[str, str]],
41
+ model_name: str,
42
+ ):
43
+ if not arxiv_url or not api_key:
44
+ history = history or []
45
+ history.append((inp, "Please provide both arXiv URL and API key to begin"))
46
+ return history, history
47
+
48
+ if arxiv_url != self.previous_url:
49
+ history = []
50
+ vectorstore = convert_to_vectorstore(arxiv_url, api_key)
51
+ self.previous_url = arxiv_url
52
+ self.chain = set_openai_api_key(api_key, vectorstore, model_name)
53
+ self.vectorstore_state = vectorstore
54
+
55
+ if self.chain is None:
56
+ self.chain = set_openai_api_key(api_key, self.vectorstore_state, model_name)
57
+
58
+ self.lock.acquire()
59
+ try:
60
+ history = history or []
61
+ if self.chain is None:
62
+ history.append((inp, "Please paste your OpenAI key to use"))
63
+ return history, history
64
+ import openai
65
+ openai.api_key = api_key
66
+ output = self.chain ({"question": inp, "chat_history": history})["answer"]
67
+ history.append((inp, output))
68
+ except Exception as e:
69
+ raise e
70
+ finally:
71
+ api_key = ""
72
+ self.lock.release()
73
+ return history, history
74
+
75
+ chat = ChatWrapper()
76
+
77
+ block = gr.Blocks(css=".gradio-container {background-color: #f8f8f8; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif}")
78
+
79
+ with block:
80
+ gr.HTML("<h1 style='text-align: center;'>ArxivGPT</h1>")
81
+ gr.HTML("<h3 style='text-align: center;'>Ask questions about research papers</h3>")
82
+
83
+ with gr.Row():
84
+ with gr.Column(width="auto"):
85
+ openai_api_key_textbox = gr.Textbox(
86
+ label="OpenAI API Key",
87
+ placeholder="Paste your OpenAI API key (sk-...)",
88
+ show_label=True,
89
+ lines=1,
90
+ type="password",
91
+ )
92
+ with gr.Column(width="auto"):
93
+ arxiv_url_textbox = gr.Textbox(
94
+ label="Arxiv URL",
95
+ placeholder="Enter the arXiv URL",
96
+ show_label=True,
97
+ lines=1,
98
+ )
99
+ with gr.Column(width="auto"):
100
+ model_dropdown = gr.Dropdown(
101
+ label="Choose a model (GPT-4 coming soon!)",
102
+ choices=["gpt-3.5-turbo"],
103
+ )
104
+
105
+ chatbot = gr.Chatbot()
106
+
107
+ with gr.Row():
108
+ message = gr.Textbox(
109
+ label="What's your question?",
110
+ placeholder="Ask questions about the paper you just linked",
111
+ lines=1,
112
+ )
113
+ submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
114
+
115
+ gr.Examples(
116
+ examples=[
117
+ "Please give me a brief summary about this paper",
118
+ "Are there any interesting correlations in the given paper?",
119
+ "How can this paper be applied in the real world?",
120
+ "What are the limitations of this paper?",
121
+ ],
122
+ inputs=message,
123
+ )
124
+
125
+ gr.HTML(
126
+ "<center style='margin-top: 20px;'>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
127
+ )
128
+
129
+ state = gr.State()
130
+
131
+ submit.click(chat,
132
+ inputs=[openai_api_key_textbox, arxiv_url_textbox, message, state, model_dropdown],
133
+ outputs=[chatbot, state])
134
+ message.submit(chat,
135
+ inputs=[openai_api_key_textbox, arxiv_url_textbox, message, state, model_dropdown],
136
+ outputs=[chatbot, state])
137
+
138
+ block.launch(share=True, debug=True, width=800)
pdf2vectorstore.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from pdf2image import convert_from_path
6
+ import pytesseract
7
+ import pickle
8
+
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.document_loaders import UnstructuredFileLoader
11
+ from langchain.vectorstores.faiss import FAISS
12
+ from langchain.embeddings import OpenAIEmbeddings
13
+
14
+ def download_pdf(url, filename):
15
+ print("Downloading pdf...")
16
+ response = requests.get(url, stream=True)
17
+ with open(filename, 'wb') as f:
18
+ for chunk in response.iter_content(chunk_size=8192):
19
+ f.write(chunk)
20
+
21
+ def extract_pdf_text(filename):
22
+ print("Extracting text from pdf...")
23
+ pytesseract.pytesseract.tesseract_cmd = 'tesseract'
24
+ images = convert_from_path(filename)
25
+ text = ""
26
+ for image in images:
27
+ text += pytesseract.image_to_string(image)
28
+
29
+ return text
30
+
31
+ def get_arxiv_pdf_url(paper_link):
32
+ if paper_link.endswith('.pdf'):
33
+ return paper_link
34
+ else:
35
+ print("Getting pdf url...")
36
+ response = requests.get(paper_link)
37
+ soup = BeautifulSoup(response.text, 'html.parser')
38
+ pdf_url = soup.find('a', {'class': 'mobile-submission-download'})['href']
39
+ pdf_url = 'https://arxiv.org' + pdf_url
40
+ return pdf_url
41
+
42
+ def read_paper(paper_link):
43
+ print("Reading paper...")
44
+ pdf_filename = 'paper.pdf'
45
+ pdf_url = get_arxiv_pdf_url(paper_link)
46
+ download_pdf(pdf_url, pdf_filename)
47
+ text = extract_pdf_text(pdf_filename)
48
+ os.remove(pdf_filename)
49
+
50
+ return text
51
+
52
+ def convert_to_vectorstore(arxiv_url, api_key):
53
+ if not arxiv_url or not api_key:
54
+ return None
55
+ print("Converting to vectorstore...")
56
+ txtfile = "paper.txt"
57
+ with open(txtfile, 'w') as f:
58
+ f.write(read_paper(arxiv_url))
59
+
60
+ loader = UnstructuredFileLoader(txtfile)
61
+ raw_documents = loader.load()
62
+ os.remove(txtfile)
63
+ print("Loaded document")
64
+
65
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
66
+ documents = text_splitter.split_documents(raw_documents)
67
+ os.environ["OPENAI_API_KEY"] = api_key
68
+ embeddings = OpenAIEmbeddings()
69
+ os.environ["OPENAI_API_KEY"] = ""
70
+ vectorstore = FAISS.from_documents(documents, embeddings)
71
+
72
+ return vectorstore
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ beautifulsoup4
3
+ pdfminer.six
4
+ PyMuPDF
5
+ pdf2image
6
+ pytesseract
7
+ unstructured
8
+ gradio
9
+ faiss-cpu
10
+ langchain
11
+ tiktoken
template.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts.prompt import PromptTemplate
2
+
3
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
4
+ Chat History:
5
+ {chat_history}
6
+ Follow Up Input: {question}
7
+ Standalone question:"""
8
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
9
+
10
+ template = """You are an AI assistant for answering questions about the contents of the research paper in Arxiv.
11
+ You are given the following extracted parts of a long document and a question. Provide a conversational answer.
12
+ If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
13
+ Question: {question}
14
+ =========
15
+ {context}
16
+ =========
17
+ Answer in Markdown:"""
18
+ QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])