Volko
commited on
Commit
Β·
9fe2c04
1
Parent(s):
5d7fd94
Optimised parsing
Browse files- app.py +1 -1
- pdf2vectorstore.py +16 -5
app.py
CHANGED
@@ -137,7 +137,7 @@ with block:
|
|
137 |
<p>Powered by <a href='https://openai.com/'>OpenAI</a>, <a href='https://arxiv.org/'>arXiv</a> and <a href='https://github.com/hwchase17/langchain'>LangChain π¦οΈπ</a></p>
|
138 |
<p>ArxivGPT is a chatbot that answers questions about research papers. It uses a pretrained GPT-3.5 model to generate answers.</p>
|
139 |
<p>Currently, it can answer questions about the paper you just linked and can also answer questions about the paper's contents.</p>
|
140 |
-
<p>It's still in development, so please report any bugs you find
|
141 |
<p>The answers can be quite limited as there is a 4097 token limit for GPT-3.5, hence waiting for GPT-4 access to upgrade.</p>
|
142 |
<p>Possible upgrades coming up: GPT-4, faster parsing, status messages, other research paper hubs.</p>
|
143 |
</div>
|
|
|
137 |
<p>Powered by <a href='https://openai.com/'>OpenAI</a>, <a href='https://arxiv.org/'>arXiv</a> and <a href='https://github.com/hwchase17/langchain'>LangChain π¦οΈπ</a></p>
|
138 |
<p>ArxivGPT is a chatbot that answers questions about research papers. It uses a pretrained GPT-3.5 model to generate answers.</p>
|
139 |
<p>Currently, it can answer questions about the paper you just linked and can also answer questions about the paper's contents.</p>
|
140 |
+
<p>It's still in development, so please report any bugs you find.</p>
|
141 |
<p>The answers can be quite limited as there is a 4097 token limit for GPT-3.5, hence waiting for GPT-4 access to upgrade.</p>
|
142 |
<p>Possible upgrades coming up: GPT-4, faster parsing, status messages, other research paper hubs.</p>
|
143 |
</div>
|
pdf2vectorstore.py
CHANGED
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
|
|
5 |
from pdf2image import convert_from_path
|
6 |
import pytesseract
|
7 |
import pickle
|
|
|
8 |
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
from langchain.document_loaders import UnstructuredFileLoader
|
@@ -23,9 +24,13 @@ def extract_pdf_text(filename):
|
|
23 |
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
|
24 |
images = convert_from_path(filename)
|
25 |
text = ""
|
26 |
-
for image in images:
|
27 |
-
text += pytesseract.image_to_string(image)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
return text
|
30 |
|
31 |
def get_arxiv_pdf_url(paper_link):
|
@@ -43,8 +48,14 @@ def read_paper(paper_link):
|
|
43 |
print("Reading paper...")
|
44 |
pdf_filename = 'paper.pdf'
|
45 |
pdf_url = get_arxiv_pdf_url(paper_link)
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
os.remove(pdf_filename)
|
49 |
|
50 |
return text
|
@@ -66,7 +77,7 @@ def convert_to_vectorstore(arxiv_url, api_key):
|
|
66 |
documents = text_splitter.split_documents(raw_documents)
|
67 |
os.environ["OPENAI_API_KEY"] = api_key
|
68 |
embeddings = OpenAIEmbeddings()
|
69 |
-
os.environ["OPENAI_API_KEY"] = ""
|
70 |
vectorstore = FAISS.from_documents(documents, embeddings)
|
|
|
71 |
|
72 |
return vectorstore
|
|
|
5 |
from pdf2image import convert_from_path
|
6 |
import pytesseract
|
7 |
import pickle
|
8 |
+
import concurrent.futures
|
9 |
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain.document_loaders import UnstructuredFileLoader
|
|
|
24 |
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
|
25 |
images = convert_from_path(filename)
|
26 |
text = ""
|
|
|
|
|
27 |
|
28 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
29 |
+
extracted_texts = executor.map(pytesseract.image_to_string, images)
|
30 |
+
|
31 |
+
for extracted_text in extracted_texts:
|
32 |
+
text += extracted_text
|
33 |
+
|
34 |
return text
|
35 |
|
36 |
def get_arxiv_pdf_url(paper_link):
|
|
|
48 |
print("Reading paper...")
|
49 |
pdf_filename = 'paper.pdf'
|
50 |
pdf_url = get_arxiv_pdf_url(paper_link)
|
51 |
+
|
52 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
53 |
+
pdf_future = executor.submit(download_pdf, pdf_url, pdf_filename)
|
54 |
+
pdf_future.result()
|
55 |
+
|
56 |
+
text_future = executor.submit(extract_pdf_text, pdf_filename)
|
57 |
+
text = text_future.result()
|
58 |
+
|
59 |
os.remove(pdf_filename)
|
60 |
|
61 |
return text
|
|
|
77 |
documents = text_splitter.split_documents(raw_documents)
|
78 |
os.environ["OPENAI_API_KEY"] = api_key
|
79 |
embeddings = OpenAIEmbeddings()
|
|
|
80 |
vectorstore = FAISS.from_documents(documents, embeddings)
|
81 |
+
os.environ["OPENAI_API_KEY"] = ""
|
82 |
|
83 |
return vectorstore
|