Volko commited on
Commit
9fe2c04
β€’
1 Parent(s): 5d7fd94

Optimised parsing

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. pdf2vectorstore.py +16 -5
app.py CHANGED
@@ -137,7 +137,7 @@ with block:
137
  <p>Powered by <a href='https://openai.com/'>OpenAI</a>, <a href='https://arxiv.org/'>arXiv</a> and <a href='https://github.com/hwchase17/langchain'>LangChain πŸ¦œοΈπŸ”—</a></p>
138
  <p>ArxivGPT is a chatbot that answers questions about research papers. It uses a pretrained GPT-3.5 model to generate answers.</p>
139
  <p>Currently, it can answer questions about the paper you just linked and can also answer questions about the paper's contents.</p>
140
- <p>It's still in development, so please report any bugs you find. It can take up to a minute to start a conversation for every new paper as there is a parsing delay.</p>
141
  <p>The answers can be quite limited as there is a 4097 token limit for GPT-3.5, hence waiting for GPT-4 access to upgrade.</p>
142
  <p>Possible upgrades coming up: GPT-4, faster parsing, status messages, other research paper hubs.</p>
143
  </div>
 
137
  <p>Powered by <a href='https://openai.com/'>OpenAI</a>, <a href='https://arxiv.org/'>arXiv</a> and <a href='https://github.com/hwchase17/langchain'>LangChain πŸ¦œοΈπŸ”—</a></p>
138
  <p>ArxivGPT is a chatbot that answers questions about research papers. It uses a pretrained GPT-3.5 model to generate answers.</p>
139
  <p>Currently, it can answer questions about the paper you just linked and can also answer questions about the paper's contents.</p>
140
+ <p>It's still in development, so please report any bugs you find.</p>
141
  <p>The answers can be quite limited as there is a 4097 token limit for GPT-3.5, hence waiting for GPT-4 access to upgrade.</p>
142
  <p>Possible upgrades coming up: GPT-4, faster parsing, status messages, other research paper hubs.</p>
143
  </div>
pdf2vectorstore.py CHANGED
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
5
  from pdf2image import convert_from_path
6
  import pytesseract
7
  import pickle
 
8
 
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain.document_loaders import UnstructuredFileLoader
@@ -23,9 +24,13 @@ def extract_pdf_text(filename):
23
  pytesseract.pytesseract.tesseract_cmd = 'tesseract'
24
  images = convert_from_path(filename)
25
  text = ""
26
- for image in images:
27
- text += pytesseract.image_to_string(image)
28
 
 
 
 
 
 
 
29
  return text
30
 
31
  def get_arxiv_pdf_url(paper_link):
@@ -43,8 +48,14 @@ def read_paper(paper_link):
43
  print("Reading paper...")
44
  pdf_filename = 'paper.pdf'
45
  pdf_url = get_arxiv_pdf_url(paper_link)
46
- download_pdf(pdf_url, pdf_filename)
47
- text = extract_pdf_text(pdf_filename)
 
 
 
 
 
 
48
  os.remove(pdf_filename)
49
 
50
  return text
@@ -66,7 +77,7 @@ def convert_to_vectorstore(arxiv_url, api_key):
66
  documents = text_splitter.split_documents(raw_documents)
67
  os.environ["OPENAI_API_KEY"] = api_key
68
  embeddings = OpenAIEmbeddings()
69
- os.environ["OPENAI_API_KEY"] = ""
70
  vectorstore = FAISS.from_documents(documents, embeddings)
 
71
 
72
  return vectorstore
 
5
  from pdf2image import convert_from_path
6
  import pytesseract
7
  import pickle
8
+ import concurrent.futures
9
 
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain.document_loaders import UnstructuredFileLoader
 
24
  pytesseract.pytesseract.tesseract_cmd = 'tesseract'
25
  images = convert_from_path(filename)
26
  text = ""
 
 
27
 
28
+ with concurrent.futures.ThreadPoolExecutor() as executor:
29
+ extracted_texts = executor.map(pytesseract.image_to_string, images)
30
+
31
+ for extracted_text in extracted_texts:
32
+ text += extracted_text
33
+
34
  return text
35
 
36
  def get_arxiv_pdf_url(paper_link):
 
48
  print("Reading paper...")
49
  pdf_filename = 'paper.pdf'
50
  pdf_url = get_arxiv_pdf_url(paper_link)
51
+
52
+ with concurrent.futures.ThreadPoolExecutor() as executor:
53
+ pdf_future = executor.submit(download_pdf, pdf_url, pdf_filename)
54
+ pdf_future.result()
55
+
56
+ text_future = executor.submit(extract_pdf_text, pdf_filename)
57
+ text = text_future.result()
58
+
59
  os.remove(pdf_filename)
60
 
61
  return text
 
77
  documents = text_splitter.split_documents(raw_documents)
78
  os.environ["OPENAI_API_KEY"] = api_key
79
  embeddings = OpenAIEmbeddings()
 
80
  vectorstore = FAISS.from_documents(documents, embeddings)
81
+ os.environ["OPENAI_API_KEY"] = ""
82
 
83
  return vectorstore