Volko commited on
Commit
ccc9ab3
1 Parent(s): 0344383

Optimised parsing

Browse files
Files changed (1) hide show
  1. pdf2vectorstore.py +8 -2
pdf2vectorstore.py CHANGED
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
5
  from pdf2image import convert_from_path
6
  import pytesseract
7
  import pickle
 
8
 
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain.document_loaders import UnstructuredFileLoader
@@ -18,14 +19,19 @@ def download_pdf(url, filename):
18
  for chunk in response.iter_content(chunk_size=8192):
19
  f.write(chunk)
20
 
 
 
 
21
  def extract_pdf_text(filename):
22
  print("Extracting text from pdf...")
23
  pytesseract.pytesseract.tesseract_cmd = 'tesseract'
24
  images = convert_from_path(filename)
25
  text = ""
26
- for image in images:
27
- text += pytesseract.image_to_string(image)
 
28
 
 
29
  return text
30
 
31
  def get_arxiv_pdf_url(paper_link):
 
5
  from pdf2image import convert_from_path
6
  import pytesseract
7
  import pickle
8
+ from concurrent.futures import ThreadPoolExecutor
9
 
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain.document_loaders import UnstructuredFileLoader
 
19
  for chunk in response.iter_content(chunk_size=8192):
20
  f.write(chunk)
21
 
22
+ def extract_image_text(image):
23
+ return pytesseract.image_to_string(image)
24
+
25
  def extract_pdf_text(filename):
26
  print("Extracting text from pdf...")
27
  pytesseract.pytesseract.tesseract_cmd = 'tesseract'
28
  images = convert_from_path(filename)
29
  text = ""
30
+
31
+ with ThreadPoolExecutor() as executor:
32
+ text_parts = list(executor.map(extract_image_text, images))
33
 
34
+ text = "".join(text_parts)
35
  return text
36
 
37
  def get_arxiv_pdf_url(paper_link):