Volko commited on
Commit
0344383
1 Parent(s): a27f388
Files changed (1) hide show
  1. pdf2vectorstore.py +5 -16
pdf2vectorstore.py CHANGED
@@ -5,7 +5,6 @@ from bs4 import BeautifulSoup
5
  from pdf2image import convert_from_path
6
  import pytesseract
7
  import pickle
8
- import concurrent.futures
9
 
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain.document_loaders import UnstructuredFileLoader
@@ -24,13 +23,9 @@ def extract_pdf_text(filename):
24
  pytesseract.pytesseract.tesseract_cmd = 'tesseract'
25
  images = convert_from_path(filename)
26
  text = ""
 
 
27
 
28
- with concurrent.futures.ThreadPoolExecutor() as executor:
29
- extracted_texts = executor.map(pytesseract.image_to_string, images)
30
-
31
- for extracted_text in extracted_texts:
32
- text += extracted_text
33
-
34
  return text
35
 
36
  def get_arxiv_pdf_url(paper_link):
@@ -48,14 +43,8 @@ def read_paper(paper_link):
48
  print("Reading paper...")
49
  pdf_filename = 'paper.pdf'
50
  pdf_url = get_arxiv_pdf_url(paper_link)
51
-
52
- with concurrent.futures.ThreadPoolExecutor() as executor:
53
- pdf_future = executor.submit(download_pdf, pdf_url, pdf_filename)
54
- pdf_future.result()
55
-
56
- text_future = executor.submit(extract_pdf_text, pdf_filename)
57
- text = text_future.result()
58
-
59
  os.remove(pdf_filename)
60
 
61
  return text
@@ -77,7 +66,7 @@ def convert_to_vectorstore(arxiv_url, api_key):
77
  documents = text_splitter.split_documents(raw_documents)
78
  os.environ["OPENAI_API_KEY"] = api_key
79
  embeddings = OpenAIEmbeddings()
80
- vectorstore = FAISS.from_documents(documents, embeddings)
81
  os.environ["OPENAI_API_KEY"] = ""
 
82
 
83
  return vectorstore
 
5
  from pdf2image import convert_from_path
6
  import pytesseract
7
  import pickle
 
8
 
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain.document_loaders import UnstructuredFileLoader
 
23
  pytesseract.pytesseract.tesseract_cmd = 'tesseract'
24
  images = convert_from_path(filename)
25
  text = ""
26
+ for image in images:
27
+ text += pytesseract.image_to_string(image)
28
 
 
 
 
 
 
 
29
  return text
30
 
31
  def get_arxiv_pdf_url(paper_link):
 
43
  print("Reading paper...")
44
  pdf_filename = 'paper.pdf'
45
  pdf_url = get_arxiv_pdf_url(paper_link)
46
+ download_pdf(pdf_url, pdf_filename)
47
+ text = extract_pdf_text(pdf_filename)
 
 
 
 
 
 
48
  os.remove(pdf_filename)
49
 
50
  return text
 
66
  documents = text_splitter.split_documents(raw_documents)
67
  os.environ["OPENAI_API_KEY"] = api_key
68
  embeddings = OpenAIEmbeddings()
 
69
  os.environ["OPENAI_API_KEY"] = ""
70
+ vectorstore = FAISS.from_documents(documents, embeddings)
71
 
72
  return vectorstore