Spaces:

Volkopat
/

arXivGPT

Runtime error

arXivGPT / pdf2vectorstore.py

Volko

Reverted

a58f539 about 2 years ago

2.27 kB


	import os
	import requests
	from bs4 import BeautifulSoup
	from pdf2image import convert_from_path
	import pytesseract
	import pickle

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import UnstructuredFileLoader
	from langchain.vectorstores.faiss import FAISS
	from langchain.embeddings import OpenAIEmbeddings

	def download_pdf(url, filename):
	print("Downloading pdf...")
	response = requests.get(url, stream=True)
	with open(filename, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	def extract_pdf_text(filename):
	print("Extracting text from pdf...")
	pytesseract.pytesseract.tesseract_cmd = 'tesseract'
	images = convert_from_path(filename)
	text = ""
	for image in images:
	text += pytesseract.image_to_string(image)

	return text

	def get_arxiv_pdf_url(paper_link):
	if paper_link.endswith('.pdf'):
	return paper_link
	else:
	print("Getting pdf url...")
	response = requests.get(paper_link)
	soup = BeautifulSoup(response.text, 'html.parser')
	pdf_url = soup.find('a', {'class': 'mobile-submission-download'})['href']
	pdf_url = 'https://arxiv.org' + pdf_url
	return pdf_url

	def read_paper(paper_link):
	print("Reading paper...")
	pdf_filename = 'paper.pdf'
	pdf_url = get_arxiv_pdf_url(paper_link)
	download_pdf(pdf_url, pdf_filename)
	text = extract_pdf_text(pdf_filename)
	os.remove(pdf_filename)

	return text

	def convert_to_vectorstore(arxiv_url, api_key):
	if not arxiv_url or not api_key:
	return None
	print("Converting to vectorstore...")
	txtfile = "paper.txt"
	with open(txtfile, 'w') as f:
	f.write(read_paper(arxiv_url))

	loader = UnstructuredFileLoader(txtfile)
	raw_documents = loader.load()
	os.remove(txtfile)
	print("Loaded document")

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
	documents = text_splitter.split_documents(raw_documents)
	os.environ["OPENAI_API_KEY"] = api_key
	embeddings = OpenAIEmbeddings()
	os.environ["OPENAI_API_KEY"] = ""
	vectorstore = FAISS.from_documents(documents, embeddings)

	return vectorstore