import os |
import requests |
from bs4 import BeautifulSoup |
from pdf2image import convert_from_path |
import pytesseract |
import pickle |
from langchain.text_splitter import RecursiveCharacterTextSplitter |
from langchain.document_loaders import UnstructuredFileLoader |
from langchain.vectorstores.faiss import FAISS |
from langchain.embeddings import OpenAIEmbeddings |
def download_pdf(url, filename): |
print("Downloading pdf...") |
response = requests.get(url, stream=True) |
with open(filename, 'wb') as f: |
for chunk in response.iter_content(chunk_size=8192): |
f.write(chunk) |
def extract_pdf_text(filename): |
print("Extracting text from pdf...") |
pytesseract.pytesseract.tesseract_cmd = 'tesseract' |
images = convert_from_path(filename) |
text = "" |
for image in images: |
text += pytesseract.image_to_string(image) |
return text |
def get_arxiv_pdf_url(paper_link): |
if paper_link.endswith('.pdf'): |
return paper_link |
else: |
print("Getting pdf url...") |
response = requests.get(paper_link) |
soup = BeautifulSoup(response.text, 'html.parser') |
pdf_url = soup.find('a', {'class': 'mobile-submission-download'})['href'] |
pdf_url = 'https://arxiv.org' + pdf_url |
return pdf_url |
def read_paper(paper_link): |
print("Reading paper...") |
pdf_filename = 'paper.pdf' |
pdf_url = get_arxiv_pdf_url(paper_link) |
download_pdf(pdf_url, pdf_filename) |
text = extract_pdf_text(pdf_filename) |
os.remove(pdf_filename) |
return text |
def convert_to_vectorstore(arxiv_url, api_key): |
if not arxiv_url or not api_key: |
return None |
print("Converting to vectorstore...") |
txtfile = "paper.txt" |
with open(txtfile, 'w') as f: |
f.write(read_paper(arxiv_url)) |
loader = UnstructuredFileLoader(txtfile) |
raw_documents = loader.load() |
os.remove(txtfile) |
print("Loaded document") |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) |
documents = text_splitter.split_documents(raw_documents) |
os.environ["OPENAI_API_KEY"] = api_key |
embeddings = OpenAIEmbeddings() |
os.environ["OPENAI_API_KEY"] = "" |
vectorstore = FAISS.from_documents(documents, embeddings) |
return vectorstore |