import os import requests from bs4 import BeautifulSoup from pdf2image import convert_from_path import pytesseract import pickle import concurrent.futures from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import UnstructuredFileLoader from langchain.vectorstores.faiss import FAISS from langchain.embeddings import OpenAIEmbeddings def download_pdf(url, filename): print("Downloading pdf...") response = requests.get(url, stream=True) with open(filename, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) def extract_pdf_text(filename): print("Extracting text from pdf...") pytesseract.pytesseract.tesseract_cmd = 'tesseract' images = convert_from_path(filename) text = "" with concurrent.futures.ThreadPoolExecutor() as executor: extracted_texts = executor.map(pytesseract.image_to_string, images) for extracted_text in extracted_texts: text += extracted_text return text def get_arxiv_pdf_url(paper_link): if paper_link.endswith('.pdf'): return paper_link else: print("Getting pdf url...") response = requests.get(paper_link) soup = BeautifulSoup(response.text, 'html.parser') pdf_url = soup.find('a', {'class': 'mobile-submission-download'})['href'] pdf_url = 'https://arxiv.org' + pdf_url return pdf_url def read_paper(paper_link): print("Reading paper...") pdf_filename = 'paper.pdf' pdf_url = get_arxiv_pdf_url(paper_link) with concurrent.futures.ThreadPoolExecutor() as executor: pdf_future = executor.submit(download_pdf, pdf_url, pdf_filename) pdf_future.result() text_future = executor.submit(extract_pdf_text, pdf_filename) text = text_future.result() os.remove(pdf_filename) return text def convert_to_vectorstore(arxiv_url, api_key): if not arxiv_url or not api_key: return None print("Converting to vectorstore...") txtfile = "paper.txt" with open(txtfile, 'w') as f: f.write(read_paper(arxiv_url)) loader = UnstructuredFileLoader(txtfile) raw_documents = loader.load() os.remove(txtfile) print("Loaded document") text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) documents = text_splitter.split_documents(raw_documents) os.environ["OPENAI_API_KEY"] = api_key embeddings = OpenAIEmbeddings() vectorstore = FAISS.from_documents(documents, embeddings) os.environ["OPENAI_API_KEY"] = "" return vectorstore