|
|
|
import os |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from pdf2image import convert_from_path |
|
import pytesseract |
|
import pickle |
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.document_loaders import UnstructuredFileLoader |
|
from langchain.vectorstores.faiss import FAISS |
|
from langchain.embeddings import OpenAIEmbeddings |
|
|
|
def download_pdf(url, filename): |
|
print("Downloading pdf...") |
|
response = requests.get(url, stream=True) |
|
with open(filename, 'wb') as f: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
|
|
def extract_pdf_text(filename): |
|
print("Extracting text from pdf...") |
|
pytesseract.pytesseract.tesseract_cmd = 'tesseract' |
|
images = convert_from_path(filename) |
|
text = "" |
|
for image in images: |
|
text += pytesseract.image_to_string(image) |
|
|
|
return text |
|
|
|
def get_arxiv_pdf_url(paper_link): |
|
if paper_link.endswith('.pdf'): |
|
return paper_link |
|
else: |
|
print("Getting pdf url...") |
|
response = requests.get(paper_link) |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
pdf_url = soup.find('a', {'class': 'mobile-submission-download'})['href'] |
|
pdf_url = 'https://arxiv.org' + pdf_url |
|
return pdf_url |
|
|
|
def read_paper(paper_link): |
|
print("Reading paper...") |
|
pdf_filename = 'paper.pdf' |
|
pdf_url = get_arxiv_pdf_url(paper_link) |
|
download_pdf(pdf_url, pdf_filename) |
|
text = extract_pdf_text(pdf_filename) |
|
os.remove(pdf_filename) |
|
|
|
return text |
|
|
|
def convert_to_vectorstore(arxiv_url, api_key): |
|
if not arxiv_url or not api_key: |
|
return None |
|
print("Converting to vectorstore...") |
|
txtfile = "paper.txt" |
|
with open(txtfile, 'w') as f: |
|
f.write(read_paper(arxiv_url)) |
|
|
|
loader = UnstructuredFileLoader(txtfile) |
|
raw_documents = loader.load() |
|
os.remove(txtfile) |
|
print("Loaded document") |
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) |
|
documents = text_splitter.split_documents(raw_documents) |
|
os.environ["OPENAI_API_KEY"] = api_key |
|
embeddings = OpenAIEmbeddings() |
|
os.environ["OPENAI_API_KEY"] = "" |
|
vectorstore = FAISS.from_documents(documents, embeddings) |
|
|
|
return vectorstore |