Spaces:
Runtime error
Runtime error
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.document_loaders import TextLoader | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
import PyPDF2 | |
from PyPDF2 import PdfReader | |
import pdfplumber | |
from PIL import Image | |
import pytesseract | |
from pdf2image import convert_from_path | |
from pdfminer.high_level import extract_pages, extract_text | |
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') | |
def extract_text_from_pdf(pdf_path): | |
# Open the PDF file | |
with open(pdf_path, 'rb') as pdf_file: | |
# Read the PDF file | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
# Get the number of pages in the PDF | |
num_pages = len(pdf_reader.pages) | |
# Initialize an empty string to store the text | |
full_text = '' | |
# Loop through each page and extract the text | |
for page_num in range(num_pages): | |
# Get the page object | |
#page = PyPDF2.PdfReader() | |
# Extract the text from the page | |
page_text = pdf_reader.pages[page_num].extract_text() | |
# Append the text to the full_text variable | |
full_text += page_text | |
# Return the full text of the PDF | |
return full_text | |
model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" | |
embeddings = HuggingFaceEmbeddings(model_name = model) | |
def save_to_vector_store(text): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, | |
chunk_overlap=20, | |
length_function=len, | |
is_separator_regex=False) | |
docs = text_splitter.create_documents([text]) | |
vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings(model="text-embedding-ada-002", api_key=OPENAI_API_KEY)) | |
#vectorstore = FAISS.from_documents(documents=docs, embedding=embeddings) | |
vectorstore.save_local(DB_FAISS_PATH, index_name="njmvc_Index") | |
#create a new file named vectorstore in your current directory. | |
if __name__=="__main__": | |
DB_FAISS_PATH = './vectorstore/db_faiss/' | |
file_name = "./data/drivermanual-2-small.pdf" | |
#loader=read_file_get_prompts(file_name) | |
#text=read_file_get_prompts(file_name) | |
text = extract_text_from_pdf(file_name) | |
#pdfReaded = PyPDF2.PdfReader(file_name) | |
#docs=loader.load() | |
#save_to_vector_store(text) | |
save_to_vector_store(text) | |