Spaces:
Runtime error
Runtime error
File size: 2,761 Bytes
ef5b171 4341309 ef5b171 4341309 ef5b171 4341309 ef5b171 4341309 ef5b171 4341309 ef5b171 4341309 5d52ad8 ef5b171 4341309 ef5b171 4341309 ef5b171 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
import PyPDF2
from PyPDF2 import PdfReader
import pdfplumber
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
def extract_text_from_pdf(pdf_path):
# Open the PDF file
with open(pdf_path, 'rb') as pdf_file:
# Read the PDF file
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Get the number of pages in the PDF
num_pages = len(pdf_reader.pages)
# Initialize an empty string to store the text
full_text = ''
# Loop through each page and extract the text
for page_num in range(num_pages):
# Get the page object
#page = PyPDF2.PdfReader()
# Extract the text from the page
page_text = pdf_reader.pages[page_num].extract_text()
# Append the text to the full_text variable
full_text += page_text
# Return the full text of the PDF
return full_text
model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
embeddings = HuggingFaceEmbeddings(model_name = model)
def save_to_vector_store(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300,
chunk_overlap=20,
length_function=len,
is_separator_regex=False)
docs = text_splitter.create_documents([text])
vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings(model="text-embedding-ada-002", api_key=OPENAI_API_KEY))
#vectorstore = FAISS.from_documents(documents=docs, embedding=embeddings)
vectorstore.save_local(DB_FAISS_PATH, index_name="njmvc_Index")
#create a new file named vectorstore in your current directory.
if __name__=="__main__":
DB_FAISS_PATH = './vectorstore/db_faiss/'
file_name = "./data/drivermanual-2-small.pdf"
#loader=read_file_get_prompts(file_name)
#text=read_file_get_prompts(file_name)
text = extract_text_from_pdf(file_name)
#pdfReaded = PyPDF2.PdfReader(file_name)
#docs=loader.load()
#save_to_vector_store(text)
save_to_vector_store(text)
|