File size: 2,761 Bytes
ef5b171
4341309
ef5b171
4341309
ef5b171
4341309
 
ef5b171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4341309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef5b171
 
4341309
 
 
 
 
 
 
ef5b171
 
 
4341309
5d52ad8
ef5b171
4341309
 
ef5b171
 
 
4341309
ef5b171
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

import PyPDF2
from PyPDF2 import PdfReader
import pdfplumber
from PIL import Image
import pytesseract
from pdf2image import convert_from_path

from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure

import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as pdf_file:
        # Read the PDF file
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        # Get the number of pages in the PDF
        num_pages = len(pdf_reader.pages)
        # Initialize an empty string to store the text
        full_text = ''
        # Loop through each page and extract the text
        for page_num in range(num_pages):
            # Get the page object
            #page = PyPDF2.PdfReader()
            # Extract the text from the page
            page_text = pdf_reader.pages[page_num].extract_text()
            # Append the text to the full_text variable
            full_text += page_text
    # Return the full text of the PDF
    return full_text

model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
embeddings = HuggingFaceEmbeddings(model_name = model)

def save_to_vector_store(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, 
                                               chunk_overlap=20,
                                               length_function=len,
                                               is_separator_regex=False)
    docs = text_splitter.create_documents([text])
    vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings(model="text-embedding-ada-002", api_key=OPENAI_API_KEY))
    #vectorstore = FAISS.from_documents(documents=docs, embedding=embeddings)
    vectorstore.save_local(DB_FAISS_PATH, index_name="njmvc_Index")
#create a new file named vectorstore in your current directory.
if __name__=="__main__":
        DB_FAISS_PATH = './vectorstore/db_faiss/'
        file_name = "./data/drivermanual-2-small.pdf"
        #loader=read_file_get_prompts(file_name)
        #text=read_file_get_prompts(file_name)
        text = extract_text_from_pdf(file_name)
        #pdfReaded = PyPDF2.PdfReader(file_name)
        #docs=loader.load()
        #save_to_vector_store(text)
        save_to_vector_store(text)