In [3]:
import os
import json

# If the file does not exist it'll default to the manual setting see below
filePathToSettingsFile = '../../env/ai.json'

# Is there a settings file? 
if os.path.exists(filePathToSettingsFile):
 # Yes there is so load settings from there
 
 print(f'Loading settings from {filePathToSettingsFile}')
 f = open(filePathToSettingsFile)
 settingsJson = json.load(f)
 del f

 for key in settingsJson:
 os.environ[key] = settingsJson[key]
 
 del settingsJson
else: 
 # Set variables manually
 
 print('Setting variables manually as there is not ai.json settings file')

 # Update the variables below with your own settings
 os.environ['REQUESTS_CA_BUNDLE'] = '../../env/ZCert.pem' 
 os.environ['HUGGING_FACE_API_KEY'] = 'Get here: https://huggingface.co/settings/tokens'
 os.environ['OPENAI_API_KEY'] = 'Get here: https://platform.openai.com/account/api-keys'
 os.environ["SERPAPI_API_KEY"] = 'serpapi KEY, Get here: https://serpapi.com/manage-api-key' 

Loading settings from ../../env/ai.json


# Load data

In [4]:
# https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/Ask%20A%20Book%20Questions.ipynb
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [57]:
import glob
from pdfminer.high_level import extract_text

rootFolder = '../rag-demo-1-data/'
literatureFolder = 'literature/'
historyOfRomeFolder = 'history-roman/'

currentFolder = f'{rootFolder}{literatureFolder}'


In [58]:
for filename in glob.glob(f"{currentFolder}*.pdf"):

 print(f'About to extract {filename}')
 try:
 text = extract_text(filename)
 text = text.encode('ascii', errors='ignore').decode('ascii')

 textFilename = f'{filename}.txt'
 print(textFilename)
 with open(textFilename, 'w') as f:
 f.write(text)
 
 os.rename(filename, f"{filename}.done")
 except Exception as err:
 print(f"Error with file {filename} {err}")

In [59]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

loader = DirectoryLoader(currentFolder, glob="**/*.txt", loader_cls=TextLoader)
docs = loader.load()

print(len(docs))
print(len(docs[0].page_content))

In [61]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(docs)
print(len(texts))

16596


In [62]:
#print(texts[8000].page_content)
print(texts[8000].metadata)

{'source': '..\\rag-demo-1-data\\literature\\moby-dick.pdf.txt'}


# Create embeddings

In [63]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import pinecone

embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [64]:
index_name = "" # put in the name of your pinecone index here
pinecone.init(api_key='', environment='gcp-starter')

In [65]:
docsearch = Pinecone.from_documents(texts, embeddings, index_name=index_name)

In [66]:
query = "What is moby dick?"
searchResult = docsearch.similarity_search(query, k=5)

In [67]:

print(searchResult[0].metadata['source'])
print(searchResult[0].page_content)

..\rag-demo-1-data\literature\moby-dick.pdf.txt
Moby Dick By Herman MelvilleDownload free eBooks of classic literature, books and 
novels at Planet eBook. Subscribe to our free eBooks blog 
and email newsletter.ETYMOLOGY.(Supplied by a Late Consumptive Usher to a Grammar 
School)The pale Usherthreadbare in coat, heart, body, and 
brain; I see him now. He was ever dusting his old lexicons 
and grammars, with a queer handkerchief, mockingly em-
bellished with all the gay flags of all the known nations of 
the world. He loved to dust his old grammars; it somehow 
mildly reminded him of his mortality.While you take in hand to school others, and to teach them 
by what name a whale-fish is to be called in our tongue 
leaving out, through ignorance, the letter H, which almost 
alone maketh the signification of the word, you deliver that 
which is not true. HACKLUYTWHALE. Sw. and Dan. HVAL. This animal is named from 
roundness or rolling; for in Dan. HVALT is arched or vaulted.
