File size: 1,492 Bytes
8b091a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from pymongo import MongoClient
# error since Jan 2024, from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
# error since Jan 2024, from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
# error since Jan 2024, from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

mongo_uri = os.getenv("MONGO_URI")
openai_api_key = os.getenv("OPENAI_API_KEY")

client = MongoClient(mongo_uri)
dbName = "langchain_demo"
collectionName = "collection_of_text_blobs"
collection = client[dbName][collectionName]

#loader = DirectoryLoader( './sample_files', glob="./*.txt", show_progress=True)
loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf")
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
docs = text_splitter.split_documents(data)

#embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
#vectorStore = MongoDBAtlasVectorSearch.from_documents( data, embeddings, collection=collection, index_name="default" )

# insert the documents in MongoDB Atlas Vector Search
x = MongoDBAtlasVectorSearch.from_documents(
    documents=docs,
    embedding=OpenAIEmbeddings(openai_api_key=openai_api_key, disallowed_special=()),
    collection=collection,
    index_name="default"
    )