Brunarize / ingestion.py
lemesdaniel's picture
Upload folder using huggingface_hub
e00b837 verified
raw
history blame contribute delete
No virus
1.38 kB
import os
from pinecone import Pinecone as PineconeClient, ServerlessSpec
from langchain_community.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_ENV = os.environ.get('PINECONE_ENV')
embeddings = OpenAIEmbeddings()
loader = PyPDFLoader("docs/M92TB4_2023-24_online.pdf")
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
pinecone = PineconeClient(
api_key=PINECONE_API_KEY
)
index_name = "linuxtips"
if index_name not in pinecone.list_indexes().names():
pinecone.create_index(
name=index_name,
dimension=1536,
metric='euclidean',
spec=ServerlessSpec(
cloud="aws",
region="us-east-1"
)
)
index = pinecone.Index(index_name)
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)
# Test
query = "Assistant, please tell me what are the main functions of an autarchy department"
docs = docsearch.similarity_search(query)
print(docs[0].page_content)