Ganesh43 commited on
Commit
5e06873
1 Parent(s): fc6435f

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +74 -0
utils.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ #The below import has been replaced by the later one
3
+ #from langchain.vectorstores import Pinecone
4
+ from langchain_community.vectorstores import Pinecone
5
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
6
+ #Pinecone as made some changes recently and we have to import it in the below way from now on :)
7
+ from pinecone import Pinecone as PineconeClient
8
+ import asyncio
9
+ from langchain.document_loaders.sitemap import SitemapLoader
10
+
11
+
12
+ #Function to fetch data from website
13
+ #https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/sitemap
14
+ sitemap_url="https://www.accenture.com/in-en/careers/jobsearch?jk=Data&sb=0&vw=1&is_rj=0&pg=1&jt=entry-level%20job"
15
+ def get_website_data(sitemap_url):
16
+
17
+ loop = asyncio.new_event_loop()
18
+ asyncio.set_event_loop(loop)
19
+ loader = SitemapLoader(sitemap_url)
20
+
21
+ docs = loader.load()
22
+
23
+ return docs
24
+ d=get_website_data(sitemap_url)
25
+ print(len(d))
26
+ #Function to split data into smaller chunks
27
+ def split_data(docs):
28
+
29
+ text_splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size = 1000,
31
+ chunk_overlap = 200,
32
+ length_function = len,
33
+ )
34
+
35
+ docs_chunks = text_splitter.split_documents(docs)
36
+ return docs_chunks
37
+
38
+ #Function to create embeddings instance
39
+ def create_embeddings():
40
+
41
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
42
+ return embeddings
43
+
44
+ #Function to push data to Pinecone
45
+ def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
46
+
47
+ PineconeClient(
48
+ api_key=pinecone_apikey,
49
+ environment=pinecone_environment
50
+ )
51
+
52
+ index_name = pinecone_index_name
53
+ #PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
54
+ index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
55
+ return index
56
+
57
+ #Function to pull index data from Pinecone
58
+ def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
59
+
60
+ PineconeClient(
61
+ api_key=pinecone_apikey,
62
+ environment=pinecone_environment
63
+ )
64
+
65
+ index_name = pinecone_index_name
66
+ #PineconeStore is an alias name of Pinecone class, please look at the imports section at the top :)
67
+ index = Pinecone.from_existing_index(index_name, embeddings)
68
+ return index
69
+
70
+ #This function will help us in fetching the top relevent documents from our vector store - Pinecone Index
71
+ def get_similar_docs(index,query,k=2):
72
+
73
+ similar_docs = index.similarity_search(query, k=k)
74
+ return similar_docs