satvikjain commited on
Commit
618bce1
1 Parent(s): cbe00ab

Optimized pinecone indexing and add OOP

Browse files
Files changed (3) hide show
  1. app.py +2 -118
  2. chatbot.py +112 -0
  3. dependencies.py +13 -0
app.py CHANGED
@@ -1,124 +1,8 @@
1
- from langchain.text_splitter import CharacterTextSplitter
2
- from langchain_community.document_loaders import TextLoader
3
- from langchain_community.document_loaders import UnstructuredURLLoader
4
- from langchain_huggingface import HuggingFaceEmbeddings
5
- from langchain_groq import ChatGroq
6
- import langchain_community.vectorstores
7
- from pinecone import Pinecone, ServerlessSpec
8
- from dotenv import load_dotenv
9
- import os
10
- from langchain_core.prompts import PromptTemplate
11
- from langchain.schema.runnable import RunnablePassthrough
12
- from langchain.schema.output_parser import StrOutputParser
13
- import gradio as gr
14
-
15
- class ChatBot():
16
- load_dotenv()
17
- # loader = DirectoryLoader('data', glob="*.md")
18
- urls = [
19
- 'https://noqs.in/faqs/',
20
- 'https://noqs.in/',
21
- 'https://noqs.in/internships/'
22
- ]
23
-
24
- url_loader = UnstructuredURLLoader(urls=urls)
25
- url_data = url_loader.load()
26
-
27
- text_loader = TextLoader('data.txt', encoding = 'UTF-8')
28
- text_data = text_loader.load()
29
-
30
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=600)
31
-
32
- url_docs = text_splitter.split_documents(url_data)
33
- text_docs = text_splitter.split_documents(text_data)
34
- docs = url_docs + text_docs
35
-
36
- embeddings = HuggingFaceEmbeddings()
37
-
38
- load_dotenv()
39
- # Initialize Pinecone client
40
- pc = Pinecone(
41
- api_key=os.environ.get("PINECONE_API_KEY")
42
- )
43
-
44
- # Define Index Name
45
- index_name = "noqs-chatbot-with-web-content-dynamic"
46
-
47
- # Checking Index
48
- if index_name not in pc.list_indexes().names():
49
- # Create new Index
50
- pc.create_index(name=index_name,
51
- metric="cosine",
52
- dimension=768,
53
- spec=ServerlessSpec(
54
- cloud="aws",
55
- region="us-east-1"
56
- ))
57
-
58
- docsearch = langchain_community.vectorstores.Pinecone.from_documents(docs, embeddings, index_name=index_name)
59
- else:
60
- # Embed the documents
61
- pc.delete_index(index_name)
62
- pc.create_index(name=index_name,
63
- metric="cosine",
64
- dimension=768,
65
- spec=ServerlessSpec(
66
- cloud="aws",
67
- region="us-east-1"
68
- ))
69
-
70
- docsearch = langchain_community.vectorstores.Pinecone.from_documents(docs, embeddings, index_name=index_name)
71
-
72
- # Define the repo ID and connect to a model on Groq API
73
- model_id = "llama3-8b-8192"
74
- llm = ChatGroq(
75
- model=model_id,
76
- temperature=0.5,
77
- max_tokens=1200
78
- )
79
-
80
- # Creating prompt response template
81
- template = """
82
- You are a knowledgeable assistant for NoQs Digital.
83
- The Manager of our company is Mr. Adit Agarwal.
84
- Users will ask you questions about our company, and you must use the given context to answer their questions accurately.
85
- Follow these guidelines:
86
- Always base your answers on the provided context. Do not make up information.
87
- If the context does not contain the answer, simply say, "I don't know based on the provided information."
88
- Offer detailed and thorough responses, but stay relevant to the user's question.
89
- Maintain a professional tone in your responses.
90
- You have to answer like a Customer Care Specialist.
91
- If someone answer about program A just answer politelly that due to high demand we are currently not offering program A.
92
- Don't tell user that you are fetching the information from the context
93
-
94
- Context: {context}
95
-
96
- User Question: {question}
97
-
98
- Answer:
99
- """
100
-
101
-
102
- prompt = PromptTemplate(
103
- template=template,
104
- input_variables=["context", "question"]
105
- )
106
-
107
- # Chaining llm and prompt
108
- rag_chain = (
109
- {"context": docsearch.as_retriever(), "question": RunnablePassthrough()}
110
- | prompt
111
- | llm
112
- | StrOutputParser()
113
- )
114
 
115
  bot = ChatBot()
116
 
117
- # def chat_function(prompt):
118
- # user_input = prompt
119
- # result = bot.rag_chain.invoke(user_input)
120
- # return result
121
-
122
  def chat_function(prompts,history):
123
  user_input = prompts
124
  result = bot.rag_chain.invoke(user_input)
 
1
+ from dependencies import *
2
+ from chatbot import ChatBot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  bot = ChatBot()
5
 
 
 
 
 
 
6
  def chat_function(prompts,history):
7
  user_input = prompts
8
  result = bot.rag_chain.invoke(user_input)
chatbot.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dependencies import *
2
+
3
+ class ChatBot():
4
+ def __init__(self, data_change = False):
5
+ self.execute = data_change
6
+ self.start_loader()
7
+ self.start_embeddings()
8
+ self.init_model()
9
+
10
+ def start_loader(self):
11
+ load_dotenv()
12
+ # loader = DirectoryLoader('data', glob="*.md")
13
+ urls = [
14
+ 'https://noqs.in/faqs/',
15
+ 'https://noqs.in/',
16
+ 'https://noqs.in/internships/'
17
+ ]
18
+
19
+ url_loader = UnstructuredURLLoader(urls=urls)
20
+ url_data = url_loader.load()
21
+
22
+ text_loader = TextLoader('data.txt', encoding = 'UTF-8')
23
+ text_data = text_loader.load()
24
+
25
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=600)
26
+
27
+ url_docs = text_splitter.split_documents(url_data)
28
+ text_docs = text_splitter.split_documents(text_data)
29
+ self.docs = url_docs + text_docs
30
+
31
+ def start_embeddings(self):
32
+ embeddings = HuggingFaceEmbeddings()
33
+ load_dotenv()
34
+ # Initialize Pinecone client
35
+ pc = Pinecone(
36
+ api_key=os.environ.get("PINECONE_API_KEY")
37
+ )
38
+
39
+ # Define Index Name
40
+ index_name = "noqs-chatbot-with-web-content-dynamic"
41
+
42
+ # Checking Index
43
+ if index_name not in pc.list_indexes().names():
44
+ # Create new Index
45
+ pc.create_index(name=index_name,
46
+ metric="cosine",
47
+ dimension=768,
48
+ spec=ServerlessSpec(
49
+ cloud="aws",
50
+ region="us-east-1"
51
+ ))
52
+
53
+ docsearch = langchain_community.vectorstores.Pinecone.from_documents(self.docs, embeddings, index_name=index_name)
54
+ else:
55
+ # Embed the documents
56
+ if self.execute:
57
+ pc.delete_index(index_name)
58
+ pc.create_index(name=index_name,
59
+ metric="cosine",
60
+ dimension=768,
61
+ spec=ServerlessSpec(
62
+ cloud="aws",
63
+ region="us-east-1"
64
+ ))
65
+
66
+ self.docsearch = langchain_community.vectorstores.Pinecone.from_documents(self.docs, embeddings, index_name=index_name)
67
+ else:
68
+ self.docsearch = langchain_community.vectorstores.Pinecone.from_existing_index(embedding=embeddings, index_name=index_name)
69
+
70
+ def init_model(self):
71
+ # Define the repo ID and connect to a model on Groq API
72
+ model_id = "llama3-8b-8192"
73
+ llm = ChatGroq(
74
+ model=model_id,
75
+ temperature=0.5,
76
+ max_tokens=1200
77
+ )
78
+
79
+ # Creating prompt response template
80
+ template = """
81
+ You are a knowledgeable assistant for NoQs Digital.
82
+ The Manager of our company is Mr. Adit Agarwal.
83
+ Users will ask you questions about our company, and you must use the given context to answer their questions accurately.
84
+ Follow these guidelines:
85
+ Always base your answers on the provided context. Do not make up information.
86
+ If the context does not contain the answer, simply say, "I don't know based on the provided information."
87
+ Offer detailed and thorough responses, but stay relevant to the user's question.
88
+ Maintain a professional tone in your responses.
89
+ You have to answer like a Customer Care Specialist.
90
+ If someone answer about program A just answer politelly that due to high demand we are currently not offering program A.
91
+ Don't tell user that you are fetching the information from the context
92
+
93
+ Context: {context}
94
+
95
+ User Question: {question}
96
+
97
+ Answer:
98
+ """
99
+
100
+
101
+ prompt = PromptTemplate(
102
+ template=template,
103
+ input_variables=["context", "question"]
104
+ )
105
+
106
+ # Chaining llm and prompt
107
+ self.rag_chain = (
108
+ {"context": self.docsearch.as_retriever(), "question": RunnablePassthrough()}
109
+ | prompt
110
+ | llm
111
+ | StrOutputParser()
112
+ )
dependencies.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import CharacterTextSplitter
2
+ from langchain_community.document_loaders import TextLoader
3
+ from langchain_community.document_loaders import UnstructuredURLLoader
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_groq import ChatGroq
6
+ import langchain_community.vectorstores
7
+ from pinecone import Pinecone, ServerlessSpec
8
+ from dotenv import load_dotenv
9
+ import os
10
+ from langchain_core.prompts import PromptTemplate
11
+ from langchain.schema.runnable import RunnablePassthrough
12
+ from langchain.schema.output_parser import StrOutputParser
13
+ import gradio as gr