Asaad Almutareb commited on
Commit
9237552
1 Parent(s): fa99d8f

corrected streaming callback handler

Browse files

replaced sentence-transformers with Embed4All from GPT4All
Updated requirements.txt, example.env and README to reflect this repo's settings

README.md CHANGED
@@ -1,37 +1,17 @@
1
- ---
2
- title: Innovation Pathfinder AI
3
- emoji: 🚀
4
- colorFrom: gray
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.2.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- # InnovationPathfinderAI
13
  GenAI Research Assistant for Innovation Labs
14
 
15
- ## Problem Statement
16
- In the age of the internet there is more information available than ever before. This is amazing,
17
- however it is difficult to manage all of this information in a central location. With out tool we
18
- want to enable people with the capable to discover and manage knowledge bases.
19
-
20
- ## Vector Store
21
- Documents are embedded and store inside of a Chroma vector store
22
-
23
- ## Agents
24
-
25
- with agents our application is able to discover and refine the information it collects based on
26
- the needs and sentiment of the user.
27
-
28
- ## Agent Tools
29
- The tools our agents have access to. More is being created
30
 
31
- - `embed_arvix_paper` This tool is able to add [arvix papers](https://arxiv.org/) to the Chroma Vector Store
 
32
 
33
- - `knowledgeBase_search` This tool is able to search the knowledge base generated by the user
34
 
35
- - `wikipedia_search` search wikipedia
 
36
 
37
- - `google_search` search google
 
 
 
1
+ # FastAPI Backend for InnovationPathfinderAI
 
 
 
 
 
 
 
 
 
 
 
2
  GenAI Research Assistant for Innovation Labs
3
 
4
+ ## Getting Started
5
+ To get started
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ 1. install requirements:
8
+ install -r requirements.txt
9
 
10
+ 2. copy example.env to .env and add your API keys and variables
11
 
12
+ 3. run uvicron:
13
+ uvicorn app.main:app
14
 
15
+ ## ToDo
16
+ we are testing replacing the sentence-transformers with GPT4All's Embed4All
17
+ Code still need to be cleaned
app/api/v1/agents/hf_mixtral_agent.py CHANGED
@@ -74,7 +74,7 @@ async def websocket_endpoint(websocket: WebSocket):
74
 
75
  await websocket.send_json(resp.model_dump())
76
  message_id: str = utils.generate_uuid()
77
- custom_handler = CustomFinalStreamingStdOutCallbackHandler(
78
  websocket, message_id=message_id
79
  )
80
 
 
74
 
75
  await websocket.send_json(resp.model_dump())
76
  message_id: str = utils.generate_uuid()
77
+ custom_handler = CustomAsyncCallbackHandler(
78
  websocket, message_id=message_id
79
  )
80
 
app/structured_tools/structured_tools.py CHANGED
@@ -8,6 +8,7 @@ from langchain_community.utilities import GoogleSearchAPIWrapper
8
  from langchain_community.embeddings.sentence_transformer import (
9
  SentenceTransformerEmbeddings,
10
  )
 
11
  from app.core.config import settings
12
  from langchain_community.vectorstores import Chroma
13
  import arxiv
@@ -51,10 +52,11 @@ def memory_search(query:str) -> str:
51
  collection_name = settings.CONVERSATION_COLLECTION_NAME
52
  #store using envar
53
 
54
- embedding_function = SentenceTransformerEmbeddings(
55
- model_name=settings.EMBEDDING_MODEL
56
- #model_name=os.getenv("EMBEDDING_MODEL"),
57
- )
 
58
 
59
  vector_db = Chroma(
60
  client=client, # client for Chroma
@@ -78,15 +80,16 @@ def knowledgeBase_search(query:str) -> str:
78
  collection_name="ArxivPapers"
79
  #store using envar
80
 
81
- embedding_function = SentenceTransformerEmbeddings(
82
- #model_name=os.getenv("EMBEDDING_MODEL"),
83
- model_name=settings.EMBEDDING_MODEL
84
- )
 
85
 
86
  vector_db = Chroma(
87
- client=client, # client for Chroma
88
- collection_name=collection_name,
89
- embedding_function=embedding_function,
90
  )
91
 
92
  retriever = vector_db.as_retriever()
@@ -153,11 +156,6 @@ def embed_arvix_paper(paper_id:str) -> None:
153
  collection_name="ArxivPapers"
154
  #store using envar
155
 
156
- embedding_function = SentenceTransformerEmbeddings(
157
- #model_name=os.getenv("EMBEDDING_MODEL"),
158
- model_name=settings.EMBEDDING_MODEL
159
- )
160
-
161
  full_path = os.path.join(pdf_directory, pdf_file_name)
162
 
163
  add_pdf_to_vector_store(
 
8
  from langchain_community.embeddings.sentence_transformer import (
9
  SentenceTransformerEmbeddings,
10
  )
11
+ from langchain_community.embeddings import GPT4AllEmbeddings
12
  from app.core.config import settings
13
  from langchain_community.vectorstores import Chroma
14
  import arxiv
 
52
  collection_name = settings.CONVERSATION_COLLECTION_NAME
53
  #store using envar
54
 
55
+ # embedding_function = SentenceTransformerEmbeddings(
56
+ # model_name=settings.EMBEDDING_MODEL
57
+ # #model_name=os.getenv("EMBEDDING_MODEL"),
58
+ # )
59
+ embedding_function = GPT4AllEmbeddings()
60
 
61
  vector_db = Chroma(
62
  client=client, # client for Chroma
 
80
  collection_name="ArxivPapers"
81
  #store using envar
82
 
83
+ # embedding_function = SentenceTransformerEmbeddings(
84
+ # #model_name=os.getenv("EMBEDDING_MODEL"),
85
+ # model_name=settings.EMBEDDING_MODEL
86
+ # )
87
+ embedding_function = GPT4AllEmbeddings()
88
 
89
  vector_db = Chroma(
90
+ client=client, # client for Chroma
91
+ collection_name=collection_name,
92
+ embedding_function=embedding_function,
93
  )
94
 
95
  retriever = vector_db.as_retriever()
 
156
  collection_name="ArxivPapers"
157
  #store using envar
158
 
 
 
 
 
 
159
  full_path = os.path.join(pdf_directory, pdf_file_name)
160
 
161
  add_pdf_to_vector_store(
app/vector_store/chroma_vector_store.py CHANGED
@@ -20,6 +20,7 @@ from langchain_community.vectorstores import Chroma
20
  from langchain_community.embeddings.sentence_transformer import (
21
  SentenceTransformerEmbeddings,
22
  )
 
23
  from app.utils.utils import (
24
  generate_uuid
25
  )
@@ -97,10 +98,11 @@ def add_markdown_to_collection(
97
  name=collection_name,
98
  )
99
 
100
- embedding_function = SentenceTransformerEmbeddings(
101
- #model_name=os.getenv("EMBEDDING_MODEL"),
102
- model_name=settings.EMBEDDING_MODEL
103
- )
 
104
 
105
  documents_page_content:list = [i.page_content for i in splits]
106
 
@@ -180,10 +182,11 @@ def add_pdf_to_vector_store(
180
  name=collection_name,
181
  )
182
 
183
- embedding_function = SentenceTransformerEmbeddings(
184
- #model_name=os.getenv("EMBEDDING_MODEL"),
185
- model_name=settings.EMBEDDING_MODEL
186
- )
 
187
 
188
  documents_page_content:list = [i.page_content for i in split_docs]
189
 
@@ -239,10 +242,11 @@ if __name__ == "__main__":
239
  collection_name="ArxivPapers"
240
 
241
  # create the open-source embedding function
242
- embedding_function = SentenceTransformerEmbeddings(
243
- #model_name=os.getenv("EMBEDDING_MODEL"),
244
- model_name=settings.EMBEDDING_MODEL
245
- )
 
246
 
247
  #method of integrating Chroma and Langchain
248
  vector_db = Chroma(
 
20
  from langchain_community.embeddings.sentence_transformer import (
21
  SentenceTransformerEmbeddings,
22
  )
23
+ from langchain_community.embeddings import GPT4AllEmbeddings
24
  from app.utils.utils import (
25
  generate_uuid
26
  )
 
98
  name=collection_name,
99
  )
100
 
101
+ # embedding_function = SentenceTransformerEmbeddings(
102
+ # #model_name=os.getenv("EMBEDDING_MODEL"),
103
+ # model_name=settings.EMBEDDING_MODEL
104
+ # )
105
+ embedding_function = GPT4AllEmbeddings()
106
 
107
  documents_page_content:list = [i.page_content for i in splits]
108
 
 
182
  name=collection_name,
183
  )
184
 
185
+ # embedding_function = SentenceTransformerEmbeddings(
186
+ # #model_name=os.getenv("EMBEDDING_MODEL"),
187
+ # model_name=settings.EMBEDDING_MODEL
188
+ # )
189
+ embedding_function = GPT4AllEmbeddings()
190
 
191
  documents_page_content:list = [i.page_content for i in split_docs]
192
 
 
242
  collection_name="ArxivPapers"
243
 
244
  # create the open-source embedding function
245
+ # embedding_function = SentenceTransformerEmbeddings(
246
+ # #model_name=os.getenv("EMBEDDING_MODEL"),
247
+ # model_name=settings.EMBEDDING_MODEL
248
+ # )
249
+ embedding_function = GPT4AllEmbeddings()
250
 
251
  #method of integrating Chroma and Langchain
252
  vector_db = Chroma(
example.env CHANGED
@@ -5,14 +5,27 @@ HUGGINGFACEHUB_API_TOKEN=
5
  OLLMA_BASE_URL=
6
 
7
  # environmental varaibles needed to use tools
8
- SERPAPI_API_KEY=
 
 
 
 
9
 
10
  # for chromadb
11
- VECTOR_DATABASE_LOCATION="innovation_pathfinder_ai/knowledge_base/"
12
 
13
  # Name for the Conversation Memory Collection
14
  CONVERSATION_COLLECTION_NAME="ConversationMemory"
15
 
16
  EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
17
 
18
- SOURCES_CACHE="innovation_pathfinder_ai/database/sources_cache.sqlite3"
 
 
 
 
 
 
 
 
 
 
5
  OLLMA_BASE_URL=
6
 
7
  # environmental varaibles needed to use tools
8
+ #SERPAPI_API_KEY=
9
+
10
+ # we are using Google Custom Search Engine now
11
+ GOOGLE_CSE_ID=
12
+ GOOGLE_API_KEY=
13
 
14
  # for chromadb
15
+ VECTOR_DATABASE_LOCATION="app/knowledge_base/"
16
 
17
  # Name for the Conversation Memory Collection
18
  CONVERSATION_COLLECTION_NAME="ConversationMemory"
19
 
20
  EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
21
 
22
+ SOURCES_CACHE="app/database/sources_cache.sqlite3"
23
+
24
+ # local cache
25
+ LOCAL_CACHE=".cache.db"
26
+
27
+ # project name
28
+ PROJECT_NAME=innovation_pathfinder_ai
29
+
30
+ # restricting access to the backend resources, for development it's set to * ('all')
31
+ BACKEND_CORS_ORIGINS=["*"]
requirements.txt CHANGED
@@ -2,10 +2,8 @@ langchain-community
2
  langchain
3
  google-search-results
4
  langchainhub
5
- text_generation
6
  arxiv
7
  wikipedia
8
- gradio==3.48.0
9
  chromadb
10
  google_api_python_client
11
  pypdf2
@@ -13,6 +11,6 @@ sqlmodel
13
  rich
14
  fastapi
15
  uvicorn
16
- sentence-transformers
17
- fastapi-pagination
18
- adaptive-cards-py
 
2
  langchain
3
  google-search-results
4
  langchainhub
 
5
  arxiv
6
  wikipedia
 
7
  chromadb
8
  google_api_python_client
9
  pypdf2
 
11
  rich
12
  fastapi
13
  uvicorn
14
+ adaptive-cards-py
15
+ pydantic_settings
16
+ gpt4all