momegas commited on
Commit
847491b
•
1 Parent(s): 4978dd5

💾 Adding a vectorstore (first implementation)

Browse files
Files changed (4) hide show
  1. .gitignore +3 -1
  2. example.ipynb +93 -21
  3. megabots/__init__.py +21 -14
  4. megabots/vectorstores.py +33 -0
.gitignore CHANGED
@@ -6,4 +6,6 @@ dist
6
  build
7
  **.pickle
8
  **.pkl
9
- .env
 
 
 
6
  build
7
  **.pickle
8
  **.pkl
9
+ .env
10
+ volumes
11
+ docker-compose.yml
example.ipynb CHANGED
@@ -1,31 +1,40 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
- "execution_count": 8,
6
  "metadata": {},
7
- "outputs": [
8
- {
9
- "data": {
10
- "text/plain": [
11
- "True"
12
- ]
13
- },
14
- "execution_count": 8,
15
- "metadata": {},
16
- "output_type": "execute_result"
17
- }
18
- ],
19
  "source": [
20
  "from megabots import bot\n",
21
- "from dotenv import load_dotenv\n",
 
 
 
 
 
 
 
 
 
 
22
  "\n",
23
- "load_dotenv()"
24
  ]
25
  },
26
  {
27
  "cell_type": "code",
28
- "execution_count": 9,
29
  "metadata": {},
30
  "outputs": [
31
  {
@@ -33,7 +42,7 @@
33
  "output_type": "stream",
34
  "text": [
35
  "Using model: gpt-3.5-turbo\n",
36
- "Loading path from disk...\n"
37
  ]
38
  },
39
  {
@@ -42,7 +51,7 @@
42
  "'The first roster of the Avengers included Iron Man, Thor, Hulk, Ant-Man, and the Wasp.'"
43
  ]
44
  },
45
- "execution_count": 9,
46
  "metadata": {},
47
  "output_type": "execute_result"
48
  }
@@ -52,9 +61,19 @@
52
  "qnabot.ask(\"what was the first roster of the avengers?\")"
53
  ]
54
  },
 
 
 
 
 
 
 
 
 
 
55
  {
56
  "cell_type": "code",
57
- "execution_count": 10,
58
  "metadata": {},
59
  "outputs": [
60
  {
@@ -62,7 +81,7 @@
62
  "output_type": "stream",
63
  "text": [
64
  "Using model: gpt-3.5-turbo\n",
65
- "Loading path from disk...\n"
66
  ]
67
  },
68
  {
@@ -71,7 +90,7 @@
71
  "\"Hmmm! Let me think about that... Ah yes, the original Avengers lineup included Iron Man, Thor, Hulk, Ant-Man, and the Wasp. They were like the ultimate superhero squad, except for maybe the Teenage Mutant Ninja Turtles. But let's be real, they were just a bunch of turtles who liked pizza.\""
72
  ]
73
  },
74
- "execution_count": 10,
75
  "metadata": {},
76
  "output_type": "execute_result"
77
  }
@@ -97,6 +116,59 @@
97
  ")\n",
98
  "qnabot.ask(\"what was the first roster of the avengers?\")\n"
99
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  }
101
  ],
102
  "metadata": {
 
1
  {
2
  "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Examples\n",
9
+ "\n",
10
+ "Below you can find some examples of how to use the 🤖 `Megabots` library."
11
+ ]
12
+ },
13
  {
14
  "cell_type": "code",
15
+ "execution_count": 13,
16
  "metadata": {},
17
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
18
  "source": [
19
  "from megabots import bot\n",
20
+ "from dotenv import load_dotenv"
21
+ ]
22
+ },
23
+ {
24
+ "attachments": {},
25
+ "cell_type": "markdown",
26
+ "metadata": {},
27
+ "source": [
28
+ "### Creating a bot\n",
29
+ "\n",
30
+ "The `bot` object is the main object of the library. It is used to create a bot and to interact with it.\n",
31
  "\n",
32
+ "The `index` argument specifies the index to use for the bot. It can either be a saved index file (e.g., `index.pkl`) or a directory of documents (e.g., `./index`). In the case of the directory the index will be automatically created. If no index is specified `bot` will look for `index.pkl` or `./index`"
33
  ]
34
  },
35
  {
36
  "cell_type": "code",
37
+ "execution_count": 14,
38
  "metadata": {},
39
  "outputs": [
40
  {
 
42
  "output_type": "stream",
43
  "text": [
44
  "Using model: gpt-3.5-turbo\n",
45
+ "Loading path from pickle file: ./index.pkl ...\n"
46
  ]
47
  },
48
  {
 
51
  "'The first roster of the Avengers included Iron Man, Thor, Hulk, Ant-Man, and the Wasp.'"
52
  ]
53
  },
54
+ "execution_count": 14,
55
  "metadata": {},
56
  "output_type": "execute_result"
57
  }
 
61
  "qnabot.ask(\"what was the first roster of the avengers?\")"
62
  ]
63
  },
64
+ {
65
+ "attachments": {},
66
+ "cell_type": "markdown",
67
+ "metadata": {},
68
+ "source": [
69
+ "### Changing the bot's prompt\n",
70
+ "\n",
71
+ "You can change the bots promnpt to customize it to your needs."
72
+ ]
73
+ },
74
  {
75
  "cell_type": "code",
76
+ "execution_count": 15,
77
  "metadata": {},
78
  "outputs": [
79
  {
 
81
  "output_type": "stream",
82
  "text": [
83
  "Using model: gpt-3.5-turbo\n",
84
+ "Loading path from pickle file: ./index.pkl ...\n"
85
  ]
86
  },
87
  {
 
90
  "\"Hmmm! Let me think about that... Ah yes, the original Avengers lineup included Iron Man, Thor, Hulk, Ant-Man, and the Wasp. They were like the ultimate superhero squad, except for maybe the Teenage Mutant Ninja Turtles. But let's be real, they were just a bunch of turtles who liked pizza.\""
91
  ]
92
  },
93
+ "execution_count": 15,
94
  "metadata": {},
95
  "output_type": "execute_result"
96
  }
 
116
  ")\n",
117
  "qnabot.ask(\"what was the first roster of the avengers?\")\n"
118
  ]
119
+ },
120
+ {
121
+ "attachments": {},
122
+ "cell_type": "markdown",
123
+ "metadata": {},
124
+ "source": [
125
+ "### Using Megabots with Milvus\n",
126
+ "\n",
127
+ "Megabots `bot` can also use Milvus as a backend for its search engine. You can find an example of how to do it below.\n",
128
+ "\n",
129
+ "In order to run Milvus you need to follow [this guide](https://milvus.io/docs/example_code.md) to download a docker compose file and run it.\n",
130
+ "The command is:\n",
131
+ " \n",
132
+ "```bash\n",
133
+ "wget https://raw.githubusercontent.com/milvus-io/pymilvus/v2.2.7/examples/hello_milvus.py\n",
134
+ "```\n",
135
+ "You can then [install Attu](https://milvus.io/docs/attu_install-docker.md) as a management tool for Milvus"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 11,
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "name": "stdout",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "Using model: gpt-3.5-turbo\n"
148
+ ]
149
+ },
150
+ {
151
+ "data": {
152
+ "text/plain": [
153
+ "'The first roster of the Avengers included Iron Man, Thor, Hulk, Ant-Man, and the Wasp.'"
154
+ ]
155
+ },
156
+ "execution_count": 11,
157
+ "metadata": {},
158
+ "output_type": "execute_result"
159
+ }
160
+ ],
161
+ "source": [
162
+ "from megabots import bot, vectorstore\n",
163
+ "\n",
164
+ "# Create a vectorstore object. Default port is 19530 and default host is localhost\n",
165
+ "milvus = vectorstore(\"milvus\")\n",
166
+ "\n",
167
+ "# Point it to your files directory so that it can index the files and add them to the vectorstore\n",
168
+ "bot = bot(\"qna-over-docs\", index=\"./examples/files/\", vectorstore=milvus)\n",
169
+ "\n",
170
+ "bot.ask(\"what was the first roster of the avengers?\")\n"
171
+ ]
172
  }
173
  ],
174
  "metadata": {
megabots/__init__.py CHANGED
@@ -1,7 +1,7 @@
 
1
  from langchain.llms import OpenAI
2
  from langchain.chat_models import ChatOpenAI
3
  from langchain.embeddings import OpenAIEmbeddings
4
- from langchain.document_loaders import DirectoryLoader, S3DirectoryLoader
5
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
6
  from langchain.vectorstores.faiss import FAISS
7
  import gradio as gr
@@ -11,10 +11,9 @@ import os
11
  from dotenv import load_dotenv
12
  from langchain.prompts import PromptTemplate
13
  from langchain.chains.question_answering import load_qa_chain
14
- from langchain.chains.conversational_retrieval.prompts import (
15
- CONDENSE_QUESTION_PROMPT,
16
- QA_PROMPT,
17
- )
18
 
19
  load_dotenv()
20
 
@@ -25,15 +24,17 @@ class Bot:
25
  model: str | None = None,
26
  prompt_template: str | None = None,
27
  prompt_variables: list[str] | None = None,
28
- memory: str | None = None,
29
  index: str | None = None,
30
  sources: bool | None = False,
 
 
 
31
  verbose: bool = False,
32
  temperature: int = 0,
33
  ):
34
  self.select_model(model, temperature)
35
  self.create_loader(index)
36
- self.load_or_create_index(index)
37
 
38
  # Load the question-answering chain for the selected model
39
  self.chain = self.create_chain(
@@ -83,18 +84,25 @@ class Bot:
83
  )
84
  self.loader = DirectoryLoader(index, recursive=True)
85
 
86
- def load_or_create_index(self, index_path: str):
87
  # Load an existing index from disk or create a new one if not available
 
 
 
 
 
 
 
88
 
89
  # Is pickle
90
- if index_path is not None and "pkl" in index_path or "pickle" in index_path:
91
- print("Loading path from disk...")
92
- with open(index_path, "rb") as f:
93
  self.search_index = pickle.load(f)
94
  return
95
 
96
  # Is directory
97
- if index_path is not None and os.path.isdir(index_path):
98
  print("Creating index...")
99
  self.search_index = FAISS.from_documents(
100
  self.loader.load_and_split(), OpenAIEmbeddings()
@@ -125,9 +133,8 @@ SUPPORTED_TASKS = {
125
  "impl": Bot,
126
  "default": {
127
  "model": "gpt-3.5-turbo",
128
- "prompt": "",
129
  "temperature": 0,
130
- "index": "./files",
131
  },
132
  }
133
  }
 
1
+ from typing import Any
2
  from langchain.llms import OpenAI
3
  from langchain.chat_models import ChatOpenAI
4
  from langchain.embeddings import OpenAIEmbeddings
 
5
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
6
  from langchain.vectorstores.faiss import FAISS
7
  import gradio as gr
 
11
  from dotenv import load_dotenv
12
  from langchain.prompts import PromptTemplate
13
  from langchain.chains.question_answering import load_qa_chain
14
+ from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
15
+ from langchain.document_loaders import DirectoryLoader
16
+ from megabots.vectorstores import vectorstore
 
17
 
18
  load_dotenv()
19
 
 
24
  model: str | None = None,
25
  prompt_template: str | None = None,
26
  prompt_variables: list[str] | None = None,
 
27
  index: str | None = None,
28
  sources: bool | None = False,
29
+ # TODO: Fix this typing
30
+ vectorstore: Any | None = None,
31
+ memory: str | None = None,
32
  verbose: bool = False,
33
  temperature: int = 0,
34
  ):
35
  self.select_model(model, temperature)
36
  self.create_loader(index)
37
+ self.load_or_create_index(index, vectorstore)
38
 
39
  # Load the question-answering chain for the selected model
40
  self.chain = self.create_chain(
 
84
  )
85
  self.loader = DirectoryLoader(index, recursive=True)
86
 
87
+ def load_or_create_index(self, index: str, vectorstore=None):
88
  # Load an existing index from disk or create a new one if not available
89
+ if vectorstore is not None:
90
+ self.search_index = vectorstore.client.from_documents(
91
+ self.loader.load_and_split(),
92
+ OpenAIEmbeddings(),
93
+ connection_args={"host": vectorstore.host, "port": vectorstore.port},
94
+ )
95
+ return
96
 
97
  # Is pickle
98
+ if index is not None and "pkl" in index or "pickle" in index:
99
+ print("Loading path from pickle file: ", index, "...")
100
+ with open(index, "rb") as f:
101
  self.search_index = pickle.load(f)
102
  return
103
 
104
  # Is directory
105
+ if index is not None and os.path.isdir(index):
106
  print("Creating index...")
107
  self.search_index = FAISS.from_documents(
108
  self.loader.load_and_split(), OpenAIEmbeddings()
 
133
  "impl": Bot,
134
  "default": {
135
  "model": "gpt-3.5-turbo",
 
136
  "temperature": 0,
137
+ "index": "./index",
138
  },
139
  }
140
  }
megabots/vectorstores.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Type, TypeVar
2
+ from langchain.vectorstores import Milvus, Qdrant
3
+ from abc import ABC
4
+
5
+
6
+ class MilvusVectorStore:
7
+ def __init__(self, host: str, port: int):
8
+ self.host = host
9
+ self.port = port
10
+ self.client = Milvus
11
+
12
+
13
+ SUPPORTED_VECTORSTORES = {
14
+ "milvus": {
15
+ "impl": MilvusVectorStore,
16
+ "default": {"host": "localhost", "port": 19530},
17
+ }
18
+ }
19
+
20
+
21
+ def vectorstore(name: str) -> MilvusVectorStore:
22
+ """Return a vectorstore object."""
23
+
24
+ if name is None:
25
+ raise RuntimeError("Impossible to instantiate a vectorstore without a name.")
26
+
27
+ if name not in SUPPORTED_VECTORSTORES:
28
+ raise ValueError(f"Vectorstore {name} is not supported.")
29
+
30
+ return SUPPORTED_VECTORSTORES[name]["impl"](
31
+ host=SUPPORTED_VECTORSTORES[name]["default"]["host"],
32
+ port=SUPPORTED_VECTORSTORES[name]["default"]["port"],
33
+ )