Asankhaya Sharma commited on
Commit
5cbd50f
β€’
1 Parent(s): 4e00df7
Files changed (2) hide show
  1. README.md +174 -13
  2. app.py +135 -0
README.md CHANGED
@@ -1,13 +1,174 @@
1
- ---
2
- title: MeraKB
3
- emoji: πŸ“š
4
- colorFrom: purple
5
- colorTo: red
6
- sdk: streamlit
7
- sdk_version: 1.27.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quivr
2
+
3
+ <p align="center">
4
+ <img src="../logo.png" alt="Quivr-logo" width="30%">
5
+ <p align="center">
6
+
7
+ <a href="https://discord.gg/HUpRgp2HG8">
8
+ <img src="https://img.shields.io/badge/discord-join%20chat-blue.svg" alt="Join our Discord" height="40">
9
+ </a>
10
+
11
+ Quivr is your second brain in the cloud, designed to easily store and retrieve unstructured information. It's like Obsidian but powered by generative AI.
12
+
13
+ ## Features
14
+
15
+ - **Store Anything**: Quivr can handle almost any type of data you throw at it. Text, images, code snippets, you name it.
16
+ - **Generative AI**: Quivr uses advanced AI to help you generate and retrieve information.
17
+ - **Fast and Efficient**: Designed with speed and efficiency in mind. Quivr makes sure you can access your data as quickly as possible.
18
+ - **Secure**: Your data is stored securely in the cloud and is always under your control.
19
+ - **Compatible Files**:
20
+ - **Text**
21
+ - **Markdown**
22
+ - **PDF**
23
+ - **Audio**
24
+ - **Video**
25
+ - **Open Source**: Quivr is open source and free to use.
26
+ ## Demo
27
+
28
+
29
+ ### Demo with GPT3.5
30
+ https://github.com/StanGirard/quivr/assets/19614572/80721777-2313-468f-b75e-09379f694653
31
+
32
+
33
+ ### Demo with Claude 100k context
34
+ https://github.com/StanGirard/quivr/assets/5101573/9dba918c-9032-4c8d-9eea-94336d2c8bd4
35
+
36
+ ## Getting Started
37
+
38
+ These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
39
+
40
+ ### Prerequisites
41
+
42
+ Make sure you have the following installed before continuing:
43
+
44
+ - Python 3.10 or higher
45
+ - Pip
46
+ - Virtualenv
47
+
48
+ You'll also need a [Supabase](https://supabase.com/) account for:
49
+
50
+ - A new Supabase project
51
+ - Supabase Project API key
52
+ - Supabase Project URL
53
+
54
+ ### Installing
55
+
56
+ - Clone the repository
57
+
58
+ ```bash
59
+ git clone git@github.com:StanGirard/Quivr.git && cd Quivr
60
+ ```
61
+
62
+ - Create a virtual environment
63
+
64
+ ```bash
65
+ virtualenv venv
66
+ ```
67
+
68
+ - Activate the virtual environment
69
+
70
+ ```bash
71
+ source venv/bin/activate
72
+ ```
73
+
74
+ - Install the dependencies
75
+
76
+ ```bash
77
+ pip install -r requirements.txt
78
+ ```
79
+
80
+ - Copy the streamlit secrets.toml example file
81
+
82
+ ```bash
83
+ cp .streamlit/secrets.toml.example .streamlit/secrets.toml
84
+ ```
85
+
86
+ - Add your credentials to .streamlit/secrets.toml file
87
+
88
+ ```toml
89
+ supabase_url = "SUPABASE_URL"
90
+ supabase_service_key = "SUPABASE_SERVICE_KEY"
91
+ openai_api_key = "OPENAI_API_KEY"
92
+ anthropic_api_key = "ANTHROPIC_API_KEY" # Optional
93
+ ```
94
+
95
+ _Note that the `supabase_service_key` is found in your Supabase dashboard under Project Settings -> API. Use the `anon` `public` key found in the `Project API keys` section._
96
+
97
+ - Run the following migration scripts on the Supabase database via the web interface (SQL Editor -> `New query`)
98
+
99
+ ```sql
100
+ -- Enable the pgvector extension to work with embedding vectors
101
+ create extension vector;
102
+
103
+ -- Create a table to store your documents
104
+ create table documents (
105
+ id bigserial primary key,
106
+ content text, -- corresponds to Document.pageContent
107
+ metadata jsonb, -- corresponds to Document.metadata
108
+ embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed
109
+ );
110
+
111
+ CREATE FUNCTION match_documents(query_embedding vector(1536), match_count int)
112
+ RETURNS TABLE(
113
+ id bigint,
114
+ content text,
115
+ metadata jsonb,
116
+ -- we return matched vectors to enable maximal marginal relevance searches
117
+ embedding vector(1536),
118
+ similarity float)
119
+ LANGUAGE plpgsql
120
+ AS $$
121
+ # variable_conflict use_column
122
+ BEGIN
123
+ RETURN query
124
+ SELECT
125
+ id,
126
+ content,
127
+ metadata,
128
+ embedding,
129
+ 1 -(documents.embedding <=> query_embedding) AS similarity
130
+ FROM
131
+ documents
132
+ ORDER BY
133
+ documents.embedding <=> query_embedding
134
+ LIMIT match_count;
135
+ END;
136
+ $$;
137
+ ```
138
+
139
+ and
140
+
141
+ ```sql
142
+ create table
143
+ stats (
144
+ -- A column called "time" with data type "timestamp"
145
+ time timestamp,
146
+ -- A column called "details" with data type "text"
147
+ chat boolean,
148
+ embedding boolean,
149
+ details text,
150
+ metadata jsonb,
151
+ -- An "integer" primary key column called "id" that is generated always as identity
152
+ id integer primary key generated always as identity
153
+ );
154
+ ```
155
+
156
+ - Run the app
157
+
158
+ ```bash
159
+ streamlit run main.py
160
+ ```
161
+
162
+ ## Built With
163
+
164
+ * [NextJS](https://nextjs.org/) - The React framework used.
165
+ * [FastAPI](https://fastapi.tiangolo.com/) - The API framework used.
166
+ * [Supabase](https://supabase.io/) - The open source Firebase alternative.
167
+
168
+ ## Contributing
169
+
170
+ Open a pull request and we'll review it as soon as possible.
171
+
172
+ ## Star History
173
+
174
+ [![Star History Chart](https://api.star-history.com/svg?repos=StanGirard/quivr&type=Date)](https://star-history.com/#StanGirard/quivr&Date)
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import os
3
+ import tempfile
4
+
5
+ import streamlit as st
6
+ from files import file_uploader, url_uploader
7
+ from question import chat_with_doc
8
+ from brain import brain
9
+ from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
10
+ from langchain.vectorstores import SupabaseVectorStore
11
+ from supabase import Client, create_client
12
+ from explorer import view_document
13
+ from stats import get_usage_today
14
+
15
+ supabase_url = st.secrets.supabase_url
16
+ supabase_key = st.secrets.supabase_service_key
17
+ openai_api_key = st.secrets.openai_api_key
18
+ anthropic_api_key = st.secrets.anthropic_api_key
19
+ hf_api_key = st.secrets.hf_api_key
20
+ supabase: Client = create_client(supabase_url, supabase_key)
21
+ self_hosted = st.secrets.self_hosted
22
+
23
+ # embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
24
+
25
+
26
+ embeddings = HuggingFaceInferenceAPIEmbeddings(
27
+ api_key=hf_api_key,
28
+ model_name="BAAI/bge-large-en-v1.5"
29
+ )
30
+
31
+ vector_store = SupabaseVectorStore(supabase, embeddings, query_name='match_documents', table_name="documents")
32
+
33
+ models = ["llama-2"]
34
+
35
+ if openai_api_key:
36
+ models += ["gpt-3.5-turbo", "gpt-4"]
37
+
38
+ if anthropic_api_key:
39
+ models += ["claude-v1", "claude-v1.3",
40
+ "claude-instant-v1-100k", "claude-instant-v1.1-100k"]
41
+
42
+ # Set the theme
43
+ st.set_page_config(
44
+ page_title="meraKB",
45
+ layout="wide",
46
+ initial_sidebar_state="expanded",
47
+ )
48
+
49
+
50
+ st.title("🧠 meraKB - Your digital brain 🧠")
51
+ st.markdown("Store your knowledge in a vector store and chat with it.")
52
+ if self_hosted == "false":
53
+ st.markdown('**πŸ“’ Note: In the public demo, access to functionality is restricted. You can only use the GPT-3.5-turbo model and upload files up to 1Mb. To use more models and upload larger files, consider self-hosting meraKB.**')
54
+
55
+ st.markdown("---\n\n")
56
+
57
+ st.session_state["overused"] = False
58
+ if self_hosted == "false":
59
+ usage = get_usage_today(supabase)
60
+ if usage > st.secrets.usage_limit:
61
+ st.markdown(
62
+ f"<span style='color:red'>You have used {usage} tokens today, which is more than your daily limit of {st.secrets.usage_limit} tokens. Please come back later or consider self-hosting.</span>", unsafe_allow_html=True)
63
+ st.session_state["overused"] = True
64
+ else:
65
+ st.markdown(f"<span style='color:blue'>Usage today: {usage} tokens out of {st.secrets.usage_limit}</span>", unsafe_allow_html=True)
66
+ st.write("---")
67
+
68
+
69
+
70
+
71
+ # Initialize session state variables
72
+ if 'model' not in st.session_state:
73
+ st.session_state['model'] = "llama-2"
74
+ if 'temperature' not in st.session_state:
75
+ st.session_state['temperature'] = 0.1
76
+ if 'chunk_size' not in st.session_state:
77
+ st.session_state['chunk_size'] = 500
78
+ if 'chunk_overlap' not in st.session_state:
79
+ st.session_state['chunk_overlap'] = 0
80
+ if 'max_tokens' not in st.session_state:
81
+ st.session_state['max_tokens'] = 500
82
+
83
+ # Create a radio button for user to choose between adding knowledge or asking a question
84
+ user_choice = st.radio(
85
+ "Choose an action", ('Add Knowledge', 'Chat with your Brain', 'Forget', "Explore"))
86
+
87
+ st.markdown("---\n\n")
88
+
89
+ if user_choice == 'Add Knowledge':
90
+ # Display chunk size and overlap selection only when adding knowledge
91
+ st.sidebar.title("Configuration")
92
+ st.sidebar.markdown(
93
+ "Choose your chunk size and overlap for adding knowledge.")
94
+ st.session_state['chunk_size'] = st.sidebar.slider(
95
+ "Select Chunk Size", 100, 1000, st.session_state['chunk_size'], 50)
96
+ st.session_state['chunk_overlap'] = st.sidebar.slider(
97
+ "Select Chunk Overlap", 0, 100, st.session_state['chunk_overlap'], 10)
98
+
99
+ # Create two columns for the file uploader and URL uploader
100
+ col1, col2 = st.columns(2)
101
+
102
+ with col1:
103
+ file_uploader(supabase, vector_store)
104
+ with col2:
105
+ url_uploader(supabase, vector_store)
106
+ elif user_choice == 'Chat with your Brain':
107
+ # Display model and temperature selection only when asking questions
108
+ st.sidebar.title("Configuration")
109
+ st.sidebar.markdown(
110
+ "Choose your model and temperature for asking questions.")
111
+ if self_hosted != "false":
112
+ st.session_state['model'] = st.sidebar.selectbox(
113
+ "Select Model", models, index=(models).index(st.session_state['model']))
114
+ else:
115
+ st.sidebar.write("**Model**: gpt-3.5-turbo")
116
+ st.sidebar.write("**Self Host to unlock more models such as claude-v1 and GPT4**")
117
+ st.session_state['model'] = "gpt-3.5-turbo"
118
+ st.session_state['temperature'] = st.sidebar.slider(
119
+ "Select Temperature", 0.1, 1.0, st.session_state['temperature'], 0.1)
120
+ if st.secrets.self_hosted != "false":
121
+ st.session_state['max_tokens'] = st.sidebar.slider(
122
+ "Select Max Tokens", 500, 4000, st.session_state['max_tokens'], 500)
123
+ else:
124
+ st.session_state['max_tokens'] = 500
125
+
126
+ chat_with_doc(st.session_state['model'], vector_store, stats_db=supabase)
127
+ elif user_choice == 'Forget':
128
+ st.sidebar.title("Configuration")
129
+
130
+ brain(supabase)
131
+ elif user_choice == 'Explore':
132
+ st.sidebar.title("Configuration")
133
+ view_document(supabase)
134
+
135
+ st.markdown("---\n\n")