Spaces:
Running
Running
marcellopoliti
commited on
Commit
•
8e018ae
1
Parent(s):
611f226
refactor
Browse files- app.py +36 -44
- generate_kb.py +0 -1
- pages/create_knowledge_box.py +2 -1
- pages/delete_knowledge_box⚠️.py +2 -1
- pages/manage_knowledge_box.py +190 -145
- requirements.txt +2 -1
- retrieve_kb.py +5 -2
- utils.py +3 -3
app.py
CHANGED
@@ -1,27 +1,47 @@
|
|
1 |
-
|
2 |
-
from utils import get_chroma_client, get_embedding_function
|
3 |
|
4 |
-
# streamlit_app.py
|
5 |
-
|
6 |
-
import hmac
|
7 |
import streamlit as st
|
|
|
8 |
import os
|
9 |
-
import streamlit.components.v1 as components
|
10 |
-
from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_information
|
11 |
-
import streamlit as st
|
12 |
import requests
|
13 |
-
import os
|
14 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
15 |
|
16 |
__import__("pysqlite3")
|
17 |
import sys
|
18 |
|
|
|
19 |
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
|
20 |
-
|
21 |
st.set_page_config(page_title="Hello", page_icon="👋", layout="wide")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
def show_sidebar():
|
|
|
25 |
# Sidebar
|
26 |
st.sidebar.header(("About"))
|
27 |
st.sidebar.markdown(
|
@@ -73,20 +93,10 @@ if not check_password():
|
|
73 |
|
74 |
# Main Streamlit app starts here
|
75 |
client = get_chroma_client()
|
76 |
-
default_embedding_function = get_embedding_function()
|
77 |
-
|
78 |
-
|
79 |
-
# Function to load a page
|
80 |
-
def load_page(page_name):
|
81 |
-
with open(f"pages/{page_name}", "r") as file:
|
82 |
-
exec(file.read(), globals())
|
83 |
-
|
84 |
-
|
85 |
-
client = get_chroma_client()
|
86 |
-
default_embedding_function = get_embedding_function()
|
87 |
-
|
88 |
show_sidebar()
|
89 |
|
|
|
90 |
col1, col2, col3 = st.columns((1, 4, 1))
|
91 |
with col2:
|
92 |
st.image("https://brianknows.org/brian_logo.png", width=300)
|
@@ -94,39 +104,21 @@ st.write("# Brian Knowledge Base System! 👋")
|
|
94 |
|
95 |
|
96 |
tab1, tab2 = st.tabs(["AskBrian", "BrianApp"])
|
|
|
|
|
97 |
with tab1:
|
98 |
st.markdown("## Ask Brian Anything")
|
99 |
kb_name = "public-knowledge-box"
|
100 |
|
101 |
-
load_dotenv()
|
102 |
-
api_key = os.getenv("BRIAN_API_KEY")
|
103 |
-
|
104 |
-
def send_post_request(prompt, kb):
|
105 |
-
url = " https://api.brianknows.org/api/v0/agent/knowledge"
|
106 |
-
data = {"prompt": prompt, "kb": kb}
|
107 |
-
headers = {
|
108 |
-
"Content-Type": "application/json",
|
109 |
-
"X-Brian-Api-Key": api_key, # Include the API key in the headers
|
110 |
-
}
|
111 |
-
|
112 |
-
response = requests.post(url, json=data, headers=headers)
|
113 |
-
|
114 |
-
if response.status_code == 200:
|
115 |
-
return response.json() # Returns the JSON response if successful
|
116 |
-
else:
|
117 |
-
return (
|
118 |
-
response.status_code,
|
119 |
-
response.text,
|
120 |
-
) # Returns the status code and error if not successful
|
121 |
-
|
122 |
# Example usage:
|
123 |
kbs = get_current_knowledge_bases(client=client)
|
124 |
kbs = (kb.name for kb in kbs)
|
125 |
kb_name = st.selectbox("Select knowledge box", kbs)
|
126 |
query = st.text_input(label="query")
|
127 |
if st.button("askbrian"):
|
128 |
-
result =
|
129 |
st.json(result)
|
130 |
|
|
|
131 |
with tab2:
|
132 |
components.iframe("https://www.brianknows.org/", height=650, scrolling=True)
|
|
|
1 |
+
"""Entry point of streamòit app"""
|
|
|
2 |
|
|
|
|
|
|
|
3 |
import streamlit as st
|
4 |
+
import hmac
|
5 |
import os
|
|
|
|
|
|
|
6 |
import requests
|
|
|
7 |
from dotenv import load_dotenv
|
8 |
+
import streamlit.components.v1 as components
|
9 |
+
|
10 |
+
from utils import get_chroma_client, get_embedding_function
|
11 |
+
from retrieve_kb import get_current_knowledge_bases
|
12 |
|
13 |
__import__("pysqlite3")
|
14 |
import sys
|
15 |
|
16 |
+
# settings
|
17 |
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
|
|
|
18 |
st.set_page_config(page_title="Hello", page_icon="👋", layout="wide")
|
19 |
+
load_dotenv()
|
20 |
+
brian_api_key = os.getenv("BRIAN_API_KEY")
|
21 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
22 |
+
|
23 |
+
|
24 |
+
def askbrian_request(prompt, kb, api_key):
|
25 |
+
url = " https://api.brianknows.org/api/v0/agent/knowledge"
|
26 |
+
data = {"prompt": prompt, "kb": kb}
|
27 |
+
headers = {
|
28 |
+
"Content-Type": "application/json",
|
29 |
+
"X-Brian-Api-Key": api_key, # Include the API key in the headers
|
30 |
+
}
|
31 |
+
|
32 |
+
response = requests.post(url, json=data, headers=headers)
|
33 |
+
|
34 |
+
if response.status_code == 200:
|
35 |
+
return response.json() # Returns the JSON response if successful
|
36 |
+
else:
|
37 |
+
return (
|
38 |
+
response.status_code,
|
39 |
+
response.text,
|
40 |
+
) # Returns the status code and error if not successful
|
41 |
|
42 |
|
43 |
def show_sidebar():
|
44 |
+
"""Shows sidebar with Biran info"""
|
45 |
# Sidebar
|
46 |
st.sidebar.header(("About"))
|
47 |
st.sidebar.markdown(
|
|
|
93 |
|
94 |
# Main Streamlit app starts here
|
95 |
client = get_chroma_client()
|
96 |
+
default_embedding_function = get_embedding_function(openai_key=openai_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
show_sidebar()
|
98 |
|
99 |
+
|
100 |
col1, col2, col3 = st.columns((1, 4, 1))
|
101 |
with col2:
|
102 |
st.image("https://brianknows.org/brian_logo.png", width=300)
|
|
|
104 |
|
105 |
|
106 |
tab1, tab2 = st.tabs(["AskBrian", "BrianApp"])
|
107 |
+
|
108 |
+
# Ask Brian Tab
|
109 |
with tab1:
|
110 |
st.markdown("## Ask Brian Anything")
|
111 |
kb_name = "public-knowledge-box"
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
# Example usage:
|
114 |
kbs = get_current_knowledge_bases(client=client)
|
115 |
kbs = (kb.name for kb in kbs)
|
116 |
kb_name = st.selectbox("Select knowledge box", kbs)
|
117 |
query = st.text_input(label="query")
|
118 |
if st.button("askbrian"):
|
119 |
+
result = askbrian_request(query, kb_name, brian_api_key)
|
120 |
st.json(result)
|
121 |
|
122 |
+
# Brian App embedded Tab
|
123 |
with tab2:
|
124 |
components.iframe("https://www.brianknows.org/", height=650, scrolling=True)
|
generate_kb.py
CHANGED
@@ -13,7 +13,6 @@ from spellchecker import SpellChecker
|
|
13 |
|
14 |
load_dotenv()
|
15 |
openai_key = os.getenv("OPENAI_API_KEY")
|
16 |
-
openai_key = st.secrets["OPENAI_API_KEY"]
|
17 |
|
18 |
|
19 |
def clean_text(text):
|
|
|
13 |
|
14 |
load_dotenv()
|
15 |
openai_key = os.getenv("OPENAI_API_KEY")
|
|
|
16 |
|
17 |
|
18 |
def clean_text(text):
|
pages/create_knowledge_box.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
import streamlit as st
|
2 |
-
from app import client, default_embedding_function
|
3 |
import pandas as pd
|
4 |
from generate_kb import generate_knowledge_box_from_url
|
5 |
from utils import get_chroma_client
|
6 |
|
7 |
# Title of the app
|
8 |
st.title("Create a knowledge box from CSV file")
|
|
|
9 |
|
10 |
# File uploader widget
|
11 |
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
|
|
1 |
import streamlit as st
|
2 |
+
from app import client, default_embedding_function, show_sidebar
|
3 |
import pandas as pd
|
4 |
from generate_kb import generate_knowledge_box_from_url
|
5 |
from utils import get_chroma_client
|
6 |
|
7 |
# Title of the app
|
8 |
st.title("Create a knowledge box from CSV file")
|
9 |
+
show_sidebar()
|
10 |
|
11 |
# File uploader widget
|
12 |
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
pages/delete_knowledge_box⚠️.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
from retrieve_kb import get_current_knowledge_bases
|
3 |
-
from app import client
|
4 |
|
5 |
|
6 |
st.title("Delete knowledge Base ☠️")
|
|
|
7 |
|
8 |
st.title("Get knowledge boxes")
|
9 |
if st.button("Get current knowledge bases"):
|
|
|
1 |
import streamlit as st
|
2 |
from retrieve_kb import get_current_knowledge_bases
|
3 |
+
from app import client, show_sidebar
|
4 |
|
5 |
|
6 |
st.title("Delete knowledge Base ☠️")
|
7 |
+
show_sidebar()
|
8 |
|
9 |
st.title("Get knowledge boxes")
|
10 |
if st.button("Get current knowledge bases"):
|
pages/manage_knowledge_box.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
-
|
|
|
|
|
2 |
import streamlit as st
|
3 |
-
from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_information
|
4 |
-
from generate_kb import add_links_to_knowledge_base
|
5 |
-
from app import client, default_embedding_function
|
6 |
import pandas as pd
|
7 |
from tempfile import NamedTemporaryFile
|
8 |
import os
|
@@ -11,136 +10,17 @@ from openai import OpenAI
|
|
11 |
import wave
|
12 |
from dotenv import load_dotenv
|
13 |
|
|
|
|
|
|
|
|
|
14 |
|
15 |
load_dotenv()
|
16 |
openai_key = os.getenv("OPENAI_API_KEY")
|
17 |
-
|
18 |
-
st.title("Manage collections")
|
19 |
-
kbs = get_current_knowledge_bases(client=client)
|
20 |
-
kbs = (kb.name for kb in kbs)
|
21 |
-
collection_name = st.selectbox("Select knowledge box", kbs)
|
22 |
-
info = {}
|
23 |
-
collection = None
|
24 |
-
|
25 |
-
if "df" not in st.session_state:
|
26 |
-
st.session_state["df"] = pd.DataFrame()
|
27 |
-
|
28 |
-
col1, col2 = st.columns(2)
|
29 |
-
|
30 |
-
if st.button("Get All"):
|
31 |
-
collection_info, coll, client = get_knowledge_base_information(
|
32 |
-
client=client,
|
33 |
-
embedding_function=default_embedding_function,
|
34 |
-
kb_name=collection_name,
|
35 |
-
)
|
36 |
-
st.session_state["collection"] = coll
|
37 |
-
st.session_state["client"] = client
|
38 |
-
collection = coll
|
39 |
-
# st.write(collection_info)
|
40 |
-
df = pd.DataFrame.from_records(collection_info)
|
41 |
-
df["source"] = df["metadatas"].apply(lambda x: x.get("source", "unkown"))
|
42 |
-
df["title"] = df["metadatas"].apply(lambda x: x.get("title", "unkown"))
|
43 |
-
df = df[["documents", "source", "title", "ids"]]
|
44 |
-
st.session_state["df"] = df
|
45 |
|
46 |
|
47 |
-
|
48 |
-
st.dataframe(st.session_state["df"], width=3_000)
|
49 |
-
unique_df = st.session_state["df"]["source"].unique()
|
50 |
-
st.text(f"unique urls: {len(unique_df)}")
|
51 |
-
st.dataframe(unique_df)
|
52 |
-
|
53 |
-
#############################
|
54 |
-
#### REMOVE A SPLIT #########
|
55 |
-
#############################
|
56 |
-
st.header("Remove a split")
|
57 |
-
id = st.text_input("Insert a split id")
|
58 |
-
if st.button("Remove Id from collection"):
|
59 |
-
if id in st.session_state["df"]["ids"].values.tolist():
|
60 |
-
res = st.session_state["collection"].delete(ids=[f"{id}"])
|
61 |
-
st.success(f"id {id} deleted")
|
62 |
-
else:
|
63 |
-
st.error(f"id {id} not in kb")
|
64 |
-
|
65 |
-
|
66 |
-
#############################
|
67 |
-
#### REMOVE URL ############
|
68 |
-
#############################
|
69 |
-
st.header("Remove url from collection")
|
70 |
-
url = st.text_input("remove url")
|
71 |
-
if st.button("Remove url from collection"):
|
72 |
-
try:
|
73 |
-
ids = st.session_state["collection"].get(where={"source": url})["ids"]
|
74 |
-
st.session_state["collection"].delete(ids=ids)
|
75 |
-
st.success("deleted")
|
76 |
-
except Exception as e:
|
77 |
-
st.error(str(e))
|
78 |
-
|
79 |
-
|
80 |
-
#############################
|
81 |
-
########### ADD URL #########
|
82 |
-
#############################
|
83 |
-
st.header("Add url to existing collection")
|
84 |
-
url_text = st.text_input("Insert a url link")
|
85 |
-
if st.button("add url to collection"):
|
86 |
-
urls = [url_text] # put in a list even if only one
|
87 |
-
res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
|
88 |
-
st.write(res)
|
89 |
-
|
90 |
-
st.header("Add pdf to existing collection")
|
91 |
-
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
92 |
-
pdf_optional_link = st.text_input(
|
93 |
-
"Insert a URL link you want to associate with the pdf"
|
94 |
-
)
|
95 |
-
pdf_title = st.text_input("This title will be displayed as a resource in ask brian")
|
96 |
-
if st.button("add pdf"):
|
97 |
-
# Create a temporary file
|
98 |
-
with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
99 |
-
# Write the uploaded PDF to the temporary file
|
100 |
-
tmp_file.write(uploaded_file.getvalue())
|
101 |
-
tmp_path = tmp_file.name
|
102 |
-
print("PATH: ", tmp_path)
|
103 |
-
urls = [tmp_path]
|
104 |
-
res = add_links_to_knowledge_base(
|
105 |
-
client=client,
|
106 |
-
kb_name=collection_name,
|
107 |
-
urls=urls,
|
108 |
-
pdf_optional_link=pdf_optional_link,
|
109 |
-
pdf_title=pdf_title,
|
110 |
-
)
|
111 |
-
st.write(res)
|
112 |
-
# Clean up: delete the temporary file
|
113 |
-
os.remove(tmp_path)
|
114 |
-
|
115 |
-
#############################
|
116 |
-
########### ADD CSV #########
|
117 |
-
#############################
|
118 |
-
st.header("Add csv to existing collection")
|
119 |
-
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
120 |
-
df = None
|
121 |
-
|
122 |
-
if uploaded_file is not None:
|
123 |
-
try:
|
124 |
-
new_df = pd.read_csv(uploaded_file)
|
125 |
-
st.write("DataFrame:")
|
126 |
-
st.write(new_df)
|
127 |
-
except Exception as e:
|
128 |
-
st.error(str(e))
|
129 |
-
if st.button("add csv urls to collection"):
|
130 |
-
urls = new_df.values.tolist()
|
131 |
-
st.write(urls)
|
132 |
-
res = add_links_to_knowledge_base(
|
133 |
-
client=client, kb_name=collection_name, urls=urls
|
134 |
-
)
|
135 |
-
st.write(res)
|
136 |
-
|
137 |
-
|
138 |
-
#############################
|
139 |
-
########## YOUTUBE ##########
|
140 |
-
#############################
|
141 |
-
|
142 |
-
|
143 |
-
def transcribe_audio(audio_path, chunk_length=10000):
|
144 |
"""
|
145 |
Transcribe audio by breaking it into chunks using wave and numpy.
|
146 |
:param audio_path: Path to the audio file (e.g., "video.wav").
|
@@ -148,7 +28,7 @@ def transcribe_audio(audio_path, chunk_length=10000):
|
|
148 |
:return: Full transcription of the audio file.
|
149 |
"""
|
150 |
# Open the wave file
|
151 |
-
client = OpenAI(api_key=
|
152 |
|
153 |
with wave.open(audio_path, "rb") as audio:
|
154 |
frame_rate = audio.getframerate()
|
@@ -218,7 +98,7 @@ def download_and_transcribe_youtube(youtube_url):
|
|
218 |
video_title = info_dict.get("title", None)
|
219 |
|
220 |
# audio_file = open("video.wav", "rb")
|
221 |
-
text = transcribe_audio("video.wav")
|
222 |
f_out_path = f"{video_title}.txt"
|
223 |
with open(f"{video_title}.txt", "w") as f_out:
|
224 |
f_out.write(text)
|
@@ -235,19 +115,184 @@ def download_and_transcribe_youtube(youtube_url):
|
|
235 |
os.remove("temp_chunk.wav")
|
236 |
|
237 |
|
238 |
-
|
239 |
-
st.
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
)
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Page to manage kbs"""
|
2 |
+
|
3 |
+
from __future__ import unicode_literals # this should always be the first import
|
4 |
import streamlit as st
|
|
|
|
|
|
|
5 |
import pandas as pd
|
6 |
from tempfile import NamedTemporaryFile
|
7 |
import os
|
|
|
10 |
import wave
|
11 |
from dotenv import load_dotenv
|
12 |
|
13 |
+
from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_information
|
14 |
+
from generate_kb import add_links_to_knowledge_base
|
15 |
+
from app import client, default_embedding_function, show_sidebar
|
16 |
+
|
17 |
|
18 |
load_dotenv()
|
19 |
openai_key = os.getenv("OPENAI_API_KEY")
|
20 |
+
show_sidebar()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
+
def transcribe_audio(audio_path, openai_key, chunk_length=10000):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
"""
|
25 |
Transcribe audio by breaking it into chunks using wave and numpy.
|
26 |
:param audio_path: Path to the audio file (e.g., "video.wav").
|
|
|
28 |
:return: Full transcription of the audio file.
|
29 |
"""
|
30 |
# Open the wave file
|
31 |
+
client = OpenAI(api_key=openai_key)
|
32 |
|
33 |
with wave.open(audio_path, "rb") as audio:
|
34 |
frame_rate = audio.getframerate()
|
|
|
98 |
video_title = info_dict.get("title", None)
|
99 |
|
100 |
# audio_file = open("video.wav", "rb")
|
101 |
+
text = transcribe_audio(audio_path="video.wav", openai_key=openai_key)
|
102 |
f_out_path = f"{video_title}.txt"
|
103 |
with open(f"{video_title}.txt", "w") as f_out:
|
104 |
f_out.write(text)
|
|
|
115 |
os.remove("temp_chunk.wav")
|
116 |
|
117 |
|
118 |
+
if "url_list" not in st.session_state:
|
119 |
+
st.session_state["url_list"] = []
|
120 |
+
|
121 |
+
|
122 |
+
def list_manager():
|
123 |
+
def add_element():
|
124 |
+
if len(user_input) > 0:
|
125 |
+
st.session_state["url_list"] += [user_input]
|
126 |
+
else:
|
127 |
+
st.warning("Enter text")
|
128 |
+
|
129 |
+
st.text("C'è un bug!!! Cliccare su add due volte!")
|
130 |
+
with st.expander("Add urls"):
|
131 |
+
user_input = st.text_input("Enter a url")
|
132 |
+
add_button = st.button("Add", key="add_button")
|
133 |
+
col1, col2 = st.columns((2))
|
134 |
+
with col1:
|
135 |
+
if add_button:
|
136 |
+
add_element()
|
137 |
+
with col2:
|
138 |
+
if st.button("reset"):
|
139 |
+
st.session_state["url_list"] = []
|
140 |
+
st.write(st.session_state["url_list"])
|
141 |
+
|
142 |
+
|
143 |
+
st.title("Manage collections")
|
144 |
+
kbs = get_current_knowledge_bases(client=client)
|
145 |
+
kbs = (kb.name for kb in kbs)
|
146 |
+
collection_name = st.selectbox("Select knowledge box", kbs)
|
147 |
+
info = {}
|
148 |
+
collection = None
|
149 |
+
|
150 |
+
|
151 |
+
if "df" not in st.session_state:
|
152 |
+
st.session_state["df"] = pd.DataFrame()
|
153 |
+
|
154 |
+
col1, col2 = st.columns(2)
|
155 |
+
|
156 |
+
if st.button("Get All"):
|
157 |
+
collection_info, coll, client = get_knowledge_base_information(
|
158 |
+
client=client,
|
159 |
+
embedding_function=default_embedding_function,
|
160 |
+
kb_name=collection_name,
|
161 |
+
)
|
162 |
+
st.session_state["collection"] = coll
|
163 |
+
st.session_state["client"] = client
|
164 |
+
collection = coll
|
165 |
+
|
166 |
+
df = pd.DataFrame.from_records(collection_info)
|
167 |
+
df["source"] = df["metadatas"].apply(lambda x: x.get("source", "unkown"))
|
168 |
+
df["title"] = df["metadatas"].apply(lambda x: x.get("title", "unkown"))
|
169 |
+
df = df[["documents", "source", "title", "ids"]]
|
170 |
+
st.session_state["df"] = df
|
171 |
+
|
172 |
+
if len(st.session_state["df"]) != 0:
|
173 |
+
st.dataframe(st.session_state["df"], width=3_000)
|
174 |
+
unique_df = st.session_state["df"]["source"].unique()
|
175 |
+
st.text(f"unique urls: {len(unique_df)}")
|
176 |
+
st.dataframe(unique_df)
|
177 |
+
else:
|
178 |
+
st.warning(f"{collection_name} KB is empty")
|
179 |
+
|
180 |
+
|
181 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs(
|
182 |
+
["Remove", "Add URL", "Add CSV", "Add PDF", "Add Youtube"]
|
183 |
)
|
184 |
|
185 |
+
# remove stuff tab
|
186 |
+
with tab1:
|
187 |
+
# remove a split
|
188 |
+
st.header("Remove a split")
|
189 |
+
id = st.text_input("Insert a split id")
|
190 |
+
if st.button("Remove Id from collection"):
|
191 |
+
try:
|
192 |
+
if id in st.session_state["df"]["ids"].values.tolist():
|
193 |
+
res = st.session_state["collection"].delete(ids=[f"{id}"])
|
194 |
+
st.success(f"id {id} deleted")
|
195 |
+
else:
|
196 |
+
st.error(f"id {id} not in kb")
|
197 |
+
except Exception as e:
|
198 |
+
st.error(f"{str(e)}")
|
199 |
+
|
200 |
+
# REMOVE URL
|
201 |
+
st.header("Remove url from collection")
|
202 |
+
url = st.text_input("remove url")
|
203 |
+
if st.button("Remove url from collection"):
|
204 |
+
try:
|
205 |
+
ids = st.session_state["collection"].get(where={"source": url})["ids"]
|
206 |
+
st.session_state["collection"].delete(ids=ids)
|
207 |
+
st.success("deleted")
|
208 |
+
except Exception as e:
|
209 |
+
st.error(str(e))
|
210 |
+
|
211 |
+
|
212 |
+
# ADD URL
|
213 |
+
with tab2:
|
214 |
+
st.header("Add url to existing collection")
|
215 |
+
url_text = st.text_input(
|
216 |
+
"Insert a url link",
|
217 |
+
help="This should be text stored in a webpage like wikipedia. NB notion pages are not supported yet!",
|
218 |
+
)
|
219 |
+
if st.button("add url to collection"):
|
220 |
+
urls = [url_text] # put in a list even if only one
|
221 |
+
res = add_links_to_knowledge_base(
|
222 |
+
client=client, kb_name=collection_name, urls=urls
|
223 |
+
)
|
224 |
+
st.write(res)
|
225 |
+
|
226 |
+
|
227 |
+
# ADD CSV
|
228 |
+
with tab3:
|
229 |
+
list_manager()
|
230 |
+
|
231 |
+
# st.header("Add csv to existing collection")
|
232 |
+
# uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
233 |
+
# df = None
|
234 |
+
# if uploaded_file is not None:
|
235 |
+
# try:
|
236 |
+
# new_df = pd.read_csv(uploaded_file)
|
237 |
+
# st.write("DataFrame:")
|
238 |
+
# st.write(new_df)
|
239 |
+
# except Exception as e:
|
240 |
+
# st.error(str(e))
|
241 |
+
# if st.button("add csv urls to collection"):
|
242 |
+
# urls = new_df.values.tolist()
|
243 |
+
# st.write(urls)
|
244 |
+
if st.button("add csv urls to collection"):
|
245 |
+
res = add_links_to_knowledge_base(
|
246 |
+
client=client, kb_name=collection_name, urls=st.session_state["url_list"]
|
247 |
+
)
|
248 |
+
st.write(res)
|
249 |
+
|
250 |
+
|
251 |
+
# Add PDF
|
252 |
+
with tab4:
|
253 |
+
st.header("Add pdf to existing collection")
|
254 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
255 |
+
pdf_optional_link = st.text_input(
|
256 |
+
"Insert a URL link you want to associate with the pdf"
|
257 |
+
)
|
258 |
+
pdf_title = st.text_input("This title will be displayed as a resource in ask brian")
|
259 |
+
if st.button("add pdf"):
|
260 |
+
# Create a temporary file
|
261 |
+
with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
262 |
+
# Write the uploaded PDF to the temporary file
|
263 |
+
tmp_file.write(uploaded_file.getvalue())
|
264 |
+
tmp_path = tmp_file.name
|
265 |
+
print("PATH: ", tmp_path)
|
266 |
+
urls = [tmp_path]
|
267 |
+
res = add_links_to_knowledge_base(
|
268 |
+
client=client,
|
269 |
+
kb_name=collection_name,
|
270 |
+
urls=urls,
|
271 |
+
pdf_optional_link=pdf_optional_link,
|
272 |
+
pdf_title=pdf_title,
|
273 |
+
)
|
274 |
+
st.write(res)
|
275 |
+
# Clean up: delete the temporary file
|
276 |
+
os.remove(tmp_path)
|
277 |
+
|
278 |
+
|
279 |
+
# Add YOUTUBE
|
280 |
+
with tab5:
|
281 |
+
st.header("Add youtube video to collection")
|
282 |
+
st.image(
|
283 |
+
"",
|
284 |
+
width=200, # Manually Adjust the width of the image as per requirement
|
285 |
+
)
|
286 |
+
|
287 |
+
video_url = st.text_input("Youtube video url")
|
288 |
+
st.text(
|
289 |
+
"Aggiungere il video puo impiegare un bel pò. Avvia e vatti a fare una canna"
|
290 |
+
)
|
291 |
+
if st.button("Add video"):
|
292 |
+
# Create a temporary file
|
293 |
+
# Write the uploaded PDF to the temporary file
|
294 |
+
try:
|
295 |
+
download_and_transcribe_youtube(video_url)
|
296 |
+
st.success("Video Added")
|
297 |
+
except Exception as e:
|
298 |
+
st.error(f"{str(e)}")
|
requirements.txt
CHANGED
@@ -14,4 +14,5 @@ librosa
|
|
14 |
future
|
15 |
yt-dlp
|
16 |
pysqlite3>=0.5.2
|
17 |
-
pyspellchecker>=0.8.1
|
|
|
|
14 |
future
|
15 |
yt-dlp
|
16 |
pysqlite3>=0.5.2
|
17 |
+
pyspellchecker>=0.8.1
|
18 |
+
beautifulsoup4>=4.12.2
|
retrieve_kb.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
from fastapi import APIRouter
|
2 |
from utils import get_chroma_client, get_embedding_function
|
|
|
|
|
3 |
|
4 |
-
|
|
|
5 |
router = APIRouter()
|
6 |
-
default_embedding_function = get_embedding_function()
|
7 |
|
8 |
|
9 |
def get_current_knowledge_bases(client):
|
|
|
1 |
from fastapi import APIRouter
|
2 |
from utils import get_chroma_client, get_embedding_function
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
|
6 |
+
load_dotenv()
|
7 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
8 |
router = APIRouter()
|
9 |
+
default_embedding_function = get_embedding_function(openai_key=openai_key)
|
10 |
|
11 |
|
12 |
def get_current_knowledge_bases(client):
|
utils.py
CHANGED
@@ -3,10 +3,10 @@ from chromadb.config import Settings
|
|
3 |
import chromadb.utils.embedding_functions as embedding_functions
|
4 |
from dotenv import load_dotenv
|
5 |
import streamlit as st
|
|
|
6 |
|
7 |
load_dotenv()
|
8 |
-
|
9 |
-
openai_key = st.secrets["OPENAI_API_KEY"]
|
10 |
|
11 |
|
12 |
def get_chroma_client(
|
@@ -25,7 +25,7 @@ def get_chroma_client(
|
|
25 |
return chroma_client
|
26 |
|
27 |
|
28 |
-
def get_embedding_function(model_name="text-embedding-ada-002"):
|
29 |
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
30 |
api_key=openai_key, model_name=model_name
|
31 |
)
|
|
|
3 |
import chromadb.utils.embedding_functions as embedding_functions
|
4 |
from dotenv import load_dotenv
|
5 |
import streamlit as st
|
6 |
+
import os
|
7 |
|
8 |
load_dotenv()
|
9 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
|
|
10 |
|
11 |
|
12 |
def get_chroma_client(
|
|
|
25 |
return chroma_client
|
26 |
|
27 |
|
28 |
+
def get_embedding_function(openai_key, model_name="text-embedding-ada-002"):
|
29 |
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
30 |
api_key=openai_key, model_name=model_name
|
31 |
)
|