Spaces:
Running
Running
Louis-François Bouchard
Omar Solano
commited on
Commit
•
0f06abd
1
Parent(s):
5c3db13
Advanced rag course update (#44)
Browse files* Openai activeloop data (#37)
* adding openai and activeloop data
* fixing issues with names
* concurrency
* black
* black
* revert to gradio3.50 for concurrency
---------
Co-authored-by: Omar Solano <omar@designstripe.com>
* ensure gradio version for HF
* Updates to files
* Push to advanced rag course
* Formatting
* formatting
---------
Co-authored-by: Omar Solano <omar@designstripe.com>
- .gitignore +3 -1
- .vscode/launch.json +18 -0
- app.py +2 -0
- data/markdown_parser.py +8 -7
- data/process_csvs_store.py +106 -76
- data/scrapper_to_csv.py +82 -0
- data/tmp.py +121 -0
- requirements.txt +1 -1
.gitignore
CHANGED
@@ -162,7 +162,9 @@ cython_debug/
|
|
162 |
*.zip
|
163 |
deeplake_store/
|
164 |
.DS_Store
|
165 |
-
|
|
|
|
|
166 |
.vscode/
|
167 |
evals/
|
168 |
local_dataset/
|
|
|
162 |
*.zip
|
163 |
deeplake_store/
|
164 |
.DS_Store
|
165 |
+
__pycache__/
|
166 |
+
.env
|
167 |
+
env/
|
168 |
.vscode/
|
169 |
evals/
|
170 |
local_dataset/
|
.vscode/launch.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "0.2.0",
|
3 |
+
"configurations": [
|
4 |
+
{
|
5 |
+
"name": "Python: App",
|
6 |
+
"type": "python",
|
7 |
+
"request": "launch",
|
8 |
+
"program": "${workspaceFolder}/data/process_csvs_store.py",
|
9 |
+
"console": "integratedTerminal",
|
10 |
+
"justMyCode": false,
|
11 |
+
"python": "/Users/louis/miniconda3/envs/buster/bin/python",
|
12 |
+
"env": {
|
13 |
+
"ACTIVELOOP_TOKEN": "eyJhbGciOiJIUzUxMiIsImlhdCI6MTY5Njc4MjIyMiwiZXhwIjoxNzI4NDA0NTk1fQ.eyJpZCI6Im9tYXJzb2xhbm8ifQ.BlvUc350-boJv4hnN67ksMgGSy7x4nAWcBO7R5RZ22Cw0ifR2AOmmM-RJutBWIspQDiegs03rJxXCCfprc6O_A",
|
14 |
+
"OPENAI_API_KEY": "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"
|
15 |
+
}
|
16 |
+
}
|
17 |
+
]
|
18 |
+
}
|
app.py
CHANGED
@@ -23,6 +23,7 @@ CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
|
|
23 |
|
24 |
AVAILABLE_SOURCES_UI = [
|
25 |
"Gen AI 360: LLMs",
|
|
|
26 |
"Gen AI 360: LangChain",
|
27 |
"Towards AI Blog",
|
28 |
"Activeloop Docs",
|
@@ -35,6 +36,7 @@ AVAILABLE_SOURCES_UI = [
|
|
35 |
AVAILABLE_SOURCES = [
|
36 |
"llm_course",
|
37 |
"langchain_course",
|
|
|
38 |
"towards_ai",
|
39 |
"activeloop",
|
40 |
"hf_transformers",
|
|
|
23 |
|
24 |
AVAILABLE_SOURCES_UI = [
|
25 |
"Gen AI 360: LLMs",
|
26 |
+
"Gen AU 360: Advanced RAG",
|
27 |
"Gen AI 360: LangChain",
|
28 |
"Towards AI Blog",
|
29 |
"Activeloop Docs",
|
|
|
36 |
AVAILABLE_SOURCES = [
|
37 |
"llm_course",
|
38 |
"langchain_course",
|
39 |
+
"advanced_rag_course",
|
40 |
"towards_ai",
|
41 |
"activeloop",
|
42 |
"hf_transformers",
|
data/markdown_parser.py
CHANGED
@@ -50,12 +50,13 @@ def get_title_link_from_md_title(md_title: str, title_link_data: dict):
|
|
50 |
return data["title"], data["link"]
|
51 |
# default back to course link if not found...
|
52 |
print("\nNot found: ", md_title)
|
53 |
-
return md_title, "https://learn.activeloop.ai/courses/
|
54 |
|
55 |
|
56 |
if __name__ == "__main__":
|
57 |
-
folder_path = "/path/to/folder/with/md_content/"
|
58 |
-
|
|
|
59 |
# folder_path = "/Users/louis/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024/LangChain & Vector Databases in Production 92657e0d65da4201bfdd6db915a4eb9f"
|
60 |
md_files = find_md_files(folder_path)
|
61 |
|
@@ -76,8 +77,8 @@ if __name__ == "__main__":
|
|
76 |
|
77 |
# with open("data/title_link_langchaincourse.json", "r") as f:
|
78 |
# title_link_data = json.load(f)
|
79 |
-
|
80 |
-
with open("/Users/louis/Downloads/
|
81 |
title_link_data = json.load(f)
|
82 |
|
83 |
for md_file in tqdm(md_files):
|
@@ -102,7 +103,7 @@ if __name__ == "__main__":
|
|
102 |
chunk = {
|
103 |
"title": title,
|
104 |
"content": headers + "\n" + substring,
|
105 |
-
"source": "
|
106 |
"url": link,
|
107 |
}
|
108 |
chunks.append(chunk)
|
@@ -112,4 +113,4 @@ if __name__ == "__main__":
|
|
112 |
df = drop_outlier_chunks(df, max_tokens_by_chunk=2000)
|
113 |
|
114 |
print(f"Exported {len(df)} chunks from {len(md_files)} articles.")
|
115 |
-
df.to_csv("
|
|
|
50 |
return data["title"], data["link"]
|
51 |
# default back to course link if not found...
|
52 |
print("\nNot found: ", md_title)
|
53 |
+
return md_title, "https://learn.activeloop.ai/courses/rag/"
|
54 |
|
55 |
|
56 |
if __name__ == "__main__":
|
57 |
+
# folder_path = "/path/to/folder/with/md_content/"
|
58 |
+
# This path is the link to the course folder with all md files
|
59 |
+
folder_path = "/Users/louis/Downloads/rag_course_advanced"
|
60 |
# folder_path = "/Users/louis/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024/LangChain & Vector Databases in Production 92657e0d65da4201bfdd6db915a4eb9f"
|
61 |
md_files = find_md_files(folder_path)
|
62 |
|
|
|
77 |
|
78 |
# with open("data/title_link_langchaincourse.json", "r") as f:
|
79 |
# title_link_data = json.load(f)
|
80 |
+
# This file contains a json with only two column, "title, link", to fit the title of the md files and link on the course platform.
|
81 |
+
with open("/Users/louis/Downloads/output.json", "r") as f:
|
82 |
title_link_data = json.load(f)
|
83 |
|
84 |
for md_file in tqdm(md_files):
|
|
|
103 |
chunk = {
|
104 |
"title": title,
|
105 |
"content": headers + "\n" + substring,
|
106 |
+
"source": "advanced_rag_course",
|
107 |
"url": link,
|
108 |
}
|
109 |
chunks.append(chunk)
|
|
|
113 |
df = drop_outlier_chunks(df, max_tokens_by_chunk=2000)
|
114 |
|
115 |
print(f"Exported {len(df)} chunks from {len(md_files)} articles.")
|
116 |
+
df.to_csv("advanced_rag_course.csv")
|
data/process_csvs_store.py
CHANGED
@@ -2,26 +2,35 @@ import pandas as pd
|
|
2 |
import time
|
3 |
import os
|
4 |
from buster.documents_manager import DeepLakeDocumentsManager
|
|
|
|
|
|
|
5 |
|
6 |
-
|
|
|
|
|
7 |
DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
|
8 |
|
9 |
-
df1 = pd.read_csv("./data/
|
10 |
-
df2 = pd.read_csv("./data/hf_transformers.csv")
|
11 |
-
df3 = pd.read_csv("./data/langchain_course.csv")
|
12 |
-
df4 = pd.read_csv("./data/filtered_tai_v2.csv")
|
13 |
-
df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
|
14 |
-
df6 = pd.read_csv("./data/openai.csv")
|
15 |
-
|
16 |
-
df8 = pd.read_csv("./data/langchain_docs.csv")
|
17 |
|
18 |
-
print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6)
|
|
|
19 |
|
20 |
dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
|
|
|
|
|
|
|
|
|
21 |
|
22 |
dm = DeepLakeDocumentsManager(
|
23 |
vector_store_path=dataset_path,
|
24 |
-
overwrite=
|
25 |
required_columns=["url", "content", "source", "title"],
|
26 |
)
|
27 |
|
@@ -30,77 +39,98 @@ dm.batch_add(
|
|
30 |
batch_size=3000,
|
31 |
min_time_interval=60,
|
32 |
num_workers=32,
|
33 |
-
csv_embeddings_filename="embeddings.csv",
|
34 |
-
csv_errors_filename="tmp.csv",
|
35 |
csv_overwrite=False,
|
36 |
)
|
37 |
|
38 |
-
dm.batch_add(
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
)
|
47 |
|
48 |
-
dm.batch_add(
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
)
|
57 |
|
58 |
-
dm.batch_add(
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
)
|
67 |
|
68 |
-
dm.batch_add(
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
)
|
77 |
|
78 |
-
dm.batch_add(
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
)
|
87 |
|
88 |
-
dm.batch_add(
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
)
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import time
|
3 |
import os
|
4 |
from buster.documents_manager import DeepLakeDocumentsManager
|
5 |
+
from deeplake.core.vectorstore import VectorStore
|
6 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
7 |
+
import numpy as np
|
8 |
|
9 |
+
# from openai import OpenAI
|
10 |
+
|
11 |
+
DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
|
12 |
DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
|
13 |
|
14 |
+
# df1 = pd.read_csv("./data/jobs.csv", encoding='ISO-8859-1') # or 'latin1' or 'cp1252'
|
15 |
+
# df2 = pd.read_csv("./data/hf_transformers.csv")
|
16 |
+
# df3 = pd.read_csv("./data/langchain_course.csv")
|
17 |
+
# df4 = pd.read_csv("./data/filtered_tai_v2.csv")
|
18 |
+
# df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
|
19 |
+
# df6 = pd.read_csv("./data/openai.csv")
|
20 |
+
df1 = pd.read_csv("./advanced_rag_course.csv")
|
|
|
21 |
|
22 |
+
# print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6))
|
23 |
+
print(len(df1))
|
24 |
|
25 |
dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
|
26 |
+
# dataset_path = f"{DEEPLAKE_DATASET}"
|
27 |
+
# because wrong name
|
28 |
+
# df1['content'] = df1['cleaned_description']
|
29 |
+
# print(np.sum(df1.content.isna()), len(df1) )
|
30 |
|
31 |
dm = DeepLakeDocumentsManager(
|
32 |
vector_store_path=dataset_path,
|
33 |
+
overwrite=False,
|
34 |
required_columns=["url", "content", "source", "title"],
|
35 |
)
|
36 |
|
|
|
39 |
batch_size=3000,
|
40 |
min_time_interval=60,
|
41 |
num_workers=32,
|
|
|
|
|
42 |
csv_overwrite=False,
|
43 |
)
|
44 |
|
45 |
+
# dm.batch_add(
|
46 |
+
# df=df2,
|
47 |
+
# batch_size=3000,
|
48 |
+
# min_time_interval=60,
|
49 |
+
# num_workers=32,
|
50 |
+
# csv_embeddings_filename="embeddings.csv",
|
51 |
+
# csv_errors_filename="tmp.csv",
|
52 |
+
# csv_overwrite=False,
|
53 |
+
# )
|
54 |
|
55 |
+
# dm.batch_add(
|
56 |
+
# df=df3,
|
57 |
+
# batch_size=3000,
|
58 |
+
# min_time_interval=60,
|
59 |
+
# num_workers=32,
|
60 |
+
# csv_embeddings_filename="embeddings.csv",
|
61 |
+
# csv_errors_filename="tmp.csv",
|
62 |
+
# csv_overwrite=False,
|
63 |
+
# )
|
64 |
|
65 |
+
# dm.batch_add(
|
66 |
+
# df=df4,
|
67 |
+
# batch_size=3000,
|
68 |
+
# min_time_interval=60,
|
69 |
+
# num_workers=32,
|
70 |
+
# csv_embeddings_filename="embeddings.csv",
|
71 |
+
# csv_errors_filename="tmp.csv",
|
72 |
+
# csv_overwrite=False,
|
73 |
+
# )
|
74 |
|
75 |
+
# dm.batch_add(
|
76 |
+
# df=df5,
|
77 |
+
# batch_size=3000,
|
78 |
+
# min_time_interval=60,
|
79 |
+
# num_workers=32,
|
80 |
+
# csv_embeddings_filename="embeddings.csv",
|
81 |
+
# csv_errors_filename="tmp.csv",
|
82 |
+
# csv_overwrite=False,
|
83 |
+
# )
|
84 |
|
85 |
+
# dm.batch_add(
|
86 |
+
# df=df6,
|
87 |
+
# batch_size=3000,
|
88 |
+
# min_time_interval=60,
|
89 |
+
# num_workers=32,
|
90 |
+
# csv_embeddings_filename="embeddings.csv",
|
91 |
+
# csv_overwrite=False,
|
92 |
+
# csv_errors_filename="tmp.csv",
|
93 |
+
# )
|
94 |
|
95 |
+
# dm.batch_add(
|
96 |
+
# df=df7,
|
97 |
+
# batch_size=3000,
|
98 |
+
# min_time_interval=60,
|
99 |
+
# num_workers=32,
|
100 |
+
# csv_embeddings_filename="embeddings.csv",
|
101 |
+
# csv_errors_filename="tmp.csv",
|
102 |
+
# csv_overwrite=False,
|
103 |
+
# )
|
104 |
|
105 |
+
|
106 |
+
# client = OpenAI()
|
107 |
+
|
108 |
+
# openai_embeddings = OpenAIEmbeddings()
|
109 |
+
# def get_embedding(text, model="text-embedding-ada-002"):
|
110 |
+
# # Call to OpenAI's API to create the embedding
|
111 |
+
# response = client.embeddings.create(input=[text], model=model)
|
112 |
+
|
113 |
+
# # Extract the embedding data from the response
|
114 |
+
# embedding = response.data[0].embedding
|
115 |
+
|
116 |
+
# # Convert the ndarray to a list
|
117 |
+
# if isinstance(embedding, np.ndarray):
|
118 |
+
# embedding = embedding.tolist()
|
119 |
+
|
120 |
+
# return embedding
|
121 |
+
|
122 |
+
|
123 |
+
# vs = VectorStore(
|
124 |
+
# dataset_path,
|
125 |
+
# runtime='compute_engine',
|
126 |
+
# token=os.environ['ACTIVELOOP_TOKEN']
|
127 |
+
# )
|
128 |
+
|
129 |
+
# data = vs.search(query = "select * where shape(embedding)[0] == 0")
|
130 |
+
|
131 |
+
# vs.update_embedding(embedding_source_tensor = "text",
|
132 |
+
# query = "select * where shape(embedding)[0] == 0",
|
133 |
+
# exec_option = "compute_engine",
|
134 |
+
# embedding_function=get_embedding)
|
135 |
+
|
136 |
+
# data2 = vs.search(query = "select * where shape(embedding)[0] == 0")
|
data/scrapper_to_csv.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
|
4 |
+
|
5 |
+
def parse_markdown_file(file_path):
|
6 |
+
entries = []
|
7 |
+
with open(file_path, "r", encoding="utf-8") as file:
|
8 |
+
current_url, current_title, current_content = "", "", ""
|
9 |
+
inside_page = False
|
10 |
+
|
11 |
+
for line in file:
|
12 |
+
if line.strip() == "--": # Check for page separator
|
13 |
+
if inside_page:
|
14 |
+
# Process the previous page
|
15 |
+
process_content(
|
16 |
+
entries, current_url, current_title, current_content
|
17 |
+
)
|
18 |
+
current_content = ""
|
19 |
+
|
20 |
+
inside_page = True
|
21 |
+
# Read URL and title
|
22 |
+
current_url = next(file).strip().split(" ", 1)[1]
|
23 |
+
current_title = (
|
24 |
+
next(file).strip().split(" ", 1)[1].replace("\n", " ")
|
25 |
+
) # Replace new lines in title
|
26 |
+
# Skip the next two lines (description and keywords)
|
27 |
+
next(file)
|
28 |
+
next(file)
|
29 |
+
# print(f"Detected Page: Title - {current_title}, URL - {current_url}") # Debugging
|
30 |
+
|
31 |
+
elif inside_page:
|
32 |
+
current_content += line
|
33 |
+
|
34 |
+
if inside_page:
|
35 |
+
process_content(entries, current_url, current_title, current_content)
|
36 |
+
|
37 |
+
df = pd.DataFrame(entries)
|
38 |
+
return df
|
39 |
+
|
40 |
+
|
41 |
+
def process_content(entries, url, title, content):
|
42 |
+
# Regular expression to match markdown headers
|
43 |
+
header_pattern = re.compile(r"^## (.+)$", re.MULTILINE)
|
44 |
+
|
45 |
+
# Split the content into sections based on headers
|
46 |
+
sections = re.split(header_pattern, content)
|
47 |
+
section_title = "Main" # Default section title for content before the first header
|
48 |
+
|
49 |
+
# Initial content before the first header (if any)
|
50 |
+
if not sections[0].startswith("##") and sections[0].strip():
|
51 |
+
add_content_section(entries, title, url, "Main", sections[0])
|
52 |
+
|
53 |
+
# Process each section
|
54 |
+
for i in range(1, len(sections), 2):
|
55 |
+
section_header = sections[i].strip()
|
56 |
+
section_text = (
|
57 |
+
sections[i + 1].strip().replace("\n", " ")
|
58 |
+
) # Replace new lines in content
|
59 |
+
|
60 |
+
add_content_section(entries, title, url, section_header, section_text)
|
61 |
+
|
62 |
+
|
63 |
+
def add_content_section(entries, title, url, section_title, section_text):
|
64 |
+
full_section = f"{section_title}: {section_text}".replace(
|
65 |
+
"\n", " "
|
66 |
+
) # Replace new lines in content
|
67 |
+
for j in range(0, len(full_section), 6000):
|
68 |
+
entries.append(
|
69 |
+
{
|
70 |
+
"title": title,
|
71 |
+
"url": url,
|
72 |
+
"source": "langchain",
|
73 |
+
"content": full_section[j : j + 6000],
|
74 |
+
}
|
75 |
+
)
|
76 |
+
|
77 |
+
|
78 |
+
markdown_file_path = "data/langchain_scrape.md"
|
79 |
+
df = parse_markdown_file(markdown_file_path)
|
80 |
+
print("Final DataFrame:")
|
81 |
+
print(df.head()) # Print the first few rows for verification
|
82 |
+
df.to_csv("data/langchain.csv", index=False)
|
data/tmp.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
from deeplake.core.vectorstore import VectorStore
|
5 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
6 |
+
import logging
|
7 |
+
|
8 |
+
from buster.documents_manager import DeepLakeDocumentsManager
|
9 |
+
from buster.llm_utils import get_openai_embedding_constructor
|
10 |
+
|
11 |
+
# Set the logging level of `httpx` to WARNING or higher to suppress annoying INFO logs
|
12 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
13 |
+
|
14 |
+
openai_embedding_fn = get_openai_embedding_constructor(
|
15 |
+
client_kwargs={"max_retries": 10}
|
16 |
+
)
|
17 |
+
|
18 |
+
# from openai import OpenAI
|
19 |
+
|
20 |
+
DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
|
21 |
+
DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
|
22 |
+
|
23 |
+
df1 = pd.read_csv("./data/langchain.csv") # or 'latin1' or 'cp1252'
|
24 |
+
df2 = pd.read_csv("./data/hf_transformers.csv")
|
25 |
+
df3 = pd.read_csv("./data/langchain_course.csv")
|
26 |
+
df4 = pd.read_csv("./data/filtered_tai_v2.csv")
|
27 |
+
df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
|
28 |
+
df6 = pd.read_csv("./data/openai.csv")
|
29 |
+
df7 = pd.read_csv("./data/activeloop.csv")
|
30 |
+
df8 = pd.read_csv("./data/llm_course.csv")
|
31 |
+
|
32 |
+
print(
|
33 |
+
f"Number of samples: {len(df1)},{len(df2)},{len(df3)},{len(df4)},{len(df5)},{len(df6)},{len(df7)},{len(df8)}"
|
34 |
+
)
|
35 |
+
|
36 |
+
dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
|
37 |
+
|
38 |
+
dm = DeepLakeDocumentsManager(
|
39 |
+
vector_store_path=dataset_path,
|
40 |
+
overwrite=True,
|
41 |
+
)
|
42 |
+
|
43 |
+
dm.batch_add(
|
44 |
+
df=df1,
|
45 |
+
batch_size=3000,
|
46 |
+
min_time_interval=60,
|
47 |
+
num_workers=32,
|
48 |
+
embedding_fn=openai_embedding_fn,
|
49 |
+
csv_filename="embeddings.csv",
|
50 |
+
csv_overwrite=False,
|
51 |
+
)
|
52 |
+
|
53 |
+
dm.batch_add(
|
54 |
+
df=df2,
|
55 |
+
batch_size=3000,
|
56 |
+
min_time_interval=60,
|
57 |
+
num_workers=32,
|
58 |
+
embedding_fn=openai_embedding_fn,
|
59 |
+
csv_filename="embeddings.csv",
|
60 |
+
csv_overwrite=False,
|
61 |
+
)
|
62 |
+
|
63 |
+
dm.batch_add(
|
64 |
+
df=df3,
|
65 |
+
batch_size=3000,
|
66 |
+
min_time_interval=60,
|
67 |
+
num_workers=32,
|
68 |
+
embedding_fn=openai_embedding_fn,
|
69 |
+
csv_filename="embeddings.csv",
|
70 |
+
csv_overwrite=False,
|
71 |
+
)
|
72 |
+
|
73 |
+
dm.batch_add(
|
74 |
+
df=df4,
|
75 |
+
batch_size=3000,
|
76 |
+
min_time_interval=60,
|
77 |
+
num_workers=32,
|
78 |
+
embedding_fn=openai_embedding_fn,
|
79 |
+
csv_filename="embeddings.csv",
|
80 |
+
csv_overwrite=False,
|
81 |
+
)
|
82 |
+
|
83 |
+
dm.batch_add(
|
84 |
+
df=df5,
|
85 |
+
batch_size=3000,
|
86 |
+
min_time_interval=60,
|
87 |
+
num_workers=32,
|
88 |
+
embedding_fn=openai_embedding_fn,
|
89 |
+
csv_filename="embeddings.csv",
|
90 |
+
csv_overwrite=False,
|
91 |
+
)
|
92 |
+
|
93 |
+
dm.batch_add(
|
94 |
+
df=df6,
|
95 |
+
batch_size=3000,
|
96 |
+
min_time_interval=60,
|
97 |
+
num_workers=32,
|
98 |
+
embedding_fn=openai_embedding_fn,
|
99 |
+
csv_filename="embeddings.csv",
|
100 |
+
csv_overwrite=False,
|
101 |
+
)
|
102 |
+
|
103 |
+
dm.batch_add(
|
104 |
+
df=df7,
|
105 |
+
batch_size=3000,
|
106 |
+
min_time_interval=60,
|
107 |
+
num_workers=32,
|
108 |
+
embedding_fn=openai_embedding_fn,
|
109 |
+
csv_filename="embeddings.csv",
|
110 |
+
csv_overwrite=False,
|
111 |
+
)
|
112 |
+
|
113 |
+
dm.batch_add(
|
114 |
+
df=df8,
|
115 |
+
batch_size=3000,
|
116 |
+
min_time_interval=60,
|
117 |
+
num_workers=32,
|
118 |
+
embedding_fn=openai_embedding_fn,
|
119 |
+
csv_filename="embeddings.csv",
|
120 |
+
csv_overwrite=False,
|
121 |
+
)
|
requirements.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
git+https://github.com/jerpint/buster@better-fallback
|
2 |
gradio==3.50.2
|
3 |
-
deeplake==3.8.9
|
|
|
1 |
git+https://github.com/jerpint/buster@better-fallback
|
2 |
gradio==3.50.2
|
3 |
+
deeplake==3.8.9
|