Louis-François Bouchard Omar Solano commited on
Commit
a5371c1
1 Parent(s): c622d88

Openai activeloop data (#37)

Browse files

* adding openai and activeloop data

* fixing issues with names

* concurrency

* black

* black

* revert to gradio3.50 for concurrency

---------

Co-authored-by: Omar Solano <omar@designstripe.com>

Files changed (6) hide show
  1. .gitignore +3 -0
  2. app.py +4 -0
  3. cfg.py +2 -2
  4. data/process_csvs_store.py +70 -6
  5. data/tmp.py +21 -0
  6. requirements.txt +2 -2
.gitignore CHANGED
@@ -3,3 +3,6 @@
3
  deeplake_store/
4
  .DS_Store
5
  __pycache__/
 
 
 
 
3
  deeplake_store/
4
  .DS_Store
5
  __pycache__/
6
+ .env
7
+ env/
8
+ .vscode/
app.py CHANGED
@@ -34,6 +34,8 @@ AVAILABLE_SOURCES_UI = [
34
  "Wikipedia",
35
  "Gen AI 360: LangChain",
36
  "Gen AI 360: LLMs",
 
 
37
  ]
38
 
39
  AVAILABLE_SOURCES = [
@@ -42,6 +44,8 @@ AVAILABLE_SOURCES = [
42
  "wikipedia",
43
  "langchain_course",
44
  "llm_course",
 
 
45
  ]
46
 
47
 
 
34
  "Wikipedia",
35
  "Gen AI 360: LangChain",
36
  "Gen AI 360: LLMs",
37
+ "Activeloop",
38
+ "Open AI",
39
  ]
40
 
41
  AVAILABLE_SOURCES = [
 
44
  "wikipedia",
45
  "langchain_course",
46
  "llm_course",
47
+ "activeloop",
48
+ "openai",
49
  ]
50
 
51
 
cfg.py CHANGED
@@ -23,7 +23,7 @@ ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")
23
  if ACTIVELOOP_TOKEN is None:
24
  logger.warning("No activeloop token found, you will not be able to fetch data.")
25
 
26
- DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "dev_vector_store")
27
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
28
 
29
  # if you want to use a local dataset, set the env. variable, it overrides all others
@@ -85,7 +85,7 @@ Q:
85
  "embedding_model": "text-embedding-ada-002",
86
  "exec_option": "compute_engine",
87
  "use_tql": True,
88
- "deep_memory": True,
89
  "activeloop_token": ACTIVELOOP_TOKEN,
90
  },
91
  documents_answerer_cfg={
 
23
  if ACTIVELOOP_TOKEN is None:
24
  logger.warning("No activeloop token found, you will not be able to fetch data.")
25
 
26
+ DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
27
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
28
 
29
  # if you want to use a local dataset, set the env. variable, it overrides all others
 
85
  "embedding_model": "text-embedding-ada-002",
86
  "exec_option": "compute_engine",
87
  "use_tql": True,
88
+ "deep_memory": False,
89
  "activeloop_token": ACTIVELOOP_TOKEN,
90
  },
91
  documents_answerer_cfg={
data/process_csvs_store.py CHANGED
@@ -3,8 +3,11 @@ import time
3
  import os
4
  from buster.documents_manager import DeepLakeDocumentsManager
5
  from deeplake.core.vectorstore import VectorStore
 
6
 
7
- DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "dev_vector_store")
 
 
8
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
9
 
10
  df1 = pd.read_csv("./data/llm_course.csv")
@@ -12,7 +15,10 @@ df2 = pd.read_csv("./data/hf_transformers.csv")
12
  df3 = pd.read_csv("./data/langchain_course.csv")
13
  df4 = pd.read_csv("./data/filtered_tai_v2.csv")
14
  df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
 
 
15
 
 
16
 
17
  dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
18
 
@@ -27,7 +33,8 @@ dm.batch_add(
27
  batch_size=3000,
28
  min_time_interval=60,
29
  num_workers=32,
30
- csv_filename="embeddings.csv",
 
31
  csv_overwrite=False,
32
  )
33
 
@@ -36,7 +43,8 @@ dm.batch_add(
36
  batch_size=3000,
37
  min_time_interval=60,
38
  num_workers=32,
39
- csv_filename="embeddings.csv",
 
40
  csv_overwrite=False,
41
  )
42
 
@@ -45,7 +53,8 @@ dm.batch_add(
45
  batch_size=3000,
46
  min_time_interval=60,
47
  num_workers=32,
48
- csv_filename="embeddings.csv",
 
49
  csv_overwrite=False,
50
  )
51
 
@@ -54,7 +63,8 @@ dm.batch_add(
54
  batch_size=3000,
55
  min_time_interval=60,
56
  num_workers=32,
57
- csv_filename="embeddings.csv",
 
58
  csv_overwrite=False,
59
  )
60
 
@@ -63,6 +73,60 @@ dm.batch_add(
63
  batch_size=3000,
64
  min_time_interval=60,
65
  num_workers=32,
66
- csv_filename="embeddings.csv",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  csv_overwrite=False,
68
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
  from buster.documents_manager import DeepLakeDocumentsManager
5
  from deeplake.core.vectorstore import VectorStore
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
 
8
+ # from openai import OpenAI
9
+
10
+ DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
11
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
12
 
13
  df1 = pd.read_csv("./data/llm_course.csv")
 
15
  df3 = pd.read_csv("./data/langchain_course.csv")
16
  df4 = pd.read_csv("./data/filtered_tai_v2.csv")
17
  df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
18
+ df6 = pd.read_csv("./data/openai.csv")
19
+ df7 = pd.read_csv("./data/activeloop.csv")
20
 
21
+ print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6), len(df7))
22
 
23
  dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
24
 
 
33
  batch_size=3000,
34
  min_time_interval=60,
35
  num_workers=32,
36
+ csv_embeddings_filename="embeddings.csv",
37
+ csv_errors_filename="tmp.csv",
38
  csv_overwrite=False,
39
  )
40
 
 
43
  batch_size=3000,
44
  min_time_interval=60,
45
  num_workers=32,
46
+ csv_embeddings_filename="embeddings.csv",
47
+ csv_errors_filename="tmp.csv",
48
  csv_overwrite=False,
49
  )
50
 
 
53
  batch_size=3000,
54
  min_time_interval=60,
55
  num_workers=32,
56
+ csv_embeddings_filename="embeddings.csv",
57
+ csv_errors_filename="tmp.csv",
58
  csv_overwrite=False,
59
  )
60
 
 
63
  batch_size=3000,
64
  min_time_interval=60,
65
  num_workers=32,
66
+ csv_embeddings_filename="embeddings.csv",
67
+ csv_errors_filename="tmp.csv",
68
  csv_overwrite=False,
69
  )
70
 
 
73
  batch_size=3000,
74
  min_time_interval=60,
75
  num_workers=32,
76
+ csv_embeddings_filename="embeddings.csv",
77
+ csv_errors_filename="tmp.csv",
78
+ csv_overwrite=False,
79
+ )
80
+
81
+ dm.batch_add(
82
+ df=df6,
83
+ batch_size=3000,
84
+ min_time_interval=60,
85
+ num_workers=32,
86
+ csv_embeddings_filename="embeddings.csv",
87
+ csv_overwrite=False,
88
+ csv_errors_filename="tmp.csv",
89
+ )
90
+
91
+ dm.batch_add(
92
+ df=df7,
93
+ batch_size=3000,
94
+ min_time_interval=60,
95
+ num_workers=32,
96
+ csv_embeddings_filename="embeddings.csv",
97
+ csv_errors_filename="tmp.csv",
98
  csv_overwrite=False,
99
  )
100
+
101
+
102
+ # client = OpenAI()
103
+
104
+ # openai_embeddings = OpenAIEmbeddings()
105
+ # def get_embedding(text, model="text-embedding-ada-002"):
106
+ # # Call to OpenAI's API to create the embedding
107
+ # response = client.embeddings.create(input=[text], model=model)
108
+
109
+ # # Extract the embedding data from the response
110
+ # embedding = response.data[0].embedding
111
+
112
+ # # Convert the ndarray to a list
113
+ # if isinstance(embedding, np.ndarray):
114
+ # embedding = embedding.tolist()
115
+
116
+ # return embedding
117
+
118
+
119
+ # vs = VectorStore(
120
+ # dataset_path,
121
+ # runtime='compute_engine',
122
+ # token=os.environ['ACTIVELOOP_TOKEN']
123
+ # )
124
+
125
+ # data = vs.search(query = "select * where shape(embedding)[0] == 0")
126
+
127
+ # vs.update_embedding(embedding_source_tensor = "text",
128
+ # query = "select * where shape(embedding)[0] == 0",
129
+ # exec_option = "compute_engine",
130
+ # embedding_function=get_embedding)
131
+
132
+ # data2 = vs.search(query = "select * where shape(embedding)[0] == 0")
data/tmp.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import pandas as pd
2
+
3
+ # # Load the CSV
4
+ # df = pd.read_csv('data/wiki.csv')
5
+
6
+
7
+ # # Count the number of unique titles in the 'title' column
8
+ # unique_titles_count = df['title']
9
+
10
+ # print(len(df))
11
+
12
+ # # # Remove the 'ranking' column
13
+ # # df.drop('ranking', axis=1, inplace=True)
14
+
15
+ # # # Save the CSV again
16
+ # # df.to_csv('data/wiki.csv', index=False)
17
+
18
+
19
+ import gradio as gr
20
+
21
+ gr.themes.builder()
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- git+https://github.com/jerpint/buster@multiple-sources
2
- gradio
3
  deeplake
 
1
+ git+https://github.com/jerpint/buster@better-fallback
2
+ gradio==3.50.2
3
  deeplake