Omar Solano commited on
Commit
7fb4bde
1 Parent(s): df8fed2

refactor data processing in process_csvs_store.py; update dataset paths

Browse files
Files changed (1) hide show
  1. data/process_csvs_store.py +91 -25
data/process_csvs_store.py CHANGED
@@ -1,33 +1,39 @@
1
- import pandas as pd
2
- import time
3
  import os
4
- from buster.documents_manager import DeepLakeDocumentsManager
5
 
6
- # from deeplake.core.vectorstore import VectorStore
7
- # from langchain.embeddings.openai import OpenAIEmbeddings
8
  import numpy as np
 
 
9
 
10
- # from openai import OpenAI
11
-
12
- DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
13
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
14
 
15
- # df1 = pd.read_csv("./data/jobs.csv", encoding='ISO-8859-1') # or 'latin1' or 'cp1252'
16
- # df2 = pd.read_csv("./data/hf_transformers.csv")
17
- # df3 = pd.read_csv("./data/langchain_course.csv")
18
- # df4 = pd.read_csv("./data/filtered_tai_v2.csv")
19
- # df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
20
- # df6 = pd.read_csv("./data/openai.csv")
21
- df1 = pd.read_csv("./advanced_rag_course.csv")
22
-
23
- # print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6))
24
- print(len(df1))
25
-
26
- dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
 
 
 
 
 
 
 
 
 
 
 
27
  # dataset_path = f"{DEEPLAKE_DATASET}"
28
- # because wrong name
29
- # df1['content'] = df1['cleaned_description']
30
- # print(np.sum(df1.content.isna()), len(df1) )
31
 
32
  dm = DeepLakeDocumentsManager(
33
  vector_store_path=dataset_path,
@@ -35,11 +41,71 @@ dm = DeepLakeDocumentsManager(
35
  required_columns=["url", "content", "source", "title"],
36
  )
37
 
 
38
  dm.batch_add(
39
  df=df1,
40
  batch_size=3000,
41
- min_time_interval=60,
42
- num_workers=32,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  csv_overwrite=False,
44
  )
45
 
 
1
+ import json
 
2
  import os
3
+ import time
4
 
 
 
5
  import numpy as np
6
+ import pandas as pd
7
+ from buster.documents_manager import DeepLakeDocumentsManager
8
 
9
+ DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset-2")
 
 
10
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
11
 
12
+ df1 = pd.read_csv("data/advanced_rag_course.csv")
13
+ df2 = pd.read_csv("data/hf_transformers.csv")
14
+ df3 = pd.read_csv("data/langchain_course.csv")
15
+ df4 = pd.read_csv("data/filtered_tai_v2.csv")
16
+ df5 = pd.read_csv("data/wiki.csv") # , encoding="ISO-8859-1")
17
+ # df6 = pd.read_csv("data/openai.csv") # Broken
18
+ df7 = pd.read_csv("data/activeloop.csv")
19
+ df8 = pd.read_csv("data/llm_course.csv")
20
+ df9 = pd.read_csv("data/langchain_docs.csv") # , encoding="ISO-8859-1")
21
+
22
+ print(len(df1), df1.columns)
23
+ print(len(df2), df2.columns)
24
+ print(len(df3), df3.columns)
25
+ print(len(df4), df4.columns)
26
+ print(len(df5), df5.columns)
27
+ # print(len(df6), df6.columns)
28
+ print(len(df7), df7.columns)
29
+ print(len(df8), df8.columns)
30
+ print(len(df9), df9.columns)
31
+
32
+
33
+ # dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
34
+ dataset_path = f"local_dataset"
35
  # dataset_path = f"{DEEPLAKE_DATASET}"
36
+
 
 
37
 
38
  dm = DeepLakeDocumentsManager(
39
  vector_store_path=dataset_path,
 
41
  required_columns=["url", "content", "source", "title"],
42
  )
43
 
44
+
45
  dm.batch_add(
46
  df=df1,
47
  batch_size=3000,
48
+ min_time_interval=5,
49
+ num_workers=15,
50
+ csv_overwrite=False,
51
+ )
52
+ dm.batch_add(
53
+ df=df2,
54
+ batch_size=3000,
55
+ min_time_interval=5,
56
+ num_workers=15,
57
+ csv_overwrite=False,
58
+ )
59
+ dm.batch_add(
60
+ df=df3,
61
+ batch_size=3000,
62
+ min_time_interval=5,
63
+ num_workers=15,
64
+ csv_overwrite=False,
65
+ )
66
+ dm.batch_add(
67
+ df=df4,
68
+ batch_size=3000,
69
+ min_time_interval=5,
70
+ num_workers=15,
71
+ csv_overwrite=False,
72
+ )
73
+ dm.batch_add(
74
+ df=df5,
75
+ batch_size=3000,
76
+ min_time_interval=5,
77
+ num_workers=15,
78
+ csv_overwrite=False,
79
+ )
80
+
81
+ # ERROR DO NOT ADD
82
+ # dm.batch_add(
83
+ # df=df6,
84
+ # batch_size=3000,
85
+ # min_time_interval=5,
86
+ # num_workers=15,
87
+ # csv_overwrite=False,
88
+ # )
89
+
90
+ dm.batch_add(
91
+ df=df7,
92
+ batch_size=3000,
93
+ min_time_interval=5,
94
+ num_workers=15,
95
+ csv_overwrite=False,
96
+ )
97
+ dm.batch_add(
98
+ df=df8,
99
+ batch_size=3000,
100
+ min_time_interval=5,
101
+ num_workers=15,
102
+ csv_overwrite=False,
103
+ )
104
+ dm.batch_add(
105
+ df=df9,
106
+ batch_size=3000,
107
+ min_time_interval=5,
108
+ num_workers=15,
109
  csv_overwrite=False,
110
  )
111