Omar Solano commited on
Commit
cebdf84
·
unverified ·
1 Parent(s): e9199c3

change dataset name to new DB (#40)

Browse files

* change dataset name to new DB

* add gitignore files

* remove commented code

* remove comments

Files changed (4) hide show
  1. .gitignore +160 -0
  2. app.py +0 -2
  3. cfg.py +2 -1
  4. data/process_csvs_store.py +12 -38
.gitignore CHANGED
@@ -1,3 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.csv
2
  *.zip
3
  deeplake_store/
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
  *.csv
162
  *.zip
163
  deeplake_store/
app.py CHANGED
@@ -52,8 +52,6 @@ logging.basicConfig(level=logging.INFO)
52
 
53
 
54
  def log_likes(completion: Completion, like_data: gr.LikeData):
55
- # make it a str so json-parsable
56
-
57
  collection = "liked_data-test"
58
 
59
  completion_json = completion.to_json(
 
52
 
53
 
54
  def log_likes(completion: Completion, like_data: gr.LikeData):
 
 
55
  collection = "liked_data-test"
56
 
57
  completion_json = completion.to_json(
cfg.py CHANGED
@@ -23,13 +23,14 @@ ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")
23
  if ACTIVELOOP_TOKEN is None:
24
  logger.warning("No activeloop token found, you will not be able to fetch data.")
25
 
26
- DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset_debug")
27
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
28
 
29
  # if you want to use a local dataset, set the env. variable, it overrides all others
30
  DEEPLAKE_DATASET_PATH = os.getenv(
31
  "DEEPLAKE_DATASET_PATH", f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
32
  )
 
33
  logger.info(f"{DEEPLAKE_DATASET_PATH=}")
34
 
35
  example_questions = [
 
23
  if ACTIVELOOP_TOKEN is None:
24
  logger.warning("No activeloop token found, you will not be able to fetch data.")
25
 
26
+ DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
27
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
28
 
29
  # if you want to use a local dataset, set the env. variable, it overrides all others
30
  DEEPLAKE_DATASET_PATH = os.getenv(
31
  "DEEPLAKE_DATASET_PATH", f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
32
  )
33
+
34
  logger.info(f"{DEEPLAKE_DATASET_PATH=}")
35
 
36
  example_questions = [
data/process_csvs_store.py CHANGED
@@ -2,12 +2,8 @@ import pandas as pd
2
  import time
3
  import os
4
  from buster.documents_manager import DeepLakeDocumentsManager
5
- from deeplake.core.vectorstore import VectorStore
6
- from langchain.embeddings.openai import OpenAIEmbeddings
7
 
8
- # from openai import OpenAI
9
-
10
- DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "ai-tutor-dataset")
11
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
12
 
13
  df1 = pd.read_csv("./data/llm_course.csv")
@@ -17,8 +13,9 @@ df4 = pd.read_csv("./data/filtered_tai_v2.csv")
17
  df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
18
  df6 = pd.read_csv("./data/openai.csv")
19
  df7 = pd.read_csv("./data/activeloop.csv")
 
20
 
21
- print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6), len(df7))
22
 
23
  dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
24
 
@@ -98,35 +95,12 @@ dm.batch_add(
98
  csv_overwrite=False,
99
  )
100
 
101
-
102
- # client = OpenAI()
103
-
104
- # openai_embeddings = OpenAIEmbeddings()
105
- # def get_embedding(text, model="text-embedding-ada-002"):
106
- # # Call to OpenAI's API to create the embedding
107
- # response = client.embeddings.create(input=[text], model=model)
108
-
109
- # # Extract the embedding data from the response
110
- # embedding = response.data[0].embedding
111
-
112
- # # Convert the ndarray to a list
113
- # if isinstance(embedding, np.ndarray):
114
- # embedding = embedding.tolist()
115
-
116
- # return embedding
117
-
118
-
119
- # vs = VectorStore(
120
- # dataset_path,
121
- # runtime='compute_engine',
122
- # token=os.environ['ACTIVELOOP_TOKEN']
123
- # )
124
-
125
- # data = vs.search(query = "select * where shape(embedding)[0] == 0")
126
-
127
- # vs.update_embedding(embedding_source_tensor = "text",
128
- # query = "select * where shape(embedding)[0] == 0",
129
- # exec_option = "compute_engine",
130
- # embedding_function=get_embedding)
131
-
132
- # data2 = vs.search(query = "select * where shape(embedding)[0] == 0")
 
2
  import time
3
  import os
4
  from buster.documents_manager import DeepLakeDocumentsManager
 
 
5
 
6
+ DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "dataset-ai-tutor")
 
 
7
  DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
8
 
9
  df1 = pd.read_csv("./data/llm_course.csv")
 
13
  df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
14
  df6 = pd.read_csv("./data/openai.csv")
15
  df7 = pd.read_csv("./data/activeloop.csv")
16
+ df8 = pd.read_csv("./data/langchain_docs.csv")
17
 
18
+ print(len(df1), len(df2), len(df3), len(df4), len(df5), len(df6), len(df7), len(df8))
19
 
20
  dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
21
 
 
95
  csv_overwrite=False,
96
  )
97
 
98
+ dm.batch_add(
99
+ df=df8,
100
+ batch_size=3000,
101
+ min_time_interval=60,
102
+ num_workers=32,
103
+ csv_embeddings_filename="embeddings.csv",
104
+ csv_errors_filename="tmp.csv",
105
+ csv_overwrite=False,
106
+ )