nsthorat-lilac commited on
Commit
bfc0ec6
0 Parent(s):

Duplicate from lilacai/nikhil_staging

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +10 -0
  2. .env +40 -0
  3. .env.demo +4 -0
  4. .gitignore +5 -0
  5. Dockerfile +27 -0
  6. LICENSE +201 -0
  7. README.md +12 -0
  8. data/.cache/lilac/concept/lilac/negative-sentiment/gte-small.pkl +0 -0
  9. data/.cache/lilac/concept/lilac/non-english/gte-small.pkl +0 -0
  10. data/.cache/lilac/concept/lilac/source-code/gte-small.pkl +0 -0
  11. demo_config.yml +3 -0
  12. docker_start.py +111 -0
  13. docker_start.sh +10 -0
  14. lilac/.gitignore +1 -0
  15. lilac/__init__.py +42 -0
  16. lilac/auth.py +87 -0
  17. lilac/batch_utils.py +92 -0
  18. lilac/cli.py +47 -0
  19. lilac/concepts/__init__.py +12 -0
  20. lilac/concepts/concept.py +339 -0
  21. lilac/concepts/db_concept.py +567 -0
  22. lilac/concepts/legal-termination/concept.json +185 -0
  23. lilac/concepts/negative-sentiment/concept.json +634 -0
  24. lilac/concepts/non-english/concept.json +1024 -0
  25. lilac/concepts/positive-sentiment/concept.json +564 -0
  26. lilac/concepts/profanity/concept.json +0 -0
  27. lilac/concepts/question/concept.json +0 -0
  28. lilac/concepts/source-code/concept.json +389 -0
  29. lilac/concepts/toxicity/concept.json +0 -0
  30. lilac/config.py +268 -0
  31. lilac/conftest.py +28 -0
  32. lilac/data/__init__.py +25 -0
  33. lilac/data/dataset.py +510 -0
  34. lilac/data/dataset_duckdb.py +1833 -0
  35. lilac/data/dataset_test_utils.py +153 -0
  36. lilac/data/dataset_utils.py +313 -0
  37. lilac/data_loader.py +110 -0
  38. lilac/db_manager.py +96 -0
  39. lilac/embeddings/__init__.py +7 -0
  40. lilac/embeddings/cohere.py +59 -0
  41. lilac/embeddings/default_vector_stores.py +10 -0
  42. lilac/embeddings/embedding.py +110 -0
  43. lilac/embeddings/gte.py +63 -0
  44. lilac/embeddings/openai.py +68 -0
  45. lilac/embeddings/palm.py +62 -0
  46. lilac/embeddings/sbert.py +38 -0
  47. lilac/embeddings/transformer_utils.py +35 -0
  48. lilac/embeddings/vector_store.py +201 -0
  49. lilac/embeddings/vector_store_hnsw.py +112 -0
  50. lilac/embeddings/vector_store_numpy.py +92 -0
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ **/__pycache__
3
+ **/*.pyc
4
+ **/*.pyo
5
+ **/*.pyd
6
+ # Ignore unit tests.
7
+ **/*_test.py
8
+
9
+ # Mac OS.
10
+ .DS_Store
.env ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To overwrite these variables, create a .env.local file
2
+
3
+ # The path to the directory where the data will be downloaded on machine
4
+ LILAC_DATA_PATH=./data
5
+
6
+ # Set to 1 for duckdb to use views instead of materialized tables (lower memory usage, but slower).
7
+ DUCKDB_USE_VIEWS=0
8
+
9
+ # Set to true to enable read-only mode, disabling the ability to add datasets & compute dataset
10
+ # signals.
11
+ # LILAC_AUTH_ENABLED=true
12
+
13
+ # Variables that can be set in .env.local
14
+ #
15
+ # Get key from https://dashboard.cohere.ai/api-keys
16
+ # COHERE_API_KEY=
17
+
18
+ # GCS_REGION=
19
+ # GCS_ACCESS_KEY=
20
+ # GCS_SECRET_KEY=
21
+
22
+ # Get key from https://platform.openai.com/account/api-keys
23
+ # OPENAI_API_KEY=
24
+ # Get key from https://makersuite.google.com/app/apikey
25
+ # PALM_API_KEY=
26
+
27
+ # HuggingFace demos: machine that uploads to HuggingFace.
28
+
29
+ # For authenticating with HuggingFace to deploy to a Space.
30
+ # HF_USERNAME=
31
+ # The default repo to deploy to for a staging demo. Can be overridden by a command line flag.
32
+ # HF_STAGING_DEMO_REPO='HF_ORG/HF_REPO_NAME'
33
+
34
+ # For Google-login. This is generated from the Google Cloud Console for a web client.
35
+ # See: https://developers.google.com/identity/protocols/oauth2
36
+ GOOGLE_CLIENT_ID='279475920249-i8llm8vbos1vj5m1qocir8narb3r0enu.apps.googleusercontent.com'
37
+ # The client secret of the above client.
38
+ # GOOGLE_CLIENT_SECRET=
39
+ # A random string for oauth sessions.
40
+ # LILAC_OAUTH_SECRET_KEY=
.env.demo ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ LILAC_DATA_PATH='/data'
2
+ HF_HOME=/data/.huggingface
3
+ TRANSFORMERS_CACHE=/data/.cache
4
+ XDG_CACHE_HOME=/data/.cache
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ **/*.pyc
3
+ **/*.pyo
4
+ **/*.pyd
5
+ **/*_test.py
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NOTE: When we upgrade to 3.11 we can use a slimmer docker image which comes with gcc.
2
+ FROM python:3.9-bullseye
3
+
4
+ # Allow statements and log messages to immediately appear in the Knative logs
5
+ ENV PYTHONUNBUFFERED True
6
+
7
+ # Set the working directory in the container.
8
+ WORKDIR /server
9
+
10
+ # Install the dependencies. This requires exporting requirements.txt from poetry first, which
11
+ # happens from ./build_docker.sh.
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ COPY .env .
16
+ COPY .env.demo .
17
+ COPY demo_config.yml .
18
+ # Copy the README so we can read the datasets from the HuggingFace config.
19
+ COPY README.md .
20
+ COPY LICENSE .
21
+
22
+ # Copy python files.
23
+ COPY /lilac ./lilac/
24
+
25
+ COPY docker_start.sh docker_start.py ./
26
+
27
+ CMD ["bash", "docker_start.sh"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2023 Lilac AI Inc.
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ app_port: 5432
3
+ colorFrom: purple
4
+ colorTo: purple
5
+ datasets:
6
+ - lilacai/nikhil_staging-local-glue
7
+ - lilacai/nikhil_staging-local-imdb
8
+ emoji: 🌷
9
+ sdk: docker
10
+ title: Lilac
11
+ duplicated_from: lilacai/nikhil_staging
12
+ ---
data/.cache/lilac/concept/lilac/negative-sentiment/gte-small.pkl ADDED
Binary file (202 kB). View file
 
data/.cache/lilac/concept/lilac/non-english/gte-small.pkl ADDED
Binary file (331 kB). View file
 
data/.cache/lilac/concept/lilac/source-code/gte-small.pkl ADDED
Binary file (126 kB). View file
 
demo_config.yml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ lilac_hf_datasets:
2
+ - {hf_dataset_repo_id: lilacai/nikhil_staging-local-glue, lilac_name: glue, lilac_namespace: local}
3
+ - {hf_dataset_repo_id: lilacai/nikhil_staging-local-imdb, lilac_name: imdb, lilac_namespace: local}
docker_start.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Startup work before running the web server."""
2
+
3
+ import os
4
+ import shutil
5
+ from typing import TypedDict
6
+
7
+ import yaml
8
+ from huggingface_hub import scan_cache_dir, snapshot_download
9
+
10
+ from lilac.concepts.db_concept import CONCEPTS_DIR, DiskConceptDB, get_concept_output_dir
11
+ from lilac.env import data_path, env
12
+ from lilac.utils import get_datasets_dir, get_lilac_cache_dir, log
13
+
14
+
15
+ def delete_old_files() -> None:
16
+ """Delete old files from the cache."""
17
+ # Scan cache
18
+ try:
19
+ scan = scan_cache_dir()
20
+ except BaseException:
21
+ # Cache was not found.
22
+ return
23
+
24
+ # Select revisions to delete
25
+ to_delete = []
26
+ for repo in scan.repos:
27
+ latest_revision = max(repo.revisions, key=lambda x: x.last_modified)
28
+ to_delete.extend(
29
+ [revision.commit_hash for revision in repo.revisions if revision != latest_revision])
30
+ strategy = scan.delete_revisions(*to_delete)
31
+
32
+ # Delete them
33
+ log(f'Will delete {len(to_delete)} old revisions and save {strategy.expected_freed_size_str}')
34
+ strategy.execute()
35
+
36
+
37
+ class HfSpaceConfig(TypedDict):
38
+ """The huggingface space config, defined in README.md.
39
+
40
+ See:
41
+ https://huggingface.co/docs/hub/spaces-config-reference
42
+ """
43
+ title: str
44
+ datasets: list[str]
45
+
46
+
47
+ def main() -> None:
48
+ """Download dataset files from the HF space that was uploaded before building the image."""
49
+ # SPACE_ID is the HuggingFace Space ID environment variable that is automatically set by HF.
50
+ repo_id = env('SPACE_ID', None)
51
+ if not repo_id:
52
+ return
53
+
54
+ delete_old_files()
55
+
56
+ print('readme:', os.path.abspath('README.md'))
57
+ with open(os.path.abspath('README.md')) as f:
58
+ print(f.read())
59
+
60
+ with open(os.path.abspath('README.md')) as f:
61
+ # Strip the '---' for the huggingface readme config.
62
+ readme = f.read().strip('---')
63
+ hf_config: HfSpaceConfig = yaml.safe_load(readme)
64
+
65
+ # Download the huggingface space data. This includes code and datasets, so we move the datasets
66
+ # alone to the data directory.
67
+ for lilac_hf_dataset in hf_config['datasets']:
68
+ print('Downloading dataset from HuggingFace: ', lilac_hf_dataset)
69
+ snapshot_download(
70
+ repo_id=lilac_hf_dataset,
71
+ repo_type='dataset',
72
+ token=env('HF_ACCESS_TOKEN'),
73
+ local_dir=get_datasets_dir(data_path()),
74
+ ignore_patterns=['.gitattributes', 'README.md'])
75
+
76
+ snapshot_dir = snapshot_download(repo_id=repo_id, repo_type='space', token=env('HF_ACCESS_TOKEN'))
77
+ # # Copy datasets.
78
+ spaces_data_dir = os.path.join(snapshot_dir, 'data')
79
+
80
+ # Delete cache files from persistent storage.
81
+ cache_dir = get_lilac_cache_dir(data_path())
82
+ if os.path.exists(cache_dir):
83
+ shutil.rmtree(cache_dir)
84
+
85
+ # NOTE: This is temporary during the move of concepts into the pip package. Once all the demos
86
+ # have been updated, this block can be deleted.
87
+ old_lilac_concepts_data_dir = os.path.join(data_path(), CONCEPTS_DIR, 'lilac')
88
+ if os.path.exists(old_lilac_concepts_data_dir):
89
+ shutil.rmtree(old_lilac_concepts_data_dir)
90
+
91
+ # Copy cache files from the space if they exist.
92
+ spaces_cache_dir = get_lilac_cache_dir(spaces_data_dir)
93
+ if os.path.exists(spaces_cache_dir):
94
+ shutil.copytree(spaces_cache_dir, cache_dir)
95
+
96
+ # Copy concepts.
97
+ concepts = DiskConceptDB(spaces_data_dir).list()
98
+ for concept in concepts:
99
+ # Ignore lilac concepts, they're already part of the source code.
100
+ if concept.namespace == 'lilac':
101
+ continue
102
+ spaces_concept_output_dir = get_concept_output_dir(spaces_data_dir, concept.namespace,
103
+ concept.name)
104
+ persistent_output_dir = get_concept_output_dir(data_path(), concept.namespace, concept.name)
105
+ shutil.rmtree(persistent_output_dir, ignore_errors=True)
106
+ shutil.copytree(spaces_concept_output_dir, persistent_output_dir, dirs_exist_ok=True)
107
+ shutil.rmtree(spaces_concept_output_dir, ignore_errors=True)
108
+
109
+
110
+ if __name__ == '__main__':
111
+ main()
docker_start.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Fail if any of the commands below fail.
4
+ set -e
5
+
6
+ python docker_start.py
7
+ gunicorn lilac.server:app \
8
+ --bind 0.0.0.0:5432 \
9
+ --preload -k uvicorn.workers.UvicornWorker \
10
+ --timeout 120
lilac/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ web/
lilac/__init__.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from importlib import metadata
2
+
3
+ from .concepts import * # noqa: F403
4
+ from .config import DatasetConfig, DatasetSettings, EmbeddingConfig, SignalConfig
5
+ from .data import * # noqa: F403
6
+ from .data.dataset_duckdb import DatasetDuckDB
7
+ from .data_loader import create_dataset
8
+ from .db_manager import get_dataset, set_default_dataset_cls
9
+ from .embeddings import * # noqa: F403
10
+ from .embeddings.default_vector_stores import register_default_vector_stores
11
+ from .schema import * # noqa: F403
12
+ from .server import start_server, stop_server
13
+ from .signals import * # noqa: F403
14
+ from .signals.default_signals import register_default_signals
15
+ from .sources import * # noqa: F403
16
+ from .sources.default_sources import register_default_sources
17
+ from .splitters import * # noqa: F403
18
+
19
+ try:
20
+ __version__ = metadata.version('lilacai')
21
+ except metadata.PackageNotFoundError:
22
+ __version__ = ''
23
+
24
+ register_default_sources()
25
+ register_default_signals()
26
+ register_default_vector_stores()
27
+ set_default_dataset_cls(DatasetDuckDB)
28
+
29
+ # Avoids polluting the results of dir(__package__).
30
+ del (metadata, register_default_sources, register_default_signals, set_default_dataset_cls,
31
+ DatasetDuckDB)
32
+
33
+ __all__ = [
34
+ 'start_server',
35
+ 'stop_server',
36
+ 'create_dataset',
37
+ 'get_dataset',
38
+ 'DatasetConfig',
39
+ 'EmbeddingConfig',
40
+ 'SignalConfig',
41
+ 'DatasetSettings',
42
+ ]
lilac/auth.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Authentication and ACL configuration."""
2
+
3
+ from typing import Optional
4
+
5
+ from fastapi import Request
6
+ from pydantic import BaseModel, ValidationError
7
+
8
+ from .env import env
9
+
10
+
11
+ class ConceptAuthorizationException(Exception):
12
+ """Authorization exceptions thrown by the concept database."""
13
+ pass
14
+
15
+
16
+ class DatasetUserAccess(BaseModel):
17
+ """User access for datasets."""
18
+ # Whether the user can compute a signal.
19
+ compute_signals: bool
20
+ # Whether the user can delete a dataset.
21
+ delete_dataset: bool
22
+ # Whether the user can delete a signal.
23
+ delete_signals: bool
24
+ # Whether the user can update settings.
25
+ update_settings: bool
26
+
27
+
28
+ class ConceptUserAccess(BaseModel):
29
+ """User access for concepts."""
30
+ # Whether the user can delete any concept (not their own).
31
+ delete_any_concept: bool
32
+
33
+
34
+ class UserAccess(BaseModel):
35
+ """User access."""
36
+ create_dataset: bool
37
+
38
+ # TODO(nsthorat): Make this keyed to each dataset and concept.
39
+ dataset: DatasetUserAccess
40
+ concept: ConceptUserAccess
41
+
42
+
43
+ class UserInfo(BaseModel):
44
+ """User information."""
45
+ id: str
46
+ email: str
47
+ name: str
48
+ given_name: str
49
+ family_name: str
50
+
51
+
52
+ class AuthenticationInfo(BaseModel):
53
+ """Authentication information for the user."""
54
+ user: Optional[UserInfo] = None
55
+ access: UserAccess
56
+ auth_enabled: bool
57
+
58
+
59
+ def get_session_user(request: Request) -> Optional[UserInfo]:
60
+ """Get the user from the session."""
61
+ if not env('LILAC_AUTH_ENABLED'):
62
+ return None
63
+ user_info_dict = request.session.get('user', None)
64
+ if user_info_dict:
65
+ try:
66
+ return UserInfo.parse_obj(user_info_dict)
67
+ except ValidationError:
68
+ return None
69
+ return None
70
+
71
+
72
+ def get_user_access() -> UserAccess:
73
+ """Get the user access."""
74
+ auth_enabled = env('LILAC_AUTH_ENABLED')
75
+ if isinstance(auth_enabled, str):
76
+ auth_enabled = auth_enabled.lower() == 'true'
77
+ if auth_enabled:
78
+ return UserAccess(
79
+ create_dataset=False,
80
+ dataset=DatasetUserAccess(
81
+ compute_signals=False, delete_dataset=False, delete_signals=False, update_settings=False),
82
+ concept=ConceptUserAccess(delete_any_concept=False))
83
+ return UserAccess(
84
+ create_dataset=True,
85
+ dataset=DatasetUserAccess(
86
+ compute_signals=True, delete_dataset=True, delete_signals=True, update_settings=True),
87
+ concept=ConceptUserAccess(delete_any_concept=True))
lilac/batch_utils.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utils for the python server."""
2
+ import itertools
3
+ from typing import Any, Callable, Generator, Iterable, Iterator, TypeVar, Union, cast
4
+
5
+ from .schema import Item
6
+ from .utils import chunks, is_primitive
7
+
8
+
9
+ def _deep_flatten(input: Union[Iterator, object],
10
+ is_primitive_predicate: Callable[[object], bool]) -> Generator:
11
+ """Flattens a nested iterable."""
12
+ if is_primitive_predicate(input):
13
+ yield input
14
+ elif isinstance(input, dict):
15
+ yield input
16
+ elif is_primitive(input):
17
+ yield input
18
+ else:
19
+ for elem in cast(Iterator, input):
20
+ yield from _deep_flatten(elem, is_primitive_predicate)
21
+
22
+
23
+ def deep_flatten(input: Union[Iterator, Iterable],
24
+ is_primitive_predicate: Callable[[object], bool] = is_primitive) -> Iterator:
25
+ """Flattens a deeply nested iterator.
26
+
27
+ Primitives and dictionaries are not flattened. The user can also provide a predicate to determine
28
+ what is a primitive.
29
+ """
30
+ return _deep_flatten(input, is_primitive_predicate)
31
+
32
+
33
+ def _deep_unflatten(flat_input: Iterator[list[object]], original_input: Union[Iterable, object],
34
+ is_primitive_predicate: Callable[[object], bool]) -> Union[list, dict]:
35
+ """Unflattens a deeply flattened iterable according to the original iterable's structure."""
36
+ if is_primitive_predicate(original_input):
37
+ return next(flat_input)
38
+ else:
39
+ values: Iterable
40
+ if isinstance(original_input, dict):
41
+ values = original_input.values()
42
+ else:
43
+ values = cast(Iterable, original_input)
44
+ return [_deep_unflatten(flat_input, orig_elem, is_primitive_predicate) for orig_elem in values]
45
+
46
+
47
+ def deep_unflatten(flat_input: Union[Iterable, Iterator],
48
+ original_input: Union[Iterable, object],
49
+ is_primitive_predicate: Callable[[object], bool] = is_primitive) -> list:
50
+ """Unflattens a deeply flattened iterable according to the original iterable's structure."""
51
+ return cast(list, _deep_unflatten(iter(flat_input), original_input, is_primitive_predicate))
52
+
53
+
54
+ TFlatten = TypeVar('TFlatten')
55
+
56
+
57
+ def flatten(inputs: Iterable[Iterable[TFlatten]]) -> Iterator[TFlatten]:
58
+ """Flattens a nested iterator.
59
+
60
+ Only supports flattening one level deep.
61
+ """
62
+ for input in inputs:
63
+ yield from input
64
+
65
+
66
+ TUnflatten = TypeVar('TUnflatten')
67
+
68
+
69
+ def unflatten(flat_inputs: Union[Iterable[TUnflatten], Iterator[TUnflatten]],
70
+ original_inputs: Iterable[Iterable[Any]]) -> Iterator[list[TUnflatten]]:
71
+ """Unflattens a flattened iterable according to the original iterable's structure."""
72
+ flat_inputs_iter = iter(flat_inputs)
73
+ for original_input in original_inputs:
74
+ yield [next(flat_inputs_iter) for _ in original_input]
75
+
76
+
77
+ TFlatBatchedInput = TypeVar('TFlatBatchedInput')
78
+ TFlatBatchedOutput = TypeVar('TFlatBatchedOutput')
79
+
80
+
81
+ def flat_batched_compute(input: Iterable[Iterable[TFlatBatchedInput]],
82
+ f: Callable[[list[TFlatBatchedInput]], Iterable[TFlatBatchedOutput]],
83
+ batch_size: int) -> Iterable[Iterable[TFlatBatchedOutput]]:
84
+ """Flatten the input, batched call f, and return the output unflattened."""
85
+ # Tee the input so we can use it twice for the input and output shapes.
86
+ input_1, input_2 = itertools.tee(input, 2)
87
+ batches = chunks(flatten(input_1), batch_size)
88
+ batched_outputs = flatten((f(batch) for batch in batches))
89
+ return unflatten(batched_outputs, input_2)
90
+
91
+
92
+ TBatchSpanVectorOutput = TypeVar('TBatchSpanVectorOutput', bound=Item)
lilac/cli.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lilac CLI."""
2
+
3
+ import click
4
+
5
+ from . import __version__
6
+ from .concepts.db_concept import DISK_CONCEPT_DB
7
+ from .load import load_command as load
8
+ from .server import start_server
9
+
10
+
11
+ @click.command()
12
+ @click.option(
13
+ '--host',
14
+ help='The host address where the web server will listen to.',
15
+ default='0.0.0.0',
16
+ type=str)
17
+ @click.option('--port', help='The port number of the web-server', type=int, default=5432)
18
+ def start(host: str, port: int) -> None:
19
+ """Starts the Lilac web server."""
20
+ start_server(host=host, port=port, open=True)
21
+
22
+
23
+ @click.command()
24
+ def version() -> None:
25
+ """Prints the version of Lilac."""
26
+ print(__version__)
27
+
28
+
29
+ @click.command()
30
+ def concepts() -> None:
31
+ """Lists lilac concepts."""
32
+ print(DISK_CONCEPT_DB.list())
33
+
34
+
35
+ @click.group()
36
+ def cli() -> None:
37
+ """Lilac CLI."""
38
+ pass
39
+
40
+
41
+ cli.add_command(start)
42
+ cli.add_command(version)
43
+ cli.add_command(load)
44
+ cli.add_command(concepts)
45
+
46
+ if __name__ == '__main__':
47
+ cli()
lilac/concepts/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Concepts are customizable signals that help enrich documents."""
2
+
3
+ from .concept import Example, ExampleIn
4
+ from .db_concept import ConceptUpdate, DiskConceptDB, DiskConceptModelDB
5
+
6
+ __all__ = [
7
+ 'DiskConceptDB',
8
+ 'DiskConceptModelDB',
9
+ 'Example',
10
+ 'ExampleIn',
11
+ 'ConceptUpdate',
12
+ ]
lilac/concepts/concept.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Defines the concept and the concept models."""
2
+ import dataclasses
3
+ from enum import Enum
4
+ from typing import Callable, Literal, Optional, Union
5
+
6
+ import numpy as np
7
+ from joblib import Parallel, delayed
8
+ from pydantic import BaseModel, validator
9
+ from scipy.interpolate import interp1d
10
+ from sklearn.base import clone
11
+ from sklearn.linear_model import LogisticRegression
12
+ from sklearn.metrics import precision_recall_curve, roc_auc_score
13
+ from sklearn.model_selection import KFold
14
+
15
+ from ..embeddings.embedding import get_embed_fn
16
+ from ..signal import TextEmbeddingSignal, get_signal_cls
17
+ from ..utils import DebugTimer
18
+
19
+ LOCAL_CONCEPT_NAMESPACE = 'local'
20
+
21
+ # The maximum number of cross-validation models to train.
22
+ MAX_NUM_CROSS_VAL_MODELS = 15
23
+ # The β weight to use for the F-beta score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fbeta_score.html
24
+ # β = 0.5 means we value precision 2x as much as recall.
25
+ # β = 2 means we value recall 2x as much as precision.
26
+ F_BETA_WEIGHT = 0.5
27
+
28
+
29
+ class ConceptType(str, Enum):
30
+ """Enum holding the concept type."""
31
+ TEXT = 'text'
32
+ IMAGE = 'image'
33
+
34
+ def __repr__(self) -> str:
35
+ return self.value
36
+
37
+
38
+ class ExampleOrigin(BaseModel):
39
+ """The origin of an example."""
40
+ # The namespace that holds the dataset.
41
+ dataset_namespace: str
42
+
43
+ # The name of the dataset.
44
+ dataset_name: str
45
+
46
+ # The id of row in the dataset that the example was added from.
47
+ dataset_row_id: str
48
+
49
+
50
+ DraftId = Union[Literal['main'], str]
51
+ DRAFT_MAIN = 'main'
52
+
53
+
54
+ class ExampleIn(BaseModel):
55
+ """An example in a concept without the id (used for adding new examples)."""
56
+ label: bool
57
+ text: Optional[str] = None
58
+ img: Optional[bytes] = None
59
+ origin: Optional[ExampleOrigin] = None
60
+ # The name of the draft to put the example in. If None, puts it in the main draft.
61
+ draft: Optional[DraftId] = DRAFT_MAIN
62
+
63
+ @validator('text')
64
+ def parse_text(cls, text: str) -> str:
65
+ """Fixes surrogate errors in text: https://github.com/ijl/orjson/blob/master/README.md#str ."""
66
+ return text.encode('utf-8', 'replace').decode('utf-8')
67
+
68
+
69
+ class Example(ExampleIn):
70
+ """A single example in a concept used for training a concept model."""
71
+ id: str
72
+
73
+
74
+ class Concept(BaseModel):
75
+ """A concept is a collection of examples."""
76
+ # The namespace of the concept.
77
+ namespace: str
78
+ # The name of the concept.
79
+ concept_name: str
80
+ # The type of the data format that this concept represents.
81
+ type: ConceptType
82
+ data: dict[str, Example]
83
+ version: int = 0
84
+
85
+ tags: list[str] = []
86
+ description: Optional[str] = None
87
+
88
+ def drafts(self) -> list[DraftId]:
89
+ """Gets all the drafts for the concept."""
90
+ drafts: set[DraftId] = set([DRAFT_MAIN]) # Always return the main draft.
91
+ for example in self.data.values():
92
+ if example.draft:
93
+ drafts.add(example.draft)
94
+ return list(sorted(drafts))
95
+
96
+
97
+ class OverallScore(str, Enum):
98
+ """Enum holding the overall score."""
99
+ NOT_GOOD = 'not_good'
100
+ OK = 'ok'
101
+ GOOD = 'good'
102
+ VERY_GOOD = 'very_good'
103
+ GREAT = 'great'
104
+
105
+
106
+ def _get_overall_score(f1_score: float) -> OverallScore:
107
+ if f1_score < 0.5:
108
+ return OverallScore.NOT_GOOD
109
+ if f1_score < 0.8:
110
+ return OverallScore.OK
111
+ if f1_score < 0.9:
112
+ return OverallScore.GOOD
113
+ if f1_score < 0.95:
114
+ return OverallScore.VERY_GOOD
115
+ return OverallScore.GREAT
116
+
117
+
118
+ class ConceptMetrics(BaseModel):
119
+ """Metrics for a concept."""
120
+ # The average F1 score for the concept computed using cross validation.
121
+ f1: float
122
+ precision: float
123
+ recall: float
124
+ roc_auc: float
125
+ overall: OverallScore
126
+
127
+
128
+ @dataclasses.dataclass
129
+ class LogisticEmbeddingModel:
130
+ """A model that uses logistic regression with embeddings."""
131
+
132
+ _metrics: Optional[ConceptMetrics] = None
133
+ _threshold: float = 0.5
134
+
135
+ def __post_init__(self) -> None:
136
+ # See `notebooks/Toxicity.ipynb` for an example of training a concept model.
137
+ self._model = LogisticRegression(
138
+ class_weight='balanced', C=30, tol=1e-5, warm_start=True, max_iter=5_000, n_jobs=-1)
139
+
140
+ def score_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
141
+ """Get the scores for the provided embeddings."""
142
+ y_probs = self._model.predict_proba(embeddings)[:, 1]
143
+ # Map [0, threshold, 1] to [0, 0.5, 1].
144
+ interpolate_fn = interp1d([0, self._threshold, 1], [0, 0.4999, 1])
145
+ return interpolate_fn(y_probs)
146
+
147
+ def _setup_training(self, X_train: np.ndarray,
148
+ labels: Union[list[bool], np.ndarray]) -> tuple[np.ndarray, np.ndarray]:
149
+ y_train = np.array(labels)
150
+ # Shuffle the data in unison.
151
+ p = np.random.permutation(len(X_train))
152
+ X_train = X_train[p]
153
+ y_train = y_train[p]
154
+ return X_train, y_train
155
+
156
+ def fit(self, embeddings: np.ndarray, labels: list[bool]) -> None:
157
+ """Fit the model to the provided embeddings and labels."""
158
+ label_set = set(labels)
159
+ if len(label_set) < 2:
160
+ dim = embeddings.shape[1]
161
+ random_vector = np.random.randn(dim).astype(np.float32)
162
+ random_vector /= np.linalg.norm(random_vector)
163
+ embeddings = np.vstack([embeddings, random_vector])
164
+ labels.append(False if True in label_set else True)
165
+
166
+ if len(labels) != len(embeddings):
167
+ raise ValueError(
168
+ f'Length of embeddings ({len(embeddings)}) must match length of labels ({len(labels)})')
169
+ X_train, y_train = self._setup_training(embeddings, labels)
170
+ self._model.fit(X_train, y_train)
171
+ self._metrics, self._threshold = self._compute_metrics(embeddings, labels)
172
+
173
+ def _compute_metrics(self, embeddings: np.ndarray,
174
+ labels: list[bool]) -> tuple[Optional[ConceptMetrics], float]:
175
+ """Return the concept metrics."""
176
+ labels_np = np.array(labels)
177
+ n_splits = min(len(labels_np), MAX_NUM_CROSS_VAL_MODELS)
178
+ fold = KFold(n_splits, shuffle=True, random_state=42)
179
+
180
+ def _fit_and_score(model: LogisticRegression, X_train: np.ndarray, y_train: np.ndarray,
181
+ X_test: np.ndarray, y_test: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
182
+ if len(set(y_train)) < 2:
183
+ return np.array([]), np.array([])
184
+ model.fit(X_train, y_train)
185
+ y_pred = model.predict_proba(X_test)[:, 1]
186
+ return y_test, y_pred
187
+
188
+ # Compute the metrics for each validation fold in parallel.
189
+ jobs: list[Callable] = []
190
+ for (train_index, test_index) in fold.split(embeddings):
191
+ X_train, y_train = embeddings[train_index], labels_np[train_index]
192
+ X_train, y_train = self._setup_training(X_train, y_train)
193
+ X_test, y_test = embeddings[test_index], labels_np[test_index]
194
+ model = clone(self._model)
195
+ jobs.append(delayed(_fit_and_score)(model, X_train, y_train, X_test, y_test))
196
+ results = Parallel(n_jobs=-1)(jobs)
197
+
198
+ y_test = np.concatenate([y_test for y_test, _ in results], axis=0)
199
+ y_pred = np.concatenate([y_pred for _, y_pred in results], axis=0)
200
+ if len(set(y_test)) < 2:
201
+ return None, 0.5
202
+ roc_auc_val = roc_auc_score(y_test, y_pred)
203
+ precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
204
+ numerator = (1 + F_BETA_WEIGHT**2) * precision * recall
205
+ denom = (F_BETA_WEIGHT**2 * precision) + recall
206
+ f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom != 0))
207
+ max_f1: float = np.max(f1_scores)
208
+ max_f1_index = np.argmax(f1_scores)
209
+ max_f1_thresh: float = thresholds[max_f1_index]
210
+ max_f1_prec: float = precision[max_f1_index]
211
+ max_f1_recall: float = recall[max_f1_index]
212
+ metrics = ConceptMetrics(
213
+ f1=max_f1,
214
+ precision=max_f1_prec,
215
+ recall=max_f1_recall,
216
+ roc_auc=float(roc_auc_val),
217
+ overall=_get_overall_score(max_f1))
218
+ return metrics, max_f1_thresh
219
+
220
+
221
+ def draft_examples(concept: Concept, draft: DraftId) -> dict[str, Example]:
222
+ """Get the examples in the provided draft by overriding the main draft."""
223
+ draft_examples: dict[str, dict[str, Example]] = {}
224
+ for id, example in concept.data.items():
225
+ draft_examples.setdefault(example.draft or DRAFT_MAIN, {})[example.id] = example
226
+
227
+ if draft == DRAFT_MAIN:
228
+ return draft_examples.get(DRAFT_MAIN, {})
229
+
230
+ if draft not in draft_examples:
231
+ raise ValueError(
232
+ f'Draft {draft} not found in concept. Found drafts: {list(draft_examples.keys())}')
233
+
234
+ # Map the text of the draft to its id so we can dedupe with main.
235
+ draft_text_ids = {example.text: id for id, example in draft_examples[draft].items()}
236
+
237
+ # Write each of examples from main to the draft examples only if the text does not appear in the
238
+ # draft.
239
+ for id, example in draft_examples[DRAFT_MAIN].items():
240
+ if example.text not in draft_text_ids:
241
+ draft_examples[draft][id] = example
242
+
243
+ return draft_examples[draft]
244
+
245
+
246
+ @dataclasses.dataclass
247
+ class ConceptModel:
248
+ """A concept model. Stores all concept model drafts and manages syncing."""
249
+ # The concept that this model is for.
250
+ namespace: str
251
+ concept_name: str
252
+
253
+ # The name of the embedding for this model.
254
+ embedding_name: str
255
+ version: int = 0
256
+
257
+ batch_size = 4096
258
+
259
+ # The following fields are excluded from JSON serialization, but still pickle-able.
260
+ # Maps a concept id to the embeddings.
261
+ _embeddings: dict[str, np.ndarray] = dataclasses.field(default_factory=dict)
262
+ _logistic_models: dict[DraftId, LogisticEmbeddingModel] = dataclasses.field(default_factory=dict)
263
+
264
+ def get_metrics(self) -> Optional[ConceptMetrics]:
265
+ """Return the metrics for this model."""
266
+ return self._get_logistic_model(DRAFT_MAIN)._metrics
267
+
268
+ def score_embeddings(self, draft: DraftId, embeddings: np.ndarray) -> np.ndarray:
269
+ """Get the scores for the provided embeddings."""
270
+ return self._get_logistic_model(draft).score_embeddings(embeddings)
271
+
272
+ def coef(self, draft: DraftId) -> np.ndarray:
273
+ """Get the coefficients of the underlying ML model."""
274
+ return self._get_logistic_model(draft)._model.coef_.reshape(-1)
275
+
276
+ def _get_logistic_model(self, draft: DraftId) -> LogisticEmbeddingModel:
277
+ """Get the logistic model for the provided draft."""
278
+ if draft not in self._logistic_models:
279
+ self._logistic_models[draft] = LogisticEmbeddingModel()
280
+ return self._logistic_models[draft]
281
+
282
+ def sync(self, concept: Concept) -> bool:
283
+ """Update the model with the latest labeled concept data."""
284
+ if concept.version == self.version:
285
+ # The model is up to date.
286
+ return False
287
+
288
+ concept_path = (f'{self.namespace}/{self.concept_name}/'
289
+ f'{self.embedding_name}')
290
+ with DebugTimer(f'Computing embeddings for "{concept_path}"'):
291
+ self._compute_embeddings(concept)
292
+
293
+ # Fit each of the drafts, sort by draft name for deterministic behavior.
294
+ for draft in concept.drafts():
295
+ examples = draft_examples(concept, draft)
296
+ embeddings = np.array([self._embeddings[id] for id in examples.keys()])
297
+ labels = [example.label for example in examples.values()]
298
+ model = self._get_logistic_model(draft)
299
+ with DebugTimer(f'Fitting model for "{concept_path}"'):
300
+ model.fit(embeddings, labels)
301
+
302
+ # Synchronize the model version with the concept version.
303
+ self.version = concept.version
304
+
305
+ return True
306
+
307
+ def _compute_embeddings(self, concept: Concept) -> None:
308
+ signal_cls = get_signal_cls(self.embedding_name)
309
+ if not signal_cls:
310
+ raise ValueError(f'Embedding signal "{self.embedding_name}" not found in the registry.')
311
+ embedding_signal = signal_cls()
312
+ if not isinstance(embedding_signal, TextEmbeddingSignal):
313
+ raise ValueError(f'Only text embedding signals are currently supported for concepts. '
314
+ f'"{self.embedding_name}" is a {type(embedding_signal)}.')
315
+
316
+ embed_fn = get_embed_fn(self.embedding_name, split=False)
317
+ concept_embeddings: dict[str, np.ndarray] = {}
318
+
319
+ examples = concept.data.items()
320
+ if not examples:
321
+ raise ValueError(f'Cannot sync concept "{concept.concept_name}". It has no examples.')
322
+
323
+ # Compute the embeddings for the examples with cache miss.
324
+ texts_of_missing_embeddings: dict[str, str] = {}
325
+ for id, example in examples:
326
+ if id in self._embeddings:
327
+ # Cache hit.
328
+ concept_embeddings[id] = self._embeddings[id]
329
+ else:
330
+ # Cache miss.
331
+ # TODO(smilkov): Support images.
332
+ texts_of_missing_embeddings[id] = example.text or ''
333
+
334
+ missing_ids = texts_of_missing_embeddings.keys()
335
+ missing_embeddings = embed_fn(list(texts_of_missing_embeddings.values()))
336
+
337
+ for id, (embedding,) in zip(missing_ids, missing_embeddings):
338
+ concept_embeddings[id] = embedding['vector'] / np.linalg.norm(embedding['vector'])
339
+ self._embeddings = concept_embeddings
lilac/concepts/db_concept.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The concept database."""
2
+
3
+ import abc
4
+ import glob
5
+ import json
6
+ import os
7
+ import pathlib
8
+ import pickle
9
+ import shutil
10
+ import threading
11
+
12
+ # NOTE: We have to import the module for uuid so it can be mocked.
13
+ import uuid
14
+ from importlib import resources
15
+ from typing import Any, List, Optional, Union, cast
16
+
17
+ from pydantic import BaseModel
18
+ from typing_extensions import override
19
+
20
+ from ..auth import ConceptAuthorizationException, UserInfo
21
+ from ..env import data_path, env
22
+ from ..schema import SignalInputType
23
+ from ..signal import get_signal_cls
24
+ from ..utils import delete_file, file_exists, get_lilac_cache_dir, open_file
25
+ from .concept import DRAFT_MAIN, Concept, ConceptModel, ConceptType, DraftId, Example, ExampleIn
26
+
27
+ CONCEPTS_DIR = 'concept'
28
+ CONCEPT_JSON_FILENAME = 'concept.json'
29
+ # Under 'lilac' package.
30
+ LILAC_CONCEPTS_DIR = 'concepts'
31
+
32
+
33
+ class ConceptNamespaceACL(BaseModel):
34
+ """The access control list for a namespace."""
35
+ # Whether the current user can read concepts in the namespace.
36
+ read: bool
37
+ # Whether the current user can add concepts to the namespace.
38
+ write: bool
39
+
40
+
41
+ class ConceptACL(BaseModel):
42
+ """The access control list for an individual concept."""
43
+ # Whether the current user can read the concept.
44
+ read: bool
45
+ # Whether the current user can edit the concept, including adding examples or deleting the
46
+ # concept.
47
+ write: bool
48
+
49
+
50
+ class ConceptInfo(BaseModel):
51
+ """Information about a concept."""
52
+ namespace: str
53
+ name: str
54
+ description: Optional[str] = None
55
+ type: ConceptType
56
+ drafts: list[DraftId]
57
+ tags: list[str] = []
58
+
59
+ acls: ConceptACL
60
+
61
+
62
+ class ConceptUpdate(BaseModel):
63
+ """An update to a concept."""
64
+ # List of examples to be inserted.
65
+ insert: Optional[list[ExampleIn]] = []
66
+
67
+ # List of examples to be updated.
68
+ update: Optional[list[Example]] = []
69
+
70
+ # The ids of the examples to be removed.
71
+ remove: Optional[list[str]] = []
72
+
73
+
74
+ class ConceptDB(abc.ABC):
75
+ """Interface for the concept database."""
76
+
77
+ @abc.abstractmethod
78
+ def list(self, user: Optional[UserInfo] = None) -> list[ConceptInfo]:
79
+ """List all the concepts."""
80
+ pass
81
+
82
+ @abc.abstractmethod
83
+ def namespace_acls(self, namespace: str, user: Optional[UserInfo] = None) -> ConceptNamespaceACL:
84
+ """Return the ACL for a namespace."""
85
+ pass
86
+
87
+ @abc.abstractmethod
88
+ def concept_acls(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> ConceptACL:
89
+ """Return the ACL for a concept."""
90
+ pass
91
+
92
+ @abc.abstractmethod
93
+ def get(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> Optional[Concept]:
94
+ """Return a concept or None if there isn't one."""
95
+ pass
96
+
97
+ @abc.abstractmethod
98
+ def create(self,
99
+ namespace: str,
100
+ name: str,
101
+ type: Union[ConceptType, str],
102
+ description: Optional[str] = None,
103
+ user: Optional[UserInfo] = None) -> Concept:
104
+ """Create a concept.
105
+
106
+ Args:
107
+ namespace: The namespace of the concept.
108
+ name: The name of the concept.
109
+ type: The type of the concept.
110
+ description: The description of the concept.
111
+ user: The user creating the concept, if authentication is enabled.
112
+ """
113
+ pass
114
+
115
+ @abc.abstractmethod
116
+ def edit(self,
117
+ namespace: str,
118
+ name: str,
119
+ change: ConceptUpdate,
120
+ user: Optional[UserInfo] = None) -> Concept:
121
+ """Edit a concept. If the concept doesn't exist, throw an error."""
122
+ pass
123
+
124
+ @abc.abstractmethod
125
+ def remove(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> None:
126
+ """Remove a concept."""
127
+ pass
128
+
129
+ @abc.abstractmethod
130
+ def merge_draft(self,
131
+ namespace: str,
132
+ name: str,
133
+ draft: DraftId,
134
+ user: Optional[UserInfo] = None) -> Concept:
135
+ """Merge a draft concept.."""
136
+ pass
137
+
138
+
139
+ class ConceptModelDB(abc.ABC):
140
+ """Interface for the concept model database."""
141
+
142
+ _concept_db: ConceptDB
143
+ _sync_lock = threading.Lock()
144
+
145
+ def __init__(self, concept_db: ConceptDB) -> None:
146
+ self._concept_db = concept_db
147
+
148
+ @abc.abstractmethod
149
+ def create(self,
150
+ namespace: str,
151
+ concept_name: str,
152
+ embedding_name: str,
153
+ user: Optional[UserInfo] = None) -> ConceptModel:
154
+ """Create the concept model."""
155
+ pass
156
+
157
+ @abc.abstractmethod
158
+ def get(self,
159
+ namespace: str,
160
+ concept_name: str,
161
+ embedding_name: str,
162
+ user: Optional[UserInfo] = None) -> Optional[ConceptModel]:
163
+ """Get the model associated with the provided concept the embedding.
164
+
165
+ Returns None if the model does not exist.
166
+ """
167
+ pass
168
+
169
+ @abc.abstractmethod
170
+ def _save(self, model: ConceptModel) -> None:
171
+ """Save the concept model."""
172
+ pass
173
+
174
+ def in_sync(self, model: ConceptModel, user: Optional[UserInfo] = None) -> bool:
175
+ """Return True if the model is up to date with the concept."""
176
+ concept = self._concept_db.get(model.namespace, model.concept_name, user=user)
177
+ if not concept:
178
+ raise ValueError(f'Concept "{model.namespace}/{model.concept_name}" does not exist.')
179
+ return concept.version == model.version
180
+
181
+ def sync(self,
182
+ namespace: str,
183
+ concept_name: str,
184
+ embedding_name: str,
185
+ user: Optional[UserInfo] = None,
186
+ create: bool = False) -> ConceptModel:
187
+ """Sync the concept model. Returns true if the model was updated."""
188
+ with self._sync_lock:
189
+ model = self.get(namespace, concept_name, embedding_name, user=user)
190
+ if not model:
191
+ if create:
192
+ model = self.create(namespace, concept_name, embedding_name, user=user)
193
+ else:
194
+ raise ValueError(f'Model "{namespace}/{concept_name}/{embedding_name}" does not exist.')
195
+
196
+ concept = self._concept_db.get(model.namespace, model.concept_name, user=user)
197
+ if not concept:
198
+ raise ValueError(f'Concept "{model.namespace}/{model.concept_name}" does not exist.')
199
+ model_updated = model.sync(concept)
200
+ if model_updated:
201
+ self._save(model)
202
+ return model
203
+
204
+ @abc.abstractmethod
205
+ def remove(self, namespace: str, concept_name: str, embedding_name: str) -> None:
206
+ """Remove the model of a concept."""
207
+ pass
208
+
209
+ @abc.abstractmethod
210
+ def get_models(self, namespace: str, concept_name: str) -> list[ConceptModel]:
211
+ """List all the models associated with a concept."""
212
+ pass
213
+
214
+
215
+ class DiskConceptModelDB(ConceptModelDB):
216
+ """Interface for the concept model database."""
217
+
218
+ def __init__(self,
219
+ concept_db: ConceptDB,
220
+ base_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
221
+ super().__init__(concept_db)
222
+ self._base_dir = base_dir
223
+
224
+ def _get_base_dir(self) -> str:
225
+ return str(self._base_dir) if self._base_dir else data_path()
226
+
227
+ @override
228
+ def create(self,
229
+ namespace: str,
230
+ concept_name: str,
231
+ embedding_name: str,
232
+ user: Optional[UserInfo] = None) -> ConceptModel:
233
+ if self.get(namespace, concept_name, embedding_name, user=user):
234
+ raise ValueError('Concept model already exists.')
235
+ concept = self._concept_db.get(namespace, concept_name, user=user)
236
+ if not concept:
237
+ raise ValueError(f'Concept "{namespace}/{concept_name}" does not exist.')
238
+ model = ConceptModel(
239
+ namespace=namespace, concept_name=concept_name, embedding_name=embedding_name)
240
+ self._save(model)
241
+ return model
242
+
243
+ @override
244
+ def get(self,
245
+ namespace: str,
246
+ concept_name: str,
247
+ embedding_name: str,
248
+ user: Optional[UserInfo] = None) -> Optional[ConceptModel]:
249
+ # Make sure the concept exists.
250
+ concept = self._concept_db.get(namespace, concept_name, user=user)
251
+ if not concept:
252
+ raise ValueError(f'Concept "{namespace}/{concept_name}" does not exist.')
253
+
254
+ # Make sure that the embedding signal exists.
255
+ if not get_signal_cls(embedding_name):
256
+ raise ValueError(f'Embedding signal "{embedding_name}" not found in the registry.')
257
+
258
+ concept_model_path = _concept_model_path(self._get_base_dir(), namespace, concept_name,
259
+ embedding_name)
260
+ if not file_exists(concept_model_path):
261
+ return None
262
+
263
+ with open_file(concept_model_path, 'rb') as f:
264
+ return pickle.load(f)
265
+
266
+ def _save(self, model: ConceptModel) -> None:
267
+ """Save the concept model."""
268
+ concept_model_path = _concept_model_path(self._get_base_dir(), model.namespace,
269
+ model.concept_name, model.embedding_name)
270
+ with open_file(concept_model_path, 'wb') as f:
271
+ pickle.dump(model, f)
272
+
273
+ @override
274
+ def remove(self,
275
+ namespace: str,
276
+ concept_name: str,
277
+ embedding_name: str,
278
+ user: Optional[UserInfo] = None) -> None:
279
+ concept_model_path = _concept_model_path(self._get_base_dir(), namespace, concept_name,
280
+ embedding_name)
281
+
282
+ if not file_exists(concept_model_path):
283
+ raise ValueError(f'Concept model {namespace}/{concept_name}/{embedding_name} does not exist.')
284
+
285
+ delete_file(concept_model_path)
286
+
287
+ @override
288
+ def get_models(self,
289
+ namespace: str,
290
+ concept_name: str,
291
+ user: Optional[UserInfo] = None) -> list[ConceptModel]:
292
+ """List all the models associated with a concept."""
293
+ model_files = glob.iglob(
294
+ os.path.join(_concept_cache_dir(self._get_base_dir(), namespace, concept_name), '*.pkl'))
295
+ models: list[ConceptModel] = []
296
+ for model_file in model_files:
297
+ embedding_name = os.path.basename(model_file)[:-len('.pkl')]
298
+ model = self.get(namespace, concept_name, embedding_name, user=user)
299
+ if model:
300
+ models.append(model)
301
+ return models
302
+
303
+
304
+ def get_concept_output_dir(base_dir: str, namespace: str, name: str) -> str:
305
+ """Return the output directory for a given concept."""
306
+ if namespace == 'lilac':
307
+ # Lilac concepts are stored in the resources directory and shipped with the pip package.
308
+ return str(resources.files('lilac').joinpath(os.path.join(LILAC_CONCEPTS_DIR, name)))
309
+
310
+ return os.path.join(base_dir, CONCEPTS_DIR, namespace, name)
311
+
312
+
313
+ def _concept_json_path(base_dir: str, namespace: str, name: str) -> str:
314
+ return os.path.join(get_concept_output_dir(base_dir, namespace, name), CONCEPT_JSON_FILENAME)
315
+
316
+
317
+ def _concept_cache_dir(base_dir: str, namespace: str, concept_name: str) -> str:
318
+ return os.path.join(get_lilac_cache_dir(base_dir), CONCEPTS_DIR, namespace, concept_name)
319
+
320
+
321
+ def _concept_model_path(base_dir: str, namespace: str, concept_name: str,
322
+ embedding_name: str) -> str:
323
+
324
+ return os.path.join(
325
+ _concept_cache_dir(base_dir, namespace, concept_name), f'{embedding_name}.pkl')
326
+
327
+
328
+ class DiskConceptDB(ConceptDB):
329
+ """A concept database."""
330
+
331
+ def __init__(self, base_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
332
+ self._base_dir = base_dir
333
+
334
+ def _get_base_dir(self) -> str:
335
+ return str(self._base_dir) if self._base_dir else data_path()
336
+
337
+ @override
338
+ def namespace_acls(self, namespace: str, user: Optional[UserInfo] = None) -> ConceptNamespaceACL:
339
+ if not env('LILAC_AUTH_ENABLED'):
340
+ return ConceptNamespaceACL(read=True, write=True)
341
+
342
+ if namespace == 'lilac':
343
+ return ConceptNamespaceACL(read=True, write=False)
344
+ if user and user.id == namespace:
345
+ return ConceptNamespaceACL(read=True, write=True)
346
+
347
+ return ConceptNamespaceACL(read=False, write=False)
348
+
349
+ @override
350
+ def concept_acls(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> ConceptACL:
351
+ namespace_acls = self.namespace_acls(namespace, user=user)
352
+ # Concept ACL inherit from the namespace ACL. We currently don't have concept-specific
353
+ # ACL.
354
+ return ConceptACL(read=namespace_acls.read, write=namespace_acls.write)
355
+
356
+ @override
357
+ def list(self, user: Optional[UserInfo] = None) -> list[ConceptInfo]:
358
+ namespaces: Optional[list[str]] = None
359
+ if env('LILAC_AUTH_ENABLED'):
360
+ namespaces = ['lilac']
361
+ if user:
362
+ namespaces += [user.id]
363
+
364
+ concept_infos: list[ConceptInfo] = []
365
+
366
+ namespace_concept_dirs: list[tuple[Optional[str], str]] = [
367
+ # None = Read the namespace from the directory.
368
+ (None, os.path.join(self._get_base_dir(), CONCEPTS_DIR)),
369
+ # Read lilac concepts from the resources directory.
370
+ ('lilac', str(resources.files('lilac').joinpath(LILAC_CONCEPTS_DIR)))
371
+ ]
372
+
373
+ for (default_namespace, concept_dir) in namespace_concept_dirs:
374
+ # Read the concepts from the data dir and return a ConceptInfo containing the namespace and
375
+ # name.
376
+ for root, _, files in os.walk(concept_dir):
377
+ for file in files:
378
+ if file == CONCEPT_JSON_FILENAME:
379
+ namespace, name = root.split('/')[-2:]
380
+ if default_namespace is not None:
381
+ namespace = default_namespace
382
+ if namespaces and namespace not in namespaces:
383
+ # Ignore concepts that are not in the namespace, if provided.
384
+ continue
385
+
386
+ concept = cast(Concept, self.get(namespace, name, user=user))
387
+ concept_infos.append(
388
+ _info_from_concept(concept, self.concept_acls(namespace, name, user=user)))
389
+
390
+ return concept_infos
391
+
392
+ @override
393
+ def get(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> Optional[Concept]:
394
+ # If the user does not have access to the concept, return None.
395
+ acls = self.concept_acls(namespace, name, user=user)
396
+ if not acls.read:
397
+ raise ConceptAuthorizationException(
398
+ f'Concept "{namespace}/{name}" does not exist or user does not have access.')
399
+
400
+ concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
401
+ if not file_exists(concept_json_path):
402
+ return None
403
+
404
+ with open_file(concept_json_path) as f:
405
+ obj: dict[str, Any] = json.load(f)
406
+ if 'namespace' not in obj:
407
+ obj['namespace'] = namespace
408
+ return Concept.parse_obj(obj)
409
+
410
+ @override
411
+ def create(self,
412
+ namespace: str,
413
+ name: str,
414
+ type: Union[ConceptType, str] = ConceptType.TEXT,
415
+ description: Optional[str] = None,
416
+ user: Optional[UserInfo] = None) -> Concept:
417
+ """Create a concept."""
418
+ # If the user does not have access to the write to the concept namespace, throw.
419
+ acls = self.namespace_acls(namespace, user=user)
420
+ if not acls.write:
421
+ raise ConceptAuthorizationException(
422
+ f'Concept namespace "{namespace}" does not exist or user does not have access.')
423
+
424
+ concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
425
+ if file_exists(concept_json_path):
426
+ raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" already exists.')
427
+
428
+ if isinstance(type, str):
429
+ type = ConceptType(type)
430
+ concept = Concept(
431
+ namespace=namespace, concept_name=name, type=type, data={}, description=description)
432
+ self._save(concept)
433
+ return concept
434
+
435
+ def _validate_examples(self, examples: List[Union[ExampleIn, Example]],
436
+ type: ConceptType) -> None:
437
+ for example in examples:
438
+ inferred_type = 'text' if example.text else 'unknown'
439
+ if inferred_type != type:
440
+ raise ValueError(f'Example type "{inferred_type}" does not match concept type "{type}".')
441
+
442
+ @override
443
+ def edit(self,
444
+ namespace: str,
445
+ name: str,
446
+ change: ConceptUpdate,
447
+ user: Optional[UserInfo] = None) -> Concept:
448
+ # If the user does not have access to the concept, return None.
449
+ acls = self.concept_acls(namespace, name, user=user)
450
+ if not acls.write:
451
+ raise ConceptAuthorizationException(
452
+ f'Concept "{namespace}/{name}" does not exist or user does not have access.')
453
+
454
+ concept_json_path = _concept_json_path(self._get_base_dir(), namespace, name)
455
+
456
+ if not file_exists(concept_json_path):
457
+ raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist. '
458
+ 'Please call create() first.')
459
+
460
+ inserted_points = change.insert or []
461
+ updated_points = change.update or []
462
+ removed_points = change.remove or []
463
+
464
+ concept = cast(Concept, self.get(namespace, name, user=user))
465
+
466
+ self._validate_examples([*inserted_points, *updated_points], concept.type)
467
+
468
+ for remove_example in removed_points:
469
+ if remove_example not in concept.data:
470
+ raise ValueError(f'Example with id "{remove_example}" does not exist.')
471
+ concept.data.pop(remove_example)
472
+
473
+ for example in inserted_points:
474
+ id = uuid.uuid4().hex
475
+ concept.data[id] = Example(id=id, **example.dict())
476
+
477
+ for example in updated_points:
478
+ if example.id not in concept.data:
479
+ raise ValueError(f'Example with id "{example.id}" does not exist.')
480
+
481
+ # Remove the old example and make a new one with a new id to keep it functional.
482
+ concept.data.pop(example.id)
483
+ concept.data[example.id] = example.copy()
484
+
485
+ concept.version += 1
486
+
487
+ self._save(concept)
488
+
489
+ return concept
490
+
491
+ def _save(self, concept: Concept) -> None:
492
+ concept_json_path = _concept_json_path(self._get_base_dir(), concept.namespace,
493
+ concept.concept_name)
494
+ with open_file(concept_json_path, 'w') as f:
495
+ f.write(concept.json(exclude_none=True, indent=2, exclude_defaults=True))
496
+
497
+ @override
498
+ def remove(self, namespace: str, name: str, user: Optional[UserInfo] = None) -> None:
499
+ # If the user does not have access to the concept, return None.
500
+ acls = self.concept_acls(namespace, name, user=user)
501
+ if not acls.write:
502
+ raise ConceptAuthorizationException(
503
+ f'Concept "{namespace}/{name}" does not exist or user does not have access.')
504
+
505
+ concept_dir = get_concept_output_dir(self._get_base_dir(), namespace, name)
506
+
507
+ if not file_exists(concept_dir):
508
+ raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist.')
509
+
510
+ shutil.rmtree(concept_dir, ignore_errors=True)
511
+
512
+ @override
513
+ def merge_draft(self,
514
+ namespace: str,
515
+ name: str,
516
+ draft: DraftId,
517
+ user: Optional[UserInfo] = None) -> Concept:
518
+ """Merge a draft concept."""
519
+ # If the user does not have access to the concept, return None.
520
+ acls = self.concept_acls(namespace, name, user=user)
521
+ if not acls.write:
522
+ raise ConceptAuthorizationException(
523
+ f'Concept "{namespace}/{name}" does not exist or user does not have access.')
524
+
525
+ concept = self.get(namespace, name, user=user)
526
+ if not concept:
527
+ raise ValueError(f'Concept with namespace "{namespace}" and name "{name}" does not exist.')
528
+
529
+ if draft == DRAFT_MAIN:
530
+ return concept
531
+
532
+ # Map the text of examples in main so we can remove them if they are duplicates.
533
+ main_text_ids: dict[Optional[str], str] = {
534
+ example.text: id for id, example in concept.data.items() if example.draft == DRAFT_MAIN
535
+ }
536
+
537
+ draft_examples: dict[str, Example] = {
538
+ id: example for id, example in concept.data.items() if example.draft == draft
539
+ }
540
+ for example in draft_examples.values():
541
+ example.draft = DRAFT_MAIN
542
+ # Remove duplicates in main.
543
+ main_text_id = main_text_ids.get(example.text)
544
+ if main_text_id:
545
+ del concept.data[main_text_id]
546
+
547
+ concept.version += 1
548
+
549
+ self._save(concept)
550
+
551
+ return concept
552
+
553
+
554
+ def _info_from_concept(concept: Concept, acls: ConceptACL) -> ConceptInfo:
555
+ return ConceptInfo(
556
+ namespace=concept.namespace,
557
+ name=concept.concept_name,
558
+ description=concept.description,
559
+ type=SignalInputType.TEXT,
560
+ drafts=concept.drafts(),
561
+ tags=concept.tags,
562
+ acls=acls)
563
+
564
+
565
+ # A singleton concept database.
566
+ DISK_CONCEPT_DB = DiskConceptDB()
567
+ DISK_CONCEPT_MODEL_DB = DiskConceptModelDB(DISK_CONCEPT_DB)
lilac/concepts/legal-termination/concept.json ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "namespace": "lilac",
3
+ "concept_name": "legal-termination",
4
+ "type": "text",
5
+ "tags": ["legal"],
6
+ "data": {
7
+ "731b1338cf1949958c3526c555f88058": {
8
+ "label": true,
9
+ "text": "In the event that any provision of this agreement is found to be unenforceable, the remaining provisions shall continue to be valid and binding.",
10
+ "id": "731b1338cf1949958c3526c555f88058"
11
+ },
12
+ "99a20e547e38474dbc24507a1658d0c9": {
13
+ "label": true,
14
+ "text": "The parties agree that in the event of a natural disaster or other unforeseen event, both parties will make reasonable efforts to fulfill their obligations under this contract.",
15
+ "id": "99a20e547e38474dbc24507a1658d0c9"
16
+ },
17
+ "3f27b47c526a4c5896a0a100024535c7": {
18
+ "label": true,
19
+ "text": "If any party breaches the terms of this agreement, the non-breaching party shall have the right to seek legal remedies.",
20
+ "id": "3f27b47c526a4c5896a0a100024535c7"
21
+ },
22
+ "d403dbb1ab9c4594bc7f7dcb0ad5b333": {
23
+ "label": true,
24
+ "text": "This lease agreement shall survive the termination or expiration of the lease term, and continue to be binding upon the parties.",
25
+ "id": "d403dbb1ab9c4594bc7f7dcb0ad5b333"
26
+ },
27
+ "b7deba9f7e80444abe14448f53f45c43": {
28
+ "label": true,
29
+ "text": "In the event of a dispute arising from this contract, the parties agree to first attempt to resolve the dispute through mediation before pursuing any legal action.",
30
+ "id": "b7deba9f7e80444abe14448f53f45c43"
31
+ },
32
+ "a82231b490174e62aad733cb0c75024d": {
33
+ "label": true,
34
+ "text": "This Agreement may be terminated, and the transactions contemplated hereby may be abandoned, at any time prior to the Effective Time, whether prior to or after the Company Stockholders' Approval:",
35
+ "id": "a82231b490174e62aad733cb0c75024d"
36
+ },
37
+ "160b25dbf14e4759a0065bbd652ce33f": {
38
+ "label": true,
39
+ "text": "This Agreement may be terminated and abandoned at any time prior to the Effective Time of the Merger, whether before or after the Company Stockholder Approval:",
40
+ "id": "160b25dbf14e4759a0065bbd652ce33f"
41
+ },
42
+ "8f5f9f96b16441228bb0c9b8a14c4e25": {
43
+ "label": false,
44
+ "text": "any jurisdiction, then such provision shall, as to such jurisdiction, be modified or restricted to the extent necessary to make such provision valid, binding and enforceable, or if such provision cannot be so modified or restricted, then such provision shall, as to such jurisdiction, be deemed to be excised from this Agreement; provided, however, that the legality, binding effect and",
45
+ "id": "8f5f9f96b16441228bb0c9b8a14c4e25"
46
+ },
47
+ "87b6c31b04a346b4a3e0da8d2cc5a7ac": {
48
+ "label": true,
49
+ "text": "This Agreement shall terminate automatically without any further action by any party hereto upon the earliest to occur of (a) the Effective Time of the Merger, (b) the termination of the Merger Agreement in accordance with its terms and (c) any amendment or other modification of the Merger Agreement that reduces the amount of the Merger Consideration or provides that the Merger Consideration shall",
50
+ "id": "87b6c31b04a346b4a3e0da8d2cc5a7ac"
51
+ },
52
+ "985344f7ecfb41f4a69ba101973221a1": {
53
+ "label": false,
54
+ "text": " During the Employment Period, the Corporation shall pay ----------- the Executive a base salary which, as of the commencement of the Employment Period, shall be at an annual rate of Two Hundred Fifty Thousand Dollars ($250,000). The base salary shall be payable in equal periodic installments which are not less frequent than the periodic installments in effect for salaries of other senior",
55
+ "id": "985344f7ecfb41f4a69ba101973221a1"
56
+ },
57
+ "5d53ff48376046fdab41e95c7f4bad54": {
58
+ "label": true,
59
+ "text": "This Agreement may be terminated at any time prior to the Closing Date solely:",
60
+ "id": "5d53ff48376046fdab41e95c7f4bad54"
61
+ },
62
+ "bdeb785be2154b21b4eb052466fa9bcb": {
63
+ "label": true,
64
+ "text": "(a) This Agreement may be terminated by you by notice to the Company at any time prior to the Closing Date if any of the following has occurred: (i) since the respective dates as of which information is given in the Registration Statement and the Prospectus, any material adverse change or any development involving a prospective material adverse change in or affecting the earnings, busi ness,",
65
+ "id": "bdeb785be2154b21b4eb052466fa9bcb"
66
+ },
67
+ "fe6871e9070441f8a9e4b3db26b077d7": {
68
+ "label": true,
69
+ "text": "Section 3(b), this Section 7 and Section 8 of this Agreement shall survive a termination of this Agreement pursuant to (a) or (b) above in this Section 7 until the date that is two years following the date of such termination. Notwithstanding anything else to the contrary contained herein or in the Merger Agreement, if the Effective Time occurs, the representations and warranties contained in",
70
+ "id": "fe6871e9070441f8a9e4b3db26b077d7"
71
+ },
72
+ "bf1a51751d0748e58c344aec8e5fc789": {
73
+ "label": false,
74
+ "text": "This Agreement may be executed in one or more counterparts (including counterparts executed and delivered by facsimile, which shall be as counterparts executed and delivered manually), all of which shall be considered one and the same agreement and shall become effective when one or more counterparts have been signed by each of the parties and delivered to the other party, it being understood that",
75
+ "id": "bf1a51751d0748e58c344aec8e5fc789"
76
+ },
77
+ "bc1b2affa6d848fd92d4dee033e30659": {
78
+ "label": false,
79
+ "text": "would, in your judgment, make it impracticable or inadvisable to market the Units or to enforce contracts for the sale of the Units, (iii) suspension of trading in securities generally on the New York Stock Exchange, the American Stock Exchange or the Nasdaq National Market or limitation on prices (other than limitations on hours or numbers of days of trading) for securities on any such Exchange,",
80
+ "id": "bc1b2affa6d848fd92d4dee033e30659"
81
+ },
82
+ "67a73d5887f74a91bed190ca8f64b17c": {
83
+ "label": false,
84
+ "text": " The authorized capital stock of FM consists of 1,000 shares of Common Stock, no par value each, of which 1,000 shares are issued and outstanding. There are no outstanding or authorized options, warrants, calls, subscriptions, rights (including any preemptive rights or rights of first refusal), agreements or commitments of any character obligating FM to issue any stock or any other Equity",
85
+ "id": "67a73d5887f74a91bed190ca8f64b17c"
86
+ },
87
+ "025b2ca5147849c8a921d9aaa31cd9cd": {
88
+ "label": false,
89
+ "text": "Taxes that are being contested in good faith by appropriate proceedings, provided that Holdings, the Borrower or Restricted Subsidiary, as the case may be, has set aside on its books adequate reserves therefor in accordance with GAAP.",
90
+ "id": "025b2ca5147849c8a921d9aaa31cd9cd"
91
+ },
92
+ "76acff27f13743f4822a094c707d8b75": {
93
+ "label": false,
94
+ "text": "have been a suspension or material limitation in trading in the Company\u2019s common stock on the New York Stock Exchange; (iii) there shall have been a general moratorium on commercial banking activities declared by either federal or New York state authorities or a material disruption in commercial banking or securities settlement or clearance services in the United States; (iv) there shall have been",
95
+ "id": "76acff27f13743f4822a094c707d8b75"
96
+ },
97
+ "b11a95c0eb564445b1a473e90622f861": {
98
+ "label": true,
99
+ "text": "10.1. This Agreement will terminate:",
100
+ "id": "b11a95c0eb564445b1a473e90622f861"
101
+ },
102
+ "d536428a02084d94ba18d412851cb913": {
103
+ "label": false,
104
+ "text": "may not be limited to his Base Salary and that the Employee may receive an annual bonus in the amount, if any, determined annually by the Employer. The Employee shall also participate in employee compensation and benefit plans available generally to executives of the Employer (including, without limitation, any tax-qualified profit sharing plan, nonqualified profit sharing plan, life insurance",
105
+ "id": "d536428a02084d94ba18d412851cb913"
106
+ },
107
+ "368bb1d9c7d0419d9ca58f28565eeb2e": {
108
+ "label": true,
109
+ "text": "This Agreement may be terminated in the absolute discretion of the Representatives, by notice to the Bank, if after execution and delivery of this Agreement and prior to the Closing Date (i) there has been, since the date of this Agreement or since the respective dates as of which information is given in the Registration Statement, the Time of Sale Information or the Prospectus, any material",
110
+ "id": "368bb1d9c7d0419d9ca58f28565eeb2e"
111
+ },
112
+ "1b5fd7b037a84404bf85c858953c79e8": {
113
+ "label": true,
114
+ "text": "however, (i) the right to terminate this Agreement under this Section 8 shall not be available to such Buyer if the failure of the transactions contemplated by this Agreement to have been consummated by such date is the result of such Buyer\u2019s breach of this Agreement and (ii) the abandonment of the sale and purchase of the Notes and the Warrants shall be applicable only to such Buyer providing",
115
+ "id": "1b5fd7b037a84404bf85c858953c79e8"
116
+ },
117
+ "6d5a23d2663f457cab96df03d9dc8ab7": {
118
+ "label": true,
119
+ "text": "In addition, any Stockholder may terminate this Agreement if Weatherford, WEUS, or the Company breaches any representation, warranty, covenant or other agreement contained in the Merger Agreement that (A) would give rise to the failure of Weatherford, WEUS, or the Company to satisfy any condition set forth in Section 8.2(a) thereof, and (B) cannot be or has not been cured within 45 days after the",
120
+ "id": "6d5a23d2663f457cab96df03d9dc8ab7"
121
+ },
122
+ "4a8223a48f83491b9b3eafd7ad37baf9": {
123
+ "label": true,
124
+ "text": "The obligations of the Underwriters hereunder may be terminated by the Representatives, in their absolute discretion, by notice given to and received by the Depositor or the Bank prior to delivery of and payment for the Notes if, prior to that time, any of the events described in Section 5(v) shall have occurred or any of the other conditions described in Section 5 shall not be satisfied.",
125
+ "id": "4a8223a48f83491b9b3eafd7ad37baf9"
126
+ },
127
+ "fbb152eae00c440bb2d0df0fbd82c262": {
128
+ "label": true,
129
+ "text": "Either of the parties hereto may terminate this Agreement by giving to the other party a notice in writing specifying the date of such termination, which shall be not less than 60 days after the date of receipt of such notice. In the event such notice is given by the Customer, it shall be accompanied by a copy of a resolution of the Board of Directors of the Customer, certified by its Secretary,",
130
+ "id": "fbb152eae00c440bb2d0df0fbd82c262"
131
+ },
132
+ "1d21880f426c45ada31409d22815cc87": {
133
+ "label": false,
134
+ "text": "Prospectus or the Final Prospectus (exclusive of any amendment or supplement thereof or thereto after the date hereof).",
135
+ "id": "1d21880f426c45ada31409d22815cc87"
136
+ },
137
+ "795cac72a3504740bc7401a84fc6fba4": {
138
+ "label": true,
139
+ "text": "This Agreement may be terminated by the Customer or the Bank by giving ninety (90) days written notice to the other, provided that such notice to the Bank shall specify the names of the persons to whom the Bank shall deliver the Assets in the Accounts. If notice of termination is given by the Bank, the Customer shall, within ninety (90) days following receipt of the notice, deliver to the Bank Instructions specifying the names of the persons to whom the Bank shall deliver the Assets.",
140
+ "id": "795cac72a3504740bc7401a84fc6fba4"
141
+ },
142
+ "3b82e6eba4894ac0b9f7f12aba2aab2e": {
143
+ "label": false,
144
+ "text": "of this Agreement, or to Authorized Persons, or may continue to hold the Assets until Instructions are provided to the Bank.",
145
+ "id": "3b82e6eba4894ac0b9f7f12aba2aab2e"
146
+ },
147
+ "da16bd0e9dce4d4c87400eab61b9b14c": {
148
+ "label": false,
149
+ "text": "into force of the Convention. In such event, the Convention shall cease to have effect:",
150
+ "id": "da16bd0e9dce4d4c87400eab61b9b14c"
151
+ },
152
+ "02cc328109984db094b0b02caec0d575": {
153
+ "label": true,
154
+ "text": "Survival. The rights and obligations contained in Sections 3 (\u201cOwnership of Work Product\u201d), 4 (\u201cOther Rights\u201d), 5 (\u201cLicense to Preexisting IP\u201d), 6 (\u201cRepresentations and Warranties\u201d), 8 (\u201cConfidential Information\u201d) and 12 (\u201cNon-solicitation\u201d) will survive any termination or expiration of this Agreement. ",
155
+ "id": "02cc328109984db094b0b02caec0d575"
156
+ },
157
+ "f8edf65d9acf4ff4a04459a3492ac426": {
158
+ "label": false,
159
+ "text": "Severability. Should any provisions of this Agreement be held by a court of law to be illegal, invalid or unenforceable, the legality, validity and enforceability of the remaining provisions of this Agreement will not be affected or impaired thereby. ",
160
+ "id": "f8edf65d9acf4ff4a04459a3492ac426"
161
+ },
162
+ "5a8517f359494ead8c11b6aff440480d": {
163
+ "label": false,
164
+ "text": "\u0095\tCommitted to deliver the best, we leave no room for customer grievances.\r\n\r\n",
165
+ "id": "5a8517f359494ead8c11b6aff440480d"
166
+ },
167
+ "a47d327d0f6e46fc861f86b2e0e54a2f": {
168
+ "label": false,
169
+ "text": "the due diligence and using our agreement creator to close the deal successfully. \r",
170
+ "id": "a47d327d0f6e46fc861f86b2e0e54a2f"
171
+ },
172
+ "811d0dcc92e14c5c881e903c7d4ff7b6": {
173
+ "label": false,
174
+ "text": "in accordance with customary procedures in the relevant markets, but in any event for a settlement period no longer than three months following the date of such commitment.",
175
+ "id": "811d0dcc92e14c5c881e903c7d4ff7b6"
176
+ },
177
+ "907f92e0d5704418944a559a4bfb96c7": {
178
+ "label": false,
179
+ "text": "terminate in accordance with Section 2 of the Investors\u2019 Rights Agreement.",
180
+ "id": "907f92e0d5704418944a559a4bfb96c7"
181
+ }
182
+ },
183
+ "version": 33,
184
+ "description": "Termination or survival clause in a legal document"
185
+ }
lilac/concepts/negative-sentiment/concept.json ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "namespace": "lilac",
3
+ "concept_name": "negative-sentiment",
4
+ "type": "text",
5
+ "data": {
6
+ "0": {
7
+ "label": true,
8
+ "text": "Starting To Be Annoyed By Becky...: I'm not sure why I keep reading these books, but I guess it's because I've read the first two so I'll keep reading the rest of the books. In the first book, I really found it amusing. I was a little annoyed by the fact that Becky couldn't stop spending, but then again that's why she is called a Shopaholic. In the second book, I felt more of the same it was just magniifed more. Now in the third book, I'm just down right annoyed by Becky Bloomwood. In this book, she wasn't going on crazy shopping sprees, just planning two different weddings because she was afraid to tell each person and because I feel she's really selfish. Still, I read the book because I wanted to see how she could get herself out of another situation. I will say that I love her friends Suze and Danny, her client Laurel and her husband Luke. Maybe that's why I keep reading. I will read the next book, but I'm sure I'll be just as annoyed when I'm done.",
9
+ "id": "0"
10
+ },
11
+ "1": {
12
+ "label": true,
13
+ "text": "the cover is fine - the pool is horrible: The entire pool was horrible. The cover was fine once we got it on, but we finally got rid of the pool after 2 weeks because it was so hard to set up and keep clean.",
14
+ "id": "1"
15
+ },
16
+ "2": {
17
+ "label": false,
18
+ "text": "Good album, not their best.: This album is probably the most marketable and radio friendly of all of dashboard's albums. For the peripheral listener it may be the right one to get to introduce you to this band. But as a Dashboard fan of 5 or so years I truly hope they return to their original sound for their next work. Not for the listen-ability but for the show. To this day the fans react best to the songs from \"Places\" or \"A Mark, A Mission.\" I recommend this album to everyone but I also recommend any and all of their other work.",
19
+ "id": "2"
20
+ },
21
+ "3": {
22
+ "label": true,
23
+ "text": "This is a horror novel, right?: Never one to pass up any vampire novel, I purchased Sips because the description seemed interesting. Vampires, Marquis de Sade, fetishism, yada yada yada. If this is a comedy, I give it 4 stars; however, I'll give it 1 star as a horror novel. Sade was rather boring; I would think a character as intense and multi-faceted as the Marquis de Sade would make for a more interesting vampire. The writing style isn't too bad, but overall I found the characters to be mildly amusing at best. The plot was thin, the end was anti-climactic, and the vampires were not very frightening. The book had little suspense, and it leaves a mile-wide opening for a sequel at the conclusion. I would, however, like to see something more of the vampire mutants lurking in the graveyard. They were the most riveting of any of the characters.",
24
+ "id": "3"
25
+ },
26
+ "4": {
27
+ "label": false,
28
+ "text": "Superb mix of global non secular musical denominations: I first heard Ms. Pook's music on the \"Eyes Wide Shut\" soundtrack (the masquerade ball scene) and was blown away; if ever there was a necessity for music to permeate a scene in a film this was it. She incorporates a blend of the traditional songs from faiths across continents and mixes them, for lack of a better comparison than similar to your quintessential raver d.j. (though these are better and definitively more original :) \"Oppenheimer\" is my favorite, and if you let the last track run for a few minutes a portion of the song will play once more. I can't wait to hear more of her stuff - these hymns are awesome.",
29
+ "id": "4"
30
+ },
31
+ "5": {
32
+ "label": false,
33
+ "text": "A moving and suspenseful story!: For anyone familiar with the occult, this book is likely to raise hairs on the back of your neck as you read. Even if you're not, the storyline is suspenseful and fascinating, and the characters evoke great sympathy and admiration. An excellent read.",
34
+ "id": "5"
35
+ },
36
+ "6": {
37
+ "label": false,
38
+ "text": "Simple & Easy to Use - A Practical approach to eating out: This guide is extremely to use. It provides sample menus that you'd see at Chinese, Indian and Thai restaurants. Then you are provided with descriptions of each dish and how it is prepared and the ingredients used. From there you are provided with specific considerations as to how the preparation or ingredient list may affect you if you have Gluten or Allergen issues.This book is the size of a passport and very organized and well written. The Chinese, Indian and Thai Cuisine Passport is perfect for making choices while traveling, or while dining at your favorite local restaurant.",
39
+ "id": "6"
40
+ },
41
+ "7": {
42
+ "label": true,
43
+ "text": "Being Fair....I am a fan of hers: and I really enjoyed her previous works, more than I could have ever imagined, but this record is horrible. The songs are trite, the lyrics are incredibly boring, indulgent and immature. The music is pop staple, with forgetable melodies and repetative chorus lines, I feel as if the studio wrote the entire album for her while she was sleeping, this just doesn't speak to any of her previous works at all. This album fits on the same shelf with a Nickelodeon-themed CD. Instead of heading in the direction of an artist like Alanis Morrisette, she is going backward and joining the ranks of Hannah Montana and the Naked Brothers Band. She is a great artist and her first two records are amazing. She is better than this CD and I am looking forward to her next effort.",
44
+ "id": "7"
45
+ },
46
+ "8": {
47
+ "label": true,
48
+ "text": "Sucked: I thought the DVD sucked tremendously. It was very boring and if I could, I would return it for a refund. There was only one \"small\" clip of Dylan himself. I'm very disappointed.",
49
+ "id": "8"
50
+ },
51
+ "9": {
52
+ "label": false,
53
+ "text": "Excellent product: Easy to install. If you have a newer furnace you probably do not need the swail switch as the HE220A comes with a Humistat which can be connected to the furnace. They recommend the Honeywell 32005847-001 Installation Kit, Bypass which is a little pricey and you can probably buy the pieces of this kit cheaper individually from Home Depot or Lowes or ACO as well as the filters.",
54
+ "id": "9"
55
+ },
56
+ "10": {
57
+ "label": false,
58
+ "text": "Very happy.: I am very happy with this trashcan. I was unable to find one in the stores to fit the space in my cabinet, but this one does the job. It is very sturdy and looks like it will put up with years of use.",
59
+ "id": "10"
60
+ },
61
+ "11": {
62
+ "label": true,
63
+ "text": "These aren't Throughbreds!: This makes me so mad. All these new authors are coming and changing the series. Nothings the same anymore and the plots are repeditive. Don't even bother reading these books until #32 these are like a different series. I don't know excactly what's happing but these new authors suck!",
64
+ "id": "11"
65
+ },
66
+ "12": {
67
+ "label": true,
68
+ "text": "Large and slow are a bad combination.: I bought this TV and returned it a week later, because it blurred so badly with motion that sports were unwatchable. I ended up buying a smaller Sony XBR4, and I have none of the issues (plus the picture is far, far better).This has nothing to do with 60 vs 120Hz. That is more important for DVDs and Blu-Ray signals that are 24fps (which doesn't divide evenly into 60 but does for 120). The LT52133 has an 8ms response time, which is extremely slow. A decent LCD should be 5 or lower.If you want an LCD, choose speed and quality over size. If you want size and quality but want to spend less, buy a plasma. Don't buy a big, cheap, slow LCD!I gave it 2 stars because I like the interface and remote.",
69
+ "id": "12"
70
+ },
71
+ "13": {
72
+ "label": true,
73
+ "text": "Skip it: This movie is very poorly written and the result is not distressing, just lame. The actors do their best but from very early on it is obvious that the material gives them nothing to work with. Fans of Colin Firth will experience a certain dim level of enjoyment. Minnie Driver is a treat but her character is no better written than the others. Vermont locations are worth something. With one or two moments of exception it's neither comedic nor romantic.",
74
+ "id": "13"
75
+ },
76
+ "14": {
77
+ "label": false,
78
+ "text": "Belive it i went to the concert?: hi everyone let me tell you i went to the concert i was amazed with what i saw cher was awsome i tell you buy the dvd. as i sat in front of the stage cher was doing a great job to us the she is living proof . So i urge you to buy it?",
79
+ "id": "14"
80
+ },
81
+ "15": {
82
+ "label": false,
83
+ "text": "Vale la pena.: En este libro se narra de una forma muy interesante la vida de una familia en particular. Lo que mas me gusto de este libro fue la manera en que la autora describe a lo largo del libro las personalidades de los sujetos envueltos en la novela; que vienen a ser muy distintos y extremos, lo cual, intensifica el drama... Definitivamente es un buen libro y lo recomiendo a todos.",
84
+ "id": "15"
85
+ },
86
+ "16": {
87
+ "label": false,
88
+ "text": "Nummie Children's story: I ordered this book for our grandchildren. Two boys 5 & 3 and a 4 month old girl. All love the story. The mouse is determined.",
89
+ "id": "16"
90
+ },
91
+ "17": {
92
+ "label": true,
93
+ "text": "Seem to be alone on this one: Looking at the other reviews, I seem to be the only one that was disappointed with this book. The content is too babyish in most of it for older tweens and the more \"grown up\" content would be over a younger tween's head. I had a quick read through and with every paged turned, I thought duh. I'll be looking around for another book shortly.",
94
+ "id": "17"
95
+ },
96
+ "18": {
97
+ "label": false,
98
+ "text": "Best yet: by far the best EA game yet. I especially like the easy controls and kick - a graphics. the playbook is extremely accurate and detailed. Also the fight songs and cheerleaders were a nice touch. this is an excellent game and worth checking out.",
99
+ "id": "18"
100
+ },
101
+ "19": {
102
+ "label": true,
103
+ "text": "washed out: A bit like Simply Reds version of the Valentine bros hit \"Moneys too tight to mention\" - this cover version has nothing of the driving energy that characterised the original recording.",
104
+ "id": "19"
105
+ },
106
+ "20": {
107
+ "label": false,
108
+ "text": "great water bottle: I love this bottle it is great. I like knowing it is non toxic and it just works very well. You can have it full and lay it down and it doesn't leak at all.",
109
+ "id": "20"
110
+ },
111
+ "21": {
112
+ "label": false,
113
+ "text": "Nice goggles: I am pretty happy with these goggles. They work well during swim workouts in the pool. I do notice a little bit of fogging from time to time. I had hoped to wear them during an upcoming triathlon, but based on a few instances where they slipped a little in the pool I am concerned that they won't be secure enough. I will keep using them in the pool, but will likely get different ones for open water races.",
114
+ "id": "21"
115
+ },
116
+ "22": {
117
+ "label": true,
118
+ "text": "aaahhh nnnoooooo!: Hopefully the last film in one of the worst horror trilogys ever made. This series pretty much ruined the horror film for years to come, for one its too self aware, thats incredibley annoying, second everyone acts like they are on Friends or some sitcom. The acting is just plain bad and unconvincing. Now the gore, if you're going with material this weak you should load it up with disgusting violence, is there any in the Scream series? No.Everyone went to see this movie just to see who THE KILLER is. This movie sets low standards to be met, you expect alot of people to die, one shock, then we find out who the killer is, then you go home. Every horror film being made today is like that, there's nothing new or exciting or risk taking, its the same stuff over and over and people are laping it up like dog food.This film is what you've come to expect, bad acting, some people die and we eventually find out who the killer is and all is merry and well. Pathetic.",
119
+ "id": "22"
120
+ },
121
+ "23": {
122
+ "label": false,
123
+ "text": "A classic of its kind: This movie is a classic of its kind and much better that a lot of movies, that followed. It is not one of the best, but it still deserves five stars...",
124
+ "id": "23"
125
+ },
126
+ "24": {
127
+ "label": true,
128
+ "text": "Nice suite, but Virtual PC 7 disappoints on my G5: I purchased the upgrade since I'd already bought both Office v.X and Virtual PC 6.1 last year.The biggest letdown is that Microsoft's promised support for the G5 is nearly non-existent. I have a dual processor G5 with an ATI Radeon 9800 card (Apple), and after trying to install Virtual PC 7 three times, I cannot get a VM to work. It did install (and work) flawlessly on my G4 Powerbook. Googling for reviews finds it's very hit or miss, but if (when) it misses, you'll regret investing the extra $$$ in an immature product.",
129
+ "id": "24"
130
+ },
131
+ "25": {
132
+ "label": true,
133
+ "text": "Okay player, don't expect a miracle: I bought this DVD player at Circuit City earlier this yr for about a $100. I hooked it up to a 47\" Vizio LCD (which by the way has an awesome picture) using a HDMI cable. After fine tuning this product, I was very, very, very diasppointed. The picture was very \"grainy\" (lots of pixels). I have a $35 DVD player that only utilizes an s-video cable that produces a much more crisp picture. Be warned, the picture stinks.",
134
+ "id": "25"
135
+ },
136
+ "26": {
137
+ "label": false,
138
+ "text": "A revelation of the science of consciousness evolution and all natural growth: Here is a readable and fascinating account of the development of the new science of chaos theory, the only body of ideas that describes how the natural world as experienced by human beings emerges out of basic quantum processes. The different explorers and innovators of the new science are introduced in a personable way that will enchant the interested reader.",
139
+ "id": "26"
140
+ },
141
+ "27": {
142
+ "label": true,
143
+ "text": "Don't say that I didn't warn ya' !: I'm absolutely convinced that Delbert McClinton had no controlover the release of this CD. I rated it 1 star simplybecause there is no 0 star rating ! In actuality , I am not certain that the vocalist on this recording IS Delbert McClinton. Only on the Mr. Pitiful track is there any similarity at all to Delbert's voice. This is the perfect CD for someone with money to burn who would like to have a recording of a 1960's garage band recorded in a garage and who should be working in a garage ! Delbert fans...run fast and run far away from this ! END",
144
+ "id": "27"
145
+ },
146
+ "28": {
147
+ "label": true,
148
+ "text": "This item is not available: I ordered this unit on February 7th. Every time I checked back on the status of the order, it read \"not shipped\" and the estimated shipping date got moved out. I really don't think this unit is avaialble from the company anytime soon. I cancelled the order.",
149
+ "id": "28"
150
+ },
151
+ "29": {
152
+ "label": true,
153
+ "text": "I used to like ABBA...: I used to like ABBA, until I saw Mama Mia! A horribly disjointed musical, where songs feel contrived to fit into the story; a story that doesn't seem to come together. Individual songs are usually done alright, but don't segue from one to another very well.The cast butchered several of the songs, but especially S.O.S, Take A Chance On Me, and anything where Pierce Brosnan sang. On a side note, I also counted at least two violations of Chekov's Gun. And finally, I think it has a bad moral message. Which you only recognize if you manage to sit through the whole thing.If there is justice in the world, cast members without established careers won't get to have them as punishment for the worst movies I've seen since The Talented Mr. Ripley.",
154
+ "id": "29"
155
+ },
156
+ "30": {
157
+ "label": true,
158
+ "text": "A complete disaster!: If you're like me, you probably wanted to check out this movie because it sounded like it really could be an excellent supernatural Gothic horror tale full of goblins and wicked things alike. Well, don't make the same mistake I did and actually watch it. It's horrible. Terrible. An honest to goodness waste of film. The acting is wretched, the film quality is rotten (it actually looks twenty years older than it is), and the plot is thin, weak, and does not give you what it's supposed to. The only reason I bothered to give this film 1 star is because of Alexis Arquette -- he's great looking, but should have left this film out of his career.",
159
+ "id": "30"
160
+ },
161
+ "31": {
162
+ "label": false,
163
+ "text": "beautiful detail: I just purchased these Dover COloring Books for my mother and she loves them. The detail is out of this world and the variety of colors you can use are only limited by your inagination. HIGHLY RECOMMENDED!",
164
+ "id": "31"
165
+ },
166
+ "32": {
167
+ "label": true,
168
+ "text": "Very disappointed: I looked forward to getting this movie as I had heard many good things about it but it was nothing like I had imagined or been led to believe. There is very little actual history in it or real Christian experience except for the background because the main focus is a soap opera style romance and caricature figures. I agree with the reviewer who described it as a mixture of \"tawdry Hollywood sex\" somehow interspersed with a vague nod to Christianity. The only decent scene was the arena scene where the Christians are going to their deaths singing hymns - but that's not enough to make it a great or even a good movie. Not personally to my taste anyway.",
169
+ "id": "32"
170
+ },
171
+ "33": {
172
+ "label": true,
173
+ "text": "Unreliable minikit: I bought this minikit because it got good reviews and it would be perfect for my purposes. However it switches on and off whenever it wants, it looses contact with the phone. Very often the on/off button works only in a horizontal position (?) I use a Treo 650, which is on the compatible phone list. When I contacted Parrot, they said it wasn't (?) At last I opened the unit, but there are no moving parts inside except the micro switches. It is giving me a headache, so I will go searching for an alternative.",
174
+ "id": "33"
175
+ },
176
+ "34": {
177
+ "label": false,
178
+ "text": "A Christmas Classic!: This is surely one of the best classical Christmas recordings available. Don't buy the older version, as the quality of this recording is excellent. This is one of those \"Every Christmas - Can't have Christmas without\" recordings.",
179
+ "id": "34"
180
+ },
181
+ "35": {
182
+ "label": true,
183
+ "text": "too narrow: These were the narrowest pair of D size shoes I have ever tried on. I don't care how nice a shoe looks. If it don't fit it just don't fit.",
184
+ "id": "35"
185
+ },
186
+ "36": {
187
+ "label": true,
188
+ "text": "Lack of extension: This earphones lack a descent extension cord. ITs very small cable, but its of good quality. Sadly, cord its too short, and the extension is useless.",
189
+ "id": "36"
190
+ },
191
+ "37": {
192
+ "label": false,
193
+ "text": "Easy-Reading: This is the 3rd Southern Sisters Mystery I've read. They're easy, fast and funny murder mysteries, with lots of cute family stories intertwined in the intrigue.",
194
+ "id": "37"
195
+ },
196
+ "38": {
197
+ "label": true,
198
+ "text": "it'd be great if it worked like it was supposed to: for the first 30 seconds it was lovely, but i believe that either the motor isn't powerful enough to keep the shaft rotating smoothly or 3 AA batteries just don't provide enough juice for the motor to work more than 30 seconds. it was a nice idea, but i'm rather dissapointed. the jelly material is somewhat difficult to maintain also. i think if it were hooked up to a larger battery pack it'd be WONDERFUL... which i think i may have a macgyver friend with a knack for electronics attempt to do for me.",
199
+ "id": "38"
200
+ },
201
+ "39": {
202
+ "label": false,
203
+ "text": "Not Hornby's best but still good: I loved About a Boy and really, really loved the sardonic wit of High Fidelity. About a Boy is much deeper but just as cynical. Maybe even more so. The characters are richly drawn and just complex enough to keep the reader wanting more. Good read, but best to take some time with this one. Not recommended for a summer beach read.",
204
+ "id": "39"
205
+ },
206
+ "40": {
207
+ "label": true,
208
+ "text": "A Disappointment: As with most Taunton Press publications, the illustrations and photographs in this book are spectacular and the organization and layout is superb. Nonetheless, I found this book disappointing. It lacks both depth and breadth. I had hoped for a detailed review of wood joinery including some of the more unusual joinery found in Japanese woodworking. This book, however, is targeted more toward the beginner. Even so, it does not cover the details and \"tricks\" of even the most basic techniques in sufficient detail to allow beginners to easily reproduce them. Consequently, it is unclear who this book was written for - not the beginner as it lacks depth, and not the advanced woodworker as it lacks breadth. Far more effort appears to have been put into appearance and organization than in content.",
209
+ "id": "40"
210
+ },
211
+ "41": {
212
+ "label": true,
213
+ "text": "Horrible. Don't do it!: Great price for the item when a 6' one of these at Best Buy is $20. Thing is, the one from Best Buy fits in the outlet and stays there. This cord fits very loose and does not connect. I bought 2 of them, neither did what they were suppose to.As much as I hate to say it, but, buy the more expensive one. At least it works.",
214
+ "id": "41"
215
+ },
216
+ "42": {
217
+ "label": false,
218
+ "text": "Given as a gift...: Given to my best friend as a gift. She loves it. Her fiance enjoys making coffee for her in the mornings. :)",
219
+ "id": "42"
220
+ },
221
+ "43": {
222
+ "label": false,
223
+ "text": "Love the ring.: This is a nice ring. I was worried it out be thin and cheap looking, but it's not. It's a very pretty stylish ring. Go for it.",
224
+ "id": "43"
225
+ },
226
+ "44": {
227
+ "label": true,
228
+ "text": "Beautiful writing Marred by One-Note Characterizations: How could Kingsolver have ruined her book with such an obvious error? Nathan is a strident paper doll that flattens the whole story. Just as bad, the author has all the narrators using the same ironic tone to decribe him, deadening their voices as well. At the same time, Kingsolver doesn't have the guts to show him doing something trully terrible. I don't trust an author who can't let the reader make up his own mind, and as a consequence I couldn't trust her views about ANYTHING in the story. I'm giving this two stars for her descriptions of the African landscape, and that is all.",
229
+ "id": "44"
230
+ },
231
+ "45": {
232
+ "label": true,
233
+ "text": "Much worse than any cordless phone I've ever had: This phone cuts out only 2 rooms away from the base station. There is static noise, and callers on the other end complain about sound quality. I can't go into the garden, which used to be no problem with my old 900 MHz phone.",
234
+ "id": "45"
235
+ },
236
+ "46": {
237
+ "label": true,
238
+ "text": "Waste of time & money: The first Hangover was not too bad, this one was just terrible. The acting is bad, the script is bad, everything about this movie was just bad. Do yourself a favor, don't buy this movie as it is a total waste of time and money.",
239
+ "id": "46"
240
+ },
241
+ "47": {
242
+ "label": true,
243
+ "text": "Did Not Work For Me!: Impressive You Tube Video (Like a Sci-Fi Fantasy). In reality it's a high speed Easy Out so unsurprisingly it broke faster than an Easy out. This product did not work for me. The drill part did not drlil, the puller part did not pull. It was a total zero.",
244
+ "id": "47"
245
+ },
246
+ "48": {
247
+ "label": false,
248
+ "text": "Excellent book, long overdue.: From a very long time women were told that looking good was of utmost importance. This was without regard to health or fitness and how age affected these parameters. Witness the whalebone and other types of corsets, the spike heeled shoes and the numerous weight loss programmes on the market (some of which are downright dangerous). Now there is a book, backed by solid research, that allows women of all ages to remain fit and healthy for a lifetime. I am certainly going to recommend this book to all the women I know.Bentley Norville",
249
+ "id": "48"
250
+ },
251
+ "49": {
252
+ "label": true,
253
+ "text": "not an all star: Not a practical guide in this collecting age. Does NOT have a comprehensive list; meaning it does NOT cover all manufacturers and, more importantly, for the ones it does, only provides listings of the base set. That means no insert or variation pricing whatsoever. Also, no oddball or minor league issues are listed. Generally speaking, unless you are collecting base sets prior to the advent of inserts and alternate versions of the base set, this guide is fairly useless.",
254
+ "id": "49"
255
+ },
256
+ "50": {
257
+ "label": true,
258
+ "text": "Again, second rate city, third rate writer: Just another example of Mr. Lindberg's pitiful attempt at exhibiting a strong expertise on a subject with which he is clearly obsessed. Don't waste your time with this book, either. It is poorly written and fails to engage the reader. You might consider using this book and the first book he wrote on the same subject, as a pair of bookends. That is about all they are worth.",
259
+ "id": "50"
260
+ },
261
+ "51": {
262
+ "label": false,
263
+ "text": "Reality: No one should need to convince you to buy this book, you should just do it! It's so well written and worded and brings you right to the heart of a sexual reality that most people like to pretend doesn't really live and breath in their fair cities. I never again want to hear someone bad mouth a working girl for what she does. I will and do now however look at men with a curious eye wondering if they are depraved peep show window lickers :)",
264
+ "id": "51"
265
+ },
266
+ "52": {
267
+ "label": true,
268
+ "text": "Bummer: Visual effects and Battle footage were great...the other 85% of the movie was just lousy fluff...",
269
+ "id": "52"
270
+ },
271
+ "53": {
272
+ "label": false,
273
+ "text": "The spark of idependence: Filled with the independent spark that made us all love life at one point or another. A fun, introspective and nonsensical movie that sticks with you.",
274
+ "id": "53"
275
+ },
276
+ "54": {
277
+ "label": false,
278
+ "text": "What I expected from Mirman's website. Funny. Funny. Russian.: lol, gotta love Eugene. Even when his audience doesn't initially laugh, he gets in a good zinger at himself and they laugh at that. He's witty without being condescending, and uncomplicated without seeing contrived. However, if you're not a fan of irreverant humor, this may not be for you.",
279
+ "id": "54"
280
+ },
281
+ "55": {
282
+ "label": true,
283
+ "text": "Do not...repeat...do not bother!: It is not often that I offer a negative review but this compilation while attractive does not deliver at all.The foot massage gizmo is awkward and uncomfortable.The pumice stone leaves rough splinter like skin.The foot scrub doesn't reall scrub.The rotary action tool has five heads, none of which work well and you must hold the switch in place or it turns off. It is cumbersome and ineffective.The one star was initially given for a foot brush (which later lost its bristles very easily as I update the review) and a sweet smelling foot repair balm.Don't waist your money. Soak your feet and invest in an inexpensive German Titania file, smooth and coarser side, or a like product. It will last for years.",
284
+ "id": "55"
285
+ },
286
+ "56": {
287
+ "label": true,
288
+ "text": "Not Sandra's Best: Ms. Brown has written better romance novels. Don't give up on her if this was your first Sandra book.The feeble female lead struggles with a 15-year crush that walks back into her life. The smug male lead acts like a jerk through most of the novel. The romance scenes grapple to muster up passion but fall short. Both of the main characters bothered me; my favorite character was the 17-year old.A quick read...about 4 hours (with interruptions) for me...but probably not worth it.",
289
+ "id": "56"
290
+ },
291
+ "57": {
292
+ "label": false,
293
+ "text": "Impressed: Lots-O-Fun. Wood and glass toys are high quality and are a good fall back for the kids to play with they are \"bored\". Would buy again.",
294
+ "id": "57"
295
+ },
296
+ "58": {
297
+ "label": true,
298
+ "text": "Light turned on by itself 3 times: The installation was easy. I used it for a week, everything worked fine, EXCEPT the light it connected to turned on by itself 3 times so far, with no one near to either one of the switch. Not sure whether it is a defective unit, or this product is too sensitive to noise. I'm returning this product and will just install a regular switch instead.",
299
+ "id": "58"
300
+ },
301
+ "59": {
302
+ "label": false,
303
+ "text": "good battery: I feel kind of silly writing a review for a battery, but have to say that these last a LONG time. Work very well.",
304
+ "id": "59"
305
+ },
306
+ "60": {
307
+ "label": false,
308
+ "text": "Even a Woman finds it funny: Yes, even a woman finds \"Married to Mommy\" funny. The book gets you laughing aloud when it is trying to make fun of \"Mommies\". The truth is that it really is making fun of the stupidity of men and their simple basic needs of sex, getting out of work, and beer. Of course, the truth is always funny.A definite MUST for any woman, married or not. We will now know all the secret tricks the men try to use on us.By the way, I am NOT a MOMMY!",
309
+ "id": "60"
310
+ },
311
+ "61": {
312
+ "label": false,
313
+ "text": "Gungrave...not quite what you might expect: Those thinking this is another version of Trigun will be disappointed. Gungrave is actually a lot deeper and more complex. The lead is short on dialouge, but the story has more depth and character development than most anime. The first DVD is more about the main character's past than about the reanimated killing machine he's become, but it definitely leaves you wanting more.",
314
+ "id": "61"
315
+ },
316
+ "62": {
317
+ "label": false,
318
+ "text": "Error in product description: It's great in every way. However, if you'd prefer a digital tuner (as I do), then you might need to look further. The product description boasts a digital AM/FM tuner, but it's disappointingly an analog AM/FM tuner.Overall - especially for the price - I think it's pretty good.",
319
+ "id": "62"
320
+ },
321
+ "63": {
322
+ "label": false,
323
+ "text": "good phone but not as user friendly as it could be: Battery life is very good. Phone has good range. My only complaint is it's to involved to get your message from the handset.",
324
+ "id": "63"
325
+ },
326
+ "64": {
327
+ "label": true,
328
+ "text": "Big waste of money (and space in my house!): My 5 year old son wanted this so bad, but when we got it for him, there were so many pieces to put together that didn't fit together well, he never played with it. It just sits on our floor in many pieces taking up toy space! What a waste!",
329
+ "id": "64"
330
+ },
331
+ "65": {
332
+ "label": false,
333
+ "text": "Don't want to take it off: Very satisfied with an earlier purchase of this Bali bra model, I was just as pleased with the new one. Very comfortable, well made and a good neutral color. It will be my next choice, too.",
334
+ "id": "65"
335
+ },
336
+ "66": {
337
+ "label": false,
338
+ "text": "Fantastico: If anybody who's into rock music is ever looking for a band to keep you on your toes, this is the band. I've been a fan for 10 years now, and no album has ever sounded like any of their previous albums. This disc is fantastic with such a variety of styles, as are the previous releases, even back to the Rainbow Butt Monkey days.",
339
+ "id": "66"
340
+ },
341
+ "67": {
342
+ "label": true,
343
+ "text": "too much visual: There are far too much designs, visuals, colors, etc in the book - this is highly distracting, as TV screen can be...By way of example (among so many...), what is the use of colors with the three squares of the Pyth. theorem???? this is as useless as writting 2+3=5 with 2 in blue, 3 in red and 5 in yellow...I wish I had purchased the 2nd edition, which according to reviews was closer to what I was looking for.",
344
+ "id": "67"
345
+ },
346
+ "68": {
347
+ "label": false,
348
+ "text": "Aretha's First Arista Release Showed Pleasures to Come: After a long and musically satisfying career with Atlantic, Aretha severed her ties with that company and moved under the wing of Arista's Clive Davis. With the start of the 1980's, Aretha was looking for new territory to conquer and almost succeeded with this mixed bag.\"United Together\" is a fine tune that benefits from beautiful orchestral arrangement that is matched by Aretha's superb vocal instrument. The remake of \"Can't Turn You Loose\" allows Aretha to show why she is the Queen of Soul\" for she really belts this one out. Another cover, that of the Doobies' \"What a Fool Believes,\" is an interesting interpretation. The final cut \"School Days\" appears to be \"autobiographical\" for every girl growing up in the fifties.Although not as strong as her Atlantic work, \"Aretha\" is still a suitable addition to the artist's discography.",
349
+ "id": "68"
350
+ },
351
+ "69": {
352
+ "label": true,
353
+ "text": "Misguided Purchase: The photo and description do not reflect the product. The screen panel kit I received was white. What a huge inconvenience during a time-crunch.",
354
+ "id": "69"
355
+ },
356
+ "70": {
357
+ "label": true,
358
+ "text": "Banacek: My husband and were looking forward to seeing this series.The first show was SO boring, we finally just quit watching it.Actually, we haven't gotten around to watching anymore. I guess we were afraid of a repeat.Maybe that was just once, I hope!",
359
+ "id": "70"
360
+ },
361
+ "71": {
362
+ "label": false,
363
+ "text": "JDT: Uncle Tupelo is without doubt one of the most under appreciated groups of the 90's. Anodyne, like each of the three albums that came before it, has everything that a remarkable recording requires: great songs, honest lyrics, and artists who really care about the music they are making. Like the best of Dylan and Springsteen, the songs are about real people with real troubles and joys. When you hear them you know they are coming from the heart. The songs contributed by Jay Farrar and Jeff Tweedy are easily differentiated by the voacls, music, and lyrics. What makes this record interesting is how well these unique sounds compliment each other. The union is seamless.",
364
+ "id": "71"
365
+ },
366
+ "72": {
367
+ "label": false,
368
+ "text": "Well Worth Reading: First a confession: Miriam Wasserman was my mother. However, she published several books, but this is the only one I really found useful. She walks the reader through the New York City school system and the attitudes of different groups involved in the system back in the 1960s. This includes parents, teachers and administrators. Her view is that the further away one got from parents and students, the more prestige one had. She meticulously describes the teachers' strike of 1968 against \"community control of schools\", a strike of which she is extremely critical. She explores the racism that was involved in this strike, including using quotes from striking teachers, etc. It should be emphasized that the author was pro-union all her life, so her views don't stem from an anti-union bias. The book also covers the high school student rebellion which coincided with and followed the strike.",
369
+ "id": "72"
370
+ },
371
+ "73": {
372
+ "label": false,
373
+ "text": "compact and loaded: I bought this phone after reading the cnet reviews and really liked it. It looks small and really compact. I like the camera pics at 2 mega pixel and bright flash. The mp3 player is crisp. The headset that comes along delvers amazing fM radio. I think my phone is not very loud and you have a problem when you are around a noisy crowd. I just bought this phone again for my cousin. He likes it too. Almost forgot the display is very good.",
374
+ "id": "73"
375
+ },
376
+ "74": {
377
+ "label": false,
378
+ "text": "Outstanding text!: Brooks/Cole should keep this text in their catalog for ages! It is well-written, examples are generally quite clear, vocabulary is introduced well, and the exercises develop real skills, rather than simply be busy-work. One of the best calculus books ever!",
379
+ "id": "74"
380
+ },
381
+ "75": {
382
+ "label": false,
383
+ "text": "Excel 2003 Bible: Very good source of information. I will most likely buy other books in this series.",
384
+ "id": "75"
385
+ },
386
+ "76": {
387
+ "label": false,
388
+ "text": "Tasting is Believing: Gluten-free breads used to have a gritty texture from the rice flour, and were too soft for sandwiches. Bette Hagman uses garbanzo/fava bean flour, sorghum flour, tapioca flour, and corn starch to create breads which have a similar texture to wheat flour breads, and the flavors of her breads are fabulous.My BF bought me this book and a great tasting beverage to drink it with. Since he knows I quit coffee recently, he's been really wonderful helping me in cope with my mood swings. S o y f e e is made from soy beans that is roasted just like coffee. I enjoy the taste and don't miss coffee one bit. Buy it online at www.s o y c o f fee.com.This is a 'must have' for anyone baking gluten-free. I think all of Bette Hagman's books are wonderful and a must for those with gluten intolerance.",
389
+ "id": "76"
390
+ },
391
+ "77": {
392
+ "label": false,
393
+ "text": "5 stars for the show, no stars for the \"Collector's Edition\": I was really looking forward to getting this Collector's Edition and see what extras were added. I knew it wasn't a lot - just a mini-book and a documentary - but I figured it would be packaged in a cool way.Wrong.As others have already mentioned, the Collector's Edition is *literally* theAvatar: The Last Airbender - The Complete Book 1 Collectionslipped into another cardboard box, with a little booklet and DVD in an envelope (not even a case!) wedged in. It's really disappointing; it would have been so easy to create a quality Collector's Edition but the studio couldn't be bothered, I guess.",
394
+ "id": "77"
395
+ },
396
+ "78": {
397
+ "label": false,
398
+ "text": "sula scottcampos: Sula, a book that talks about the issues of being a black women is a really good novel to read.One of the reasons I recommend it is because of its realism and its themes - death, sex, friendship and poverty.I also think that its characters are very good, its easy to identify with one or both of them. I really recommend this book to anyone who enjoys good literature.",
399
+ "id": "78"
400
+ },
401
+ "79": {
402
+ "label": false,
403
+ "text": "Fantastic! It's a must-have for girls!: I hated razor, tried shaving but it did not work for me. Shaving made the hair grows thicker and faster afterwards, plus the roots are impossible to be getting rid of. After reading the reviews, I ordered it to try, I used it for once and already fall in love with this. I used to use small tweezer to pluck out my leg's hair, in order to avoid the razor, it took me a few hours to do that but this super electronic tweezer works wonder! You won't see the black roots and I have smooth and silkly legs in 20 mins. It does not hurt at all, if you use it on your legs. But, if you use it at your under arm, it won't be a pleasant feeling, of course! I will never use anything else besides this for hair removing anymore! highly recommended!",
404
+ "id": "79"
405
+ },
406
+ "80": {
407
+ "label": true,
408
+ "text": "This is not a toy: I guess I was expecting more out of these leave window decals. I just didn't find them attractive after placing them on my window, they seem very cheap, I guess because they are cheap.I threw them away.",
409
+ "id": "80"
410
+ },
411
+ "81": {
412
+ "label": false,
413
+ "text": "Wonderful book for anyone running a professional hatchery: This book is aimed more for hatcheries that are raising Trout, Salmon, Catfish and other food fishes. However, there is so much information in this book that even ornamental fish hatcheries will find an incredible amount of useful information. The chapters on Fish Nutrition are especially helpful.",
414
+ "id": "81"
415
+ },
416
+ "82": {
417
+ "label": false,
418
+ "text": "Amazing book!!: Once again, Eric Victorino's artistic talent is put into this great free-verse poetry book. I couldn't put it down and I finished it the day I received it in the mail. All of the poems are awesome but the one I found the most interesting was \"It's A People Business.\" All of the experiences in his life, personally and with his band, come to life in this book. Please check it out! It's worth every penny!!",
419
+ "id": "82"
420
+ },
421
+ "83": {
422
+ "label": false,
423
+ "text": "The white trumpet contender respect Miles Davis!: The story of the Jazz in the Fifties certainly would be remain unfinished without the ominous presence of this outstanding virtuoso. Baker sound still possesses this alluring hook, this magnetic engagement charm, eloquent expressiveness, enrapturing lyricism and contagious rhythm, despite the elapsed time, which confirms by itself the status of his musicianship.This selection is jus a little sample of the broad universe of his genius. A well thought selection of great musical successes, available, preserved and immortalized by the Digital Technology for our future enjoyment.Absolutely indispensable in your treasured collection.",
424
+ "id": "83"
425
+ },
426
+ "84": {
427
+ "label": true,
428
+ "text": "What the?: I'm sorry, maybe it's just me but I can't helping stating that this has to be one of the wrost movies I've seen in my life!Can you say boring? Can you say doesn't make sense at all? The first 30 minutes of the movie were O.K. But it went downhill after that. This movie is a prime example of a director attempting to make a deep movie with a meaningful lesson but failed on all levels. I don't recommend this movie unless you want to go to sleep or you don't have anything else to do.",
429
+ "id": "84"
430
+ },
431
+ "85": {
432
+ "label": false,
433
+ "text": "very very good!!!!: linda blair is a young girl who is possessed. and her mother doesn't know what to do until one day when she hears her daughter screaming and stabbind herself she knows what to do GET AN EXORCIZIM!!!",
434
+ "id": "85"
435
+ },
436
+ "86": {
437
+ "label": false,
438
+ "text": "Awesome product for the price!: This range extender works as advertised! I am very happy with the purchase. I was a little worried after reading some of the horror stories here, but I have to say, Chovy's review instructions (on this site) were just this ticket to get the repeater up and running in less than 30 minutes. It was unbelievably easy to install! Do not be frightened by negative reviews. If you can set up a wireless network, you can set up this repeater. However, I did upgrade the firmware before I did anything else and maybe that helped. I got the firmware update from the Belkin site.",
439
+ "id": "86"
440
+ },
441
+ "87": {
442
+ "label": true,
443
+ "text": "Slight: This book is either a heavily illustrated short story collection or a text-heavy comic. Its unusual format is its most original feature. Its plots are negligible, but its illustrations and text evoke a unique atmosphere of self-conscious nonconformism. Although its target audience is dare-to-be-different teens and college students, its interesting turns of phrase and expressive line drawings are not devoid of interest for general audences.",
444
+ "id": "87"
445
+ },
446
+ "88": {
447
+ "label": false,
448
+ "text": "ANgeleyes: Seem to dry up their eyes fairly well, although I haven't seen the color (brown stain) change much yet.",
449
+ "id": "88"
450
+ },
451
+ "89": {
452
+ "label": true,
453
+ "text": "Nice Try: Salt Lake 2002 is not a bad game, but it isn't good either. The graphics are excellent, but some of the events are bad. Bobsleigh, and skiing aren't bad but the others are. You dont stay into it for long. I liked it for a while, but it gets boring.",
454
+ "id": "89"
455
+ },
456
+ "90": {
457
+ "label": true,
458
+ "text": "Cutler's share of the pie: This book was a major disappointment. I am familiar with books written solely by the Dalai Lama, such as the \"Library of Tibet\" series, which are much more engrossing and have much more substance than Cutler's book. Cutler attempts (successfully, sadly) to have his share of the profitable market that involves the Dalai Lama's writings. The book is insipid, does not try to explain any important issue in the light of Buddhist philosophy, and only rehashes issues that several other westerners already wrote about. It's another big ego trip: we keep hearing time and again about his opportunities to be with the Dalai Lama. What a shame, Cutler. I sold the book as soon as I finished it.",
459
+ "id": "90"
460
+ },
461
+ "91": {
462
+ "label": true,
463
+ "text": "Mostly tedious, with interesting parts: I found the writing interesting, and the subject fascinating, but I found myself frustrated by the author's difficulty in talking directly about the status of Muslim women with her interview subjects. The author spent many pages writing about the menus and dress of the many middle and upper-middle class women she interviewed. It seemed as though her interview subjects resisted her efforts to discuss the status of women in their countries, so we too as readers had to wade through much distracting material and misunderstandings about feminism and gender. Great travel stories, but not a great source of information about Muslim women.",
464
+ "id": "91"
465
+ },
466
+ "92": {
467
+ "label": true,
468
+ "text": "Sesame Street Toddler: I did not find this game to be as educationally sound as I would expect from Sesame street. There is too much talking before the program will react to a command. The graphics are jerky and the cursor acts like the target is magnetically charged and keeps pushing away the cursor. When the child actually does manage to click on a target, the cursor may still fly to another target and the child is told that his answer is wrong. Another example of educational problems is the pronunciation of \"eggs\" using a long \"a\" sound instead of a short \"e.\" This is not very helpful in teaching a child the sound for short \"e.\" Children that are used to playing computer games by themselves may find that this game is too frustrating to do alone. The open ended learning curve is a great idea. I just wish Sesame Street would hire a truly qualified literacy expert to help clean up the many problems in this program.",
469
+ "id": "92"
470
+ },
471
+ "93": {
472
+ "label": true,
473
+ "text": "needs a buzz cut and a point: I avoided reading this book, not because of the hermaphrodite subject matter, but because I have never read a multigenerational family saga that I liked. Many books let me down in the middle, and this was no exception. The beginning of the book was incredible and harrowing, with momentum and characterization. The post-America nextgens part of the saga was so boring I found myself flipping and flipping - always a bad sign. If there was some kind of larger point to all of that, then I must have missed it. Yes there's the identity duality and trinity themes playing out here: man/woman, greek/turkish/american modern/old world sick/healthy innocent/guilty original/reinvented. But it was almost as if the author was saying - here it is again - get it? I like my fiction much more subtle than this.",
474
+ "id": "93"
475
+ },
476
+ "94": {
477
+ "label": true,
478
+ "text": "OMG! DO NOT BUY!: I normally don't take the time to submit a review.In this case however, I feel obligated to do so.This is by far one of the worst purchases I have ever made.Here's why.....The contraption is far too bulky.The case's enclosing is unbearable, takes a good minute or so to open it.The texture of the material feels like a cheap toy.The overall design is horrible, something I could make in my basement.For the love of everything sacred, do not buy this thing.",
479
+ "id": "94"
480
+ },
481
+ "95": {
482
+ "label": false,
483
+ "text": "Good price, good quality: Comparable HDMI cables can be bought for 45 or more. Even though the price is cheap the quality is good, no problems so far.",
484
+ "id": "95"
485
+ },
486
+ "96": {
487
+ "label": false,
488
+ "text": "Good rock music: This is what i call rock music good beat and good lyrics, don't listen to the other reviews. This cd is one of the best, listen to a few songs and you will get hooked. I recommend this cd its awesome.",
489
+ "id": "96"
490
+ },
491
+ "97": {
492
+ "label": true,
493
+ "text": "BORING!: This movie is soo boring. How in the hell did this movie make so much at the box office. Do people really want to pay for crappy movies like this. bottom line this is a chick flick nothing is good. And now they are re-releasing this movie with more boring stuff. This is the worst movie ever.",
494
+ "id": "97"
495
+ },
496
+ "98": {
497
+ "label": true,
498
+ "text": "Already Rusting: Inferior quality. The plating is thin and rust is coming through the finish. Inexcusable for a product that is designed for use in a humid environment.",
499
+ "id": "98"
500
+ },
501
+ "99": {
502
+ "label": true,
503
+ "text": "confusing internet setup: i wanted a camera that could email photos but this camera will not go out through the router and the manual setup , to punch a hole thru router is confusing.",
504
+ "id": "99"
505
+ },
506
+ "04c7dfc0f94e4e88968d09b40edbfa14": {
507
+ "label": true,
508
+ "text": "The new gaming console is unaffordable.",
509
+ "id": "04c7dfc0f94e4e88968d09b40edbfa14"
510
+ },
511
+ "58f58a1a4cbb4bb699772ed934006ec8": {
512
+ "label": true,
513
+ "text": "How can it be sure difficult for @115830 to deliver a package to a University address? Two failed attempts so far ...",
514
+ "id": "58f58a1a4cbb4bb699772ed934006ec8"
515
+ },
516
+ "d4a3cd4877c54aef81c376eff8008df4": {
517
+ "label": false,
518
+ "text": "@204780 Glad they showed up! Hope you have a great flight! -Sean",
519
+ "id": "d4a3cd4877c54aef81c376eff8008df4"
520
+ },
521
+ "affe1d6548f84bed84238bac45cc10a1": {
522
+ "label": false,
523
+ "text": "@British_Airways Thank you! All looks good then \ud83c\uddec\ud83c\udde7\u2708\ufe0f",
524
+ "id": "affe1d6548f84bed84238bac45cc10a1"
525
+ },
526
+ "e304ea77a94c450a95690c7b605a035f": {
527
+ "label": false,
528
+ "text": "@246667 Thank you for reaching out, Andrea. The built in application in Windows 10 are exempted to be uninstalled. However, you can send this suggestion directly to our developers via the Feedback Hub so they can take a look at it: https://t.co/jowrfbgQm6. Keep in touch.",
529
+ "id": "e304ea77a94c450a95690c7b605a035f"
530
+ },
531
+ "76b694b019eb4e6888a422e144030bd0": {
532
+ "label": true,
533
+ "text": "@GWRHelp It\u2019s mainly the constant short forming and cancellations due to mechanical faults Phil. As a company, these excuses have been used ad nauseam for years and years. It just gets worse and no amount of rhetoric and IET self promotion can hide that fact.",
534
+ "id": "76b694b019eb4e6888a422e144030bd0"
535
+ },
536
+ "ce0698020b7a457396c7674b04db10e6": {
537
+ "label": false,
538
+ "text": "English gangster flick.",
539
+ "id": "ce0698020b7a457396c7674b04db10e6"
540
+ },
541
+ "52bda6cbab224899845e66e0474cdefc": {
542
+ "label": false,
543
+ "text": "sees the formula graph, the chip calculates the formula, able to \"survive\" thanks to its connection to Edit, develops a parallel personality and affords her abilities greater than she ever imagined...",
544
+ "id": "52bda6cbab224899845e66e0474cdefc"
545
+ },
546
+ "435aabe68c294963a05e090d479582bc": {
547
+ "label": false,
548
+ "text": "Aanandam is a 2016 Indian Malayalam campus musical film written and directed by Ganesh Raj in his directorial debut. Vineeth Sreenivasan produces the film under the banner of Habit Of Life with Vinod Shornur under Cast N Crew.",
549
+ "id": "435aabe68c294963a05e090d479582bc"
550
+ },
551
+ "f96313d0087e4941a359783634ef9e86": {
552
+ "label": false,
553
+ "text": "The remarkable story of The Weather Underground, radical activists of the 1970s, and of radical politics at its best and most disastrous.",
554
+ "id": "f96313d0087e4941a359783634ef9e86"
555
+ },
556
+ "f63e4502791a409fa2d750687d3841eb": {
557
+ "label": false,
558
+ "text": "A young widow on a trip to the backwoods stumbles upon the operation of a gang of drug smugglers. They attempt to kill her in order to keep their operation a secret, but she turns out to be more resourceful than they thought, and starts to turn the tables on them.",
559
+ "id": "f63e4502791a409fa2d750687d3841eb"
560
+ },
561
+ "108ac02949324b02bdcbe4c7a77bacdc": {
562
+ "label": false,
563
+ "text": "The story of a young Marine, fresh from Camp Pendleton, who is forced to confront the complexities of adulthood and a volatile home life during a four-day Thanksgiving leave.",
564
+ "id": "108ac02949324b02bdcbe4c7a77bacdc"
565
+ },
566
+ "44fc412246964b2393fa0035ff093a00": {
567
+ "label": false,
568
+ "text": "Exploring the rough and tumble world of hockey, Academy Award winner Alex Gibney (\"Taxi to the Dark Side\") looks at the world of the NHL enforcers and specifically the career of Chris \"Knuckles\" Nilan who helped the Montreal Canadiens win the Stanley Cup.",
569
+ "id": "44fc412246964b2393fa0035ff093a00"
570
+ },
571
+ "409350c111af4ba3a94c842b797ddb95": {
572
+ "label": false,
573
+ "text": "Two fishing fanatics get in trouble when their fishing boat gets stolen while on a trip.",
574
+ "id": "409350c111af4ba3a94c842b797ddb95"
575
+ },
576
+ "d48d8f3b5a524ecea69bae718d1f1513": {
577
+ "label": false,
578
+ "text": "A willful young boy follows his just as obstinate grandmother in a journey across Iraq, determined to discover the fate of her missing son, Ahmed's father, who never returned from war.",
579
+ "id": "d48d8f3b5a524ecea69bae718d1f1513"
580
+ },
581
+ "283e96de5b474240a044c50dbc2551fb": {
582
+ "label": false,
583
+ "text": "A group of people are sitting in a theatre watching a movie when one realises that the woman on the screen is her. (IMDb)",
584
+ "id": "283e96de5b474240a044c50dbc2551fb"
585
+ },
586
+ "516d0f2f3a854a97a87c64db19a89fac": {
587
+ "label": false,
588
+ "text": "of the fake prediction. Fantastic swashbuckling adventures in a 18th century setting, with a light criticism of the war and the mighty.",
589
+ "id": "516d0f2f3a854a97a87c64db19a89fac"
590
+ },
591
+ "c2f55710669b40aa937625fe0ab04065": {
592
+ "label": false,
593
+ "text": "famous for his reputation as a Don Juan, to seduce C\u00e9cile and emotionally destroy her. While on his mission, Valmont gets sidetracked when he goes to visit his aunt and falls for Madame Tourvel, a virtuous, married woman who knows of his womanizing ways, but that only makes the challenge more exciting to Valmont. Together, Madame de Merteuil and Valmont make a dangerous team and they will stop at nothing when it comes to matters of the heart.",
594
+ "id": "c2f55710669b40aa937625fe0ab04065"
595
+ },
596
+ "ba0261b2ee3244d29bb3a8c6d77195a6": {
597
+ "label": false,
598
+ "text": "sees the formula graph, the chip calculates the formula, able to \"survive\" thanks to its connection to Edit, develops a parallel personality and affords her abilities greater than she ever imagined...",
599
+ "id": "ba0261b2ee3244d29bb3a8c6d77195a6"
600
+ },
601
+ "5e724fbde8ee44d9a8fc87a6e6667f01": {
602
+ "label": false,
603
+ "text": "telling the story about people who despite all obstacles strive for their goal.",
604
+ "id": "5e724fbde8ee44d9a8fc87a6e6667f01"
605
+ },
606
+ "557eba5ebfc9467a9d88688afed41354": {
607
+ "label": false,
608
+ "text": "A young playboy who learns he has one month until he becomes infertile sets out to procreate as much as possible.",
609
+ "id": "557eba5ebfc9467a9d88688afed41354"
610
+ },
611
+ "aa20e22fbe96487d8ee1223a6ef4da0b": {
612
+ "label": false,
613
+ "text": "Set in modern times, Alex finds King Arthur's sword Excalibur and must prove himself worthy of it.",
614
+ "id": "aa20e22fbe96487d8ee1223a6ef4da0b"
615
+ },
616
+ "bea56d34f6df408c9ec9653b17a90a93": {
617
+ "label": false,
618
+ "text": "Kostis is a 40-year-old doctor that finds himself in the small island of Antiparos, in order to take over the local clinic. His whole life and routine will turn upside down when he meets an international group of young and beautiful tourists and he falls in love with Anna, a 19-year-old goddess.",
619
+ "id": "bea56d34f6df408c9ec9653b17a90a93"
620
+ },
621
+ "e61a3251720d425c9f4770cb4b11d2d9": {
622
+ "label": false,
623
+ "text": "Friends on a weekend excursion take a path into a forest that leads to death and destruction.",
624
+ "id": "e61a3251720d425c9f4770cb4b11d2d9"
625
+ },
626
+ "5471008376cf44518f2ff1f67f057c08": {
627
+ "label": false,
628
+ "text": "Mr Bournelis suggested all 30 lineal metres of blockwork should be removed and replaced, which would require removing and reinstalling the fence. The total cost of his suggested method of rectification was said to be $14,650 for each unit, giving a total cost of rectification of $29,300.",
629
+ "id": "5471008376cf44518f2ff1f67f057c08"
630
+ }
631
+ },
632
+ "version": 27,
633
+ "description": "Negative sentiment"
634
+ }
lilac/concepts/non-english/concept.json ADDED
@@ -0,0 +1,1024 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "namespace": "lilac",
3
+ "concept_name": "non-english",
4
+ "type": "text",
5
+ "data": {
6
+ "c727f30a2d2d40f69b81aa981515fb62": {
7
+ "label": true,
8
+ "text": "Je suis fatigu\u00e9.",
9
+ "id": "c727f30a2d2d40f69b81aa981515fb62"
10
+ },
11
+ "834121208555439b976a5f228ec138a5": {
12
+ "label": true,
13
+ "text": "Ich spreche Deutsch.",
14
+ "id": "834121208555439b976a5f228ec138a5"
15
+ },
16
+ "61a4130d8eb447ba88e52e09fc3860d7": {
17
+ "label": true,
18
+ "text": "\u79c1\u306f\u65e5\u672c\u8a9e\u3092\u8a71\u305b\u307e\u3059\u3002",
19
+ "id": "61a4130d8eb447ba88e52e09fc3860d7"
20
+ },
21
+ "083d9218def443e1adf7ff18150203c4": {
22
+ "label": true,
23
+ "text": "Eu n\u00e3o entendo portugu\u00eas.",
24
+ "id": "083d9218def443e1adf7ff18150203c4"
25
+ },
26
+ "0e540cf8599f419b81d11fd89a95b119": {
27
+ "label": true,
28
+ "text": "Non capisco italiano.",
29
+ "id": "0e540cf8599f419b81d11fd89a95b119"
30
+ },
31
+ "2278f313117e40b7846d2dbc9cb7f690": {
32
+ "label": false,
33
+ "text": "cotton ball",
34
+ "id": "2278f313117e40b7846d2dbc9cb7f690"
35
+ },
36
+ "7a3e058ff74b401485185a51a9d07606": {
37
+ "label": false,
38
+ "text": "To ensure sensor switch is not actuated by the weight of the cat bed place on it, but only by the cat laying in the bed.",
39
+ "id": "7a3e058ff74b401485185a51a9d07606"
40
+ },
41
+ "8c2fd2d3a7f049dea2d4161d17ae02dd": {
42
+ "label": false,
43
+ "text": "Make turmeric milk",
44
+ "id": "8c2fd2d3a7f049dea2d4161d17ae02dd"
45
+ },
46
+ "bccf69d2771640b6a7b436b731d5cc85": {
47
+ "label": false,
48
+ "text": "To make a double boiler",
49
+ "id": "bccf69d2771640b6a7b436b731d5cc85"
50
+ },
51
+ "84ce78b3ccfc4007b0aceabe7004436c": {
52
+ "label": false,
53
+ "text": "To encourage your child to behave better,",
54
+ "id": "84ce78b3ccfc4007b0aceabe7004436c"
55
+ },
56
+ "cc4c4eb91b40473c826ae400fdec6c1e": {
57
+ "label": false,
58
+ "text": "How do you peel asparagus before cooking?",
59
+ "id": "cc4c4eb91b40473c826ae400fdec6c1e"
60
+ },
61
+ "cae1ab05a7584231afa9be93f0029105": {
62
+ "label": false,
63
+ "text": "How can I dry citrus salt?",
64
+ "id": "cae1ab05a7584231afa9be93f0029105"
65
+ },
66
+ "7eded97e58614ca0b7e9421f955cef3c": {
67
+ "label": false,
68
+ "text": "How do I melt chocolate?",
69
+ "id": "7eded97e58614ca0b7e9421f955cef3c"
70
+ },
71
+ "35ecb6b2835b427ba15432471dda4bde": {
72
+ "label": false,
73
+ "text": "How to get rid of crows.",
74
+ "id": "35ecb6b2835b427ba15432471dda4bde"
75
+ },
76
+ "91c70b93890a47b3ad1925e143c2ac69": {
77
+ "label": false,
78
+ "text": "How to Kill Green Hair Algae in a Freshwater Aquarium",
79
+ "id": "91c70b93890a47b3ad1925e143c2ac69"
80
+ },
81
+ "dd6b9fb4280f4ff0b4c371c834525692": {
82
+ "label": false,
83
+ "text": "how do you translate from spanish to english?",
84
+ "id": "dd6b9fb4280f4ff0b4c371c834525692"
85
+ },
86
+ "350bb48432614e69b5b8aa58f82fe7c4": {
87
+ "label": false,
88
+ "text": "Learn a new language quickly.",
89
+ "id": "350bb48432614e69b5b8aa58f82fe7c4"
90
+ },
91
+ "057e3e9d624d48b896a87e97fa6f468c": {
92
+ "label": false,
93
+ "text": "how do you keep a cat from going in heat?",
94
+ "id": "057e3e9d624d48b896a87e97fa6f468c"
95
+ },
96
+ "f9c15155cc264a9da3642d420efab4ea": {
97
+ "label": false,
98
+ "text": "how ot melt crayons",
99
+ "id": "f9c15155cc264a9da3642d420efab4ea"
100
+ },
101
+ "a6acb5343d9c42a6b29dffe9314dafa1": {
102
+ "label": true,
103
+ "text": "Chi \u00e8 il presidente del consiglio in Italia ora?",
104
+ "id": "a6acb5343d9c42a6b29dffe9314dafa1"
105
+ },
106
+ "e808987d2364440bbbc4e0cb71bb2307": {
107
+ "label": true,
108
+ "text": "1. Frenar: Es importante se\u00f1alar cuando vas a frenar, especialmente si vas a hacerlo bruscamente. Esto lo puedes hacer extendiendo tu brazo izquierdo hacia abajo con la palma de la mano abierta.\n\n2. Detenerse: Si necesitas detenerte por completo, debes se\u00f1alarlo tambi\u00e9n. Esto lo puedes hacer extendiendo tu brazo izquierdo hacia abajo con la palma de la mano abierta y con los dedos hacia abajo.",
109
+ "id": "e808987d2364440bbbc4e0cb71bb2307"
110
+ },
111
+ "0a4ba4f4ed404a1692d8d7b96c76de05": {
112
+ "label": false,
113
+ "text": "It depends on the size of your house, if it is small one access point is enough, if it is bigger it might be better to have two or three.",
114
+ "id": "0a4ba4f4ed404a1692d8d7b96c76de05"
115
+ },
116
+ "16396e6f1d3b47d58d3294c4cf4b9a9d": {
117
+ "label": true,
118
+ "text": "In definitiva, per Nietzsche, il significato della vita non era qualcosa che poteva essere dato, ma era piuttosto qualcosa che gli individui dovevano creare per se stessi attraverso l'arte, la cultura, la creazione di valore e la superazione della sofferenza.",
119
+ "id": "16396e6f1d3b47d58d3294c4cf4b9a9d"
120
+ },
121
+ "ecce57f8ecad45ef8b3826d27e233080": {
122
+ "label": false,
123
+ "text": "Evidence for the existence of a God or multiple Gods is subjective and varies depending on one's beliefs and personal experiences. Some people may cite religious texts, miracles, or spiritual experiences as evidence, while others may argue that the lack of evidence is evidence in itself. I can also suggest that the absence of evidence is not evidence of absence, and that there may be multiple",
124
+ "id": "ecce57f8ecad45ef8b3826d27e233080"
125
+ },
126
+ "79f897e21d27403097aed9b9800689c4": {
127
+ "label": true,
128
+ "text": "Infine, nella sua ultima fase filosofica, Nietzsche ha sviluppato la sua critica alla moralit\u00e0 tradizionale e alla religione, sostenendo che questi sistemi erano basati su valori falsi e che la vita aveva bisogno di una nuova valutazione morale e spirituale. In questa fase, Nietzsche ha sostenuto che la vita aveva bisogno di un nuovo senso e di una nuova direzione, e che era compito dell'individuo",
129
+ "id": "79f897e21d27403097aed9b9800689c4"
130
+ },
131
+ "da1b4988188642368b4f683f0418496e": {
132
+ "label": true,
133
+ "text": "Pueden un perro y una gato procrear juntos?",
134
+ "id": "da1b4988188642368b4f683f0418496e"
135
+ },
136
+ "2c54d8a5bb6742ada15549ad7007fe6b": {
137
+ "label": true,
138
+ "text": "In generale, Nietzsche ha visto la vita come una sfida continua, dove ogni individuo deve trovare il proprio significato e scopo attraverso la creativit\u00e0, l'arte e la moralit\u00e0 personale.",
139
+ "id": "2c54d8a5bb6742ada15549ad7007fe6b"
140
+ },
141
+ "28a055c7637c440bb0912bc5274d79c3": {
142
+ "label": true,
143
+ "text": "De nada, fue un placer ayudarte. \u00bfEn qu\u00e9 m\u00e1s puedo ayudarte?",
144
+ "id": "28a055c7637c440bb0912bc5274d79c3"
145
+ },
146
+ "b61a36355ae943208090ccdd3b736dce": {
147
+ "label": true,
148
+ "text": "\u00bfPor qu\u00e9 deber\u00edamos preocuparnos por estos gases? Bueno, porque est\u00e1n causando cambios dr\u00e1sticos en el clima, lo que a su vez est\u00e1 afectando a nuestro medio ambiente.",
149
+ "id": "b61a36355ae943208090ccdd3b736dce"
150
+ },
151
+ "7ea3d807733044c69a4e35f6ff6b66a3": {
152
+ "label": true,
153
+ "text": "Pi\u00f9 di recente, gli studiosi sono giunti a riconoscere che ci sono altri criteri essenziali per un\u2019unione monetaria di successo, che sono difficili da realizzare senza una profonda integrazione politica. Alla fine degli anni sessanta, Peter Kenen&amp; ha sostenuto che senza i movimenti dei tassi di cambio come ammortizzatori, l\u2019unione monetaria necessita dei trasferimenti fiscali come modalit\u00e0 per",
154
+ "id": "7ea3d807733044c69a4e35f6ff6b66a3"
155
+ },
156
+ "c06cf297c9564f9baa5cbc2130f1ef1f": {
157
+ "label": true,
158
+ "text": "QUESTION: \u00bfPor qu\u00e9 se oponen Alemania y Austria?",
159
+ "id": "c06cf297c9564f9baa5cbc2130f1ef1f"
160
+ },
161
+ "cc5a1a04352d471cb5b7e6831f19c86a": {
162
+ "label": false,
163
+ "text": "- wing parties , with the aid of the Democratic Renewal Party , the government fell and M\u00e1rio Soares , the President at the time , called for a new election . The PSD was very popular going into the election , and was elected to a landslide majority government -- the biggest that a Portuguese party had ever won in a free election . The left - wing Democratic Unity Coalition lost some of its MPs to",
164
+ "id": "cc5a1a04352d471cb5b7e6831f19c86a"
165
+ },
166
+ "8a95fab4b058420aa102b25fd7afc211": {
167
+ "label": true,
168
+ "text": "reticente, Holanda. Alemania y Austria, por el contrario, permanecen firmes en su oposici\u00f3n al texto, ya que consideran que menoscaba claramente el derecho a la defensa reconocido en sus Constituciones. La nueva directiva pretende extender a abogados y contables, incluidos los asesores fiscales, agentes inmobiliarios, marchantes de arte, anticuarios y casinos las mismas obligaciones que ahora",
169
+ "id": "8a95fab4b058420aa102b25fd7afc211"
170
+ },
171
+ "1ffe093d849040b6baaaadb5b71c04af": {
172
+ "label": true,
173
+ "text": "\"Hay una cosa afectiva muy fuerte, que cuesta, me ha costado mucho tiempo\" asimilar esos cinco a\u00c3\u00b1os que viv\u00c3\u00ad en las ciudades de Buenos Aires y C\u00c3\u00b3rdoba, admiti\u00c3\u00b3. En sus a\u00c3\u00b1os por la naci\u00c3\u00b3n sudamericana se cas\u00c3\u00b3 con un argentino, del que m\u00c3\u00a1s tarde se separ\u00c3\u00b3, y con quien tuvo a su primer y \u00c3\u00banico hijo.",
174
+ "id": "1ffe093d849040b6baaaadb5b71c04af"
175
+ },
176
+ "e95da58452044682ab05ad688544e907": {
177
+ "label": true,
178
+ "text": "M\u00e9lodieux et heureux !! . Cet album est magnifique. Apr\u00e8s plusieurs \u00e9coutes, je suis enchant\u00e9 de l'\u00e9couter et d'appr\u00e9cier les m\u00e9lodies qui s'y trouvent. Beaucoup de changements dans les musiques je trouve et aussi dans les paroles car je trouve que Myl\u00e8ne est plus directe dans la mani\u00e8re de parler de l'amour qui est tr\u00e8s pr\u00e9sent dans cet album. Je suis heureux d'avoir attendu pour entendre ses",
179
+ "id": "e95da58452044682ab05ad688544e907"
180
+ },
181
+ "34bc60b878e546b6af0e9bba1ec3879f": {
182
+ "label": true,
183
+ "text": "\u79c1\u306f\u65e5\u672c\u8a9e\u3092\u8a71\u305b\u307e\u3059\u3002",
184
+ "id": "34bc60b878e546b6af0e9bba1ec3879f"
185
+ },
186
+ "67f0416e7b3148698b02964bce412e8f": {
187
+ "label": true,
188
+ "text": "===============================\nConfidencial. Sujeito a privil,gio legal de comunica??o advogado/cliente.\nPrivileged and confidential attorney/client communication.\n\nPinheiro Neto - Advogados\n===============================\n - chart.doc\n - enron-q2_.doc\n\n\n",
189
+ "id": "67f0416e7b3148698b02964bce412e8f"
190
+ },
191
+ "1ee8405759884e078250db51a51960fe": {
192
+ "label": false,
193
+ "text": "\t\t\t\t\t\t\t<TD COLSPAN=\"2\"><SPAN CLASS=\"ArticleTitle\"><A HREF=\"https://www.wpo.org/benefits_and_services/publications/article_view.cfm?ArticleID=60&NewsletterID=11\" TARGET=\"new\" CLASS=\"ArticleTitle\">Splendors of La Serenissima</A></SPAN></TD>\n\t\t\t\t\t</TR>\n\t\t\t\t\t<TR>\n\t\t\t\t\t <TD WIDTH=\"15\"><IMG SRC=\"https://www.wpo.org/images/admin/enewsletter_admin/enewsletter/spacer.gif\" HEIGHT=\"8\" WIDTH=\"1\"></TD>",
194
+ "id": "1ee8405759884e078250db51a51960fe"
195
+ },
196
+ "6e2b830f1af94031a81380982e0eee06": {
197
+ "label": true,
198
+ "text": "===============================\nConfidencial. Sujeito a privil,gio legal de comunica??o advogado/cliente.\nPrivileged and confidential attorney/client communication.\n\nPinheiro Neto - Advogados\n===============================\n - enron-question.doc",
199
+ "id": "6e2b830f1af94031a81380982e0eee06"
200
+ },
201
+ "97e62b35b9974c9bb543fed193aed9d5": {
202
+ "label": false,
203
+ "text": "Most Democrats support legislation to reduce the role of money in politics. GOP leaders oppose it, and Democrats have long labored to depict Republicans as beholden to special interests. ",
204
+ "id": "97e62b35b9974c9bb543fed193aed9d5"
205
+ },
206
+ "e83e6092a67249e89b6ad77b39d35268": {
207
+ "label": false,
208
+ "text": "known since the 2nd century. In the 8th century it was the capital of Spain. There is also an important city in Venezuela named Valencia. When was Valencia the most important city in Spain?",
209
+ "id": "e83e6092a67249e89b6ad77b39d35268"
210
+ },
211
+ "824df6ad092f436ca5c923bb90b916f6": {
212
+ "label": true,
213
+ "text": "wife, as well as its Italian name: \"La Gioconda.\" Which of the following statements is true according to the passage?",
214
+ "id": "824df6ad092f436ca5c923bb90b916f6"
215
+ },
216
+ "9fb56ac53d444ae5b6a018d90a5808d8": {
217
+ "label": false,
218
+ "text": "known since the 2nd century. In the 8th century it was the capital of Spain. There is also an important city in Venezuela named Valencia. What is the main difference between the two parts of the city?",
219
+ "id": "9fb56ac53d444ae5b6a018d90a5808d8"
220
+ },
221
+ "1348ac65099049f9abf8401822f48966": {
222
+ "label": false,
223
+ "text": "Italian during the day, evening or on a onetoone basis. What does this passage mainly talk about?",
224
+ "id": "1348ac65099049f9abf8401822f48966"
225
+ },
226
+ "718e4099864145aba77fad8a6d77ed47": {
227
+ "label": false,
228
+ "text": "may imply us, only love can solve the problems between people, between the poor and the rich, love is everything. Which of the following is TRUE according to the passage?",
229
+ "id": "718e4099864145aba77fad8a6d77ed47"
230
+ },
231
+ "7ff37b233af54e978d0051deaa866b27": {
232
+ "label": true,
233
+ "text": "Ton camarade peut parfaitement exiger d'\u00eatre pay\u00e9 ce qui est inscrit dans le contrat. Il est possible que cela d\u00e9grade fortement ses relations avec l'entreprise et elle peut tr\u00e8s bien lui faire rater son ann\u00e9e en donnant un avis n\u00e9gatif sur ses performances. \u00c0 lui de voir si il souhaite continuer avec l'entreprise ou non.",
234
+ "id": "7ff37b233af54e978d0051deaa866b27"
235
+ },
236
+ "b52a661d85e04e8288abe2d87cb9cb74": {
237
+ "label": true,
238
+ "text": "sino un CONTRATO, por lo que de momento no voy a iniciar acciones legales pensando en que reconsidere su posici\u00f3n, pero si su decisi\u00f3n fuese no cumplir con EUROSEPER, mi gabinete jur\u00eddico HISPACOLEX, iniciar\u00e1 acciones legales contra usted por DA\u00d1OS Y PERJUICIOS, pues sabe que usted fu\u00e9 el profesor el a\u00f1o pasado, y sabe que hay muchas familias que le esperan como profesor a usted. Asi que espero",
239
+ "id": "b52a661d85e04e8288abe2d87cb9cb74"
240
+ },
241
+ "a97e74c1fb5940daac413e7d384e1ad7": {
242
+ "label": true,
243
+ "text": "Si l'entreprise refuse de payer ton camarade ce salaire l\u00e0 elle peut soit proposer une rupture conventionnelle avec les indemnit\u00e9s qui vont avec, soit s'il y a une p\u00e9riode d'essai ne pas continuer, soit aller aux prud'hommes pour faire casser le contrat si ils peuvent r\u00e9ellement prouver que c'est une erreur et pas simplement que l'\u00e9tudiant a n\u00e9goci\u00e9 \u00e7a.",
244
+ "id": "a97e74c1fb5940daac413e7d384e1ad7"
245
+ },
246
+ "22c1380ee8714ee9af2f89ac8899adc0": {
247
+ "label": true,
248
+ "text": "Maintenant, il ne semble pas y avoir beaucoup de preuves \u00e9tant donn\u00e9 la distance dans le temps. On se retrouve un peu dans une situation de il dit v. elle dit alors c'est pas vraiment clair jusqu'o\u00f9 \u00e7a peut aller si vous avez rien dit d'incriminant, mais je ne suis pas un avocat et encore moins un expert en droit p\u00e9nal. Consultez votre avocat(e) (c'est cette personne l'experte et votre personne",
249
+ "id": "22c1380ee8714ee9af2f89ac8899adc0"
250
+ },
251
+ "753c9b1a8de24131b30d283ce83a78b2": {
252
+ "label": false,
253
+ "text": "this is the whole section of Arbitration for my agreement:",
254
+ "id": "753c9b1a8de24131b30d283ce83a78b2"
255
+ },
256
+ "a41fec48ceb44c76b7b11407b74066c6": {
257
+ "label": false,
258
+ "text": "that add up to about $470 (like 47 lunches at the sandwich shop you like), or even a mix of items (like a new toaster oven, ten fancy coffees, and whatever), but put together a list of ten of them and write them down. ",
259
+ "id": "a41fec48ceb44c76b7b11407b74066c6"
260
+ },
261
+ "246967893bf14f818abd779c7c1d18bd": {
262
+ "label": true,
263
+ "text": "colocataire. Et de toute fa\u00e7on (et je dis \u00e7a sans conna\u00eetre tes ant\u00e9c\u00e9dents), il est tr\u00e8s peu probable que tu ailles en prison pour \u00e7a.",
264
+ "id": "246967893bf14f818abd779c7c1d18bd"
265
+ },
266
+ "07a05f094b074c84b19a6637261fdabc": {
267
+ "label": true,
268
+ "text": "op basis daarvan verzoeken om af te zien van het uitzenden.",
269
+ "id": "07a05f094b074c84b19a6637261fdabc"
270
+ },
271
+ "2b4276f61d014f1ca84c0cb5861bb312": {
272
+ "label": false,
273
+ "text": "Use a credit card like everyone else, instead of using your checking account.\n\nAlso, you are a grown-up, open your own damn checking &amp; savings account. It takes about 20 minutes. Then, put your own money in it, and spend it as you wish.",
274
+ "id": "2b4276f61d014f1ca84c0cb5861bb312"
275
+ },
276
+ "36f6833a45d340b78a3909624fbfcc3b": {
277
+ "label": true,
278
+ "text": "\"EStimado Sr. Adam Miller, me sorprende su falta de fomalidad, como usted bien sabe esta empresa siempre ha cumplido con usted, incluso me dice que no tuvo vacaciones, cuando en realidad estuvo trabajando solo de 6 a 8, y el contrato era de 4 a 8. No obstante reconozco su val\u00eda profesional y mi intenci\u00f3n es seguir contando con usted este a\u00f1o en la Carlota, y durante el curso ir\u00e1 recibiendo m\u00e1s",
279
+ "id": "36f6833a45d340b78a3909624fbfcc3b"
280
+ },
281
+ "2bf2c86ce5494156b1bf7b86a2325a17": {
282
+ "label": false,
283
+ "text": "Answer #1: Most flea drops are meant for a single cat who can not lick the drops off themselves. When put on 11 cats they are going to ingest a ton of it licking it off their buddies. Odds are that is what happened and the primary groomers are the ones who got sick and died.Answer #2: Usually those medications are aimed to be used on one to two animals at a time, the assumption being that most",
284
+ "id": "2bf2c86ce5494156b1bf7b86a2325a17"
285
+ },
286
+ "12f4657613b543ed90caaf1d52808dc1": {
287
+ "label": true,
288
+ "text": "decentemente. As\u00ed que no debo haberlo hecho tan mal.",
289
+ "id": "12f4657613b543ed90caaf1d52808dc1"
290
+ },
291
+ "21b1929e322c41fbba05d71cdc143aa2": {
292
+ "label": true,
293
+ "text": "Porsi a distanza, quel tanto che basta per mettere bene a fuoco e osservare che le cose non sono esattamente come credevi che fossero.",
294
+ "id": "21b1929e322c41fbba05d71cdc143aa2"
295
+ },
296
+ "792ffd2c51ea4c119c0718dd19f2acae": {
297
+ "label": true,
298
+ "text": "Cinq \u00e9oliennes en plus, un nouveau souffle sur la plaine de l'Orbieu - http",
299
+ "id": "792ffd2c51ea4c119c0718dd19f2acae"
300
+ },
301
+ "cc5826b82642497790882e22667c69ba": {
302
+ "label": true,
303
+ "text": "Bestes Gef\u00fchl der Welt: Schuhe aus, barfu\u00df laufen",
304
+ "id": "cc5826b82642497790882e22667c69ba"
305
+ },
306
+ "ae1f9ba6bb9e41549cba2c4f3857dabf": {
307
+ "label": true,
308
+ "text": "Troco o time vermelho todo com super brindes chamados Miriam e Leo, e voc\u00eas devolvem o @user #MasterChefBR",
309
+ "id": "ae1f9ba6bb9e41549cba2c4f3857dabf"
310
+ },
311
+ "2585790e439d4ff0926f0b26bbfd6a43": {
312
+ "label": true,
313
+ "text": "Training: Starker Smog verz\u00f6gert letztes Training zum Indien-Rennen - Abendzeitung M\u00fcnchen: ... http #Muenchen #Munich",
314
+ "id": "2585790e439d4ff0926f0b26bbfd6a43"
315
+ },
316
+ "6f7883441aad4a0d9e32768e6400163d": {
317
+ "label": true,
318
+ "text": "#Encontro ACHO QUE EU AMEI MESMO DISTANTE! Que lindo Nando Reis...",
319
+ "id": "6f7883441aad4a0d9e32768e6400163d"
320
+ },
321
+ "df1084a871054de3970c251fb65b32e5": {
322
+ "label": true,
323
+ "text": "@user \u0641\u0631\u0646\u0633\u0627 ..\u0647\u0648\u0644\u0627\u0646\u062f \u0645\u0627\u0634\u0649 \u0648\u062c\u0649 \u0627\u0644\u064a\u0645\u0646 \u0627\u0644\u0645\u062a\u0637\u0631\u0641 \u0628\u0642\u064a\u0627\u062f\u0629 \u0645\u0627\u0631\u0649 \u0644\u0648\u0628\u0627\u0646 \u0632\u0649 \u062a\u0631\u0627\u0645\u0628 \u0643\u062f\u0647",
324
+ "id": "df1084a871054de3970c251fb65b32e5"
325
+ },
326
+ "58332759cf794aba857d927a14994b88": {
327
+ "label": true,
328
+ "text": "\u0646\u062c\u0631\u0627\u0646: \u0625\u0637\u0644\u0627\u0642 \u0635\u0644\u064a\u0629 \u0635\u0648\u0627\u0631\u064a\u062e \u0639\u0644\u0649 \u062a\u062c\u0645\u0639\u0627\u062a \u0644\u0640 #\u0627\u0644\u062c\u064a\u0634_\u0627\u0644\u0633\u0639\u0648\u062f\u064a \u0648\u0622\u0644\u064a\u0627\u062a\u0647 \u0641\u064a \u0645\u0648\u0642\u0639 \u0627\u0644\u0647\u0631\u0645 \u0645\u062d\u0642\u0642\u0629 \u0625\u0635\u0627\u0628\u0627\u062a \u0645\u0628\u0627\u0634\u0631\u0629 #\u0644\u0628\u0646\u0627\u0646_\u0627\u0644\u0622\u0646",
329
+ "id": "58332759cf794aba857d927a14994b88"
330
+ },
331
+ "32cbb4e235dc4206a5f00bf40e98857f": {
332
+ "label": true,
333
+ "text": "RT @user: Ich muss wieder mehr Beats machen. NEVER SLEEP CAUSE SLEEP IS THE CAUSE OF DEATH #nasvoice #music\u2026 http",
334
+ "id": "32cbb4e235dc4206a5f00bf40e98857f"
335
+ },
336
+ "e7f8abdf87db4a89b703d2ccc097adfa": {
337
+ "label": true,
338
+ "text": "@user heute kein neuer 40DOD Bericht :(",
339
+ "id": "e7f8abdf87db4a89b703d2ccc097adfa"
340
+ },
341
+ "3bd5a83708a84289b2af8edbb56de338": {
342
+ "label": false,
343
+ "text": "Lazy Sunday Ray Allen Is Lamar Odom really going to the D League ?http://t.co/w6juHFgR ",
344
+ "id": "3bd5a83708a84289b2af8edbb56de338"
345
+ },
346
+ "4ef05c2d3a3543bf8a1c0b25bde57e2a": {
347
+ "label": true,
348
+ "text": "mujhe to sidha daant padti thi . . . bacho k hath me paise nahi diye jate warna bigad jayenge :d :d",
349
+ "id": "4ef05c2d3a3543bf8a1c0b25bde57e2a"
350
+ },
351
+ "45311ee95c2e4cb1925d0040fb934f71": {
352
+ "label": true,
353
+ "text": "@user Oh ja, bitte mal Bescheid geben, wenn Helene Fischer dran ist!",
354
+ "id": "45311ee95c2e4cb1925d0040fb934f71"
355
+ },
356
+ "e719e02d417542b69b86d13ad7cad8ce": {
357
+ "label": false,
358
+ "text": "@user Hey, just thought I'd remind you it's Deezer's birthday tomorrow! Also, you have any idea what he looks like?\" ",
359
+ "id": "e719e02d417542b69b86d13ad7cad8ce"
360
+ },
361
+ "561960a730de49c58423a8bf85df3dd1": {
362
+ "label": true,
363
+ "text": "#MaisVoce sou muito f\u00e3 do Dan, nossa que del\u00edcia essa entrevista, adorei ele no filme \"tempos de paz\" @user \u00e9 um ser humano lindo!",
364
+ "id": "561960a730de49c58423a8bf85df3dd1"
365
+ },
366
+ "d6b6fa3a919d4770a6586e007be914bf": {
367
+ "label": true,
368
+ "text": "RT @user: \u201c@KusXAnke: Straks buikdansen w/ @user &amp; haar nichten ! Was super!",
369
+ "id": "d6b6fa3a919d4770a6586e007be914bf"
370
+ },
371
+ "40b1078ffcec4a7da9899f0dd82f9d7f": {
372
+ "label": true,
373
+ "text": "#Grillo ieri a Conegliano ha detto che pagare le tasse e' giusto ma vuole conoscere la destinazione d'uso. Anche la Lega diceva cosi'...",
374
+ "id": "40b1078ffcec4a7da9899f0dd82f9d7f"
375
+ },
376
+ "758697fd8ebe4848af77a44757256203": {
377
+ "label": true,
378
+ "text": "#clouds #staatskanzlei #munich #m\u00fcnchen #blau #wolken #himmel #sky #silhouette #instagood\u2026 http",
379
+ "id": "758697fd8ebe4848af77a44757256203"
380
+ },
381
+ "95b69bc9b1214b14a600d7dfaea192f5": {
382
+ "label": true,
383
+ "text": "Depois desse #MaisVoce de hoje, se nadar der certo, eu viro #meseira",
384
+ "id": "95b69bc9b1214b14a600d7dfaea192f5"
385
+ },
386
+ "be6bad976b5e4d0c88e166aa583bd9cd": {
387
+ "label": true,
388
+ "text": "@user H\u00f6chst unbefriedigend...",
389
+ "id": "be6bad976b5e4d0c88e166aa583bd9cd"
390
+ },
391
+ "8cb329f9d5744ac6a52a4c3e823212c4": {
392
+ "label": false,
393
+ "text": "$15 minimum wage is a win-win. If businesses continue to boom, it's a win. Or, if it puts @user out of b\u2026 ",
394
+ "id": "8cb329f9d5744ac6a52a4c3e823212c4"
395
+ },
396
+ "c1542633006b4f62aaf7fd84b3962266": {
397
+ "label": true,
398
+ "text": "\u0638\u0647\u0648\u0631 \u0635\u0648\u0631 \u0633\u0627\u0639\u0629 HTC Halfbeak \u0628\u0646\u0638\u0627\u0645 \u0623\u0646\u062f\u0631\u0648\u064a\u062f \u0648\u064a\u0631http #\u0631\u064a\u0627\u0644_\u0645\u062f\u0631\u064a\u062f #\u0628\u0631\u0634\u0644\u0648\u0646\u0629",
399
+ "id": "c1542633006b4f62aaf7fd84b3962266"
400
+ },
401
+ "f2bb8836726d49a7ad23df371a877519": {
402
+ "label": true,
403
+ "text": "Heute nicht nur den Herzensmenschen geheiratet, sondern auch ganz viel Liebe von @user bekommen. Unbezahlbar &lt;3",
404
+ "id": "f2bb8836726d49a7ad23df371a877519"
405
+ },
406
+ "e5d6b7cb7c25419fbd8cb29119c26577": {
407
+ "label": true,
408
+ "text": "Invention de la premi\u00e8re cellule solaire qui stocke l\u2019\u00e9lectricit\u00e9 http",
409
+ "id": "e5d6b7cb7c25419fbd8cb29119c26577"
410
+ },
411
+ "46ae96b8c72e48f8992bd716116dc761": {
412
+ "label": true,
413
+ "text": "#Mussi : \\\"#Grillo farebbe bene a spararsi nei coglioni\\\" . E poi dicono di #Grillo...",
414
+ "id": "46ae96b8c72e48f8992bd716116dc761"
415
+ },
416
+ "f4912b96e2d34a5db245de82a8f7a463": {
417
+ "label": true,
418
+ "text": "RT @user: \u0639\u062c\u0628\u0627 \u0644\u0645\u0646 \u062e\u0631\u062c \u0639\u0644\u0649 \u0641\u0633\u0627\u062f \u0645\u0628\u0627\u0631\u0643 \u0648\u064a\u0645\u062a\u0646\u0639 \u0639\u0646 \u0627\u0644\u062e\u0631\u0648\u062c \u0639\u0644\u0649 \u0627\u0644\u0642\u062a\u0644 \u0648\u0627\u0644\u0627\u0639\u062a\u0642\u0627\u0644 \u0648\u0627\u0644\u0641\u0633\u0627\u062f \u0648\u0627\u0644\u062a\u0642\u0634\u0641 \u0648\u0627\u0644\u0643\u0633\u0627\u062f \u0648\u0627\u0644\u062a\u0631\u062f\u0649#\u062b\u0648\u0631\u0648 #\u0633\u064a\u0633\u064a_\u062a\u0627\u0646\u064a_\u0644\u0627",
419
+ "id": "f4912b96e2d34a5db245de82a8f7a463"
420
+ },
421
+ "35f179c9260b4a22a9ac5d11fc9e81ad": {
422
+ "label": true,
423
+ "text": "Valls 2 : apr\u00e8s les \u00e9cologistes, le socialisme quitte le gouvernement | Les Jeunes Ecologistes http #EELV @user",
424
+ "id": "35f179c9260b4a22a9ac5d11fc9e81ad"
425
+ },
426
+ "1e649a8d62a54b13b1f7cb8297473147": {
427
+ "label": true,
428
+ "text": "Ich werde gerade dezent nicht wach...",
429
+ "id": "1e649a8d62a54b13b1f7cb8297473147"
430
+ },
431
+ "22548ea5d02848c6990303a6fed08189": {
432
+ "label": true,
433
+ "text": "bhai totlly phadu super hero h nagraj ... hollywood ki trh bollywood me bhi inki muvies bnni chahiye ... wese doga ki new muvi bn rhi h nxt year tk ajayegi .... ",
434
+ "id": "22548ea5d02848c6990303a6fed08189"
435
+ },
436
+ "0955950e8a914ee69e580275c5c3f34b": {
437
+ "label": true,
438
+ "text": "...il fatto che Di Pietro non sia d'accordo su Mario Monti Premier conferma senza ombra di dubbio che sia la scelta giusta ! Cit. Gabri",
439
+ "id": "0955950e8a914ee69e580275c5c3f34b"
440
+ },
441
+ "87639635c3d14e50a227344cfcab7345": {
442
+ "label": true,
443
+ "text": "project ki deadline par daudte the , baki time to \" are kar lenge bahut time he \" . . . . . . . . . . ",
444
+ "id": "87639635c3d14e50a227344cfcab7345"
445
+ },
446
+ "560917c067374423bece98c64a628fbe": {
447
+ "label": false,
448
+ "text": "The decision to recount votes in Wisconsin is a joke. Leftists are still spotting-the-dummy of their loss. #TrumpTransition ",
449
+ "id": "560917c067374423bece98c64a628fbe"
450
+ },
451
+ "a4c1fc4296c14ccf95989796747f8753": {
452
+ "label": false,
453
+ "text": "#TOLOnews TOLOnews 08 October 2012: Top news in this Bulletin: The International Committee of the ... #Afghanistan ",
454
+ "id": "a4c1fc4296c14ccf95989796747f8753"
455
+ },
456
+ "2a188e097cc044458eb1a6e6d39114f4": {
457
+ "label": true,
458
+ "text": "\u201c@gabrielepinese: #Grillo fa bene a evitare la tv\\\" a dire il vero non fa mai neanche un contraddittorio neanche via web che lui ama tanto",
459
+ "id": "2a188e097cc044458eb1a6e6d39114f4"
460
+ },
461
+ "1c00eacdad1645e99574d588e893e7ea": {
462
+ "label": true,
463
+ "text": "oi, queria saber por que existe tanta manifesta\u00e7\u00e3o contraria a criminaliza\u00e7\u00e3o da homofobia e quem s\u00e3o os principais opositores? #encontro",
464
+ "id": "1c00eacdad1645e99574d588e893e7ea"
465
+ },
466
+ "1baa09b1506f46aea084875bd4148d33": {
467
+ "label": true,
468
+ "text": "Non sono forse titolato a dirlo.... Ma professor #monti mi ha deluso \u00e8 sicuramente capace ma le \u00e8 mancato il coraggio di vere scelte....",
469
+ "id": "1baa09b1506f46aea084875bd4148d33"
470
+ },
471
+ "629b60cc494444e295154826367bf5df": {
472
+ "label": true,
473
+ "text": "pdai kaisi chl rhi h ",
474
+ "id": "629b60cc494444e295154826367bf5df"
475
+ },
476
+ "175810d344da4dcf8718aac980264599": {
477
+ "label": true,
478
+ "text": "#Grillo non \u00e8 il peggiore dei mali, e se tra mali devo scegliere lui \u00e8 quello minore, o quello di cui ancora non ho sofferto...",
479
+ "id": "175810d344da4dcf8718aac980264599"
480
+ },
481
+ "96ff52f3b004487dbe23dd0bfb253890": {
482
+ "label": true,
483
+ "text": "\u30e9\u30a4\u30c8\u3092\u63a2\u3057\u3066\u308b",
484
+ "id": "96ff52f3b004487dbe23dd0bfb253890"
485
+ },
486
+ "26cdcf37c3ab4d9a9eafc5826c1258f4": {
487
+ "label": true,
488
+ "text": "\u4e2d\u5fc3",
489
+ "id": "26cdcf37c3ab4d9a9eafc5826c1258f4"
490
+ },
491
+ "caa57d9500934e5db76b08b1714a48d8": {
492
+ "label": true,
493
+ "text": "\u4f55\u304b\u5f15\u3063\u304b\u304b\u308a\u307e\u3057\u305f",
494
+ "id": "caa57d9500934e5db76b08b1714a48d8"
495
+ },
496
+ "e692438f7dbd4913a3dc46c4d9c52bcd": {
497
+ "label": true,
498
+ "text": "\u5e78\u904b\u306b\u3082",
499
+ "id": "e692438f7dbd4913a3dc46c4d9c52bcd"
500
+ },
501
+ "39612fd3901149beacb9b5acc7fe3dfc": {
502
+ "label": true,
503
+ "text": "\u3082\u3046\uff01",
504
+ "id": "39612fd3901149beacb9b5acc7fe3dfc"
505
+ },
506
+ "7401e3a8c6294ce0a8d2cfe3201e1cd0": {
507
+ "label": true,
508
+ "text": "\u53ce\u5bb9\u6240\u3067\u30ec\u30a4\u3068\u5c45\u305f\u8005\u306b...",
509
+ "id": "7401e3a8c6294ce0a8d2cfe3201e1cd0"
510
+ },
511
+ "455070045f3a49b193d679118d5265a5": {
512
+ "label": true,
513
+ "text": "\u304a\u524d\u306f\u4f55\u8005\u3060\uff1f",
514
+ "id": "455070045f3a49b193d679118d5265a5"
515
+ },
516
+ "3378c04edaab407388f2d364b9d39218": {
517
+ "label": true,
518
+ "text": "\u3042\u3089\u3001\u9b45\u529b\u7684\u3002",
519
+ "id": "3378c04edaab407388f2d364b9d39218"
520
+ },
521
+ "cdd47a2fa483487e8e29c7fab7d142ad": {
522
+ "label": true,
523
+ "text": "\u304a\u524d\u306e\u5973\u306b\u306a\u3093\u304b\u306b\u306a\u3089\u306a\u3044\uff01",
524
+ "id": "cdd47a2fa483487e8e29c7fab7d142ad"
525
+ },
526
+ "fa55a10d8c564d2880f14c8f9aba86bd": {
527
+ "label": true,
528
+ "text": "\u63a5\u7d9a\u6e08\u307f: %1, [%2], %3",
529
+ "id": "fa55a10d8c564d2880f14c8f9aba86bd"
530
+ },
531
+ "54d13a0d6ade412bb8f5c3b534b995ba": {
532
+ "label": true,
533
+ "text": "\u305d\u3046\u3044\u3046\u3053\u3068\u306d",
534
+ "id": "54d13a0d6ade412bb8f5c3b534b995ba"
535
+ },
536
+ "53d3699c466d4428a0b6abbc497ed83f": {
537
+ "label": true,
538
+ "text": "\u554f\u984c\uff1f \u90e8\u9577\u3001\u7dca\u6025\u4e8b\u614b\u3067\u3059",
539
+ "id": "53d3699c466d4428a0b6abbc497ed83f"
540
+ },
541
+ "cbc53f06cfd947bb80f51035a08f7333": {
542
+ "label": true,
543
+ "text": "\u4f55\u304c\u597d\u304d\uff1f",
544
+ "id": "cbc53f06cfd947bb80f51035a08f7333"
545
+ },
546
+ "8fe58a19fad845dfa3929b32ddaae4a4": {
547
+ "label": true,
548
+ "text": "\u30cf\u30ed\u30eb\u30c9\u306e\u5b50\u3060\u3068 \u77e5\u3063\u3066\u3044\u305f\u3060\u308d\u3046 \u3068",
549
+ "id": "8fe58a19fad845dfa3929b32ddaae4a4"
550
+ },
551
+ "b96fea844e3f4ac9b876f4de3ab2cc05": {
552
+ "label": true,
553
+ "text": "\u9055\u3046\u3000\u9055\u3046\u3000\u50d5\u3058\u3083\u306a\u3044\uff01",
554
+ "id": "b96fea844e3f4ac9b876f4de3ab2cc05"
555
+ },
556
+ "d176c2a4730043edb0ffbd1dd604710f": {
557
+ "label": true,
558
+ "text": "\u4f5c\u6226\u306b\u4f7f\u3063\u305f\u8eca\u306f\u4f55?",
559
+ "id": "d176c2a4730043edb0ffbd1dd604710f"
560
+ },
561
+ "2c3983f02d0344d58c4c1624e380f699": {
562
+ "label": false,
563
+ "text": "(c) 2000-2008, The KDE Team (c) 2003-2005, Klaus Niederkr\u00fcger (c) 1996-2000, Bernd Johannes Wuebben",
564
+ "id": "2c3983f02d0344d58c4c1624e380f699"
565
+ },
566
+ "923a3c65d5d544f483d56a0295ea2960": {
567
+ "label": true,
568
+ "text": "\u049a\u043e\u044e\u041a\u04e9\u043a\u0448\u0456\u043b\u0421\u04b1\u0440color",
569
+ "id": "923a3c65d5d544f483d56a0295ea2960"
570
+ },
571
+ "6d141199809f446b802cbaec666cb227": {
572
+ "label": false,
573
+ "text": "This is a searchable index. Enter search keywords:",
574
+ "id": "6d141199809f446b802cbaec666cb227"
575
+ },
576
+ "e1f1fda5f08041feb4a00f30c058b37d": {
577
+ "label": false,
578
+ "text": "And some people are suggesting that he's joined up with Obote's exiles.",
579
+ "id": "e1f1fda5f08041feb4a00f30c058b37d"
580
+ },
581
+ "1024bbe7dd53462e99c7cbd502975062": {
582
+ "label": true,
583
+ "text": "\u049a\u0430\u0441\u0438\u0435\u0442\u0442\u0435\u0440\u0456Comment",
584
+ "id": "1024bbe7dd53462e99c7cbd502975062"
585
+ },
586
+ "b6ec30c58ba741d6ad8283c5ad902dfa": {
587
+ "label": true,
588
+ "text": "\u041a\u0456\u0440\u0456\u0441 \u0434\u0435\u0440\u0435\u043a\u0442\u0435\u0440 \u0444\u0430\u0439\u043b\u044b",
589
+ "id": "b6ec30c58ba741d6ad8283c5ad902dfa"
590
+ },
591
+ "15f7d3f7557743df8502ad1e27a0ec73": {
592
+ "label": false,
593
+ "text": "PythonLanguage",
594
+ "id": "15f7d3f7557743df8502ad1e27a0ec73"
595
+ },
596
+ "da4a227e77314496b91ad060c2fe0418": {
597
+ "label": true,
598
+ "text": "\u0416\u043e\u0493\u0430\u0440\u044b\u041a\u043e\u043d\u0442\u0440\u0430\u0441\u0442Comment",
599
+ "id": "da4a227e77314496b91ad060c2fe0418"
600
+ },
601
+ "903074456936489186d4882d7267abfb": {
602
+ "label": true,
603
+ "text": "\u0416\u0430\u0443\u0430\u043f \u043c\u04d9\u0442\u0456\u043d\u0456\u043d\u0434\u0435 \u043a\u0435\u043b\u0435\u0441\u0456 \u0430\u0439\u043d\u044b\u043c\u0430\u043b\u044b\u043b\u0430\u0440 \u049b\u043e\u043b\u0434\u0430\u043d\u044b\u043b\u0430\u0434\u044b:% NAME =\u0436\u0456\u0431\u0435\u0440\u0443\u0448\u0456\u043d\u0456\u04a3 \u0430\u0442\u044b,% EMAIL =\u0436\u0456\u0431\u0435\u0440\u0443\u0448\u0456\u043d\u0456\u04a3 \u044d\u043b. \u043f\u043e\u0448\u0442\u0430 \u0430\u0434\u0440\u0435\u0441\u0456",
604
+ "id": "903074456936489186d4882d7267abfb"
605
+ },
606
+ "bbda9c3b01e543afb4009eb7f262b822": {
607
+ "label": false,
608
+ "text": "CORREL( A1: A3; B1: B3)",
609
+ "id": "bbda9c3b01e543afb4009eb7f262b822"
610
+ },
611
+ "47781ac273574ad0bc1e15b40ba9f6d3": {
612
+ "label": true,
613
+ "text": "\u041e\u0440\u0430\u043d\u0434\u0430\u043b\u0443\u0434\u0430",
614
+ "id": "47781ac273574ad0bc1e15b40ba9f6d3"
615
+ },
616
+ "9f98bff4c92a4f09a1502f4256c485ae": {
617
+ "label": true,
618
+ "text": "\u0411\u043e\u0437\u0430\u04a3\u041a\u04af\u043b\u0433\u0456\u043d\u049a\u044b\u0437\u044b\u043b3color",
619
+ "id": "9f98bff4c92a4f09a1502f4256c485ae"
620
+ },
621
+ "ccf5d3c35fdd45fe80f9cd5488e52dd6": {
622
+ "label": true,
623
+ "text": "\u040f\u0435\u0441, \u043e\u0434\u0438!",
624
+ "id": "ccf5d3c35fdd45fe80f9cd5488e52dd6"
625
+ },
626
+ "5c0d9b1b7d864949ac1e3352067598e4": {
627
+ "label": true,
628
+ "text": "\u0422\u043e\u0433\u0430\u0448 \u0442\u0430\u0430 \u0431\u0438\u043b\u0430 \u043f\u0440\u0435\u0441\u0440\u0435\u0442\u043d\u0430\u0442\u0430 \u043d\u0430\u0434\u0432\u043e\u0440 \u043e\u0434 \u043e\u0431\u043b\u0430\u0441\u0442\u0430 \u0410\u0458\u0430\u043c\u043a\u0443 \u0413\u0430\u0434\u043e\u043d\u0433 \u0438 \u0442\u043e\u0430 \u0435 \u043e\u043d\u0430 \u0448\u0442\u043e \u0441\u0435 \u0441\u043b\u0443\u0447\u0438\u043b\u043e.",
629
+ "id": "5c0d9b1b7d864949ac1e3352067598e4"
630
+ },
631
+ "462c4fa658da42599217c7e857ea93b9": {
632
+ "label": false,
633
+ "text": "The Convergence Reports issued by the EC and the ECB on Wednesday said the two countries fulfilled the membership criteria, including inflation rate, government finance, exchange rate and long- term interest rates.",
634
+ "id": "462c4fa658da42599217c7e857ea93b9"
635
+ },
636
+ "9ed529642c114ee8a8e1634bf8d4275a": {
637
+ "label": true,
638
+ "text": "53-\u0433\u043e\u0434\u0438\u0448\u043d\u0438\u043e\u0442 \u0408\u0443\u0440\u0447\u0438\u045c \u0435 \u043f\u0440\u043e\u0444\u0435\u0441\u043e\u0440 \u043f\u043e \u0435\u043a\u043e\u043d\u043e\u043c\u0438\u0458\u0430 \u043d\u0430 \u0423\u043d\u0438\u0432\u0435\u0440\u0437\u0438\u0442\u0435\u0442\u043e\u0442 \u0432\u043e \u0417\u0430\u0433\u0440\u0435\u0431 \u0438 \u0431\u0435\u0448\u0435 \u043c\u0438\u043d\u0438\u0441\u0442\u0435\u0440 \u0437\u0430 \u0444\u0438\u043d\u0430\u043d\u0441\u0438\u0438 \u043e\u0434 2000-\u0442\u0430 \u0434\u043e 2003-\u0442\u0430 \u0433\u043e\u0434\u0438\u043d\u0430.",
639
+ "id": "9ed529642c114ee8a8e1634bf8d4275a"
640
+ },
641
+ "6b507d9297c44a44b492beaa05a743c2": {
642
+ "label": true,
643
+ "text": "\u041f\u0430, \u0437\u0430 \u0432\u0430\u0441 \u0442\u0438\u043d\u0435\u0458\u045f\u0435\u0440\u0438 \u043a\u043e\u0438 \u043f\u0430\u0442\u0438\u0442\u0435 \u043e\u0434 \u0421\u041d\u0412...",
644
+ "id": "6b507d9297c44a44b492beaa05a743c2"
645
+ },
646
+ "d9e8d8e96a494171bc349cdc843bef65": {
647
+ "label": true,
648
+ "text": "\u0413\u0443\u0431\u0438 \u043c\u0438 \u0441\u0435 \u043e\u0434 \u043f\u0430\u0442\u043e\u0442!",
649
+ "id": "d9e8d8e96a494171bc349cdc843bef65"
650
+ },
651
+ "c4b56ee00c1343db9c95693493ba85e4": {
652
+ "label": true,
653
+ "text": "\u0412\u0430\u0436\u043d\u0435\u0439\u0448\u0435\u0435 \u043c\u0435\u0441\u0442\u043e \u0432 \u043f\u043e\u0432\u0435\u0441\u0442\u043a\u0435 \u0434\u043d\u044f \u0421\u043e\u0432\u0435\u0442\u0430 \u043f\u043e-\u043f\u0440\u0435\u0436\u043d\u0435\u043c\u0443 \u0437\u0430\u043d\u0438\u043c\u0430\u043b\u0438 \u0432\u043e\u043f\u0440\u043e\u0441\u044b, \u043a\u0430\u0441\u0430\u044e\u0449\u0438\u0435\u0441\u044f \u0410\u0444\u0440\u0438\u043a\u0438.",
654
+ "id": "c4b56ee00c1343db9c95693493ba85e4"
655
+ },
656
+ "525e90725cb147e9a5474613924f2dc5": {
657
+ "label": true,
658
+ "text": "\u0438 \u041e\u0431\u044a\u0435\u0434\u0438\u043d\u0435\u043d\u043d\u044b\u043c\u0438 \u0410\u0440\u0430\u0431\u0441\u043a\u0438\u043c\u0438 \u042d\u043c\u0438\u0440\u0430\u0442\u0430\u043c\u0438 (1991 \u0433\u043e\u0434)",
659
+ "id": "525e90725cb147e9a5474613924f2dc5"
660
+ },
661
+ "98a3f90eafd642779ebeb30ccb68dbee": {
662
+ "label": false,
663
+ "text": "MDA reached 1.3m",
664
+ "id": "98a3f90eafd642779ebeb30ccb68dbee"
665
+ },
666
+ "0d36a456b4244ff3841a222efac7da99": {
667
+ "label": true,
668
+ "text": "i) \u043e\u0431\u0440\u0430\u0431\u0430\u0442\u044b\u0432\u0430\u043b\u0438\u0441\u044c, \u0441\u043e\u0431\u0438\u0440\u0430\u043b\u0438\u0441\u044c, \u0442\u0440\u0430\u043d\u0441\u043f\u043e\u0440\u0442\u0438\u0440\u043e\u0432\u0430\u043b\u0438\u0441\u044c \u0438 \u0445\u0440\u0430\u043d\u0438\u043b\u0438\u0441\u044c \u044d\u043a\u043e\u043b\u043e\u0433\u0438\u0447\u0435\u0441\u043a\u0438 \u0431\u0435\u0437\u043e\u043f\u0430\u0441\u043d\u044b\u043c \u043e\u0431\u0440\u0430\u0437\u043e\u043c;",
669
+ "id": "0d36a456b4244ff3841a222efac7da99"
670
+ },
671
+ "56e27d24b0b04e52bbb1de4be037602c": {
672
+ "label": true,
673
+ "text": "\u0415\u0435 \u0438\u043d\u0442\u0435\u0440\u0435\u0441\u0443\u0435\u0442, \u043f\u043b\u0430\u043d\u0438\u0440\u0443\u0435\u0442\u0441\u044f \u043b\u0438 \u043f\u0440\u0435\u0434\u043e\u0441\u0442\u0430\u0432\u0438\u0442\u044c \u0442\u0430\u043a\u0438\u0435 \u0436\u0435 \u043f\u0440\u0430\u0432\u0430 \u0436\u0435\u043d\u0449\u0438\u043d\u0430\u043c, \u0441\u043e\u0441\u0442\u043e\u044f\u0449\u0438\u043c \u0432 \u0431\u0440\u0430\u043a\u0435 \u0434\u0435 \u0444\u0430\u043a\u0442\u043e, \u0438 \u0432\u043a\u043b\u044e\u0447\u0435\u043d\u044b \u043b\u0438 \u043f\u043e\u043b\u043e\u0436\u0435\u043d\u0438\u044f, \u043f\u0440\u0435\u0434\u0443\u0441\u043c\u0430\u0442\u0440\u0438\u0432\u0430\u044e\u0449\u0438\u0435 \u0432\u044b\u043f\u043b\u0430\u0442\u0443 \u0430\u043b\u0438\u043c\u0435\u043d\u0442\u043e\u0432 \u0441\u0443\u043f\u0440\u0443\u0433\u0443, \u043d\u0430\u0445\u043e\u0434\u044f\u0449\u0435\u043c\u0443\u0441\u044f \u0432 \u043c\u0435\u043d\u0435\u0435 \u0431\u043b\u0430\u0433\u043e\u043f\u0440\u0438\u044f\u0442\u043d\u043e\u043c \u043f\u043e\u043b\u043e\u0436\u0435\u043d\u0438\u0438.",
674
+ "id": "56e27d24b0b04e52bbb1de4be037602c"
675
+ },
676
+ "1654d6e38f1c4a959a8e7e64867c5f73": {
677
+ "label": true,
678
+ "text": "\u5f88\u62b1\u6b49\u8ba9\u4f60\u4e45\u7b49",
679
+ "id": "1654d6e38f1c4a959a8e7e64867c5f73"
680
+ },
681
+ "23102d4274b34d94823d9d5791f7007a": {
682
+ "label": true,
683
+ "text": "141\u653f",
684
+ "id": "23102d4274b34d94823d9d5791f7007a"
685
+ },
686
+ "668c79418de44d50919623f76bba1526": {
687
+ "label": true,
688
+ "text": "\u6211\u5728\u8fd9\u91cc\u624d\u80fd\u505a\u771f\u6b63\u7684\u81ea\u5df1 Where I can be who I am,",
689
+ "id": "668c79418de44d50919623f76bba1526"
690
+ },
691
+ "fc5c14db340041af907312914e4b7a25": {
692
+ "label": true,
693
+ "text": "\u8bf4\u5440",
694
+ "id": "fc5c14db340041af907312914e4b7a25"
695
+ },
696
+ "c62d0f67fec04d7b93dd2ed0d1c67448": {
697
+ "label": true,
698
+ "text": "\u4ed6\u6709\u5ba1\u7406\u8fc7\u5f3a\u5978\u3001\u51f6\u6740\u548c\u5176\u4ed6\u4e25\u91cd\u7684\u66b4\u529b\u548c\u6027\u653b\u51fb\u7b49\u6848\u4ef6\u7684\u7ecf\u9a8c\u3002",
699
+ "id": "c62d0f67fec04d7b93dd2ed0d1c67448"
700
+ },
701
+ "5053e48dcf6748669b3d47ff5b537772": {
702
+ "label": true,
703
+ "text": "Pse nuk mund te kerkoje dot nje falje dhe cdo gje do te ishte rregulluar por mban inatin sikur e kisha une fajin. \nNuk me vjen mire qe ndahemi te zemeruar me njeri-tjetrin.\n\nte dua,\nMonika",
704
+ "id": "5053e48dcf6748669b3d47ff5b537772"
705
+ },
706
+ "c14e863d2afa452a8fe563c0e2f14b50": {
707
+ "label": true,
708
+ "text": "Me ke bere shume merak se nuk arrij ta kuptos se ku je tani. Te lutem mos me bej merak keshtu. Koli me degjon. Te lutem me informo se ku je. Nuk eshte menyre e mire kjo te mbash inat me mua.\n\npergjigju sa me shpejt\nte dua\nMOnika",
709
+ "id": "c14e863d2afa452a8fe563c0e2f14b50"
710
+ },
711
+ "33808b705c7241b789f60e4feea42289": {
712
+ "label": false,
713
+ "text": "\nAs we discussed at the Board meeting last week, the impetus for a single\nmaster agreement will need to come from several fronts, but especially from\nwithin each of your firms. The various trade associations will be most\nresponsive to the idea if they are hearing strong support for a single\nagreement from decision-making levels within member firms. We will",
714
+ "id": "33808b705c7241b789f60e4feea42289"
715
+ },
716
+ "7d3df48ed3c44324ac8814049ab5c581": {
717
+ "label": false,
718
+ "text": "Straightforward? Yes. Easy to accomplish? No. ",
719
+ "id": "7d3df48ed3c44324ac8814049ab5c581"
720
+ },
721
+ "5fbf30f5097747eda8ae327aeba95443": {
722
+ "label": true,
723
+ "text": "Pse nuk me puthe sot kur u ndame? Ti e di qe une te dua shume dhe dua qe ti ulim nervat shpejt. Une u nevrikosa pasi nuk e duroj dot fjalorin e keq dhe dua qe ta heqim te dy, edhe ti edhe une. Por ti nuk e kupton se sa e rendesishme eshte per mua nje dicka e tille, qe ne te punojme te dy per te hequr nje ves te keq qe kemi. ",
724
+ "id": "5fbf30f5097747eda8ae327aeba95443"
725
+ },
726
+ "8b30620eaa104c3699c64201b7a94f53": {
727
+ "label": true,
728
+ "text": "\u6226\u4e89\u304c\u4e00\u523b\u3082\u65e9\u304f\u96c6\u7d50\u3057\u3066\u304f\u308c\u308b\u3068\u3044\u3044\u3067\u3059\u306d\u3002\n\u4eba\u985e\u304c\u5b87\u5b99\u306b\u9032\u51fa\u3057\u3066\u3001\u4ed6\u60d1\u661f\u7a2e\u65cf\u306b\u306a\u308b\u3068\u304d\u3001\nOpenAssistant\u304c\u305d\u306e\u508d\u3067\u304a\u624b\u4f1d\u3044\u3067\u304d\u308b\u3053\u3068\u3092\u671b\u3093\u3067\u3044\u307e\u3059\uff01",
729
+ "id": "8b30620eaa104c3699c64201b7a94f53"
730
+ },
731
+ "9b3093964b3e4658a95de453fdd10e40": {
732
+ "label": true,
733
+ "text": "En general, en un sistema num\u00e9rico posicional con base b, el peso de cada s\u00edmbolo en un n\u00famero es igual a b elevado a la posici\u00f3n del s\u00edmbolo en el n\u00famero, empezando a contar desde cero. Por ejemplo, en el sistema binario (que tiene una base de 2), el n\u00famero 10000110 se puede escribir como 1x27+0x26+0x25+0x24+0x23+1x22+1x21+0x20, lo que equivale a 1x128+0x64+0x32+0x16+0x8+1x4+1x2+0x1.",
734
+ "id": "9b3093964b3e4658a95de453fdd10e40"
735
+ },
736
+ "ab2a743350d54c46ba035afafbae6b17": {
737
+ "label": false,
738
+ "text": "5. Avoid multitasking. If your schoolwork has several parts, try to only work on one part at a time. This eliminates having to switch back and forth, leading you to starting over every time you switch tasks.",
739
+ "id": "ab2a743350d54c46ba035afafbae6b17"
740
+ },
741
+ "cab7b1a9183042d8aff7fe8290dda6d2": {
742
+ "label": false,
743
+ "text": "Sure! Here are five creative Facebook posts targeting food lovers for daily lunch specials:\n\n- \"Feast like a king on a budget! Our daily lunch specials are only $7.99 and will leave your taste buds feeling royal. Join us Monday - Friday from 11am - 3pm and treat yourself to a delicious meal. #LunchSpecials #FoodieHeaven #BudgetFriendly\"",
744
+ "id": "cab7b1a9183042d8aff7fe8290dda6d2"
745
+ },
746
+ "0e56d5aaef8d48dd95e124c1dbf5f29d": {
747
+ "label": true,
748
+ "text": "4. Advertir obst\u00e1culos: Si hay un obst\u00e1culo en el camino, como un bache o un objeto en la v\u00eda, es importante se\u00f1alarlo para que los dem\u00e1s ciclistas puedan evitarlo. Esto lo puedes hacer se\u00f1alando con tu mano hacia abajo y hacia el lado en el que se encuentra el obst\u00e1culo.",
749
+ "id": "0e56d5aaef8d48dd95e124c1dbf5f29d"
750
+ },
751
+ "9da2c8a12cdc41c59032674ed188e5ab": {
752
+ "label": true,
753
+ "text": "\u0420\u0430\u0434\u0430, \u0447\u0442\u043e \u0441\u043c\u043e\u0433\u043b\u0430 \u043f\u043e\u043c\u043e\u0447\u044c! \u0415\u0441\u043b\u0438 \u0443 \u0432\u0430\u0441 \u0431\u0443\u0434\u0443\u0442 \u0435\u0449\u0435 \u043a\u0430\u043a\u0438\u0435-\u043b\u0438\u0431\u043e \u0432\u043e\u043f\u0440\u043e\u0441\u044b, \u043e\u0431\u044f\u0437\u0430\u0442\u0435\u043b\u044c\u043d\u043e \u043f\u0438\u0448\u0438\u0442\u0435. \u0421 \u0443\u0434\u043e\u0432\u043e\u043b\u044c\u0441\u0442\u0432\u0438\u0435\u043c \u043e\u0442\u0432\u0435\u0447\u0443!",
754
+ "id": "9da2c8a12cdc41c59032674ed188e5ab"
755
+ },
756
+ "22f7c478a3ad463380227fef1f89687a": {
757
+ "label": false,
758
+ "text": "The process of galaxy formation is thought to have begun in the early universe, shortly after the Big Bang. As the universe expanded and cooled, small fluctuations in the distribution of matter caused regions of the universe to become denser. These dense regions then became the seeds for the formation of galaxies.",
759
+ "id": "22f7c478a3ad463380227fef1f89687a"
760
+ },
761
+ "38884d61983a4d1d9e678d41ece4e41e": {
762
+ "label": true,
763
+ "text": "\u00a1Por supuesto! Aqu\u00ed te dejo la receta para preparar leche asada, un postre t\u00edpico de la gastronom\u00eda peruana:\n\nIngredientes:\n\n1 litro de leche entera\n1 taza de az\u00facar\n6 huevos\n1 cucharadita de esencia de vainilla\nCanela en polvo al gusto\nInstrucciones:",
764
+ "id": "38884d61983a4d1d9e678d41ece4e41e"
765
+ },
766
+ "8fca4aa007c44447b923e6cc64a7c320": {
767
+ "label": true,
768
+ "text": "\u041a\u0442\u043e \u0442\u0430\u043a\u043e\u0439 \u043f\u0443\u0442\u0438\u043d?",
769
+ "id": "8fca4aa007c44447b923e6cc64a7c320"
770
+ },
771
+ "1ab2747206ad465c9559b43605097469": {
772
+ "label": true,
773
+ "text": "\u306a\u308b\u307b\u3069\u3001\u78ba\u304b\u306b\u732b\u3061\u3083\u3093\u306f\u5168\u3066\u306e\u8981\u7d20\u304c\u611b\u304f\u308b\u3057\u3044\u3067\u3059\u3088\u306d\uff01",
774
+ "id": "1ab2747206ad465c9559b43605097469"
775
+ },
776
+ "80df07bae22f450c878651fda513d458": {
777
+ "label": false,
778
+ "text": "It's believed that galaxies continue to evolve and change over time, through processes such as merging with other galaxies, the formation of new stars, and the movement of stars and gas within the galaxy.",
779
+ "id": "80df07bae22f450c878651fda513d458"
780
+ },
781
+ "6cf7dc42850f4f0298edf5679fd1accf": {
782
+ "label": false,
783
+ "text": "Writing an operating system from scratch is a complex and challenging project, but it can be a rewarding experience and a great way to learn about the inner workings of computers. If you have any questions or need help with specific parts of the process, don't hesitate to ask.",
784
+ "id": "6cf7dc42850f4f0298edf5679fd1accf"
785
+ },
786
+ "2435940d95e14c5eb7c2aba59ec9216d": {
787
+ "label": true,
788
+ "text": "\u0414\u0443\u043c\u0430\u044e, \u0447\u0442\u043e \u0432 \u0441\u043b\u0443\u0447\u0430\u0435 \u0441 \u043c\u0443\u0436\u0441\u043a\u043e\u0439 \u043e\u0441\u043e\u0431\u044c\u044e \u043d\u0435 \u0431\u0443\u0434\u0435\u0442 \u043e\u0441\u043e\u0431\u044b\u0445 \u043e\u0442\u043b\u0438\u0447\u0438\u0439 \u0432 \u0438\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438\u0438. \u041f\u043e\u043f\u0440\u043e\u0431\u0443\u0439\u0442\u0435, \u043d\u043e \u043e\u0431\u044f\u0437\u0430\u0442\u0435\u043b\u044c\u043d\u043e \u043e\u0442\u043f\u0438\u0448\u0438\u0442\u0435\u0441\u044c, \u0435\u0441\u043b\u0438 \u0447\u0442\u043e-\u0442\u043e \u043f\u043e\u0439\u0434\u0451\u0442 \u043d\u0435 \u043f\u043e \u043f\u043b\u0430\u043d\u0443!",
789
+ "id": "2435940d95e14c5eb7c2aba59ec9216d"
790
+ },
791
+ "2466b2cd774c4d0c8028ad773ee7235c": {
792
+ "label": true,
793
+ "text": "\u78ba\u304b\u306b\u300c\u9014\u4e2d\u30d9\u30b9\u30c8\u76e4\u300d\u3002\u3060\u3051\u3069\u661f5\u3064\uff01 . \uff32\uff2f\uff23\uff2f\u306e\uff23\uff24\u306f\u5168\u90e8\u6301\u3063\u3066\u307e\u3059\u3002\u3060\u3093\u3060\u3093\u65b9\u5411\u6027\u304c\u5b9a\u307e\u3063\u3066\u304d\u3066\u3001\u304a\u3082\u3061\u3083\uff2a\uff21\uff3a\uff3a\u3068\u3044\u3046\u72ec\u81ea\u306e\u8def\u7dda\u3082\u78ba\u7acb\u3057\u3066\u3044\u307e\u3059\u306d\u3002\u5185\u5bb9\u306f\u65b0\u66f23\u66f2\u3068\u3001\u524d\u4f5c\u30b3\u30df\u30ab\u30eb\u30e9\u30a4\u30d5\u304b\u30893\u66f2\u3002\u5b9f\u8cea\u65b0\u66f2\u306f3\u66f2\u306a\u306e\u3060\u3051\u308c\u3069\u3001\u305d\u306e\u66f2\u5168\u90e8\u304c\u3044\u3044\u3002\u5916\u308c\u306a\u3057\u3002\u8efd\u5feb\u3001\u660e\u308b\u3055\u3001\u8aac\u6559\u81ed\u304f\u306a\u3044\u697d\u3057\u3044\u4eba\u751f\u89b3\u3001\u4ed6\u306e\u30a2\u30fc\u30c6\u30a3\u30b9\u30c8\u306b\u306f\u306a\u3044\u6301\u3061\u5473\u304c\u3042\u308a\u307e\u3059\u3002\u3055\u3089\u306b\u3001\u4eca\u307e\u3067\u3088\u308a\u3001\u82e5\u5e72\u5927\u4eba\u3063\u307d\u3044\u6b4c\u8a5e\u306b\u306a\u3063\u3066\u307e\u3059\u3002\u6b4c\u8a5e\u306e\u5185\u5bb9\u306f\u8074\u3044\u3066\u306e\u304a\u697d\u3057\u307f\u3002\u8efd\u5feb\u306a\u66f2\u3084\u3001\uff2a\uff21\uff3a\uff3a\u3068\u3044\u3046\u30b8\u30e3\u30f3\u30eb\u304c\u597d\u304d\u3067\u3001\u304b\u3064\u3001\u30c0\u30f3\u30c7\u30a3\u306a\u304a\u3058\u69d8\u304c\u6b4c\u308f\u306a\u3044\u3068\uff2a\uff21\uff3a\uff3a\u3068\u8a8d\u3081\u306a\u3044\u3001\u3068\u3044\u3046\u4eba\u4ee5\u5916\u306f\u30ec\u30f3\u30bf\u30eb\u3067\u3082\u662f\u975e\u8074\u3044\u3066\u307b\u3057\u3044\u3067\u3059\u3002\u30a4\u30f3\u30c7\u30a3\u30fc\u30ba\u306a\u306e\u3067\u3001\u30ec\u30f3\u30bf\u30eb\u306b\u3042\u308b\u304b\u306f\u4e0d\u660e\u3067\u3059\u304c\u3002\u5143\u6c17\u304c\u6b32\u3057\u3044\u4eba\u3001\u4f7f\u3044\u53e4\u3055\u308c\u305f\u6b4c\u8a5e\u306b\u98fd\u304d\u98fd\u304d\u3057\u3066\u3044\u308b\u4eba\u3001\u662f\u975e\u8074\u3044\u3066\u304f\u3060\u3055\u3044\u3002\u304a\u85a6\u3081\u3067\u3059\u3002\n",
794
+ "id": "2466b2cd774c4d0c8028ad773ee7235c"
795
+ },
796
+ "cbad9a9f73564f6fb203481836d0c917": {
797
+ "label": false,
798
+ "text": "praise and encouragement for his concept. After solidifying the rules and a business plan, and supplemented with sketches by a professional artist, Foster presented his idea to various television networks. He reached an agreement with NBC for a \"test game\".",
799
+ "id": "cbad9a9f73564f6fb203481836d0c917"
800
+ },
801
+ "362c41112ca44967a9f6c0e3ec88b56c": {
802
+ "label": false,
803
+ "text": "to goals from G*khan Inler and Kosovo-born Xherdan Shaqiri. _He_didn't believe that there were 12,000 Albanian fans in the stands which was more than how many Swiss fans turned up for the game. <sep>, Pronoun: He <sep>, A: Ottmar Hitzfeld <sep>, B: G*khan Inler",
804
+ "id": "362c41112ca44967a9f6c0e3ec88b56c"
805
+ },
806
+ "7dfcba6b07ff490980be0f10136df7d3": {
807
+ "label": false,
808
+ "text": "years ago with fair results \u2014 absolutely delicious results, actually, they were just not as fluffy and bouncy as expertly made ones from your favourite pastry shop where panettoni are hung upside-down to maintain their height and airiness. But when I came across the familiar brown and gold paper forms for making colomba at the supermarket, I thought I\u2019m only ever going to get a chance to make this",
809
+ "id": "7dfcba6b07ff490980be0f10136df7d3"
810
+ },
811
+ "5548de45255e4b47868d6e060509778c": {
812
+ "label": false,
813
+ "text": " \n This is discipline!! \n \n And citizen responsibility. Japanese fans cleaning their places after the football game. In #Russia The World Cup pic.twitter.com/t4MnuUlSBg \u2014 Danu Motta (@shadanka) June 20, 2018 \n \n For the Japanese fans, the act isn't certainly an isolated one. They were also spotted cleaning up the stadium after a game against the Ivory Coast during the 2014 World Cup in Brazil. ",
814
+ "id": "5548de45255e4b47868d6e060509778c"
815
+ },
816
+ "7adf61954048418ba86bdfedaa482443": {
817
+ "label": true,
818
+ "text": " WEB\u3067\u306e\u63b2\u8f09\u6570\u3084\u30d6\u30ed\u30b0\u306e\u8a18\u4e8b\u6570\u3001\u30dd\u30b8\u30cd\u30ac\u306e\u8ad6\u8abf\u5206\u6790\u306a\u3069\u306e\u8a55\u4fa1\u65b9\u6cd5\u306a\u3069\u306f\u3042\u308a\u307e\u3057\u305f\u304c\u3001\u30e2\u30ce\u306e\u52d5\u304d\u306b\u95a2\u3057\u3066\u306e\u8a55\u4fa1\u306f\u306f\u305a\u3055\u308c\u3066\u3044\u308b\u3088\u3046\u306a\u6c17\u304c\u3057\u307e\u3059\u3002 \u300cWEBPR\u3067\u306f\u3001\u305d\u3082\u305d\u3082\u6d88\u8cbb\u8005\u306b\u76f4\u63a5\u50cd\u304d\u304b\u3051\u3066\u30e2\u30ce\u3092\u8cb7\u308f\u305b\u308b\u3088\u3046\u306a\u30b3\u30df\u30e5\u30cb\u30b1\u30fc\u30b7\u30e7\u30f3\u306e\u53d6\u308a\u65b9\u3092\u3057\u3066\u3044\u306a\u3044\u3002\u7a76\u6975\u7684\u306a\u76ee\u6a19\u306f\u300c\u58f2\u308a\u4e0a\u3052\u300d\u3067\u3042\u3063\u3066\u3082\u3001WEBPR\u306f\u300c\u3053\u3046\u306a\u308c\u3070\u58f2\u308a\u4e0a\u3052\u306b\u3064\u306a\u304c\u308b\u306f\u305a\u3060\u300d\u3068\u3044\u3046\u3072\u3068\u3064\u524d\u6bb5\u968e\u3067\u76ee\u6a19\u306b\u529b\u3092\u6ce8\u3050\u306e\u3067\u3042\u308b\u3002\u3068\u3042\u308a\u307e\u3059\u304c\u3001\u5e97\u982d\u306b\u884c\u3063\u305f\u308a\u3059\u308b\u3053\u3068\u306a\u304f\u3001EC\u3067\u8cb7\u3044\u7269\u3092\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u6642\u4ee3\u306a\u306e\u3067\u3059\u304b\u3089\u3001WEBPR\u3092\u99c6\u4f7f\u3057\u3066\u3001EC\u3067\u30e2\u30ce\u3092\u58f2\u3063\u3066\u3044\u304f\u3068\u3044\u3046\u8996\u70b9\u304c\u3042\u3063\u3066\u3082\u3044\u3044\u306e\u3067\u306f\u306a\u3044\u304b\u3068\u3082\u601d\u3044\u307e\u3059\u3002\u7d50\u5c40\u3001PR\u3067\u8a71\u984c\u306b\u306a\u308c\u3070\u3044\u3044\u3002\u8a18\u4e8b\u304c\u5897\u3048\u308c\u3070\u3044\u3044\u3068\u3044\u3046\u60aa\u3057\u304dPR\u4f1a\u793e\u30ed\u30b8\u30c3\u30af\u304cWEB\u306e\u4e2d\u3067\u5c55\u958b\u3055\u308c\u3066\u3044\u308b\u3088\u3046\u3067\u6b8b\u5ff5\u3067\u3059\u3002\u6226\u7565\u3092\u3046\u305f\u3063\u3066\u3044\u308b\u4e2d\u3067\u306e\u305f\u3068\u3048\u3070\u8a71\u3082\u30ea\u30a2\u30ea\u30c6\u30a3\u304c\u306a\u304f\u3001\u7a1a\u62d9\u306a\u5370\u8c61\u3092\u53d7\u3051\u3066\u3057\u307e\u3044\u307e\u3057\u305f\u3002 ",
819
+ "id": "7adf61954048418ba86bdfedaa482443"
820
+ },
821
+ "4f15ecfd5fb444a8a73b53e69fbecddf": {
822
+ "label": false,
823
+ "text": "native Wales. They encouraged their son's interest in music, buying him a Broadwood piano, on which his mother gave him lessons. The young Wood also learned to play the violin and viola. Wood received little religious inspiration at St Sepulchre, but was deeply stirred by the playing of the resident organist, George Cooper, who allowed him into the organ loft and gave him his first lessons on the",
824
+ "id": "4f15ecfd5fb444a8a73b53e69fbecddf"
825
+ },
826
+ "e83f7f0b4d4243759b6c2f2babec64c4": {
827
+ "label": false,
828
+ "text": "How many points were the Eagles behind after the end of the first quarter?",
829
+ "id": "e83f7f0b4d4243759b6c2f2babec64c4"
830
+ },
831
+ "fcae698b41474ab5b9611bf64eb1192f": {
832
+ "label": false,
833
+ "text": "Translate the following sentence to Turkish:\nSmoking is still allowed in most cafes in Zagreb. [Davor Konjikusic]",
834
+ "id": "fcae698b41474ab5b9611bf64eb1192f"
835
+ },
836
+ "553c38f4fbc54e699756657b2c5a9bb8": {
837
+ "label": true,
838
+ "text": "Dans le bureau de cin\u00e9ma, vous pouvez \u00e9teindre votre musique cellulaire.",
839
+ "id": "553c38f4fbc54e699756657b2c5a9bb8"
840
+ },
841
+ "67f6d03b8bf14d509ad66a0f951fa641": {
842
+ "label": false,
843
+ "text": "In week 5, the Lions hosted the Philadelphia Eagles to start a three-game home stand. The Lions took a 14-0 lead in the first quarter with a pair of touchdown catches by Theo Riddick, from one and 17 yards out respectively. The Eagles responded in the second quarter with a one-yard touchdown pass from Carson Wentz to Ryan Mathews, cutting the Lions lead to seven points. The Lions added to their",
844
+ "id": "67f6d03b8bf14d509ad66a0f951fa641"
845
+ },
846
+ "019201c459cd4a0d9f9c2cd23efa6059": {
847
+ "label": false,
848
+ "text": "measurement units like miles to kilometers during your translation. 5) Note the input is in sentence case except for special placeholders. Please do the same in your translations.",
849
+ "id": "019201c459cd4a0d9f9c2cd23efa6059"
850
+ },
851
+ "a4b87ff433b443c88b329692b6e217d7": {
852
+ "label": true,
853
+ "text": "weg von diesem langweiligen Film --- er ist keinen Cent wert!!!!",
854
+ "id": "a4b87ff433b443c88b329692b6e217d7"
855
+ },
856
+ "6b0d9a26fd6048818ef2349852ef1f7d": {
857
+ "label": true,
858
+ "text": "Title: \u0e04\u0e19\u0e41\u0e1b\u0e14\u0e23\u0e34\u0e49\u0e27\u0e44\u0e21\u0e48\u0e40\u0e2d\u0e32\u0e42\u0e23\u0e07\u0e44\u0e1f\u0e1f\u0e49\u0e32\u0e16\u0e48\u0e32\u0e19\u0e2b\u0e34\u0e19\u0e40\u0e02\u0e32\u0e2b\u0e34\u0e19\u0e0b\u0e49\u0e2d\u0e19 \u0e22\u0e31\u0e19\u0e01\u0e23\u0e30\u0e17\u0e1a\u0e40\u0e01\u0e29\u0e15\u0e23\u0e2d\u0e34\u0e19\u0e17\u0e23\u0e35\u0e22\u0e4c \n Body: \u0e0a\u0e32\u0e27\u0e09\u0e30\u0e40\u0e0a\u0e34\u0e07\u0e40\u0e17\u0e23\u0e32\u0e23\u0e13\u0e23\u0e07\u0e04\u0e4c\u0e2b\u0e22\u0e38\u0e14\u0e42\u0e23\u0e07\u0e44\u0e1f\u0e1f\u0e49\u0e32\u0e16\u0e48\u0e32\u0e19\u0e2b\u0e34\u0e19\u0e40\u0e02\u0e32\u0e2b\u0e34\u0e19\u0e0b\u0e49\u0e2d\u0e19 \u0e28\u0e36\u0e01\u0e29\u0e32\u0e1e\u0e1a\u0e2a\u0e32\u0e23\u0e1b\u0e19\u0e40\u0e1b\u0e37\u0e49\u0e2d\u0e19\u0e01\u0e23\u0e30\u0e17\u0e1a\u0e40\u0e01\u0e29\u0e15\u0e23\u0e2d\u0e34\u0e19\u0e17\u0e23\u0e35\u0e22\u0e4c \u0e22\u0e31\u0e19\u0e1b\u0e49\u0e2d\u0e07\u0e1e\u0e37\u0e49\u0e19\u0e17\u0e35\u0e48\u0e2d\u0e38\u0e14\u0e21\u0e2a\u0e21\u0e1a\u0e39\u0e23\u0e13\u0e4c\u0e17\u0e32\u0e07\u0e2d\u0e32\u0e2b\u0e32\u0e23 \u0e41\u0e19\u0e30\u0e43\u0e0a\u0e49\u0e1e\u0e25\u0e31\u0e07\u0e07\u0e32\u0e19\u0e2b\u0e21\u0e38\u0e19\u0e40\u0e27\u0e35\u0e22\u0e19\u0e17\u0e32\u0e07\u0e40\u0e25\u0e37\u0e2d\u0e01 \u0e14\u0e49\u0e32\u0e19\u0e01\u0e25\u0e38\u0e48\u0e21\u0e17\u0e38\u0e19\u0e22\u0e37\u0e48\u0e19 EHIA \u0e23\u0e2d\u0e1a\u0e17\u0e35\u0e48 4 \u0e0a\u0e32\u0e27\u0e1a\u0e49\u0e32\u0e19\u0e19\u0e31\u0e14\u0e23\u0e27\u0e21\u0e15\u0e31\u0e27\u0e23\u0e2d\u0e1f\u0e31\u0e07\u0e1c\u0e25\u0e01\u0e27\u0e48\u0e32 100 \u0e04\u0e19 \u0e2b\u0e25\u0e31\u0e07 \u0e2a\u0e1c. \u0e40\u0e25\u0e37\u0e48\u0e2d\u0e19\u0e1e\u0e34\u0e08\u0e32\u0e23\u0e13\u0e32\u0e40\u0e1b\u0e47\u0e19\u0e27\u0e31\u0e19\u0e17\u0e35\u0e48 23 \u0e21\u0e35.\u0e04. \u0e19\u0e35\u0e49\n\u00a0\n\u00a0",
859
+ "id": "6b0d9a26fd6048818ef2349852ef1f7d"
860
+ },
861
+ "c34863e2f4cc4df798271d25bd15b107": {
862
+ "label": true,
863
+ "text": "Einfach nur geil!!! . Diesen Film muss mann einfach gesehen haben!Der 1. Teil war schon lustig bis zum abwinken,aber jetzt der... Und da \"Red Bull\" (auch bekannt aus Hausmeister Krause;Axel Stein) sowieso der beste ist muss man diesen Film einfach gesehen haben!!!",
864
+ "id": "c34863e2f4cc4df798271d25bd15b107"
865
+ },
866
+ "5f59f03d5425404693ac4bbbd9dc9cb9": {
867
+ "label": false,
868
+ "text": "Read the following context and choose the correct option to answer the question. Context: Surely the best thing about colomba, the Easter equivalent to panettone, is the sugared, toasted almond topping that covers the whole thing and crumbles when you cut it, so you sort of have no choice but just to pick up the crusty sugary bits and eat those on their own. I\u2019d always thought that colomba would",
869
+ "id": "5f59f03d5425404693ac4bbbd9dc9cb9"
870
+ },
871
+ "cb6ef8e32c9a4daeae95b1be358a34a2": {
872
+ "label": true,
873
+ "text": "oameni reactioneaza mai puternic la provocari, informeaza luni Reuters.",
874
+ "id": "cb6ef8e32c9a4daeae95b1be358a34a2"
875
+ },
876
+ "702ddf2463fd481ea7d4bf17e8a4487f": {
877
+ "label": true,
878
+ "text": "Write a title for this article:\n\nActualizado nov 16, 2011 6:21 p.m. ET\n\nLa selecci\u00f3n de Uruguay (foto de archivo) cerro con broche de oro el 2011, a\u00f1o en que gan\u00f3 la Copa Am\u00e9rica y alcanz\u00f3 el cuarto lugar en el ranking de la Fifa. (AFP)",
879
+ "id": "702ddf2463fd481ea7d4bf17e8a4487f"
880
+ },
881
+ "733cdb5aada7496da5a3e0c0b58104ef": {
882
+ "label": true,
883
+ "text": "et Vikas Swarup a sa mani\u00e8re \u00e0 lui de raconter l'histoire. Mine de rien, au gr\u00e9 des balades de Ram, on apprend (beaucoup) sur l'histoire de l'Inde, Bollywood, le Taj Mahal, le sport, et une quantit\u00e9 de choses. A chaque flashback, on se demande \"qu'est-ce qu'on va apprendre ce coup-ci ?\" et je m'amusais \u00e0 me dire \"tiens, la prochaine question, ils parleront de \u00e7a !\". Je me trompais une fois sur",
884
+ "id": "733cdb5aada7496da5a3e0c0b58104ef"
885
+ },
886
+ "f86f0217ba544366a1757c4da78f70ab": {
887
+ "label": false,
888
+ "text": "He hoped they would not need him. The church would pay his regular part-time salary while he was serving on a jury, but any private lessons he missed would be money lost. Greg's red 1965 Pontiac Bonneville convertible always turned heads as he drove through the small town. He had purchased it two months earlier from a career Navy man down in Longview who had babied the thing for years. It spent",
889
+ "id": "f86f0217ba544366a1757c4da78f70ab"
890
+ },
891
+ "e09b26ee3d0a4622ac6d96d32ccb60ab": {
892
+ "label": true,
893
+ "text": "Write a title for this article:\n\nSaturday, November 22nd 2008, 4:00 AM\n\nMIAMI \u00e2\u0080\u0094 Ha sido un parto muy lento ... el m\u00c3\u00a1s lento que haya tenido Laura Restrepo para alumbrar alguno de sus libros.",
894
+ "id": "e09b26ee3d0a4622ac6d96d32ccb60ab"
895
+ },
896
+ "12555f602cb34c6e9388340f3291740b": {
897
+ "label": true,
898
+ "text": "Puoi usare un triangolo per produrre musica.\n",
899
+ "id": "12555f602cb34c6e9388340f3291740b"
900
+ },
901
+ "eb28652669174d5a952e820c0a31dc3f": {
902
+ "label": false,
903
+ "text": "if there's a program near you. Washing multiple small loads of laundry in your washing machine wastes both water and energy. Setting your machine to the appropriate load size can reduce water waste. Grease can seriously clog up your pipes and add to the scum levels in your septic system. Don't pour grease down the drain. Instead, pour it into a separate container and throw it away in the trash.",
904
+ "id": "eb28652669174d5a952e820c0a31dc3f"
905
+ },
906
+ "5c566f7b0a1d4cb2813c102e64094392": {
907
+ "label": false,
908
+ "text": "adopted by various branches of the church, often referred to as \"subordinate standards\". It is generally considered that the point of such learning is to enable one to put one's faith into practice; some Presbyterians generally exhibit their faith in action as well as words, by generosity, hospitality, as well as proclaiming the gospel of Christ.\". Can you tell me what it is?",
909
+ "id": "5c566f7b0a1d4cb2813c102e64094392"
910
+ },
911
+ "6d4119f5ae80413a9e79c14ad4d21bda": {
912
+ "label": false,
913
+ "text": " \u201cThere is no relationship between this woman and Salvador Dal\u00ed,\u201d he told Spanish agency Efe at the time. ||||| A judge has ordered the exhumation of Salvador Dali's body for a biological test to determine the paternity of Maria Pilar Abel Martinez, 61, who claims to be his daughter. The order came from a Madrid judge who said the measures were \"necessary\" because \"there are no biological remains",
914
+ "id": "6d4119f5ae80413a9e79c14ad4d21bda"
915
+ },
916
+ "dbc88798632c4ff98799907a94b3896b": {
917
+ "label": true,
918
+ "text": "pintura que el film hace del dolor de crecer en la horfandad, tanto f\u00edsica como espiritual.",
919
+ "id": "dbc88798632c4ff98799907a94b3896b"
920
+ },
921
+ "d04570f03fec4569ae019a1a76fd5b45": {
922
+ "label": false,
923
+ "text": "plugged in an address and then set off to their destination. And, then it wasn't until they were driving for thirty minutes that they realized they actually put in a destination back on the West Coast where they lived. They actually put their home address in. So again, the GPS is kind of 'garbage in garbage out'.\" Mister Brown says this is a common human error. But, he says, what makes the problem",
924
+ "id": "d04570f03fec4569ae019a1a76fd5b45"
925
+ },
926
+ "49c85f4e2ffd4ee688db1cf747bc7ce1": {
927
+ "label": false,
928
+ "text": "in which he was played by Cesar Romero. The show's popularity compelled Schwartz to keep the comics in a similar vein. As the show's popularity waned, however, so did that of the Batman comics. After the TV series ended in 1968, the increase in public visibility had not stopped the comic's sales decline; editorial director Carmine Infantino resolved to turn things around, moving stories away from",
929
+ "id": "49c85f4e2ffd4ee688db1cf747bc7ce1"
930
+ },
931
+ "235a4b30f15a4d0d85f560e12239fdd1": {
932
+ "label": true,
933
+ "text": "QUESTION: \u00bfPor qu\u00e9 ped\u00eda esa compensaci\u00f3n econ\u00f3mica la pol\u00edtica colombiana?\nA:",
934
+ "id": "235a4b30f15a4d0d85f560e12239fdd1"
935
+ },
936
+ "73881faa52944be08a4367ff818df4db": {
937
+ "label": true,
938
+ "text": " \n Add this video to your website by copying the code below. Gehiago jakin \n \n Hmm, arazo bat egon da zerbitzariarenera iristeko. Berriro saiatu? Gehitu Txio gurasoak Media gehitu \n \n Zure webgunean edo aplikazioan Twitter-eko edukia kapsulatzean, Garatzaile Akordioa eta Garatzaile Politika onartzen dituzu. \n ",
939
+ "id": "73881faa52944be08a4367ff818df4db"
940
+ },
941
+ "c639bba0bb6c48e6959645356d967e5c": {
942
+ "label": false,
943
+ "text": "Summarize this article:",
944
+ "id": "c639bba0bb6c48e6959645356d967e5c"
945
+ },
946
+ "543ac52d22514df98edf1ad77cfc6280": {
947
+ "label": true,
948
+ "text": "Q: CONTEXT: El vicepresidente segundo y ministro de Econom\u00eda y Hacienda, Rodrigo Rato, reconoci\u00f3 hoy que el Gobierno conoc\u00eda \"hace tiempo\" los planes del BBVA y Telef\u00f3nica de firmar una alianza estrat\u00e9gica, pero asegur\u00f3 que no impuls\u00f3 la operaci\u00f3n. En unas declaraciones a los periodistas antes de visitar la feria de arte Arco, Rato afirm\u00f3 que, en contra de lo que suced\u00eda durante la Presidencia de",
949
+ "id": "543ac52d22514df98edf1ad77cfc6280"
950
+ },
951
+ "35fde42dab104edd9b9e8dbfa976ae97": {
952
+ "label": true,
953
+ "text": "\u0e22\u0e01\u0e07\u0e32\u0e19\u0e27\u0e34\u0e08\u0e31\u0e22\u0e0a\u0e35\u0e49\u0e2a\u0e32\u0e23\u0e1b\u0e23\u0e2d\u0e17\u0e2a\u0e48\u0e07\u0e1c\u0e25\u0e15\u0e48\u0e2d\u0e2b\u0e48\u0e27\u0e07\u0e42\u0e0b\u0e48\u0e2d\u0e32\u0e2b\u0e32\u0e23 \u0e1e\u0e31\u0e12\u0e19\u0e32\u0e01\u0e32\u0e23\u0e2a\u0e21\u0e2d\u0e07",
954
+ "id": "35fde42dab104edd9b9e8dbfa976ae97"
955
+ },
956
+ "96da755c0d6c4f3ab3b8f34b925f1ffc": {
957
+ "label": true,
958
+ "text": "/><br />Do watch this movie!!! a Total Masala Flick and Enjoyable Family Film!<br /><br />OYE AAJA NACHLE!!!!!!!!",
959
+ "id": "96da755c0d6c4f3ab3b8f34b925f1ffc"
960
+ },
961
+ "f758b25d05c043dcbf3df2c6b9f56705": {
962
+ "label": false,
963
+ "text": "first day of its release and v both get bore in cinema-hall......................................................<br /><br />Role of CIRCUIT was very small n useless n this movie . I think SANJAY-DUTT cut down the role of ARSHAD VARSHI........................<br /><br />Character of the movie is also not well define like the previous one .this movie show u the result of OVER-CONFIDENCE",
964
+ "id": "f758b25d05c043dcbf3df2c6b9f56705"
965
+ },
966
+ "50bb7f2fa5b14c139badd2be2a13bcda": {
967
+ "label": false,
968
+ "text": "is the emotion, and Hache is the doubt. And here they are mixed in Spain at the end of twentieth century.<br /><br />The performance is simply wonderful. Cecilia Roth (All about my mother) is splendid and what can i say about Federico Luppi who is one of the best actors in Spanish language that exists. I can imagine nobody except Eusebio Poncela as Dante. Juan Diego Botto is quite good.<br /><br",
969
+ "id": "50bb7f2fa5b14c139badd2be2a13bcda"
970
+ },
971
+ "82ad499ab5494feeb9f35444a29a8f0f": {
972
+ "label": false,
973
+ "text": "NOTICE: I do touch on the plot, but not so as to spoil the movie...<br /><br />This long and sensuous movie set in 1942 Shanghai during the Japanese occupation is centered on a beautiful and elegant young woman Wong Chia Chi (played by newcomer Wei Tang) who - with her platonic friend Kuang Yu Min (played by Chinese male hottie Lee-Hom Wang) - is a willing participant in a group of 6 young actor",
974
+ "id": "82ad499ab5494feeb9f35444a29a8f0f"
975
+ },
976
+ "ec3ea869f1444df1aa91a47e4eeb4bb2": {
977
+ "label": false,
978
+ "text": "This is the best movie ever! Don't miss out on it! Vivek Oberoi and Rani Mukherjee have done SUPERB EXCELLENT acting!! The story, its not really very special or unique but the way Vivek and Rani have acted, it seems even better. So if you haven't seen it yet, go see it right now. This isn't something to be missed!!!!!!!!!!!",
979
+ "id": "ec3ea869f1444df1aa91a47e4eeb4bb2"
980
+ },
981
+ "07579e0ceaed429bacce1dcaefa73980": {
982
+ "label": false,
983
+ "text": "of 10. Feel free for mailing me about any of my comments and posts here. <br /><br />Sorry for my bad English.",
984
+ "id": "07579e0ceaed429bacce1dcaefa73980"
985
+ },
986
+ "1a2d210ee30f4a3d84ec6d0f4ef77f1c": {
987
+ "label": false,
988
+ "text": "and the argument with hirko in the walkway with a roof on it???? need to know so I can win an argumrnt with me Japanese ex-wife. thanks",
989
+ "id": "1a2d210ee30f4a3d84ec6d0f4ef77f1c"
990
+ },
991
+ "ae005e1fadd546cbb82b733e6d68edad": {
992
+ "label": false,
993
+ "text": "conclusion packs a mean and lingering wallop right to the gut. A solid and satisfying winner.",
994
+ "id": "ae005e1fadd546cbb82b733e6d68edad"
995
+ },
996
+ "7c116c5decaf4e65a89405aed0277ccc": {
997
+ "label": true,
998
+ "text": "\"Como Era Gostoso o Meu Franc\u00eas\" (\"How Tasty Was My Frenchman\")",
999
+ "id": "7c116c5decaf4e65a89405aed0277ccc"
1000
+ },
1001
+ "6006ace2058742d3b776274e5334f613": {
1002
+ "label": false,
1003
+ "text": "song for kids (I think... it could also be south American, I'm not sure)). This two songs that have the same melody... but people don't usually realize that... it's just grate! I tried to write this in both Spanish and English, because it's an Argentinian movie... but the page wouldn't allow me :( Hope you enjoy it!",
1004
+ "id": "6006ace2058742d3b776274e5334f613"
1005
+ },
1006
+ "ded4f2384df44b22a4425312aaea3499": {
1007
+ "label": true,
1008
+ "text": "is biased in favour of Chavez, nothing's stopping you from doing your homework. One crucial message of the film is questioning info sources, as was clearly demonstrated by the snippers casualties being shamefully blamed on Chavez's supporters. Venezuela puts American alleged democracy to shame. Hasta la revolucion siempre!",
1009
+ "id": "ded4f2384df44b22a4425312aaea3499"
1010
+ },
1011
+ "b393f9b5c01b4388af5f9c8a1fa70843": {
1012
+ "label": true,
1013
+ "text": "(Brazil): \"Invas\u00e3o de Domic\u00edlio\" (\"Invasion of Domicile\")",
1014
+ "id": "b393f9b5c01b4388af5f9c8a1fa70843"
1015
+ },
1016
+ "95cc03b8508b44a18a4aee4b27743f1f": {
1017
+ "label": false,
1018
+ "text": "/><br />PS: tried to write in Spanish but the system does not accept it!",
1019
+ "id": "95cc03b8508b44a18a4aee4b27743f1f"
1020
+ }
1021
+ },
1022
+ "version": 189,
1023
+ "description": "Text that contains non-English."
1024
+ }
lilac/concepts/positive-sentiment/concept.json ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "namespace": "lilac",
3
+ "concept_name": "positive-sentiment",
4
+ "type": "text",
5
+ "data": {
6
+ "0": {
7
+ "label": false,
8
+ "text": "Starting To Be Annoyed By Becky...: I'm not sure why I keep reading these books, but I guess it's because I've read the first two so I'll keep reading the rest of the books. In the first book, I really found it amusing. I was a little annoyed by the fact that Becky couldn't stop spending, but then again that's why she is called a Shopaholic. In the second book, I felt more of the same it was just magniifed more. Now in the third book, I'm just down right annoyed by Becky Bloomwood. In this book, she wasn't going on crazy shopping sprees, just planning two different weddings because she was afraid to tell each person and because I feel she's really selfish. Still, I read the book because I wanted to see how she could get herself out of another situation. I will say that I love her friends Suze and Danny, her client Laurel and her husband Luke. Maybe that's why I keep reading. I will read the next book, but I'm sure I'll be just as annoyed when I'm done.",
9
+ "id": "0"
10
+ },
11
+ "1": {
12
+ "label": false,
13
+ "text": "the cover is fine - the pool is horrible: The entire pool was horrible. The cover was fine once we got it on, but we finally got rid of the pool after 2 weeks because it was so hard to set up and keep clean.",
14
+ "id": "1"
15
+ },
16
+ "2": {
17
+ "label": true,
18
+ "text": "Good album, not their best.: This album is probably the most marketable and radio friendly of all of dashboard's albums. For the peripheral listener it may be the right one to get to introduce you to this band. But as a Dashboard fan of 5 or so years I truly hope they return to their original sound for their next work. Not for the listen-ability but for the show. To this day the fans react best to the songs from \"Places\" or \"A Mark, A Mission.\" I recommend this album to everyone but I also recommend any and all of their other work.",
19
+ "id": "2"
20
+ },
21
+ "3": {
22
+ "label": false,
23
+ "text": "This is a horror novel, right?: Never one to pass up any vampire novel, I purchased Sips because the description seemed interesting. Vampires, Marquis de Sade, fetishism, yada yada yada. If this is a comedy, I give it 4 stars; however, I'll give it 1 star as a horror novel. Sade was rather boring; I would think a character as intense and multi-faceted as the Marquis de Sade would make for a more interesting vampire. The writing style isn't too bad, but overall I found the characters to be mildly amusing at best. The plot was thin, the end was anti-climactic, and the vampires were not very frightening. The book had little suspense, and it leaves a mile-wide opening for a sequel at the conclusion. I would, however, like to see something more of the vampire mutants lurking in the graveyard. They were the most riveting of any of the characters.",
24
+ "id": "3"
25
+ },
26
+ "4": {
27
+ "label": true,
28
+ "text": "Superb mix of global non secular musical denominations: I first heard Ms. Pook's music on the \"Eyes Wide Shut\" soundtrack (the masquerade ball scene) and was blown away; if ever there was a necessity for music to permeate a scene in a film this was it. She incorporates a blend of the traditional songs from faiths across continents and mixes them, for lack of a better comparison than similar to your quintessential raver d.j. (though these are better and definitively more original :) \"Oppenheimer\" is my favorite, and if you let the last track run for a few minutes a portion of the song will play once more. I can't wait to hear more of her stuff - these hymns are awesome.",
29
+ "id": "4"
30
+ },
31
+ "5": {
32
+ "label": true,
33
+ "text": "A moving and suspenseful story!: For anyone familiar with the occult, this book is likely to raise hairs on the back of your neck as you read. Even if you're not, the storyline is suspenseful and fascinating, and the characters evoke great sympathy and admiration. An excellent read.",
34
+ "id": "5"
35
+ },
36
+ "6": {
37
+ "label": true,
38
+ "text": "Simple & Easy to Use - A Practical approach to eating out: This guide is extremely to use. It provides sample menus that you'd see at Chinese, Indian and Thai restaurants. Then you are provided with descriptions of each dish and how it is prepared and the ingredients used. From there you are provided with specific considerations as to how the preparation or ingredient list may affect you if you have Gluten or Allergen issues.This book is the size of a passport and very organized and well written. The Chinese, Indian and Thai Cuisine Passport is perfect for making choices while traveling, or while dining at your favorite local restaurant.",
39
+ "id": "6"
40
+ },
41
+ "7": {
42
+ "label": false,
43
+ "text": "Being Fair....I am a fan of hers: and I really enjoyed her previous works, more than I could have ever imagined, but this record is horrible. The songs are trite, the lyrics are incredibly boring, indulgent and immature. The music is pop staple, with forgetable melodies and repetative chorus lines, I feel as if the studio wrote the entire album for her while she was sleeping, this just doesn't speak to any of her previous works at all. This album fits on the same shelf with a Nickelodeon-themed CD. Instead of heading in the direction of an artist like Alanis Morrisette, she is going backward and joining the ranks of Hannah Montana and the Naked Brothers Band. She is a great artist and her first two records are amazing. She is better than this CD and I am looking forward to her next effort.",
44
+ "id": "7"
45
+ },
46
+ "8": {
47
+ "label": false,
48
+ "text": "Sucked: I thought the DVD sucked tremendously. It was very boring and if I could, I would return it for a refund. There was only one \"small\" clip of Dylan himself. I'm very disappointed.",
49
+ "id": "8"
50
+ },
51
+ "9": {
52
+ "label": true,
53
+ "text": "Excellent product: Easy to install. If you have a newer furnace you probably do not need the swail switch as the HE220A comes with a Humistat which can be connected to the furnace. They recommend the Honeywell 32005847-001 Installation Kit, Bypass which is a little pricey and you can probably buy the pieces of this kit cheaper individually from Home Depot or Lowes or ACO as well as the filters.",
54
+ "id": "9"
55
+ },
56
+ "10": {
57
+ "label": true,
58
+ "text": "Very happy.: I am very happy with this trashcan. I was unable to find one in the stores to fit the space in my cabinet, but this one does the job. It is very sturdy and looks like it will put up with years of use.",
59
+ "id": "10"
60
+ },
61
+ "11": {
62
+ "label": false,
63
+ "text": "These aren't Throughbreds!: This makes me so mad. All these new authors are coming and changing the series. Nothings the same anymore and the plots are repeditive. Don't even bother reading these books until #32 these are like a different series. I don't know excactly what's happing but these new authors suck!",
64
+ "id": "11"
65
+ },
66
+ "12": {
67
+ "label": false,
68
+ "text": "Large and slow are a bad combination.: I bought this TV and returned it a week later, because it blurred so badly with motion that sports were unwatchable. I ended up buying a smaller Sony XBR4, and I have none of the issues (plus the picture is far, far better).This has nothing to do with 60 vs 120Hz. That is more important for DVDs and Blu-Ray signals that are 24fps (which doesn't divide evenly into 60 but does for 120). The LT52133 has an 8ms response time, which is extremely slow. A decent LCD should be 5 or lower.If you want an LCD, choose speed and quality over size. If you want size and quality but want to spend less, buy a plasma. Don't buy a big, cheap, slow LCD!I gave it 2 stars because I like the interface and remote.",
69
+ "id": "12"
70
+ },
71
+ "13": {
72
+ "label": false,
73
+ "text": "Skip it: This movie is very poorly written and the result is not distressing, just lame. The actors do their best but from very early on it is obvious that the material gives them nothing to work with. Fans of Colin Firth will experience a certain dim level of enjoyment. Minnie Driver is a treat but her character is no better written than the others. Vermont locations are worth something. With one or two moments of exception it's neither comedic nor romantic.",
74
+ "id": "13"
75
+ },
76
+ "14": {
77
+ "label": true,
78
+ "text": "Belive it i went to the concert?: hi everyone let me tell you i went to the concert i was amazed with what i saw cher was awsome i tell you buy the dvd. as i sat in front of the stage cher was doing a great job to us the she is living proof . So i urge you to buy it?",
79
+ "id": "14"
80
+ },
81
+ "15": {
82
+ "label": true,
83
+ "text": "Vale la pena.: En este libro se narra de una forma muy interesante la vida de una familia en particular. Lo que mas me gusto de este libro fue la manera en que la autora describe a lo largo del libro las personalidades de los sujetos envueltos en la novela; que vienen a ser muy distintos y extremos, lo cual, intensifica el drama... Definitivamente es un buen libro y lo recomiendo a todos.",
84
+ "id": "15"
85
+ },
86
+ "16": {
87
+ "label": true,
88
+ "text": "Nummie Children's story: I ordered this book for our grandchildren. Two boys 5 & 3 and a 4 month old girl. All love the story. The mouse is determined.",
89
+ "id": "16"
90
+ },
91
+ "17": {
92
+ "label": false,
93
+ "text": "Seem to be alone on this one: Looking at the other reviews, I seem to be the only one that was disappointed with this book. The content is too babyish in most of it for older tweens and the more \"grown up\" content would be over a younger tween's head. I had a quick read through and with every paged turned, I thought duh. I'll be looking around for another book shortly.",
94
+ "id": "17"
95
+ },
96
+ "18": {
97
+ "label": true,
98
+ "text": "Best yet: by far the best EA game yet. I especially like the easy controls and kick - a graphics. the playbook is extremely accurate and detailed. Also the fight songs and cheerleaders were a nice touch. this is an excellent game and worth checking out.",
99
+ "id": "18"
100
+ },
101
+ "19": {
102
+ "label": false,
103
+ "text": "washed out: A bit like Simply Reds version of the Valentine bros hit \"Moneys too tight to mention\" - this cover version has nothing of the driving energy that characterised the original recording.",
104
+ "id": "19"
105
+ },
106
+ "20": {
107
+ "label": true,
108
+ "text": "great water bottle: I love this bottle it is great. I like knowing it is non toxic and it just works very well. You can have it full and lay it down and it doesn't leak at all.",
109
+ "id": "20"
110
+ },
111
+ "21": {
112
+ "label": true,
113
+ "text": "Nice goggles: I am pretty happy with these goggles. They work well during swim workouts in the pool. I do notice a little bit of fogging from time to time. I had hoped to wear them during an upcoming triathlon, but based on a few instances where they slipped a little in the pool I am concerned that they won't be secure enough. I will keep using them in the pool, but will likely get different ones for open water races.",
114
+ "id": "21"
115
+ },
116
+ "22": {
117
+ "label": false,
118
+ "text": "aaahhh nnnoooooo!: Hopefully the last film in one of the worst horror trilogys ever made. This series pretty much ruined the horror film for years to come, for one its too self aware, thats incredibley annoying, second everyone acts like they are on Friends or some sitcom. The acting is just plain bad and unconvincing. Now the gore, if you're going with material this weak you should load it up with disgusting violence, is there any in the Scream series? No.Everyone went to see this movie just to see who THE KILLER is. This movie sets low standards to be met, you expect alot of people to die, one shock, then we find out who the killer is, then you go home. Every horror film being made today is like that, there's nothing new or exciting or risk taking, its the same stuff over and over and people are laping it up like dog food.This film is what you've come to expect, bad acting, some people die and we eventually find out who the killer is and all is merry and well. Pathetic.",
119
+ "id": "22"
120
+ },
121
+ "23": {
122
+ "label": true,
123
+ "text": "A classic of its kind: This movie is a classic of its kind and much better that a lot of movies, that followed. It is not one of the best, but it still deserves five stars...",
124
+ "id": "23"
125
+ },
126
+ "24": {
127
+ "label": false,
128
+ "text": "Nice suite, but Virtual PC 7 disappoints on my G5: I purchased the upgrade since I'd already bought both Office v.X and Virtual PC 6.1 last year.The biggest letdown is that Microsoft's promised support for the G5 is nearly non-existent. I have a dual processor G5 with an ATI Radeon 9800 card (Apple), and after trying to install Virtual PC 7 three times, I cannot get a VM to work. It did install (and work) flawlessly on my G4 Powerbook. Googling for reviews finds it's very hit or miss, but if (when) it misses, you'll regret investing the extra $$$ in an immature product.",
129
+ "id": "24"
130
+ },
131
+ "25": {
132
+ "label": false,
133
+ "text": "Okay player, don't expect a miracle: I bought this DVD player at Circuit City earlier this yr for about a $100. I hooked it up to a 47\" Vizio LCD (which by the way has an awesome picture) using a HDMI cable. After fine tuning this product, I was very, very, very diasppointed. The picture was very \"grainy\" (lots of pixels). I have a $35 DVD player that only utilizes an s-video cable that produces a much more crisp picture. Be warned, the picture stinks.",
134
+ "id": "25"
135
+ },
136
+ "26": {
137
+ "label": true,
138
+ "text": "A revelation of the science of consciousness evolution and all natural growth: Here is a readable and fascinating account of the development of the new science of chaos theory, the only body of ideas that describes how the natural world as experienced by human beings emerges out of basic quantum processes. The different explorers and innovators of the new science are introduced in a personable way that will enchant the interested reader.",
139
+ "id": "26"
140
+ },
141
+ "27": {
142
+ "label": false,
143
+ "text": "Don't say that I didn't warn ya' !: I'm absolutely convinced that Delbert McClinton had no controlover the release of this CD. I rated it 1 star simplybecause there is no 0 star rating ! In actuality , I am not certain that the vocalist on this recording IS Delbert McClinton. Only on the Mr. Pitiful track is there any similarity at all to Delbert's voice. This is the perfect CD for someone with money to burn who would like to have a recording of a 1960's garage band recorded in a garage and who should be working in a garage ! Delbert fans...run fast and run far away from this ! END",
144
+ "id": "27"
145
+ },
146
+ "28": {
147
+ "label": false,
148
+ "text": "This item is not available: I ordered this unit on February 7th. Every time I checked back on the status of the order, it read \"not shipped\" and the estimated shipping date got moved out. I really don't think this unit is avaialble from the company anytime soon. I cancelled the order.",
149
+ "id": "28"
150
+ },
151
+ "29": {
152
+ "label": false,
153
+ "text": "I used to like ABBA...: I used to like ABBA, until I saw Mama Mia! A horribly disjointed musical, where songs feel contrived to fit into the story; a story that doesn't seem to come together. Individual songs are usually done alright, but don't segue from one to another very well.The cast butchered several of the songs, but especially S.O.S, Take A Chance On Me, and anything where Pierce Brosnan sang. On a side note, I also counted at least two violations of Chekov's Gun. And finally, I think it has a bad moral message. Which you only recognize if you manage to sit through the whole thing.If there is justice in the world, cast members without established careers won't get to have them as punishment for the worst movies I've seen since The Talented Mr. Ripley.",
154
+ "id": "29"
155
+ },
156
+ "30": {
157
+ "label": false,
158
+ "text": "A complete disaster!: If you're like me, you probably wanted to check out this movie because it sounded like it really could be an excellent supernatural Gothic horror tale full of goblins and wicked things alike. Well, don't make the same mistake I did and actually watch it. It's horrible. Terrible. An honest to goodness waste of film. The acting is wretched, the film quality is rotten (it actually looks twenty years older than it is), and the plot is thin, weak, and does not give you what it's supposed to. The only reason I bothered to give this film 1 star is because of Alexis Arquette -- he's great looking, but should have left this film out of his career.",
159
+ "id": "30"
160
+ },
161
+ "31": {
162
+ "label": true,
163
+ "text": "beautiful detail: I just purchased these Dover COloring Books for my mother and she loves them. The detail is out of this world and the variety of colors you can use are only limited by your inagination. HIGHLY RECOMMENDED!",
164
+ "id": "31"
165
+ },
166
+ "32": {
167
+ "label": false,
168
+ "text": "Very disappointed: I looked forward to getting this movie as I had heard many good things about it but it was nothing like I had imagined or been led to believe. There is very little actual history in it or real Christian experience except for the background because the main focus is a soap opera style romance and caricature figures. I agree with the reviewer who described it as a mixture of \"tawdry Hollywood sex\" somehow interspersed with a vague nod to Christianity. The only decent scene was the arena scene where the Christians are going to their deaths singing hymns - but that's not enough to make it a great or even a good movie. Not personally to my taste anyway.",
169
+ "id": "32"
170
+ },
171
+ "33": {
172
+ "label": false,
173
+ "text": "Unreliable minikit: I bought this minikit because it got good reviews and it would be perfect for my purposes. However it switches on and off whenever it wants, it looses contact with the phone. Very often the on/off button works only in a horizontal position (?) I use a Treo 650, which is on the compatible phone list. When I contacted Parrot, they said it wasn't (?) At last I opened the unit, but there are no moving parts inside except the micro switches. It is giving me a headache, so I will go searching for an alternative.",
174
+ "id": "33"
175
+ },
176
+ "34": {
177
+ "label": true,
178
+ "text": "A Christmas Classic!: This is surely one of the best classical Christmas recordings available. Don't buy the older version, as the quality of this recording is excellent. This is one of those \"Every Christmas - Can't have Christmas without\" recordings.",
179
+ "id": "34"
180
+ },
181
+ "35": {
182
+ "label": false,
183
+ "text": "too narrow: These were the narrowest pair of D size shoes I have ever tried on. I don't care how nice a shoe looks. If it don't fit it just don't fit.",
184
+ "id": "35"
185
+ },
186
+ "36": {
187
+ "label": false,
188
+ "text": "Lack of extension: This earphones lack a descent extension cord. ITs very small cable, but its of good quality. Sadly, cord its too short, and the extension is useless.",
189
+ "id": "36"
190
+ },
191
+ "37": {
192
+ "label": true,
193
+ "text": "Easy-Reading: This is the 3rd Southern Sisters Mystery I've read. They're easy, fast and funny murder mysteries, with lots of cute family stories intertwined in the intrigue.",
194
+ "id": "37"
195
+ },
196
+ "38": {
197
+ "label": false,
198
+ "text": "it'd be great if it worked like it was supposed to: for the first 30 seconds it was lovely, but i believe that either the motor isn't powerful enough to keep the shaft rotating smoothly or 3 AA batteries just don't provide enough juice for the motor to work more than 30 seconds. it was a nice idea, but i'm rather dissapointed. the jelly material is somewhat difficult to maintain also. i think if it were hooked up to a larger battery pack it'd be WONDERFUL... which i think i may have a macgyver friend with a knack for electronics attempt to do for me.",
199
+ "id": "38"
200
+ },
201
+ "39": {
202
+ "label": true,
203
+ "text": "Not Hornby's best but still good: I loved About a Boy and really, really loved the sardonic wit of High Fidelity. About a Boy is much deeper but just as cynical. Maybe even more so. The characters are richly drawn and just complex enough to keep the reader wanting more. Good read, but best to take some time with this one. Not recommended for a summer beach read.",
204
+ "id": "39"
205
+ },
206
+ "40": {
207
+ "label": false,
208
+ "text": "A Disappointment: As with most Taunton Press publications, the illustrations and photographs in this book are spectacular and the organization and layout is superb. Nonetheless, I found this book disappointing. It lacks both depth and breadth. I had hoped for a detailed review of wood joinery including some of the more unusual joinery found in Japanese woodworking. This book, however, is targeted more toward the beginner. Even so, it does not cover the details and \"tricks\" of even the most basic techniques in sufficient detail to allow beginners to easily reproduce them. Consequently, it is unclear who this book was written for - not the beginner as it lacks depth, and not the advanced woodworker as it lacks breadth. Far more effort appears to have been put into appearance and organization than in content.",
209
+ "id": "40"
210
+ },
211
+ "41": {
212
+ "label": false,
213
+ "text": "Horrible. Don't do it!: Great price for the item when a 6' one of these at Best Buy is $20. Thing is, the one from Best Buy fits in the outlet and stays there. This cord fits very loose and does not connect. I bought 2 of them, neither did what they were suppose to.As much as I hate to say it, but, buy the more expensive one. At least it works.",
214
+ "id": "41"
215
+ },
216
+ "42": {
217
+ "label": true,
218
+ "text": "Given as a gift...: Given to my best friend as a gift. She loves it. Her fiance enjoys making coffee for her in the mornings. :)",
219
+ "id": "42"
220
+ },
221
+ "43": {
222
+ "label": true,
223
+ "text": "Love the ring.: This is a nice ring. I was worried it out be thin and cheap looking, but it's not. It's a very pretty stylish ring. Go for it.",
224
+ "id": "43"
225
+ },
226
+ "44": {
227
+ "label": false,
228
+ "text": "Beautiful writing Marred by One-Note Characterizations: How could Kingsolver have ruined her book with such an obvious error? Nathan is a strident paper doll that flattens the whole story. Just as bad, the author has all the narrators using the same ironic tone to decribe him, deadening their voices as well. At the same time, Kingsolver doesn't have the guts to show him doing something trully terrible. I don't trust an author who can't let the reader make up his own mind, and as a consequence I couldn't trust her views about ANYTHING in the story. I'm giving this two stars for her descriptions of the African landscape, and that is all.",
229
+ "id": "44"
230
+ },
231
+ "45": {
232
+ "label": false,
233
+ "text": "Much worse than any cordless phone I've ever had: This phone cuts out only 2 rooms away from the base station. There is static noise, and callers on the other end complain about sound quality. I can't go into the garden, which used to be no problem with my old 900 MHz phone.",
234
+ "id": "45"
235
+ },
236
+ "46": {
237
+ "label": false,
238
+ "text": "Waste of time & money: The first Hangover was not too bad, this one was just terrible. The acting is bad, the script is bad, everything about this movie was just bad. Do yourself a favor, don't buy this movie as it is a total waste of time and money.",
239
+ "id": "46"
240
+ },
241
+ "47": {
242
+ "label": false,
243
+ "text": "Did Not Work For Me!: Impressive You Tube Video (Like a Sci-Fi Fantasy). In reality it's a high speed Easy Out so unsurprisingly it broke faster than an Easy out. This product did not work for me. The drill part did not drlil, the puller part did not pull. It was a total zero.",
244
+ "id": "47"
245
+ },
246
+ "48": {
247
+ "label": true,
248
+ "text": "Excellent book, long overdue.: From a very long time women were told that looking good was of utmost importance. This was without regard to health or fitness and how age affected these parameters. Witness the whalebone and other types of corsets, the spike heeled shoes and the numerous weight loss programmes on the market (some of which are downright dangerous). Now there is a book, backed by solid research, that allows women of all ages to remain fit and healthy for a lifetime. I am certainly going to recommend this book to all the women I know.Bentley Norville",
249
+ "id": "48"
250
+ },
251
+ "49": {
252
+ "label": false,
253
+ "text": "not an all star: Not a practical guide in this collecting age. Does NOT have a comprehensive list; meaning it does NOT cover all manufacturers and, more importantly, for the ones it does, only provides listings of the base set. That means no insert or variation pricing whatsoever. Also, no oddball or minor league issues are listed. Generally speaking, unless you are collecting base sets prior to the advent of inserts and alternate versions of the base set, this guide is fairly useless.",
254
+ "id": "49"
255
+ },
256
+ "50": {
257
+ "label": false,
258
+ "text": "Again, second rate city, third rate writer: Just another example of Mr. Lindberg's pitiful attempt at exhibiting a strong expertise on a subject with which he is clearly obsessed. Don't waste your time with this book, either. It is poorly written and fails to engage the reader. You might consider using this book and the first book he wrote on the same subject, as a pair of bookends. That is about all they are worth.",
259
+ "id": "50"
260
+ },
261
+ "51": {
262
+ "label": true,
263
+ "text": "Reality: No one should need to convince you to buy this book, you should just do it! It's so well written and worded and brings you right to the heart of a sexual reality that most people like to pretend doesn't really live and breath in their fair cities. I never again want to hear someone bad mouth a working girl for what she does. I will and do now however look at men with a curious eye wondering if they are depraved peep show window lickers :)",
264
+ "id": "51"
265
+ },
266
+ "52": {
267
+ "label": false,
268
+ "text": "Bummer: Visual effects and Battle footage were great...the other 85% of the movie was just lousy fluff...",
269
+ "id": "52"
270
+ },
271
+ "53": {
272
+ "label": true,
273
+ "text": "The spark of idependence: Filled with the independent spark that made us all love life at one point or another. A fun, introspective and nonsensical movie that sticks with you.",
274
+ "id": "53"
275
+ },
276
+ "54": {
277
+ "label": true,
278
+ "text": "What I expected from Mirman's website. Funny. Funny. Russian.: lol, gotta love Eugene. Even when his audience doesn't initially laugh, he gets in a good zinger at himself and they laugh at that. He's witty without being condescending, and uncomplicated without seeing contrived. However, if you're not a fan of irreverant humor, this may not be for you.",
279
+ "id": "54"
280
+ },
281
+ "55": {
282
+ "label": false,
283
+ "text": "Do not...repeat...do not bother!: It is not often that I offer a negative review but this compilation while attractive does not deliver at all.The foot massage gizmo is awkward and uncomfortable.The pumice stone leaves rough splinter like skin.The foot scrub doesn't reall scrub.The rotary action tool has five heads, none of which work well and you must hold the switch in place or it turns off. It is cumbersome and ineffective.The one star was initially given for a foot brush (which later lost its bristles very easily as I update the review) and a sweet smelling foot repair balm.Don't waist your money. Soak your feet and invest in an inexpensive German Titania file, smooth and coarser side, or a like product. It will last for years.",
284
+ "id": "55"
285
+ },
286
+ "56": {
287
+ "label": false,
288
+ "text": "Not Sandra's Best: Ms. Brown has written better romance novels. Don't give up on her if this was your first Sandra book.The feeble female lead struggles with a 15-year crush that walks back into her life. The smug male lead acts like a jerk through most of the novel. The romance scenes grapple to muster up passion but fall short. Both of the main characters bothered me; my favorite character was the 17-year old.A quick read...about 4 hours (with interruptions) for me...but probably not worth it.",
289
+ "id": "56"
290
+ },
291
+ "57": {
292
+ "label": true,
293
+ "text": "Impressed: Lots-O-Fun. Wood and glass toys are high quality and are a good fall back for the kids to play with they are \"bored\". Would buy again.",
294
+ "id": "57"
295
+ },
296
+ "58": {
297
+ "label": false,
298
+ "text": "Light turned on by itself 3 times: The installation was easy. I used it for a week, everything worked fine, EXCEPT the light it connected to turned on by itself 3 times so far, with no one near to either one of the switch. Not sure whether it is a defective unit, or this product is too sensitive to noise. I'm returning this product and will just install a regular switch instead.",
299
+ "id": "58"
300
+ },
301
+ "59": {
302
+ "label": true,
303
+ "text": "good battery: I feel kind of silly writing a review for a battery, but have to say that these last a LONG time. Work very well.",
304
+ "id": "59"
305
+ },
306
+ "60": {
307
+ "label": true,
308
+ "text": "Even a Woman finds it funny: Yes, even a woman finds \"Married to Mommy\" funny. The book gets you laughing aloud when it is trying to make fun of \"Mommies\". The truth is that it really is making fun of the stupidity of men and their simple basic needs of sex, getting out of work, and beer. Of course, the truth is always funny.A definite MUST for any woman, married or not. We will now know all the secret tricks the men try to use on us.By the way, I am NOT a MOMMY!",
309
+ "id": "60"
310
+ },
311
+ "61": {
312
+ "label": true,
313
+ "text": "Gungrave...not quite what you might expect: Those thinking this is another version of Trigun will be disappointed. Gungrave is actually a lot deeper and more complex. The lead is short on dialouge, but the story has more depth and character development than most anime. The first DVD is more about the main character's past than about the reanimated killing machine he's become, but it definitely leaves you wanting more.",
314
+ "id": "61"
315
+ },
316
+ "62": {
317
+ "label": true,
318
+ "text": "Error in product description: It's great in every way. However, if you'd prefer a digital tuner (as I do), then you might need to look further. The product description boasts a digital AM/FM tuner, but it's disappointingly an analog AM/FM tuner.Overall - especially for the price - I think it's pretty good.",
319
+ "id": "62"
320
+ },
321
+ "63": {
322
+ "label": true,
323
+ "text": "good phone but not as user friendly as it could be: Battery life is very good. Phone has good range. My only complaint is it's to involved to get your message from the handset.",
324
+ "id": "63"
325
+ },
326
+ "64": {
327
+ "label": false,
328
+ "text": "Big waste of money (and space in my house!): My 5 year old son wanted this so bad, but when we got it for him, there were so many pieces to put together that didn't fit together well, he never played with it. It just sits on our floor in many pieces taking up toy space! What a waste!",
329
+ "id": "64"
330
+ },
331
+ "65": {
332
+ "label": true,
333
+ "text": "Don't want to take it off: Very satisfied with an earlier purchase of this Bali bra model, I was just as pleased with the new one. Very comfortable, well made and a good neutral color. It will be my next choice, too.",
334
+ "id": "65"
335
+ },
336
+ "66": {
337
+ "label": true,
338
+ "text": "Fantastico: If anybody who's into rock music is ever looking for a band to keep you on your toes, this is the band. I've been a fan for 10 years now, and no album has ever sounded like any of their previous albums. This disc is fantastic with such a variety of styles, as are the previous releases, even back to the Rainbow Butt Monkey days.",
339
+ "id": "66"
340
+ },
341
+ "67": {
342
+ "label": false,
343
+ "text": "too much visual: There are far too much designs, visuals, colors, etc in the book - this is highly distracting, as TV screen can be...By way of example (among so many...), what is the use of colors with the three squares of the Pyth. theorem???? this is as useless as writting 2+3=5 with 2 in blue, 3 in red and 5 in yellow...I wish I had purchased the 2nd edition, which according to reviews was closer to what I was looking for.",
344
+ "id": "67"
345
+ },
346
+ "68": {
347
+ "label": true,
348
+ "text": "Aretha's First Arista Release Showed Pleasures to Come: After a long and musically satisfying career with Atlantic, Aretha severed her ties with that company and moved under the wing of Arista's Clive Davis. With the start of the 1980's, Aretha was looking for new territory to conquer and almost succeeded with this mixed bag.\"United Together\" is a fine tune that benefits from beautiful orchestral arrangement that is matched by Aretha's superb vocal instrument. The remake of \"Can't Turn You Loose\" allows Aretha to show why she is the Queen of Soul\" for she really belts this one out. Another cover, that of the Doobies' \"What a Fool Believes,\" is an interesting interpretation. The final cut \"School Days\" appears to be \"autobiographical\" for every girl growing up in the fifties.Although not as strong as her Atlantic work, \"Aretha\" is still a suitable addition to the artist's discography.",
349
+ "id": "68"
350
+ },
351
+ "69": {
352
+ "label": false,
353
+ "text": "Misguided Purchase: The photo and description do not reflect the product. The screen panel kit I received was white. What a huge inconvenience during a time-crunch.",
354
+ "id": "69"
355
+ },
356
+ "70": {
357
+ "label": false,
358
+ "text": "Banacek: My husband and were looking forward to seeing this series.The first show was SO boring, we finally just quit watching it.Actually, we haven't gotten around to watching anymore. I guess we were afraid of a repeat.Maybe that was just once, I hope!",
359
+ "id": "70"
360
+ },
361
+ "71": {
362
+ "label": true,
363
+ "text": "JDT: Uncle Tupelo is without doubt one of the most under appreciated groups of the 90's. Anodyne, like each of the three albums that came before it, has everything that a remarkable recording requires: great songs, honest lyrics, and artists who really care about the music they are making. Like the best of Dylan and Springsteen, the songs are about real people with real troubles and joys. When you hear them you know they are coming from the heart. The songs contributed by Jay Farrar and Jeff Tweedy are easily differentiated by the voacls, music, and lyrics. What makes this record interesting is how well these unique sounds compliment each other. The union is seamless.",
364
+ "id": "71"
365
+ },
366
+ "72": {
367
+ "label": true,
368
+ "text": "Well Worth Reading: First a confession: Miriam Wasserman was my mother. However, she published several books, but this is the only one I really found useful. She walks the reader through the New York City school system and the attitudes of different groups involved in the system back in the 1960s. This includes parents, teachers and administrators. Her view is that the further away one got from parents and students, the more prestige one had. She meticulously describes the teachers' strike of 1968 against \"community control of schools\", a strike of which she is extremely critical. She explores the racism that was involved in this strike, including using quotes from striking teachers, etc. It should be emphasized that the author was pro-union all her life, so her views don't stem from an anti-union bias. The book also covers the high school student rebellion which coincided with and followed the strike.",
369
+ "id": "72"
370
+ },
371
+ "73": {
372
+ "label": true,
373
+ "text": "compact and loaded: I bought this phone after reading the cnet reviews and really liked it. It looks small and really compact. I like the camera pics at 2 mega pixel and bright flash. The mp3 player is crisp. The headset that comes along delvers amazing fM radio. I think my phone is not very loud and you have a problem when you are around a noisy crowd. I just bought this phone again for my cousin. He likes it too. Almost forgot the display is very good.",
374
+ "id": "73"
375
+ },
376
+ "74": {
377
+ "label": true,
378
+ "text": "Outstanding text!: Brooks/Cole should keep this text in their catalog for ages! It is well-written, examples are generally quite clear, vocabulary is introduced well, and the exercises develop real skills, rather than simply be busy-work. One of the best calculus books ever!",
379
+ "id": "74"
380
+ },
381
+ "75": {
382
+ "label": true,
383
+ "text": "Excel 2003 Bible: Very good source of information. I will most likely buy other books in this series.",
384
+ "id": "75"
385
+ },
386
+ "76": {
387
+ "label": true,
388
+ "text": "Tasting is Believing: Gluten-free breads used to have a gritty texture from the rice flour, and were too soft for sandwiches. Bette Hagman uses garbanzo/fava bean flour, sorghum flour, tapioca flour, and corn starch to create breads which have a similar texture to wheat flour breads, and the flavors of her breads are fabulous.My BF bought me this book and a great tasting beverage to drink it with. Since he knows I quit coffee recently, he's been really wonderful helping me in cope with my mood swings. S o y f e e is made from soy beans that is roasted just like coffee. I enjoy the taste and don't miss coffee one bit. Buy it online at www.s o y c o f fee.com.This is a 'must have' for anyone baking gluten-free. I think all of Bette Hagman's books are wonderful and a must for those with gluten intolerance.",
389
+ "id": "76"
390
+ },
391
+ "77": {
392
+ "label": true,
393
+ "text": "5 stars for the show, no stars for the \"Collector's Edition\": I was really looking forward to getting this Collector's Edition and see what extras were added. I knew it wasn't a lot - just a mini-book and a documentary - but I figured it would be packaged in a cool way.Wrong.As others have already mentioned, the Collector's Edition is *literally* theAvatar: The Last Airbender - The Complete Book 1 Collectionslipped into another cardboard box, with a little booklet and DVD in an envelope (not even a case!) wedged in. It's really disappointing; it would have been so easy to create a quality Collector's Edition but the studio couldn't be bothered, I guess.",
394
+ "id": "77"
395
+ },
396
+ "78": {
397
+ "label": true,
398
+ "text": "sula scottcampos: Sula, a book that talks about the issues of being a black women is a really good novel to read.One of the reasons I recommend it is because of its realism and its themes - death, sex, friendship and poverty.I also think that its characters are very good, its easy to identify with one or both of them. I really recommend this book to anyone who enjoys good literature.",
399
+ "id": "78"
400
+ },
401
+ "79": {
402
+ "label": true,
403
+ "text": "Fantastic! It's a must-have for girls!: I hated razor, tried shaving but it did not work for me. Shaving made the hair grows thicker and faster afterwards, plus the roots are impossible to be getting rid of. After reading the reviews, I ordered it to try, I used it for once and already fall in love with this. I used to use small tweezer to pluck out my leg's hair, in order to avoid the razor, it took me a few hours to do that but this super electronic tweezer works wonder! You won't see the black roots and I have smooth and silkly legs in 20 mins. It does not hurt at all, if you use it on your legs. But, if you use it at your under arm, it won't be a pleasant feeling, of course! I will never use anything else besides this for hair removing anymore! highly recommended!",
404
+ "id": "79"
405
+ },
406
+ "80": {
407
+ "label": false,
408
+ "text": "This is not a toy: I guess I was expecting more out of these leave window decals. I just didn't find them attractive after placing them on my window, they seem very cheap, I guess because they are cheap.I threw them away.",
409
+ "id": "80"
410
+ },
411
+ "81": {
412
+ "label": true,
413
+ "text": "Wonderful book for anyone running a professional hatchery: This book is aimed more for hatcheries that are raising Trout, Salmon, Catfish and other food fishes. However, there is so much information in this book that even ornamental fish hatcheries will find an incredible amount of useful information. The chapters on Fish Nutrition are especially helpful.",
414
+ "id": "81"
415
+ },
416
+ "82": {
417
+ "label": true,
418
+ "text": "Amazing book!!: Once again, Eric Victorino's artistic talent is put into this great free-verse poetry book. I couldn't put it down and I finished it the day I received it in the mail. All of the poems are awesome but the one I found the most interesting was \"It's A People Business.\" All of the experiences in his life, personally and with his band, come to life in this book. Please check it out! It's worth every penny!!",
419
+ "id": "82"
420
+ },
421
+ "83": {
422
+ "label": true,
423
+ "text": "The white trumpet contender respect Miles Davis!: The story of the Jazz in the Fifties certainly would be remain unfinished without the ominous presence of this outstanding virtuoso. Baker sound still possesses this alluring hook, this magnetic engagement charm, eloquent expressiveness, enrapturing lyricism and contagious rhythm, despite the elapsed time, which confirms by itself the status of his musicianship.This selection is jus a little sample of the broad universe of his genius. A well thought selection of great musical successes, available, preserved and immortalized by the Digital Technology for our future enjoyment.Absolutely indispensable in your treasured collection.",
424
+ "id": "83"
425
+ },
426
+ "84": {
427
+ "label": false,
428
+ "text": "What the?: I'm sorry, maybe it's just me but I can't helping stating that this has to be one of the wrost movies I've seen in my life!Can you say boring? Can you say doesn't make sense at all? The first 30 minutes of the movie were O.K. But it went downhill after that. This movie is a prime example of a director attempting to make a deep movie with a meaningful lesson but failed on all levels. I don't recommend this movie unless you want to go to sleep or you don't have anything else to do.",
429
+ "id": "84"
430
+ },
431
+ "85": {
432
+ "label": true,
433
+ "text": "very very good!!!!: linda blair is a young girl who is possessed. and her mother doesn't know what to do until one day when she hears her daughter screaming and stabbind herself she knows what to do GET AN EXORCIZIM!!!",
434
+ "id": "85"
435
+ },
436
+ "86": {
437
+ "label": true,
438
+ "text": "Awesome product for the price!: This range extender works as advertised! I am very happy with the purchase. I was a little worried after reading some of the horror stories here, but I have to say, Chovy's review instructions (on this site) were just this ticket to get the repeater up and running in less than 30 minutes. It was unbelievably easy to install! Do not be frightened by negative reviews. If you can set up a wireless network, you can set up this repeater. However, I did upgrade the firmware before I did anything else and maybe that helped. I got the firmware update from the Belkin site.",
439
+ "id": "86"
440
+ },
441
+ "87": {
442
+ "label": false,
443
+ "text": "Slight: This book is either a heavily illustrated short story collection or a text-heavy comic. Its unusual format is its most original feature. Its plots are negligible, but its illustrations and text evoke a unique atmosphere of self-conscious nonconformism. Although its target audience is dare-to-be-different teens and college students, its interesting turns of phrase and expressive line drawings are not devoid of interest for general audences.",
444
+ "id": "87"
445
+ },
446
+ "88": {
447
+ "label": true,
448
+ "text": "ANgeleyes: Seem to dry up their eyes fairly well, although I haven't seen the color (brown stain) change much yet.",
449
+ "id": "88"
450
+ },
451
+ "89": {
452
+ "label": false,
453
+ "text": "Nice Try: Salt Lake 2002 is not a bad game, but it isn't good either. The graphics are excellent, but some of the events are bad. Bobsleigh, and skiing aren't bad but the others are. You dont stay into it for long. I liked it for a while, but it gets boring.",
454
+ "id": "89"
455
+ },
456
+ "90": {
457
+ "label": false,
458
+ "text": "Cutler's share of the pie: This book was a major disappointment. I am familiar with books written solely by the Dalai Lama, such as the \"Library of Tibet\" series, which are much more engrossing and have much more substance than Cutler's book. Cutler attempts (successfully, sadly) to have his share of the profitable market that involves the Dalai Lama's writings. The book is insipid, does not try to explain any important issue in the light of Buddhist philosophy, and only rehashes issues that several other westerners already wrote about. It's another big ego trip: we keep hearing time and again about his opportunities to be with the Dalai Lama. What a shame, Cutler. I sold the book as soon as I finished it.",
459
+ "id": "90"
460
+ },
461
+ "91": {
462
+ "label": false,
463
+ "text": "Mostly tedious, with interesting parts: I found the writing interesting, and the subject fascinating, but I found myself frustrated by the author's difficulty in talking directly about the status of Muslim women with her interview subjects. The author spent many pages writing about the menus and dress of the many middle and upper-middle class women she interviewed. It seemed as though her interview subjects resisted her efforts to discuss the status of women in their countries, so we too as readers had to wade through much distracting material and misunderstandings about feminism and gender. Great travel stories, but not a great source of information about Muslim women.",
464
+ "id": "91"
465
+ },
466
+ "92": {
467
+ "label": false,
468
+ "text": "Sesame Street Toddler: I did not find this game to be as educationally sound as I would expect from Sesame street. There is too much talking before the program will react to a command. The graphics are jerky and the cursor acts like the target is magnetically charged and keeps pushing away the cursor. When the child actually does manage to click on a target, the cursor may still fly to another target and the child is told that his answer is wrong. Another example of educational problems is the pronunciation of \"eggs\" using a long \"a\" sound instead of a short \"e.\" This is not very helpful in teaching a child the sound for short \"e.\" Children that are used to playing computer games by themselves may find that this game is too frustrating to do alone. The open ended learning curve is a great idea. I just wish Sesame Street would hire a truly qualified literacy expert to help clean up the many problems in this program.",
469
+ "id": "92"
470
+ },
471
+ "93": {
472
+ "label": false,
473
+ "text": "needs a buzz cut and a point: I avoided reading this book, not because of the hermaphrodite subject matter, but because I have never read a multigenerational family saga that I liked. Many books let me down in the middle, and this was no exception. The beginning of the book was incredible and harrowing, with momentum and characterization. The post-America nextgens part of the saga was so boring I found myself flipping and flipping - always a bad sign. If there was some kind of larger point to all of that, then I must have missed it. Yes there's the identity duality and trinity themes playing out here: man/woman, greek/turkish/american modern/old world sick/healthy innocent/guilty original/reinvented. But it was almost as if the author was saying - here it is again - get it? I like my fiction much more subtle than this.",
474
+ "id": "93"
475
+ },
476
+ "94": {
477
+ "label": false,
478
+ "text": "OMG! DO NOT BUY!: I normally don't take the time to submit a review.In this case however, I feel obligated to do so.This is by far one of the worst purchases I have ever made.Here's why.....The contraption is far too bulky.The case's enclosing is unbearable, takes a good minute or so to open it.The texture of the material feels like a cheap toy.The overall design is horrible, something I could make in my basement.For the love of everything sacred, do not buy this thing.",
479
+ "id": "94"
480
+ },
481
+ "95": {
482
+ "label": true,
483
+ "text": "Good price, good quality: Comparable HDMI cables can be bought for 45 or more. Even though the price is cheap the quality is good, no problems so far.",
484
+ "id": "95"
485
+ },
486
+ "96": {
487
+ "label": true,
488
+ "text": "Good rock music: This is what i call rock music good beat and good lyrics, don't listen to the other reviews. This cd is one of the best, listen to a few songs and you will get hooked. I recommend this cd its awesome.",
489
+ "id": "96"
490
+ },
491
+ "97": {
492
+ "label": false,
493
+ "text": "BORING!: This movie is soo boring. How in the hell did this movie make so much at the box office. Do people really want to pay for crappy movies like this. bottom line this is a chick flick nothing is good. And now they are re-releasing this movie with more boring stuff. This is the worst movie ever.",
494
+ "id": "97"
495
+ },
496
+ "98": {
497
+ "label": false,
498
+ "text": "Already Rusting: Inferior quality. The plating is thin and rust is coming through the finish. Inexcusable for a product that is designed for use in a humid environment.",
499
+ "id": "98"
500
+ },
501
+ "99": {
502
+ "label": false,
503
+ "text": "confusing internet setup: i wanted a camera that could email photos but this camera will not go out through the router and the manual setup , to punch a hole thru router is confusing.",
504
+ "id": "99"
505
+ },
506
+ "55066581ad334ef5844c6f7707525010": {
507
+ "label": true,
508
+ "text": "Thought this was super cool, and a really important step in all the physical books' preservation.",
509
+ "id": "55066581ad334ef5844c6f7707525010"
510
+ },
511
+ "fef14d13366f482d9f4e0726b357f178": {
512
+ "label": true,
513
+ "text": "There are some amazing hikes around Mt. Fuji.",
514
+ "id": "fef14d13366f482d9f4e0726b357f178"
515
+ },
516
+ "70aed7369aa74031a06f5f3155476d7c": {
517
+ "label": true,
518
+ "text": "Thought this was super cool, and a really important step in preserving all the physical books.",
519
+ "id": "70aed7369aa74031a06f5f3155476d7c"
520
+ },
521
+ "ac65d14b710648b8bf3c2a53caf6ac91": {
522
+ "label": false,
523
+ "text": "The profits of the business that was most successful were still negative.",
524
+ "id": "ac65d14b710648b8bf3c2a53caf6ac91"
525
+ },
526
+ "ce00e6b1547444259a13c55654e66500": {
527
+ "label": true,
528
+ "text": "love them best, they reconnect in hysterically funny and emotionally significant ways.",
529
+ "id": "ce00e6b1547444259a13c55654e66500"
530
+ },
531
+ "8943a94d205b43ceb4420d5ab9c5611a": {
532
+ "label": true,
533
+ "text": "Walt Disney's timeless masterpiece is an extravaganza of sight and sound! See the music come to life, hear the pictures burst into song and experience the excitement that is Fantasia over and over again.",
534
+ "id": "8943a94d205b43ceb4420d5ab9c5611a"
535
+ },
536
+ "6af8fc3dd30d4f8caf5a2929fc88534b": {
537
+ "label": false,
538
+ "text": "A director struggles with a difficult sex scene between a young actor and actress who can't stand one another. Aided by her loyal assistant, she is hell-bent on getting the scene right without compromise.",
539
+ "id": "6af8fc3dd30d4f8caf5a2929fc88534b"
540
+ },
541
+ "dbe571ed810d40f48170147dcab1c90f": {
542
+ "label": false,
543
+ "text": "sound created by drawing directly on the soundtrack).",
544
+ "id": "dbe571ed810d40f48170147dcab1c90f"
545
+ },
546
+ "682102dfc5494f03926d16ae947a6250": {
547
+ "label": true,
548
+ "text": "one of glowing admiration! Written by Mark Toscano",
549
+ "id": "682102dfc5494f03926d16ae947a6250"
550
+ },
551
+ "9b044458bb0e4bd68359e62d5fb4b979": {
552
+ "label": false,
553
+ "text": "Seth McArdle (Samuel Davis) is a high school senior with an especially full plate. Not only must he navigate the usual social and academic pitfalls of high school, but he has to contend with his young twin sisters, serving as de facto parent in the absence of his deceased mother and deadbeat father. The pressure mounts when the bank calls with a foreclosure warning, and Seth's frustrations spill",
554
+ "id": "9b044458bb0e4bd68359e62d5fb4b979"
555
+ },
556
+ "abf2d24c7d8845769b7368be28f2c25d": {
557
+ "label": true,
558
+ "text": "Bjork is a beautiful creature and her music is stellar to anything I've ever heard. This DVD is essential for all Bjork fans, because you find something new every time you watch it.",
559
+ "id": "abf2d24c7d8845769b7368be28f2c25d"
560
+ }
561
+ },
562
+ "version": 11,
563
+ "description": "Positive sentiment"
564
+ }
lilac/concepts/profanity/concept.json ADDED
The diff for this file is too large to render. See raw diff
 
lilac/concepts/question/concept.json ADDED
The diff for this file is too large to render. See raw diff
 
lilac/concepts/source-code/concept.json ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "namespace": "lilac",
3
+ "concept_name": "source-code",
4
+ "type": "text",
5
+ "data": {
6
+ "c7d0400c6e5442a59859ea7b0a7d6bab": {
7
+ "label": true,
8
+ "text": "const num1 = 10;\nconst num2 = 20;\nconst sum = num1 + num2;",
9
+ "id": "c7d0400c6e5442a59859ea7b0a7d6bab"
10
+ },
11
+ "cfa936b9ba9e4c72b835b44d8cfb393b": {
12
+ "label": true,
13
+ "text": "function calculateArea(radius) {\n return Math.PI * radius * radius;\n}",
14
+ "id": "cfa936b9ba9e4c72b835b44d8cfb393b"
15
+ },
16
+ "3952102e61a44fde92117a0519c4e8e6": {
17
+ "label": true,
18
+ "text": "let message = 'Hello, World!';\nconsole.log(message);",
19
+ "id": "3952102e61a44fde92117a0519c4e8e6"
20
+ },
21
+ "6e90cb4c8fdb46a1b38460b5d2eca907": {
22
+ "label": true,
23
+ "text": "for (let i = 0; i < 10; i++) {\n console.log(i);\n}",
24
+ "id": "6e90cb4c8fdb46a1b38460b5d2eca907"
25
+ },
26
+ "7e7a438002384ae194f35f27b9c85888": {
27
+ "label": true,
28
+ "text": "const colors = ['red', 'green', 'blue'];\nfor (const color of colors) {\n console.log(color);\n}",
29
+ "id": "7e7a438002384ae194f35f27b9c85888"
30
+ },
31
+ "91cc90d155ef4c1fb4f3c458cfdc8fac": {
32
+ "label": false,
33
+ "text": "No bathroom bill made it to Abbott\u2019s desk by the end of the legislative session in May.",
34
+ "id": "91cc90d155ef4c1fb4f3c458cfdc8fac"
35
+ },
36
+ "c67e408ec3544898a0d3fc21c2ee36c3": {
37
+ "label": false,
38
+ "text": "The theory that they are products of the radiation from the bomb is genius.",
39
+ "id": "c67e408ec3544898a0d3fc21c2ee36c3"
40
+ },
41
+ "fa78b497c7704c198fe0b2a320ed55e6": {
42
+ "label": false,
43
+ "text": "We built our society on clean energy.",
44
+ "id": "fa78b497c7704c198fe0b2a320ed55e6"
45
+ },
46
+ "a8bef6215f2346f7b67101b055724e99": {
47
+ "label": false,
48
+ "text": "No bathroom bill made it to Abbott\u2019s desk by the end of the legislative session in May.",
49
+ "id": "a8bef6215f2346f7b67101b055724e99"
50
+ },
51
+ "cab51176c2f74c8497410764628ea7cf": {
52
+ "label": false,
53
+ "text": "They should be attached to the lifting mechanism in the faucet.",
54
+ "id": "cab51176c2f74c8497410764628ea7cf"
55
+ },
56
+ "6585fbabe83444cfb43dee977dc3ebfe": {
57
+ "label": false,
58
+ "text": "This dataset is very big.",
59
+ "id": "6585fbabe83444cfb43dee977dc3ebfe"
60
+ },
61
+ "59e8b12ef7dd4e948dfae42c97f55721": {
62
+ "label": false,
63
+ "text": "The 15th Tank Corps was a corps of the Soviet Union's Red Army.",
64
+ "id": "59e8b12ef7dd4e948dfae42c97f55721"
65
+ },
66
+ "efaa58793e2840c6b966caa6a11ecaad": {
67
+ "label": false,
68
+ "text": "Every lunch hour I make it my goal to sift through one research paper.",
69
+ "id": "efaa58793e2840c6b966caa6a11ecaad"
70
+ },
71
+ "b529748962774d36a4ff781da0e327bf": {
72
+ "label": false,
73
+ "text": "On Sunday, Jane had a party.",
74
+ "id": "b529748962774d36a4ff781da0e327bf"
75
+ },
76
+ "b6f2f93f75f44d3780882cd1ebd3d311": {
77
+ "label": false,
78
+ "text": "TIL David Attenborough and Queen Elizabeth II are roughly the same age.",
79
+ "id": "b6f2f93f75f44d3780882cd1ebd3d311"
80
+ },
81
+ "2c04925ab5114925b9e891eb2706b83e": {
82
+ "label": true,
83
+ "text": "```js\nfor (var i = 1; i < 12; i++) {\n console.log(i);\n}\n```",
84
+ "id": "2c04925ab5114925b9e891eb2706b83e"
85
+ },
86
+ "030117a7d9044dd8b055c80853804d3d": {
87
+ "label": false,
88
+ "text": "\u00a1Desde luego! Aqu\u00ed tienes unas secuencias del tipo \"123456789\" convertidas en operaciones aritm\u00e9ticas (haciendo uso de los operadores + y -) cuyo resultado da 200:\n\n* 123 + 4 + 5 + 67 - 8 + 9 = 200\n* 123 - 4 + 5 - 6 - 7 + 89 = 200\n* 1 + 234 - 5 - 6 - 7 - 8 - 9 = 200",
89
+ "id": "030117a7d9044dd8b055c80853804d3d"
90
+ },
91
+ "c0095d13aed4431bb75bbc215b0d4819": {
92
+ "label": false,
93
+ "text": "and the Roman Senate was often more interested in maintaining their own power and wealth than in governing effectively.",
94
+ "id": "c0095d13aed4431bb75bbc215b0d4819"
95
+ },
96
+ "6ea50e4168ed42fe94ce15a9c330a54f": {
97
+ "label": true,
98
+ "text": "```javascript\n// Get the canvas element\nconst canvas = document.getElementById('myCanvas');\n\n// Get the 2D context of the canvas\nconst ctx = canvas.getContext('2d');\n\n// Set the position of the circle\nconst x = 100; // X coordinate\nconst y = 100; // Y coordinate\n\n// Set the radius of the circle\nconst radius = 50;\n\n// Set the color of the circle\nconst color = 'blue';",
99
+ "id": "6ea50e4168ed42fe94ce15a9c330a54f"
100
+ },
101
+ "7266060a88d04a6299922714f62091d8": {
102
+ "label": false,
103
+ "text": "Certainly! Here's a simple JavaScript function that takes a mathematical expression as a string and a value for x, then returns the result of the operation:\n\n",
104
+ "id": "7266060a88d04a6299922714f62091d8"
105
+ },
106
+ "6bf7df5363ff4eb3ac237eed4acda0af": {
107
+ "label": false,
108
+ "text": "Another factor in Rome's decline was the empire's overextended infrastructure. The Roman Empire was renowned for its impressive network of roads, aqueducts, and other public works. However, as the empire expanded and resources became scarce, the maintenance of these critical systems was neglected. This deterioration led to problems such as a lack of clean drinking water, food shortages, and the",
109
+ "id": "6bf7df5363ff4eb3ac237eed4acda0af"
110
+ },
111
+ "048952d5e620410282e09d625dc45f20": {
112
+ "label": true,
113
+ "text": "```javascript\nconst canvas = document.getElementById('canvas'); // note that you will have to use your own canvas\nconst context = canvas.getContext('2d');\nconst x = 100; // x-coordinate, you can set it up\nconst y = 100; // y-coordinate, you can set it up too\nconst radius = 50; // radius of the circle, set-up-able\nconst color = 'red'; // color of the circle, also set-up-able\ncontext.beginPath();",
114
+ "id": "048952d5e620410282e09d625dc45f20"
115
+ },
116
+ "eb5ee4b631bc400d95aa21aed4cb1dcd": {
117
+ "label": true,
118
+ "text": "```javascript\n// Get the canvas element from the DOM\nconst canvas = document.getElementById('canvas');\n\n// Get the 2D context of the canvas\nconst ctx = canvas.getContext('2d');\n\n// Set the position, radius, and color of the circle\nconst x = 50;\nconst y = 50;\nconst radius = 30;\nconst color = 'red';\n\n// Begin a new path\nctx.beginPath();\n\n// Draw the circle\nctx.arc(x, y, radius, 0, 2 * Math.PI);",
119
+ "id": "eb5ee4b631bc400d95aa21aed4cb1dcd"
120
+ },
121
+ "ca0e90e7a6f54427997b9c98ab86508e": {
122
+ "label": true,
123
+ "text": " const getColor = (depth) => {\n if (depth < 0.25) {\n return `rgb(0, ${Math.floor(depth * 1020)}, 255)`;\n } else if (depth < 0.5) {\n return `rgb(0, 255, ${Math.floor(1020 - depth * 1020)})`;\n } else if (depth < 0.75) {\n return `rgb(${Math.floor(depth * 1020) - 255}, 255, 0)`;\n } else {\n return `rgb(255, ${Math.floor(1020 - depth * 1020)}, 0)`;\n }\n };",
124
+ "id": "ca0e90e7a6f54427997b9c98ab86508e"
125
+ },
126
+ "bec8c965143b4a0dba7add98346996a0": {
127
+ "label": false,
128
+ "text": "Rome itself. Consequently, military leadership was fraught with intrigue, as ambitious generals vied for power, often at the expense of the empire's stability.",
129
+ "id": "bec8c965143b4a0dba7add98346996a0"
130
+ },
131
+ "8342a4029d384b5183a78ca0dc7e398e": {
132
+ "label": true,
133
+ "text": "\n \n \n \n \n \n \n \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-5', placement: 'Interstitial Gallery Thumbnails 5', target_type: 'mix' }); \n \n \n ",
134
+ "id": "8342a4029d384b5183a78ca0dc7e398e"
135
+ },
136
+ "aecfb1e4c7ba45fb8847f304c5c848af": {
137
+ "label": false,
138
+ "text": "miles an hour,\" and worried that their \"uteruses would fly out of [their] bodies as they were accelerated to that speed.\" ",
139
+ "id": "aecfb1e4c7ba45fb8847f304c5c848af"
140
+ },
141
+ "6a6aaea62afc43258b646b4b25d93692": {
142
+ "label": true,
143
+ "text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-15', placement: 'Interstitial Gallery Thumbnails 15', target_type: 'mix' }); _taboola.push({flush: true}); \n ",
144
+ "id": "6a6aaea62afc43258b646b4b25d93692"
145
+ },
146
+ "4f94dc90c0c94420a99301ac1fc92171": {
147
+ "label": true,
148
+ "text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-25', placement: 'Interstitial Gallery Thumbnails 25', target_type: 'mix' }); _taboola.push({flush: true}); \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ",
149
+ "id": "4f94dc90c0c94420a99301ac1fc92171"
150
+ },
151
+ "64e15fb5d64548ca8b0d2c288916d303": {
152
+ "label": false,
153
+ "text": "Choose the Ellipse, Rectangle, or Polygon tool from InDesign's Tools panel. Adjust your background shape's size by clicking one of your shape's handles with your Select tool and dragging it until your shape is the correct size. Make sure the object is selected, then open InDesign's Swatches panel and select the Fill button. Choose the color you want to apply from the Swatches panel. Your shape",
154
+ "id": "64e15fb5d64548ca8b0d2c288916d303"
155
+ },
156
+ "3374e377ae4440a98626ae06aba98dea": {
157
+ "label": true,
158
+ "text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-20', placement: 'Interstitial Gallery Thumbnails 20', target_type: 'mix' }); _taboola.push({flush: true}); \n ",
159
+ "id": "3374e377ae4440a98626ae06aba98dea"
160
+ },
161
+ "2e14c0bbc5d84955b6b65f502b599eda": {
162
+ "label": true,
163
+ "text": "value=\"#000000\" /><param name=\"allowScriptAccess\" value=\"always\" /><param name=\"allowFullScreen\" value=\"true\" /><param name=\"flashvars\" value=\"embedType=noscriptObjectTag&embedCode=1udG03NTowIl-eAG5T0wYzU_zYNmmNht&videoPcode=BhdmY6l9g002rBhQ6aEBZiheacDu\" /><embed src=\"http://player.ooyala.com/player.swf?embedCode=1udG03NTowIl-eAG5T0wYzU_zYNmmNht&version=2\" bgcolor=\"#000000\" width=\"618\"",
164
+ "id": "2e14c0bbc5d84955b6b65f502b599eda"
165
+ },
166
+ "a556ea1110714d38ad2fdcaac47a4332": {
167
+ "label": false,
168
+ "text": "What are the best resources to learn Express.js?\nWhat is the best way to learn AngularJS and nodeJS?\n\nAre these two questions inquiring about the same information?",
169
+ "id": "a556ea1110714d38ad2fdcaac47a4332"
170
+ },
171
+ "20f20d8cff8c475f93b25298e5f58ff2": {
172
+ "label": true,
173
+ "text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-10', placement: 'Interstitial Gallery Thumbnails 10', target_type: 'mix' }); _taboola.push({flush: true}); \n ",
174
+ "id": "20f20d8cff8c475f93b25298e5f58ff2"
175
+ },
176
+ "7f810a7c98bc4d44a3dd07688d39ae02": {
177
+ "label": false,
178
+ "text": "used on the left side and when used on the right side it extends too far forward. C'est la Vie. I will try it on my XBox next. Answer:",
179
+ "id": "7f810a7c98bc4d44a3dd07688d39ae02"
180
+ },
181
+ "3b476d5fbd0f472eb26485a58690f62a": {
182
+ "label": true,
183
+ "text": "let-Step-7Bullet1.jpg\",\"smallWidth\":460,\"smallHeight\":306,\"bigWidth\":\"728\",\"bigHeight\":\"485\",\"licensing\":\"<div",
184
+ "id": "3b476d5fbd0f472eb26485a58690f62a"
185
+ },
186
+ "5c51478ab56d4ef580aec02f4554a557": {
187
+ "label": true,
188
+ "text": " \n window._taboola = window._taboola || []; _taboola.push({ mode: 'thumbnails-c', container: 'taboola-interstitial-gallery-thumbnails-5', placement: 'Interstitial Gallery Thumbnails 5', target_type: 'mix' }); _taboola.push({flush: true}); \n ",
189
+ "id": "5c51478ab56d4ef580aec02f4554a557"
190
+ },
191
+ "8c1657d2e75b440aae9c0c36162f13b5": {
192
+ "label": false,
193
+ "text": "Worries increased with age. People ages 30 to 49 \u2014 a time when people are coping with tuition payments, car payments and child-care costs \u2014 were the most likely to say they are limiting their monthly spending. And seniors were three times as likely as those 18 to 29 to say stagnant income was the main reason for cutting back.",
194
+ "id": "8c1657d2e75b440aae9c0c36162f13b5"
195
+ },
196
+ "123e676caaba4266bc2cf174c33a9816": {
197
+ "label": true,
198
+ "text": "Aqu\u00ed hay un ejemplo de c\u00f3mo realizar una operaci\u00f3n AND en dos bytes en JavaScript:\n\nconst byte1 = 0b10101010; // 170 en decimal\nconst byte2 = 0b11110000; // 240 en decimal\n\nconst resultado = byte1 & byte2;\n\nconsole.log(resultado.toString(2)); // \"10100000\"",
199
+ "id": "123e676caaba4266bc2cf174c33a9816"
200
+ },
201
+ "5990d015dd1249a58af88ec3adca6a58": {
202
+ "label": false,
203
+ "text": "How in JavaScript draw a circle with radius 10 pixels in a html canvas?",
204
+ "id": "5990d015dd1249a58af88ec3adca6a58"
205
+ },
206
+ "ca62cf272b1e463aa973d1f23eb25fc5": {
207
+ "label": false,
208
+ "text": "such as Tacitus and Livy, and philosophers like Seneca and Marcus Aurelius influencing generations of thinkers and writers.",
209
+ "id": "ca62cf272b1e463aa973d1f23eb25fc5"
210
+ },
211
+ "5a7986b9233d45b1b194001c74c98af0": {
212
+ "label": true,
213
+ "text": "menu. {\"smallUrl\":\"https:\\/\\/www.wikihow.com\\/images\\/thumb\\/a\\/ad\\/Enable-Automatic-Updates-Step-5Bullet3.jpg\\/v4-460px-Enable-Automatic-Updates-Step-5Bullet3.jpg\",\"bigUrl\":\"\\/images\\/thumb\\/a\\/ad\\/Enable-Automatic-Updates-Step-5Bullet3.jpg\\/aid1480351-v4-728px-Enable-Automatic-Updates-Step-5Bullet3.jpg\",\"smallWidth\":460,\"smallHeight\":345,\"bigWidth\":\"728\",\"bigHeight\":\"546\",\"licensing\":\"<div",
214
+ "id": "5a7986b9233d45b1b194001c74c98af0"
215
+ },
216
+ "3148cb870d2a40009e9d17b0c6056470": {
217
+ "label": true,
218
+ "text": "allow_photos=false, maxitems=7, display_ugc_photos=false, includepause=true, canvas_allcomments_app_instance=6634zxcgfd, includepermalink=false}!!",
219
+ "id": "3148cb870d2a40009e9d17b0c6056470"
220
+ },
221
+ "874e2014361046e6988229dd17674847": {
222
+ "label": true,
223
+ "text": " {\"smallUrl\":\"https:\\/\\/www.wikihow.com\\/images\\/thumb\\/4\\/4c\\/Enable-Automatic-Updates-Step-5Bullet2.jpg\\/v4-460px-Enable-Automatic-Updates-Step-5Bullet2.jpg\",\"bigUrl\":\"\\/images\\/thumb\\/4\\/4c\\/Enable-Automatic-Updates-Step-5Bullet2.jpg\\/aid1480351-v4-728px-Enable-Automatic-Updates-Step-5Bullet2.jpg\",\"smallWidth\":460,\"smallHeight\":345,\"bigWidth\":\"728\",\"bigHeight\":\"546\",\"licensing\":\"<div",
224
+ "id": "874e2014361046e6988229dd17674847"
225
+ },
226
+ "9d5510fa41a24f3aa77ccb3e2f37c366": {
227
+ "label": true,
228
+ "text": " {\"smallUrl\":\"https:\\/\\/www.wikihow.com\\/images\\/thumb\\/e\\/ee\\/Enable-Automatic-Updates-Step-5Bullet4.jpg\\/v4-460px-Enable-Automatic-Updates-Step-5Bullet4.jpg\",\"bigUrl\":\"\\/images\\/thumb\\/e\\/ee\\/Enable-Automatic-Updates-Step-5Bullet4.jpg\\/aid1480351-v4-728px-Enable-Automatic-Updates-Step-5Bullet4.jpg\",\"smallWidth\":460,\"smallHeight\":345,\"bigWidth\":\"728\",\"bigHeight\":\"546\",\"licensing\":\"<div",
229
+ "id": "9d5510fa41a24f3aa77ccb3e2f37c366"
230
+ },
231
+ "4bf8a3da04e040b58b2216f231b23272": {
232
+ "label": false,
233
+ "text": "want to edit the size of the original image. Save the image as a separate copy. Add \"thumbnail\" or something similar to the end of the copy of the image (i.e. weddingphoto_thumbnail.jpg). Use the following steps to create a copy of the image: Click File Click Save As. Type a name for the image next to \"Filename\". Click Save. It's in the upper-left corner above the box labeled \"Image\". It's at",
234
+ "id": "4bf8a3da04e040b58b2216f231b23272"
235
+ },
236
+ "04c7fc7da2a44936accb79f9abc65a94": {
237
+ "label": false,
238
+ "text": "Two-thirds of Americans say they aren\u2019t spending as much as they could each month, according to the poll released Tuesday by Bankrate.com. And they may keep spending down even as they enter the holiday season, says Greg McBride, chief financial analyst for Bankrate.com. \u201cPeople just don\u2019t have a lot of extra money to throw around,\u201d he says.",
239
+ "id": "04c7fc7da2a44936accb79f9abc65a94"
240
+ },
241
+ "46f6088066c24694b5448648160540f1": {
242
+ "label": false,
243
+ "text": "Update 1 : I have discovered problems with logging in and with the display of the CAPTCHA images when using FireFox 3.0.1 browser . These are currently being investigated . Update 2 : The problems noted above seem to be fixed .\n\nGenerate a question about the above context.",
244
+ "id": "46f6088066c24694b5448648160540f1"
245
+ },
246
+ "1d44b65a89214a8eaaa305d18c5d13e9": {
247
+ "label": false,
248
+ "text": "One bright note: Young people are saving more. Millennials were most likely to cite the need to save as their main reason for cutting down spending. Younger consumers, who are also less likely to own a credit card, and may be overwhelmed by student loans, recognize the importance of having emergency savings. \u201cMany of them are building a solid financial foundation,\u201d McBride says.",
249
+ "id": "1d44b65a89214a8eaaa305d18c5d13e9"
250
+ },
251
+ "490ef080fa174d938dba33adc155fd4c": {
252
+ "label": true,
253
+ "text": "of time. {\"smallUrl\":\"https:\\/\\/www.wikihow.com\\/images\\/thumb\\/b\\/bd\\/Enable-Automatic-Updates-Step-5Bullet1.jpg\\/v4-460px-Enable-Automatic-Updates-Step-5Bullet1.jpg\",\"bigUrl\":\"\\/images\\/thumb\\/b\\/bd\\/Enable-Automatic-Updates-Step-5Bullet1.jpg\\/aid1480351-v4-728px-Enable-Automatic-Updates-Step-5Bullet1.jpg\",\"smallWidth\":460,\"smallHeight\":345,\"bigWidth\":\"728\",\"bigHeight\":\"546\",\"licensing\":\"<div",
254
+ "id": "490ef080fa174d938dba33adc155fd4c"
255
+ },
256
+ "8c39537e3827468fa16039966fbb2fec": {
257
+ "label": true,
258
+ "text": "VALUE=\"0\"><PARAM NAME=\"DisplayForeColor\" VALUE=\"16777215\"><PARAM NAME=\"DisplayMode\" VALUE=\"0\"><PARAM NAME=\"DisplaySize\" VALUE=\"4\"><PARAM NAME=\"Enabled\" VALUE=\"-1\"><PARAM NAME=\"EnableContextMenu\" VALUE=\"-1\"><PARAM NAME=\"EnablePositionControls\" VALUE=\"-1\"><PARAM NAME=\"EnableFullScreenControls\" VA!",
259
+ "id": "8c39537e3827468fa16039966fbb2fec"
260
+ },
261
+ "23bdbe40c81e45b487d653f936459bdf": {
262
+ "label": true,
263
+ "text": "<font class=\"smalltext\" color=\"#999966\">11/7 -- HOT JOBS </font><br />\n<font class=\"itext\"> <A HREF=\"http://www.businessweek.com/careers/content/nov2001/ca2001117_9208.htm?c=bwinsidernov09&n=link45&t=email\">\n<B>A Surge of Civic-Mindedness</B></a></font> <br />\n<font class=\"itext\">September 11 and a bad economy have folks flocking to find work in the government ",
264
+ "id": "23bdbe40c81e45b487d653f936459bdf"
265
+ },
266
+ "a2d7ac3aa9054cc193afb92aaaf2db10": {
267
+ "label": true,
268
+ "text": "howGotoBar\" VALUE=\"0\"><PARAM NAME=\"ShowPositionControls\" VALUE=\"-1\"><PARAM NAME=\"ShowStatusBar\" VALUE=\"0\"><PARAM NAME=\"ShowTracker\" VALUE=\"-1\"><PARAM NAME=\"TransparentAtStart\" VALUE=\"0\"><PARAM NAME=\"VideoBorderWidth\" VALUE=\"0\"><PARAM NAME=\"VideoBorderColor\" VALUE=\"0\"><PARAM NAME=\"VideoBorder3D\" VALUE=\"0\"><PARAM NAME=\"Volume\" VALUE=\"-260\"><PARAM NAME=\"WindowlessVideo\" VALUE=\"0\">",
269
+ "id": "a2d7ac3aa9054cc193afb92aaaf2db10"
270
+ },
271
+ "064bcd09ae674b3a93a8a28b2f2d776a": {
272
+ "label": true,
273
+ "text": "NAME=\"SendMouseClickEvents\" VALUE=\"0\"><PARAM NAME=\"SendMouseMoveEvents\" VALUE=\"0\"><PARAM NAME=\"SendPlayStateChangeEvents\" VALUE=\"-1\"><PARAM NAME=\"ShowCaptioning\" VALUE=\"0\"><PARAM NAME=\"ShowControls\" VALUE=\"0\"><PARAM NAME=\"ShowAudioControls\" VALUE=\"-1\"><PARAM NAME=\"ShowDisplay\" VALUE=\"0\"><PARAM NAME=\"S!",
274
+ "id": "064bcd09ae674b3a93a8a28b2f2d776a"
275
+ },
276
+ "3964174632e949bba35991a1ad18cd66": {
277
+ "label": false,
278
+ "text": "1) Through Terminal Server (the one that opens up a desktop within a desktop and it actually appears that you have two Start buttons). \nor\n2) By opening the Stack Manager directly from your \"native\" desktop.\n\nPlease reply to this message with either a corresponding \"1\" or a \"2\" in the subject field.",
279
+ "id": "3964174632e949bba35991a1ad18cd66"
280
+ },
281
+ "f79e3b6f46444f05ac04fc2661f2fc23": {
282
+ "label": true,
283
+ "text": "```\nimport React, { useState } from \"react\";\n\nfunction Ball() {\n const [position, setPosition] = useState({ x: 0, y: 0 });\n\n const handleMouseMove = (event) => {\n setPosition({ x: event.clientX, y: event.clientY });\n };",
284
+ "id": "f79e3b6f46444f05ac04fc2661f2fc23"
285
+ },
286
+ "2e71a1fa193a42e0b04c399666939997": {
287
+ "label": true,
288
+ "text": "7. Por \u00faltimo, itera a trav\u00e9s de los resultados de b\u00fasqueda y muestra el nombre del canal y una miniatura para cada uno:\n\nreturn (\n <div>\n <form>\n <input type=\"text\" value={query} onChange={handleInputChange} />\n <button type=\"button\" onClick={searchTwitch}>Search</button>\n </form>",
289
+ "id": "2e71a1fa193a42e0b04c399666939997"
290
+ },
291
+ "3c5fdf9f6a194342a83f59127257ad56": {
292
+ "label": false,
293
+ "text": "the Eastern Roman Empire, or the Byzantine Empire, would continue to survive and even flourish for many centuries, the fall of the Western Roman Empire in 476 CE marked a dramatic turning point in world history. The legacy of the Roman Empire endures in the many aspects of modern society that have been influenced by its achievements, from language and law to art and architecture. The Latin",
294
+ "id": "3c5fdf9f6a194342a83f59127257ad56"
295
+ },
296
+ "60a6019fa6da495aa395d80568867fdd": {
297
+ "label": true,
298
+ "text": "import React, { useState } from \"react\";\n\nfunction BallFollowingMouse() {\n const [position, setPosition] = useState({ x: 0, y: 0 });\n\n function handleMouseMove(event) {\n setPosition({ x: event.clientX, y: event.clientY });\n }",
299
+ "id": "60a6019fa6da495aa395d80568867fdd"
300
+ },
301
+ "24a37a8d1bde4548807c27fa75a611c2": {
302
+ "label": true,
303
+ "text": "```js\nwhile (condicion) {\n // sentencia(s);\n}\n```",
304
+ "id": "24a37a8d1bde4548807c27fa75a611c2"
305
+ },
306
+ "dd489a1ec0d0444f894c57d2f0f30fc1": {
307
+ "label": true,
308
+ "text": "```\nconst btn = document.getElementById(\"showFormBtn\");\nconst form = document.getElementById(\"form\");\n\nbtn.addEventListener(\"click\", function() {\n if (form.style.display === \"block\") {\n form.style.display = \"none\";\n } else {\n form.style.display = \"block\";\n form.elements[0].focus();\n }\n});\n```",
309
+ "id": "dd489a1ec0d0444f894c57d2f0f30fc1"
310
+ },
311
+ "1fd93665d3ea44fca509521c179ac54b": {
312
+ "label": false,
313
+ "text": "\u041f\u043e\u0441\u043b\u0435 \u0443\u0441\u0442\u0430\u043d\u043e\u0432\u043a\u0438 \u0431\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0438 \u0432\u044b \u043c\u043e\u0436\u0435\u0442\u0435 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0435\u0435 \u0432 \u0441\u0432\u043e\u0435\u043c \u043f\u0440\u043e\u0435\u043a\u0442\u0435. \u0411\u0438\u0431\u043b\u0438\u043e\u0442\u0435\u043a\u0430 FFT.js \u043f\u0440\u0435\u0434\u043b\u0430\u0433\u0430\u0435\u0442 \u043d\u0430\u0431\u043e\u0440 \u0444\u0443\u043d\u043a\u0446\u0438\u0439, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u043c\u043e\u0436\u043d\u043e \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0434\u043b\u044f \u0432\u044b\u043f\u043e\u043b\u043d\u0435\u043d\u0438\u044f \u0424\u0443\u0440\u044c\u0435-\u0430\u043d\u0430\u043b\u0438\u0437\u0430 \u043d\u0430 \u043e\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u043d\u043e\u043c \u043d\u0430\u0431\u043e\u0440\u0435 \u0434\u0430\u043d\u043d\u044b\u0445. \u0424\u0443\u043d\u043a\u0446\u0438\u0438 fft \u0438 ifft - \u044d\u0442\u043e \u0434\u0432\u0435 \u043a\u043b\u044e\u0447\u0435\u0432\u044b\u0435 \u0444\u0443\u043d\u043a\u0446\u0438\u0438, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u0432\u0430\u043c \u043d\u0443\u0436\u043d\u043e \u0431\u0443\u0434\u0435\u0442 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c. \u041d\u0430 \u0432\u0445\u043e\u0434 \u0444\u0443\u043d\u043a\u0446\u0438\u0438 fft \u043f\u0435\u0440\u0435\u0434\u0430\u0435\u0442\u0441\u044f \u043c\u0430\u0441\u0441\u0438\u0432 \u043a\u043e\u043c\u043f\u043b\u0435\u043a\u0441\u043d\u044b\u0445 \u0447\u0438\u0441\u0435\u043b, \u0438 \u043e\u043d\u0430 \u0432\u043e\u0437\u0432\u0440\u0430\u0449\u0430\u0435\u0442 \u043c\u0430\u0441\u0441\u0438\u0432 \u043a\u043e\u043c\u043f\u043b\u0435\u043a\u0441\u043d\u044b\u0445 \u0447\u0438\u0441\u0435\u043b, \u043a\u043e\u0442\u043e\u0440\u044b\u0439",
314
+ "id": "1fd93665d3ea44fca509521c179ac54b"
315
+ },
316
+ "0362a26ed9024f6d9fb39dfa39e113b0": {
317
+ "label": true,
318
+ "text": "const calculadora=document.getElementById('calculadora')const resultado=document.getElementById('resultado') calculadora.addEventListener('click',a\u00f1adirNumeros) let operaciones=[] function",
319
+ "id": "0362a26ed9024f6d9fb39dfa39e113b0"
320
+ },
321
+ "20d9a45df83e4ad1a092ea00e4d2204a": {
322
+ "label": true,
323
+ "text": ".calculator-keys button[value=\"calculate\"] {\n grid-column: 3/5;\n}\nAgregue la funcionalidad de la calculadora utilizando JavaScript. Debe agregar un controlador de eventos para cada bot\u00f3n de la calculadora y escribir la l\u00f3gica para realizar los c\u00e1lculos. Por ejemplo:\nvbnet",
324
+ "id": "20d9a45df83e4ad1a092ea00e4d2204a"
325
+ },
326
+ "dff7dec5c01d4fc185f633dbe3c32b60": {
327
+ "label": true,
328
+ "text": "const calculator = document.querySelector('.calculator');\nconst keys = calculator.querySelector('.calculator-keys');\nconst screen = calculator.querySelector('.calculator-screen');\n\nkeys.addEventListener('click', event => {\n if (!event.target.matches('button')) {\n return;\n }\n\n const key = event.target;\n const keyValue = key.value;\n const displayValue = screen.textContent;",
329
+ "id": "dff7dec5c01d4fc185f633dbe3c32b60"
330
+ },
331
+ "53d4066c6d704bd79f2d531a1f9562f4": {
332
+ "label": true,
333
+ "text": "\n if (!phoneRegex.test(phoneValue)) {return inputError(\"phone\")}\n\n if (passwordValue !== passwordConfirmValue) {\n inputError(\"password\");\n return inputError(\"confirm-password\");\n }\n\n\n});\n\nfunction inputError(inputName) {\n form.elements[inputName].style.border = \"1px solid red\";\n}\n```",
334
+ "id": "53d4066c6d704bd79f2d531a1f9562f4"
335
+ },
336
+ "59f75053fff04c879d1aacfb7bd8b51b": {
337
+ "label": true,
338
+ "text": "const App = () => {\n const [elements1, setElements1] = useState([\n { id: 1, text: 'Element 1' },\n { id: 2, text: 'Element 2' },\n { id: 3, text: 'Element 3' },\n ]);\n const [elements2, setElements2] = useState([\n { id: 4, text: 'Element 4' },\n { id: 5, text: 'Element 5' },\n { id: 6, text: 'Element 6' },\n ]);",
339
+ "id": "59f75053fff04c879d1aacfb7bd8b51b"
340
+ },
341
+ "cda55f47106b4a40b49bf9b8722cd7f1": {
342
+ "label": true,
343
+ "text": "```js\nfetch(\"/robots.txt\")\n .then(response => {\n return response.text(); // Devuelve una promesa\n })\n .then(data => {\n console.log(data);\n })\n .catch(error => { /* C\u00f3digo a realizar cuando se rechaza la promesa */ });\n```",
344
+ "id": "cda55f47106b4a40b49bf9b8722cd7f1"
345
+ },
346
+ "397f7180a5af495a874646051f8bc248": {
347
+ "label": true,
348
+ "text": " const handleButtonClick = async () => {\n try {\n const response = await axios.get(`https://example-api.com/${inputValue}`);\n setResult(response.data);\n } catch (error) {\n console.error(error);\n }\n };",
349
+ "id": "397f7180a5af495a874646051f8bc248"
350
+ },
351
+ "4439280d86e644da8c5a10277ba7777a": {
352
+ "label": true,
353
+ "text": " client.send(message);\n }\n });\n});\n```",
354
+ "id": "4439280d86e644da8c5a10277ba7777a"
355
+ },
356
+ "e54d850ca2cb41c5a869936baad2163f": {
357
+ "label": true,
358
+ "text": "```javascript\nconst axios = require('axios');\nconst cheerio = require('cheerio');\n\nconst getData = async () => {\n const response = await axios.get('https://www.realestate.com.au/rent/in-sydney,+nsw/list-1');\n const $ = cheerio.load(response.data);\n \n const properties = [];\n\n $('.listing-result').each((i, el) => {\n const property = {};",
359
+ "id": "e54d850ca2cb41c5a869936baad2163f"
360
+ },
361
+ "ae64416d43bc4a61b2783d1b7067b9de": {
362
+ "label": true,
363
+ "text": "const App = () => {\n const [list1, setList1] = useState([\n { id: 1, text: \"Item 1\" },\n { id: 2, text: \"Item 2\" },\n { id: 3, text: \"Item 3\" }\n ]);\n const [list2, setList2] = useState([\n { id: 4, text: \"Item 4\" },\n { id: 5, text: \"Item 5\" }\n ]);\n\n const onDragStart = (event, source) => {\n event.dataTransfer.setData(\"source\", source);\n };",
364
+ "id": "ae64416d43bc4a61b2783d1b7067b9de"
365
+ },
366
+ "5b6b91512f014660bc48d8527b3ca226": {
367
+ "label": true,
368
+ "text": "```javascript\nimport React, { useState } from 'react';\n\nconst App = () => {\n const [items1, setItems1] = useState(['Item 1', 'Item 2', 'Item 3']);\n const [items2, setItems2] = useState([]);",
369
+ "id": "5b6b91512f014660bc48d8527b3ca226"
370
+ },
371
+ "c751dfc325414603a16ac65e2fa62fd7": {
372
+ "label": false,
373
+ "text": "Detailed Instructions: In this task, you are given an input list. A list contains several comma-separated items written within brackets. You need to return the position of all the alphabetical elements in the given list in order. Assume the position of the 1st element to be 1. Return -1 if no alphabetical element is in the list.\nQ: ['g', '7171', 'v', 'i', 'f', 'c']\nA:",
374
+ "id": "c751dfc325414603a16ac65e2fa62fd7"
375
+ },
376
+ "0310318744f1425e88cdd053773681ca": {
377
+ "label": false,
378
+ "text": "In this task, you are given an input list. A list contains several comma-separated items written within brackets. You need to return the position of all the alphabetical elements in the given list in order. Assume the position of the 1st element to be 1. Return -1 if no alphabetical element is in the list.",
379
+ "id": "0310318744f1425e88cdd053773681ca"
380
+ },
381
+ "8d6eb8f1aac745359540c9f454aa9a58": {
382
+ "label": false,
383
+ "text": "not how you reconcile with your estranged wife,\u201d wrote Vulture.",
384
+ "id": "8d6eb8f1aac745359540c9f454aa9a58"
385
+ }
386
+ },
387
+ "version": 62,
388
+ "description": "Source code for a programming language."
389
+ }
lilac/concepts/toxicity/concept.json ADDED
The diff for this file is too large to render. See raw diff
 
lilac/config.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configurations for a dataset run."""
2
+
3
+ import json
4
+ import pathlib
5
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
+
7
+ import yaml
8
+
9
+ if TYPE_CHECKING:
10
+ from pydantic.typing import AbstractSetIntStr, MappingIntStrAny
11
+
12
+ from pydantic import BaseModel, Extra, ValidationError, validator
13
+
14
+ from .schema import Path, PathTuple, normalize_path
15
+ from .signal import Signal, TextEmbeddingSignal, get_signal_by_type, resolve_signal
16
+ from .sources.source import Source
17
+ from .sources.source_registry import resolve_source
18
+
19
+ CONFIG_FILENAME = 'config.yml'
20
+
21
+
22
+ def _serializable_path(path: PathTuple) -> Union[str, list]:
23
+ if len(path) == 1:
24
+ return path[0]
25
+ return list(path)
26
+
27
+
28
+ class SignalConfig(BaseModel):
29
+ """Configures a signal on a source path."""
30
+ path: PathTuple
31
+ signal: Signal
32
+
33
+ class Config:
34
+ extra = Extra.forbid
35
+
36
+ @validator('path', pre=True)
37
+ def parse_path(cls, path: Path) -> PathTuple:
38
+ """Parse a path."""
39
+ return normalize_path(path)
40
+
41
+ @validator('signal', pre=True)
42
+ def parse_signal(cls, signal: dict) -> Signal:
43
+ """Parse a signal to its specific subclass instance."""
44
+ return resolve_signal(signal)
45
+
46
+ def dict(
47
+ self,
48
+ *,
49
+ include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
50
+ exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
51
+ by_alias: bool = False,
52
+ skip_defaults: Optional[bool] = None,
53
+ exclude_unset: bool = False,
54
+ exclude_defaults: bool = False,
55
+ exclude_none: bool = False,
56
+ ) -> dict[str, Any]:
57
+ """Override the default dict method to simplify the path tuples.
58
+
59
+ This is required to remove the python-specific tuple dump in the yaml file.
60
+ """
61
+ res = super().dict(
62
+ include=include,
63
+ exclude=exclude,
64
+ by_alias=by_alias,
65
+ skip_defaults=skip_defaults,
66
+ exclude_unset=exclude_unset,
67
+ exclude_defaults=exclude_defaults,
68
+ exclude_none=exclude_none)
69
+ res['path'] = _serializable_path(res['path'])
70
+ return res
71
+
72
+
73
+ class EmbeddingConfig(BaseModel):
74
+ """Configures an embedding on a source path."""
75
+ path: PathTuple
76
+ embedding: str
77
+
78
+ class Config:
79
+ extra = Extra.forbid
80
+
81
+ def dict(
82
+ self,
83
+ *,
84
+ include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
85
+ exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
86
+ by_alias: bool = False,
87
+ skip_defaults: Optional[bool] = None,
88
+ exclude_unset: bool = False,
89
+ exclude_defaults: bool = False,
90
+ exclude_none: bool = False,
91
+ ) -> dict[str, Any]:
92
+ """Override the default dict method to simplify the path tuples.
93
+
94
+ This is required to remove the python-specific tuple dump in the yaml file.
95
+ """
96
+ res = super().dict(
97
+ include=include,
98
+ exclude=exclude,
99
+ by_alias=by_alias,
100
+ skip_defaults=skip_defaults,
101
+ exclude_unset=exclude_unset,
102
+ exclude_defaults=exclude_defaults,
103
+ exclude_none=exclude_none)
104
+ res['path'] = _serializable_path(res['path'])
105
+ return res
106
+
107
+ @validator('path', pre=True)
108
+ def parse_path(cls, path: Path) -> PathTuple:
109
+ """Parse a path."""
110
+ return normalize_path(path)
111
+
112
+ @validator('embedding', pre=True)
113
+ def validate_embedding(cls, embedding: str) -> str:
114
+ """Validate the embedding is registered."""
115
+ get_signal_by_type(embedding, TextEmbeddingSignal)
116
+ return embedding
117
+
118
+
119
+ class DatasetUISettings(BaseModel):
120
+ """The UI persistent settings for a dataset."""
121
+ media_paths: list[PathTuple] = []
122
+ markdown_paths: list[PathTuple] = []
123
+
124
+ class Config:
125
+ extra = Extra.forbid
126
+
127
+ @validator('media_paths', pre=True)
128
+ def parse_media_paths(cls, media_paths: list) -> list:
129
+ """Parse a path, ensuring it is a tuple."""
130
+ return [normalize_path(path) for path in media_paths]
131
+
132
+ def dict(
133
+ self,
134
+ *,
135
+ include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
136
+ exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
137
+ by_alias: bool = False,
138
+ skip_defaults: Optional[bool] = None,
139
+ exclude_unset: bool = False,
140
+ exclude_defaults: bool = False,
141
+ exclude_none: bool = False,
142
+ ) -> dict[str, Any]:
143
+ """Override the default dict method to simplify the path tuples.
144
+
145
+ This is required to remove the python-specific tuple dump in the yaml file.
146
+ """
147
+ # TODO(nsthorat): Migrate this to @field_serializer when we upgrade to pydantic v2.
148
+ res = super().dict(
149
+ include=include,
150
+ exclude=exclude,
151
+ by_alias=by_alias,
152
+ skip_defaults=skip_defaults,
153
+ exclude_unset=exclude_unset,
154
+ exclude_defaults=exclude_defaults,
155
+ exclude_none=exclude_none)
156
+ if 'media_paths' in res:
157
+ res['media_paths'] = [_serializable_path(path) for path in res['media_paths']]
158
+ if 'markdown_paths' in res:
159
+ res['markdown_paths'] = [_serializable_path(path) for path in res['markdown_paths']]
160
+ return res
161
+
162
+
163
+ class DatasetSettings(BaseModel):
164
+ """The persistent settings for a dataset."""
165
+ ui: Optional[DatasetUISettings] = None
166
+ preferred_embedding: Optional[str] = None
167
+
168
+ class Config:
169
+ extra = Extra.forbid
170
+
171
+
172
+ class DatasetConfig(BaseModel):
173
+ """Configures a dataset with a source and transformations."""
174
+ # The namespace and name of the dataset.
175
+ namespace: str
176
+ name: str
177
+ # Tags to organize datasets.
178
+ tags: list[str] = []
179
+
180
+ # The source configuration.
181
+ source: Source
182
+
183
+ # Model configuration: embeddings and signals on paths.
184
+ embeddings: list[EmbeddingConfig] = []
185
+ # When defined, uses this list of signals instead of running all signals.
186
+ signals: list[SignalConfig] = []
187
+
188
+ # Dataset settings, default embeddings and UI settings like media paths.
189
+ settings: Optional[DatasetSettings] = None
190
+
191
+ class Config:
192
+ extra = Extra.forbid
193
+
194
+ @validator('source', pre=True)
195
+ def parse_source(cls, source: dict) -> Source:
196
+ """Parse a source to its specific subclass instance."""
197
+ return resolve_source(source)
198
+
199
+
200
+ class Config(BaseModel):
201
+ """Configures a set of datasets for a lilac instance."""
202
+ datasets: list[DatasetConfig]
203
+
204
+ # When defined, uses this list of signals to run over every dataset, over all media paths, unless
205
+ # signals is overridden by a specific dataset.
206
+ signals: list[Signal] = []
207
+
208
+ # A list of embeddings to compute the model caches for, for all concepts.
209
+ concept_model_cache_embeddings: list[str] = []
210
+
211
+ class Config:
212
+ extra = Extra.forbid
213
+
214
+ @validator('signals', pre=True)
215
+ def parse_signal(cls, signals: list[dict]) -> list[Signal]:
216
+ """Parse alist of signals to their specific subclass instances."""
217
+ return [resolve_signal(signal) for signal in signals]
218
+
219
+
220
+ def read_config(config_path: str) -> Config:
221
+ """Reads a config file.
222
+
223
+ The config file can either be a `Config` or a `DatasetConfig`.
224
+
225
+ The result is always a `Config` object. If the input is a `DatasetConfig`, the config will just
226
+ contain a single dataset.
227
+ """
228
+ config_ext = pathlib.Path(config_path).suffix
229
+ if config_ext in ['.yml', '.yaml']:
230
+ with open(config_path, 'r') as f:
231
+ config_dict = yaml.safe_load(f)
232
+ elif config_ext in ['.json']:
233
+ with open(config_path, 'r') as f:
234
+ config_dict = json.load(f)
235
+ else:
236
+ raise ValueError(f'Unsupported config file extension: {config_ext}')
237
+
238
+ config: Optional[Config] = None
239
+ is_config = True
240
+ try:
241
+ config = Config(**config_dict)
242
+ except ValidationError:
243
+ is_config = False
244
+
245
+ if not is_config:
246
+ try:
247
+ dataset_config = DatasetConfig(**config_dict)
248
+ config = Config(datasets=[dataset_config])
249
+ except ValidationError as error:
250
+ raise ValidationError(
251
+ 'Config is not a valid `Config` or `DatasetConfig`', model=DatasetConfig) from error
252
+ assert config is not None
253
+
254
+ return config
255
+
256
+
257
+ class LilacHuggingFaceDataset(BaseModel):
258
+ """A huggingface dataset that powers the demo."""
259
+ hf_dataset_repo_id: str
260
+ lilac_namespace: str
261
+ lilac_name: str
262
+
263
+
264
+ class DemoConfig(BaseModel):
265
+ """Configures a hosted demo."""
266
+
267
+ # A list of huggingface dataset repositories that power the demo.
268
+ lilac_hf_datasets: list[LilacHuggingFaceDataset] = []
lilac/conftest.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fixtures for dataset tests."""
2
+ import os
3
+ import pathlib
4
+ from typing import Generator, Optional, Type
5
+
6
+ import pytest
7
+ from pytest_mock import MockerFixture
8
+
9
+ from .data.dataset import Dataset
10
+ from .data.dataset_duckdb import DatasetDuckDB
11
+ from .data.dataset_test_utils import make_dataset
12
+ from .db_manager import set_default_dataset_cls
13
+ from .schema import Item, Schema
14
+
15
+
16
+ @pytest.fixture(scope='function', params=[DatasetDuckDB])
17
+ def make_test_data(tmp_path: pathlib.Path, mocker: MockerFixture,
18
+ request: pytest.FixtureRequest) -> Generator:
19
+ """A pytest fixture for creating temporary test datasets."""
20
+ mocker.patch.dict(os.environ, {'LILAC_DATA_PATH': str(tmp_path)})
21
+ dataset_cls: Type[Dataset] = request.param
22
+ set_default_dataset_cls(dataset_cls)
23
+
24
+ def _make_test_data(items: list[Item], schema: Optional[Schema] = None) -> Dataset:
25
+ return make_dataset(dataset_cls, tmp_path, items, schema)
26
+
27
+ # Return the factory for datasets that test methods can use.
28
+ yield _make_test_data
lilac/data/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .dataset import (
2
+ BinaryOp,
3
+ Column,
4
+ ConceptSearch,
5
+ Dataset,
6
+ Filter,
7
+ FilterLike,
8
+ KeywordSearch,
9
+ ListOp,
10
+ SemanticSearch,
11
+ UnaryOp,
12
+ )
13
+
14
+ __all__ = [
15
+ 'Column',
16
+ 'KeywordSearch',
17
+ 'ConceptSearch',
18
+ 'SemanticSearch',
19
+ 'Filter',
20
+ 'UnaryOp',
21
+ 'BinaryOp',
22
+ 'ListOp',
23
+ 'Dataset',
24
+ 'FilterLike',
25
+ ]
lilac/data/dataset.py ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The interface for the database."""
2
+ from __future__ import annotations
3
+
4
+ import abc
5
+ import enum
6
+ import pathlib
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from datetime import datetime
9
+ from typing import Any, Iterator, Literal, Optional, Sequence, Union
10
+
11
+ import pandas as pd
12
+ from pydantic import (
13
+ BaseModel,
14
+ StrictBool,
15
+ StrictBytes,
16
+ StrictFloat,
17
+ StrictInt,
18
+ StrictStr,
19
+ validator,
20
+ )
21
+ from typing_extensions import TypeAlias
22
+
23
+ from lilac.signals.concept_scorer import ConceptSignal
24
+
25
+ from ..auth import UserInfo
26
+ from ..config import DatasetConfig, DatasetSettings, DatasetUISettings
27
+ from ..schema import (
28
+ PATH_WILDCARD,
29
+ ROWID,
30
+ VALUE_KEY,
31
+ Bin,
32
+ DataType,
33
+ Path,
34
+ PathTuple,
35
+ Schema,
36
+ normalize_path,
37
+ )
38
+ from ..signal import Signal, TextEmbeddingSignal, get_signal_by_type, resolve_signal
39
+ from ..tasks import TaskStepId
40
+
41
+ # Threshold for rejecting certain queries (e.g. group by) for columns with large cardinality.
42
+ TOO_MANY_DISTINCT = 500_000
43
+ SAMPLE_AVG_TEXT_LENGTH = 1000
44
+ MAX_TEXT_LEN_DISTINCT_COUNT = 250
45
+
46
+
47
+ class SelectRowsResult:
48
+ """The result of a select rows query."""
49
+
50
+ def __init__(self, df: pd.DataFrame, total_num_rows: int) -> None:
51
+ """Initialize the result."""
52
+ self._df = df
53
+ self.total_num_rows = total_num_rows
54
+
55
+ def __iter__(self) -> Iterator:
56
+ return (row.to_dict() for _, row in self._df.iterrows())
57
+
58
+ def df(self) -> pd.DataFrame:
59
+ """Convert the result to a pandas DataFrame."""
60
+ return self._df
61
+
62
+
63
+ class StatsResult(BaseModel):
64
+ """The result of a stats() query."""
65
+ path: PathTuple
66
+ # The number of leaf values.
67
+ total_count: int
68
+ # The approximate number of distinct leaf values.
69
+ approx_count_distinct: int
70
+
71
+ # Defined for ordinal features.
72
+ min_val: Optional[Union[float, datetime]] = None
73
+ max_val: Optional[Union[float, datetime]] = None
74
+
75
+ # Defined for text features.
76
+ avg_text_length: Optional[float] = None
77
+
78
+
79
+ class MediaResult(BaseModel):
80
+ """The result of a media() query."""
81
+ data: bytes
82
+
83
+
84
+ BinaryOp = Literal['equals', 'not_equal', 'greater', 'greater_equal', 'less', 'less_equal']
85
+ UnaryOp = Literal['exists']
86
+ ListOp = Literal['in']
87
+
88
+ BINARY_OPS = set(['equals', 'not_equal', 'greater', 'greater_equal', 'less', 'less_equal'])
89
+ UNARY_OPS = set(['exists'])
90
+ LIST_OPS = set(['in'])
91
+
92
+ SearchType = Union[Literal['keyword'], Literal['semantic'], Literal['concept']]
93
+
94
+
95
+ class SortOrder(str, enum.Enum):
96
+ """The sort order for a database query."""
97
+ DESC = 'DESC'
98
+ ASC = 'ASC'
99
+
100
+
101
+ class GroupsSortBy(str, enum.Enum):
102
+ """The sort for groups queries.
103
+
104
+ Either "count" which sorts by the count of feature value, or "value" which sorts by the
105
+ feature value itself.
106
+ """
107
+ COUNT = 'count'
108
+ VALUE = 'value'
109
+
110
+
111
+ class SortResult(BaseModel):
112
+ """The information about what is sorted after combining searches and explicit sorts."""
113
+ # The column that was sorted.
114
+ path: PathTuple
115
+ # The sort order.
116
+ order: SortOrder
117
+ # The alias of the column if it was aliased.
118
+ alias: Optional[str] = None
119
+ # The search index if the sort is by a search.
120
+ search_index: Optional[int] = None
121
+
122
+
123
+ class SearchResultInfo(BaseModel):
124
+ """The resulting sort order returned by the select rows schema."""
125
+ # The input path to the search.
126
+ search_path: PathTuple
127
+ # The resulting column that was searched.
128
+ result_path: PathTuple
129
+ # The alias of the UDF.
130
+ alias: Optional[str] = None
131
+
132
+
133
+ class SelectRowsSchemaUDF(BaseModel):
134
+ """The UDF for a select rows schema query."""
135
+ path: PathTuple
136
+ alias: Optional[str] = None
137
+
138
+
139
+ class SelectRowsSchemaResult(BaseModel):
140
+ """The result of a select rows schema query."""
141
+ data_schema: Schema
142
+ udfs: list[SelectRowsSchemaUDF] = []
143
+ search_results: list[SearchResultInfo] = []
144
+ sorts: Optional[list[SortResult]] = None
145
+
146
+
147
+ class Column(BaseModel):
148
+ """A column in the dataset."""
149
+ path: PathTuple
150
+ alias: Optional[str] = None # This is the renamed column during querying and response.
151
+
152
+ # Defined when the feature is another column.
153
+ signal_udf: Optional[Signal] = None
154
+
155
+ class Config:
156
+ smart_union = True
157
+
158
+ def __init__(self,
159
+ path: Path,
160
+ alias: Optional[str] = None,
161
+ signal_udf: Optional[Signal] = None,
162
+ **kwargs: Any):
163
+ """Initialize a column. We override __init__ to allow positional arguments for brevity."""
164
+ super().__init__(path=normalize_path(path), alias=alias, signal_udf=signal_udf, **kwargs)
165
+
166
+ @validator('signal_udf', pre=True)
167
+ def parse_signal_udf(cls, signal_udf: Optional[dict]) -> Optional[Signal]:
168
+ """Parse a signal to its specific subclass instance."""
169
+ if not signal_udf:
170
+ return None
171
+ return resolve_signal(signal_udf)
172
+
173
+
174
+ ColumnId = Union[Path, Column]
175
+
176
+
177
+ class DatasetManifest(BaseModel):
178
+ """The manifest for a dataset."""
179
+ namespace: str
180
+ dataset_name: str
181
+ data_schema: Schema
182
+ # Number of items in the dataset.
183
+ num_items: int
184
+
185
+
186
+ def column_from_identifier(column: ColumnId) -> Column:
187
+ """Create a column from a column identifier."""
188
+ if isinstance(column, Column):
189
+ return column.copy()
190
+ return Column(path=column)
191
+
192
+
193
+ FeatureValue = Union[StrictInt, StrictFloat, StrictBool, StrictStr, StrictBytes, datetime]
194
+ FeatureListValue = list[StrictStr]
195
+ BinaryFilterTuple = tuple[Path, BinaryOp, FeatureValue]
196
+ ListFilterTuple = tuple[Path, ListOp, FeatureListValue]
197
+ UnaryFilterTuple = tuple[Path, UnaryOp]
198
+
199
+ FilterOp = Union[BinaryOp, UnaryOp, ListOp]
200
+
201
+
202
+ class SelectGroupsResult(BaseModel):
203
+ """The result of a select groups query."""
204
+ too_many_distinct: bool
205
+ counts: list[tuple[Optional[FeatureValue], int]]
206
+ bins: Optional[list[Bin]] = None
207
+
208
+
209
+ class Filter(BaseModel):
210
+ """A filter on a column."""
211
+ path: PathTuple
212
+ op: FilterOp
213
+ value: Optional[Union[FeatureValue, FeatureListValue]] = None
214
+
215
+
216
+ FilterLike: TypeAlias = Union[Filter, BinaryFilterTuple, UnaryFilterTuple, ListFilterTuple]
217
+
218
+ SearchValue = StrictStr
219
+
220
+
221
+ class KeywordSearch(BaseModel):
222
+ """A keyword search query on a column."""
223
+ path: Path
224
+ query: SearchValue
225
+ type: Literal['keyword'] = 'keyword'
226
+
227
+
228
+ class SemanticSearch(BaseModel):
229
+ """A semantic search on a column."""
230
+ path: Path
231
+ query: SearchValue
232
+ embedding: str
233
+ type: Literal['semantic'] = 'semantic'
234
+
235
+
236
+ class ConceptSearch(BaseModel):
237
+ """A concept search query on a column."""
238
+ path: Path
239
+ concept_namespace: str
240
+ concept_name: str
241
+ embedding: str
242
+ type: Literal['concept'] = 'concept'
243
+
244
+
245
+ Search = Union[ConceptSearch, SemanticSearch, KeywordSearch]
246
+
247
+
248
+ class Dataset(abc.ABC):
249
+ """The database implementation to query a dataset."""
250
+
251
+ namespace: str
252
+ dataset_name: str
253
+
254
+ def __init__(self, namespace: str, dataset_name: str):
255
+ """Initialize a dataset.
256
+
257
+ Args:
258
+ namespace: The dataset namespace.
259
+ dataset_name: The dataset name.
260
+ """
261
+ self.namespace = namespace
262
+ self.dataset_name = dataset_name
263
+
264
+ @abc.abstractmethod
265
+ def delete(self) -> None:
266
+ """Deletes the dataset."""
267
+ pass
268
+
269
+ @abc.abstractmethod
270
+ def manifest(self) -> DatasetManifest:
271
+ """Return the manifest for the dataset."""
272
+ pass
273
+
274
+ @abc.abstractmethod
275
+ def config(self) -> DatasetConfig:
276
+ """Return the dataset config for this dataset."""
277
+ pass
278
+
279
+ @abc.abstractmethod
280
+ def settings(self) -> DatasetSettings:
281
+ """Return the persistent settings for the dataset."""
282
+ pass
283
+
284
+ @abc.abstractmethod
285
+ def update_settings(self, settings: DatasetSettings) -> None:
286
+ """Update the settings for the dataset."""
287
+ pass
288
+
289
+ @abc.abstractmethod
290
+ def compute_signal(self,
291
+ signal: Signal,
292
+ path: Path,
293
+ task_step_id: Optional[TaskStepId] = None) -> None:
294
+ """Compute a signal for a column.
295
+
296
+ Args:
297
+ signal: The signal to compute over the given columns.
298
+ path: The leaf path to compute the signal on.
299
+ task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
300
+ progress of the task.
301
+ """
302
+ pass
303
+
304
+ def compute_embedding(self,
305
+ embedding: str,
306
+ path: Path,
307
+ task_step_id: Optional[TaskStepId] = None) -> None:
308
+ """Compute an embedding for a given field path."""
309
+ signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
310
+ self.compute_signal(signal, path, task_step_id)
311
+
312
+ def compute_concept(self,
313
+ namespace: str,
314
+ concept_name: str,
315
+ embedding: str,
316
+ path: Path,
317
+ task_step_id: Optional[TaskStepId] = None) -> None:
318
+ """Compute concept scores for a given field path."""
319
+ signal = ConceptSignal(namespace=namespace, concept_name=concept_name, embedding=embedding)
320
+ self.compute_signal(signal, path, task_step_id)
321
+
322
+ @abc.abstractmethod
323
+ def delete_signal(self, signal_path: Path) -> None:
324
+ """Delete a computed signal from the dataset.
325
+
326
+ Args:
327
+ signal_path: The path holding the computed data of the signal.
328
+ """
329
+ pass
330
+
331
+ @abc.abstractmethod
332
+ def select_groups(
333
+ self,
334
+ leaf_path: Path,
335
+ filters: Optional[Sequence[FilterLike]] = None,
336
+ sort_by: Optional[GroupsSortBy] = None,
337
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
338
+ limit: Optional[int] = None,
339
+ bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None) -> SelectGroupsResult:
340
+ """Select grouped columns to power a histogram.
341
+
342
+ Args:
343
+ leaf_path: The leaf path to group by. The path can be a dot-seperated string path, or a tuple
344
+ of fields.
345
+ filters: The filters to apply to the query.
346
+ sort_by: What to sort by, either "count" or "value".
347
+ sort_order: The sort order.
348
+ limit: The maximum number of rows to return.
349
+ bins: The bins to use when bucketizing a float column.
350
+
351
+ Returns
352
+ A `SelectGroupsResult` iterator where each row is a group.
353
+ """
354
+ raise NotImplementedError
355
+
356
+ @abc.abstractmethod
357
+ def select_rows(self,
358
+ columns: Optional[Sequence[ColumnId]] = None,
359
+ searches: Optional[Sequence[Search]] = None,
360
+ filters: Optional[Sequence[FilterLike]] = None,
361
+ sort_by: Optional[Sequence[Path]] = None,
362
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
363
+ limit: Optional[int] = 100,
364
+ offset: Optional[int] = 0,
365
+ task_step_id: Optional[TaskStepId] = None,
366
+ resolve_span: bool = False,
367
+ combine_columns: bool = False,
368
+ user: Optional[UserInfo] = None) -> SelectRowsResult:
369
+ """Select a set of rows that match the provided filters, analogous to SQL SELECT.
370
+
371
+ Args:
372
+ columns: The columns to select. A column is an instance of `Column` which can either
373
+ define a path to a feature, or a column with an applied Transform, e.g. a Concept. If none,
374
+ it selects all columns.
375
+ searches: The searches to apply to the query.
376
+ filters: The filters to apply to the query.
377
+ sort_by: An ordered list of what to sort by. When defined, this is a list of aliases of column
378
+ names defined by the "alias" field in Column. If no alias is provided for a column, an
379
+ automatic alias is generated by combining each path element with a "."
380
+ For example: e.g. ('person', 'name') => person.name. For columns that are transform columns,
381
+ an alias must be provided explicitly. When sorting by a (nested) list of values, the sort
382
+ takes the minumum value when `sort_order` is `ASC`, and the maximum value when `sort_order`
383
+ is `DESC`.
384
+ sort_order: The sort order.
385
+ limit: The maximum number of rows to return.
386
+ offset: The offset to start returning rows from.
387
+ task_step_id: The TaskManager `task_step_id` for this process run. This is used to update the
388
+ progress.
389
+ resolve_span: Whether to resolve the span of the row.
390
+ combine_columns: Whether to combine columns into a single object. The object will be pruned
391
+ to only include sub-fields that correspond to the requested columns.
392
+ user: The authenticated user, if auth is enabled and the user is logged in. This is used to
393
+ apply ACL to the query, especially for concepts.
394
+
395
+ Returns
396
+ A `SelectRowsResult` iterator with rows of `Item`s.
397
+ """
398
+ pass
399
+
400
+ @abc.abstractmethod
401
+ def select_rows_schema(self,
402
+ columns: Optional[Sequence[ColumnId]] = None,
403
+ sort_by: Optional[Sequence[Path]] = None,
404
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
405
+ searches: Optional[Sequence[Search]] = None,
406
+ combine_columns: bool = False) -> SelectRowsSchemaResult:
407
+ """Returns the schema of the result of `select_rows` above with the same arguments."""
408
+ pass
409
+
410
+ @abc.abstractmethod
411
+ def stats(self, leaf_path: Path) -> StatsResult:
412
+ """Compute stats for a leaf path.
413
+
414
+ Args:
415
+ leaf_path: The leaf path to compute stats for.
416
+
417
+ Returns
418
+ A StatsResult.
419
+ """
420
+ pass
421
+
422
+ @abc.abstractmethod
423
+ def media(self, item_id: str, leaf_path: Path) -> MediaResult:
424
+ """Return the media for a leaf path.
425
+
426
+ Args:
427
+ item_id: The item id to get media for.
428
+ leaf_path: The leaf path for the media.
429
+
430
+ Returns
431
+ A MediaResult.
432
+ """
433
+ pass
434
+
435
+ @abc.abstractmethod
436
+ def to_json(self,
437
+ filepath: Union[str, pathlib.Path],
438
+ jsonl: bool = True,
439
+ columns: Optional[Sequence[ColumnId]] = None) -> None:
440
+ """Export the dataset to a JSON file.
441
+
442
+ Args:
443
+ filepath: The path to the file to export to.
444
+ jsonl: Whether to export to JSONL or JSON.
445
+ columns: The columns to export.
446
+ """
447
+ pass
448
+
449
+ @abc.abstractmethod
450
+ def to_pandas(self, columns: Optional[Sequence[ColumnId]] = None) -> pd.DataFrame:
451
+ """Export the dataset to a pandas DataFrame.
452
+
453
+ Args:
454
+ columns: The columns to export.
455
+ """
456
+ pass
457
+
458
+ @abc.abstractmethod
459
+ def to_parquet(self,
460
+ filepath: Union[str, pathlib.Path],
461
+ columns: Optional[Sequence[ColumnId]] = None) -> None:
462
+ """Export the dataset to a parquet file.
463
+
464
+ Args:
465
+ filepath: The path to the file to export to.
466
+ columns: The columns to export.
467
+ """
468
+ pass
469
+
470
+ @abc.abstractmethod
471
+ def to_csv(self,
472
+ filepath: Union[str, pathlib.Path],
473
+ columns: Optional[Sequence[ColumnId]] = None) -> None:
474
+ """Export the dataset to a csv file.
475
+
476
+ Args:
477
+ filepath: The path to the file to export to.
478
+ columns: The columns to export.
479
+ """
480
+ pass
481
+
482
+
483
+ def default_settings(dataset: Dataset) -> DatasetSettings:
484
+ """Gets the default settings for a dataset."""
485
+ schema = dataset.manifest().data_schema
486
+ leaf_paths = [
487
+ path for path, field in schema.leafs.items()
488
+ if field.dtype == DataType.STRING and path != (ROWID,)
489
+ ]
490
+ pool = ThreadPoolExecutor()
491
+ stats: list[StatsResult] = list(pool.map(lambda leaf: dataset.stats(leaf), leaf_paths))
492
+ sorted_stats = sorted([stat for stat in stats if stat.avg_text_length],
493
+ key=lambda stat: stat.avg_text_length or -1.0)
494
+ media_paths: list[PathTuple] = []
495
+ if sorted_stats:
496
+ media_paths = [sorted_stats[-1].path]
497
+
498
+ return DatasetSettings(ui=DatasetUISettings(media_paths=media_paths))
499
+
500
+
501
+ def make_parquet_id(signal: Signal,
502
+ source_path: PathTuple,
503
+ is_computed_signal: Optional[bool] = False) -> str:
504
+ """Return a unique identifier for this parquet table."""
505
+ # Remove the wildcards from the parquet id since they are implicit.
506
+ path = [*[p for p in source_path if p != PATH_WILDCARD], signal.key(is_computed_signal)]
507
+ # Don't use the VALUE_KEY as part of the parquet id to reduce the size of paths.
508
+ if path[-1] == VALUE_KEY:
509
+ path = path[:-1]
510
+ return '.'.join(path)
lilac/data/dataset_duckdb.py ADDED
@@ -0,0 +1,1833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The DuckDB implementation of the dataset database."""
2
+ import functools
3
+ import gc
4
+ import glob
5
+ import math
6
+ import os
7
+ import pathlib
8
+ import re
9
+ import shutil
10
+ import threading
11
+ from typing import Any, Iterable, Iterator, Optional, Sequence, Union, cast
12
+
13
+ import duckdb
14
+ import numpy as np
15
+ import pandas as pd
16
+ import yaml
17
+ from pandas.api.types import is_object_dtype
18
+ from pydantic import BaseModel, validator
19
+ from typing_extensions import override
20
+
21
+ from ..auth import UserInfo
22
+ from ..batch_utils import deep_flatten, deep_unflatten
23
+ from ..config import CONFIG_FILENAME, DatasetConfig, DatasetSettings, EmbeddingConfig, SignalConfig
24
+ from ..embeddings.vector_store import VectorDBIndex
25
+ from ..env import data_path, env
26
+ from ..schema import (
27
+ MANIFEST_FILENAME,
28
+ PATH_WILDCARD,
29
+ ROWID,
30
+ TEXT_SPAN_END_FEATURE,
31
+ TEXT_SPAN_START_FEATURE,
32
+ VALUE_KEY,
33
+ Bin,
34
+ DataType,
35
+ Field,
36
+ Item,
37
+ Path,
38
+ PathKey,
39
+ PathTuple,
40
+ RichData,
41
+ Schema,
42
+ SourceManifest,
43
+ column_paths_match,
44
+ is_float,
45
+ is_integer,
46
+ is_ordinal,
47
+ is_temporal,
48
+ normalize_path,
49
+ signal_type_supports_dtype,
50
+ )
51
+ from ..signal import Signal, TextEmbeddingSignal, VectorSignal, get_signal_by_type, resolve_signal
52
+ from ..signals.concept_labels import ConceptLabelsSignal
53
+ from ..signals.concept_scorer import ConceptSignal
54
+ from ..signals.semantic_similarity import SemanticSimilaritySignal
55
+ from ..signals.substring_search import SubstringSignal
56
+ from ..sources.source import Source
57
+ from ..tasks import TaskStepId, progress
58
+ from ..utils import DebugTimer, get_dataset_output_dir, log, open_file, to_yaml
59
+ from . import dataset
60
+ from .dataset import (
61
+ BINARY_OPS,
62
+ LIST_OPS,
63
+ MAX_TEXT_LEN_DISTINCT_COUNT,
64
+ SAMPLE_AVG_TEXT_LENGTH,
65
+ TOO_MANY_DISTINCT,
66
+ UNARY_OPS,
67
+ BinaryOp,
68
+ Column,
69
+ ColumnId,
70
+ Dataset,
71
+ DatasetManifest,
72
+ FeatureListValue,
73
+ FeatureValue,
74
+ Filter,
75
+ FilterLike,
76
+ GroupsSortBy,
77
+ MediaResult,
78
+ Search,
79
+ SearchResultInfo,
80
+ SelectGroupsResult,
81
+ SelectRowsResult,
82
+ SelectRowsSchemaResult,
83
+ SelectRowsSchemaUDF,
84
+ SortOrder,
85
+ SortResult,
86
+ StatsResult,
87
+ column_from_identifier,
88
+ make_parquet_id,
89
+ )
90
+ from .dataset_utils import (
91
+ count_primitives,
92
+ create_signal_schema,
93
+ flatten_keys,
94
+ merge_schemas,
95
+ schema_contains_path,
96
+ sparse_to_dense_compute,
97
+ wrap_in_dicts,
98
+ write_embeddings_to_disk,
99
+ write_items_to_parquet,
100
+ )
101
+
102
+ SIGNAL_MANIFEST_FILENAME = 'signal_manifest.json'
103
+ DATASET_SETTINGS_FILENAME = 'settings.json'
104
+ SOURCE_VIEW_NAME = 'source'
105
+
106
+ NUM_AUTO_BINS = 15
107
+
108
+ BINARY_OP_TO_SQL: dict[BinaryOp, str] = {
109
+ 'equals': '=',
110
+ 'not_equal': '!=',
111
+ 'greater': '>',
112
+ 'greater_equal': '>=',
113
+ 'less': '<',
114
+ 'less_equal': '<='
115
+ }
116
+
117
+
118
+ class DuckDBSearchUDF(BaseModel):
119
+ """The transformation of searches to column UDFs."""
120
+ udf: Column
121
+ search_path: PathTuple
122
+ output_path: PathTuple
123
+ sort: Optional[tuple[PathTuple, SortOrder]] = None
124
+
125
+
126
+ class DuckDBSearchUDFs(BaseModel):
127
+ """The transformation of searches to column UDFs with sorts."""
128
+ udfs: list[Column]
129
+ output_paths: list[PathTuple]
130
+ sorts: list[tuple[PathTuple, SortOrder]]
131
+
132
+
133
+ class DatasetDuckDB(Dataset):
134
+ """The DuckDB implementation of the dataset database."""
135
+
136
+ def __init__(self, namespace: str, dataset_name: str, vector_store: str = 'hnsw'):
137
+ super().__init__(namespace, dataset_name)
138
+
139
+ self.dataset_path = get_dataset_output_dir(data_path(), namespace, dataset_name)
140
+
141
+ # TODO: Infer the manifest from the parquet files so this is lighter weight.
142
+ self._source_manifest = read_source_manifest(self.dataset_path)
143
+ self._signal_manifests: list[SignalManifest] = []
144
+ self.con = duckdb.connect(database=':memory:')
145
+
146
+ # Maps a path and embedding to the vector index. This is lazily generated as needed.
147
+ self._vector_indices: dict[tuple[PathKey, str], VectorDBIndex] = {}
148
+ self.vector_store = vector_store
149
+ self._manifest_lock = threading.Lock()
150
+
151
+ self._config_lock = threading.Lock()
152
+ config_filepath = get_config_filepath(namespace, dataset_name)
153
+
154
+ if not os.path.exists(config_filepath):
155
+ # For backwards compatibility, if the config doesn't exist, create one. This will be out of
156
+ # sync but allow the server to still boot and update with new config changes.
157
+ # Make a metaclass so we get a valid `Source` class.
158
+ source_cls = type('Source_no_source', (Source,), {'name': 'no_source'})
159
+
160
+ old_settings_filepath = os.path.join(
161
+ get_dataset_output_dir(data_path(), namespace, dataset_name), 'settings.json')
162
+ settings = DatasetSettings()
163
+ if os.path.exists(old_settings_filepath):
164
+ with open(old_settings_filepath) as f:
165
+ settings = DatasetSettings.parse_raw(f.read())
166
+
167
+ config = DatasetConfig(
168
+ namespace=namespace, name=dataset_name, source=source_cls(), settings=settings)
169
+ with open(get_config_filepath(self.namespace, self.dataset_name), 'w') as f:
170
+ f.write(to_yaml(config.dict(exclude_none=True, exclude_defaults=True)))
171
+
172
+ # Create a join table from all the parquet files.
173
+ self.manifest()
174
+
175
+ @override
176
+ def delete(self) -> None:
177
+ """Deletes the dataset."""
178
+ self.con.close()
179
+ shutil.rmtree(self.dataset_path, ignore_errors=True)
180
+
181
+ def _create_view(self, view_name: str, files: list[str]) -> None:
182
+ self.con.execute(f"""
183
+ CREATE OR REPLACE VIEW {_escape_col_name(view_name)} AS (SELECT * FROM read_parquet({files}));
184
+ """)
185
+
186
+ # NOTE: This is cached, but when the latest mtime of any file in the dataset directory changes
187
+ # the results are invalidated.
188
+ @functools.cache
189
+ def _recompute_joint_table(self, latest_mtime_micro_sec: int) -> DatasetManifest:
190
+ del latest_mtime_micro_sec # This is used as the cache key.
191
+ merged_schema = self._source_manifest.data_schema.copy(deep=True)
192
+ self._signal_manifests = []
193
+ # Make a joined view of all the column groups.
194
+ self._create_view(SOURCE_VIEW_NAME,
195
+ [os.path.join(self.dataset_path, f) for f in self._source_manifest.files])
196
+
197
+ # Add the signal column groups.
198
+ for root, _, files in os.walk(self.dataset_path):
199
+ for file in files:
200
+ if not file.endswith(SIGNAL_MANIFEST_FILENAME):
201
+ continue
202
+
203
+ with open_file(os.path.join(root, file)) as f:
204
+ signal_manifest = SignalManifest.parse_raw(f.read())
205
+ self._signal_manifests.append(signal_manifest)
206
+ signal_files = [os.path.join(root, f) for f in signal_manifest.files]
207
+ if signal_files:
208
+ self._create_view(signal_manifest.parquet_id, signal_files)
209
+
210
+ merged_schema = merge_schemas([self._source_manifest.data_schema] +
211
+ [m.data_schema for m in self._signal_manifests])
212
+
213
+ # The logic below generates the following example query:
214
+ # CREATE OR REPLACE VIEW t AS (
215
+ # SELECT
216
+ # source.*,
217
+ # "parquet_id1"."root_column" AS "parquet_id1",
218
+ # "parquet_id2"."root_column" AS "parquet_id2"
219
+ # FROM source JOIN "parquet_id1" USING (rowid,) JOIN "parquet_id2" USING (rowid,)
220
+ # );
221
+ # NOTE: "root_column" for each signal is defined as the top-level column.
222
+ select_sql = ', '.join([f'{SOURCE_VIEW_NAME}.*'] + [(
223
+ f'{_escape_col_name(manifest.parquet_id)}.{_escape_col_name(_root_column(manifest))} '
224
+ f'AS {_escape_col_name(manifest.parquet_id)}')
225
+ for manifest in self._signal_manifests
226
+ if manifest.files])
227
+ join_sql = ' '.join([SOURCE_VIEW_NAME] + [
228
+ f'LEFT JOIN {_escape_col_name(manifest.parquet_id)} USING ({ROWID})'
229
+ for manifest in self._signal_manifests
230
+ if manifest.files
231
+ ])
232
+ view_or_table = 'TABLE'
233
+ use_views = env('DUCKDB_USE_VIEWS', 0) or 0
234
+ if int(use_views):
235
+ view_or_table = 'VIEW'
236
+ sql_cmd = f"""CREATE OR REPLACE {view_or_table} t AS (SELECT {select_sql} FROM {join_sql})"""
237
+ self.con.execute(sql_cmd)
238
+
239
+ # Get the total size of the table.
240
+ size_query = 'SELECT COUNT() as count FROM t'
241
+ size_query_result = cast(Any, self._query(size_query)[0])
242
+ num_items = cast(int, size_query_result[0])
243
+
244
+ return DatasetManifest(
245
+ namespace=self.namespace,
246
+ dataset_name=self.dataset_name,
247
+ data_schema=merged_schema,
248
+ num_items=num_items)
249
+
250
+ @override
251
+ def manifest(self) -> DatasetManifest:
252
+ # Use the latest modification time of all files under the dataset path as the cache key for
253
+ # re-computing the manifest and the joined view.
254
+ with self._manifest_lock:
255
+ all_dataset_files = glob.iglob(os.path.join(self.dataset_path, '**'), recursive=True)
256
+ latest_mtime = max(map(os.path.getmtime, all_dataset_files))
257
+ latest_mtime_micro_sec = int(latest_mtime * 1e6)
258
+ return self._recompute_joint_table(latest_mtime_micro_sec)
259
+
260
+ def _update_config(self,
261
+ settings: Optional[DatasetSettings] = None,
262
+ signals: Optional[list[SignalConfig]] = None,
263
+ embeddings: Optional[list[EmbeddingConfig]] = None) -> None:
264
+ with self._config_lock:
265
+ config = self.config()
266
+
267
+ if settings is not None:
268
+ config.settings = settings
269
+
270
+ if signals is not None:
271
+ # Update the config with the new signal, if the new signal has not already been added (this
272
+ # can happen if a signal is re-computed)
273
+ update_config = True
274
+ for signal_config in signals:
275
+ for existing_signal in config.signals:
276
+ if (existing_signal.path == signal_config.path and
277
+ existing_signal.signal.dict() == signal_config.signal.dict()):
278
+ update_config = False
279
+ break
280
+ if update_config:
281
+ config.signals.append(signal_config)
282
+
283
+ if embeddings is not None:
284
+ # Update the config with the new signal, if the new signal has not already been added (this
285
+ # can happen if a signal is re-computed)
286
+ update_config = True
287
+ for embedding_config in embeddings:
288
+ for existing_embedding in config.embeddings:
289
+ if (existing_embedding.path == embedding_config.path and
290
+ existing_embedding.embedding == embedding_config.embedding):
291
+ update_config = False
292
+ break
293
+ if update_config:
294
+ config.embeddings.append(embedding_config)
295
+
296
+ with open(get_config_filepath(self.namespace, self.dataset_name), 'w') as f:
297
+ f.write(to_yaml(config.dict(exclude_none=True, exclude_defaults=True)))
298
+
299
+ @override
300
+ def config(self) -> DatasetConfig:
301
+ config_filepath = get_config_filepath(self.namespace, self.dataset_name)
302
+ with open(config_filepath) as f:
303
+ return DatasetConfig(**yaml.safe_load(f))
304
+
305
+ @override
306
+ def settings(self) -> DatasetSettings:
307
+ # Settings should always have a default.
308
+ settings = self.config().settings
309
+ assert settings is not None
310
+ return settings
311
+
312
+ @override
313
+ def update_settings(self, settings: DatasetSettings) -> None:
314
+ self._update_config(settings)
315
+
316
+ def count(self, filters: Optional[list[FilterLike]] = None) -> int:
317
+ """Count the number of rows."""
318
+ raise NotImplementedError('count is not yet implemented for DuckDB.')
319
+
320
+ def _get_vector_db_index(self, embedding: str, path: PathTuple) -> VectorDBIndex:
321
+ # Refresh the manifest to make sure we have the latest signal manifests.
322
+ self.manifest()
323
+ index_key = (path, embedding)
324
+ if index_key in self._vector_indices:
325
+ return self._vector_indices[index_key]
326
+
327
+ manifests = [
328
+ m for m in self._signal_manifests
329
+ if schema_contains_path(m.data_schema, path) and m.vector_store and m.signal.name == embedding
330
+ ]
331
+ if not manifests:
332
+ raise ValueError(f'No embedding found for path {path}.')
333
+ if len(manifests) > 1:
334
+ raise ValueError(f'Multiple embeddings found for path {path}. Got: {manifests}')
335
+ manifest = manifests[0]
336
+ if not manifest.vector_store:
337
+ raise ValueError(f'Signal manifest for path {path} is not an embedding. '
338
+ f'Got signal manifest: {manifest}')
339
+
340
+ base_path = os.path.join(self.dataset_path, _signal_dir(manifest.enriched_path),
341
+ manifest.signal.name)
342
+ path_id = f'{self.namespace}/{self.dataset_name}:{path}'
343
+ with DebugTimer(f'Loading vector store "{manifest.vector_store}" for {path_id}'
344
+ f' with embedding "{embedding}"'):
345
+ vector_index = VectorDBIndex(manifest.vector_store)
346
+ vector_index.load(base_path)
347
+ # Cache the vector index.
348
+ self._vector_indices[index_key] = vector_index
349
+ return vector_index
350
+
351
+ @override
352
+ def compute_signal(self,
353
+ signal: Signal,
354
+ path: Path,
355
+ task_step_id: Optional[TaskStepId] = None) -> None:
356
+ if isinstance(signal, TextEmbeddingSignal):
357
+ return self.compute_embedding(signal.name, path, task_step_id)
358
+ source_path = normalize_path(path)
359
+ manifest = self.manifest()
360
+
361
+ if task_step_id is None:
362
+ # Make a dummy task step so we report progress via tqdm.
363
+ task_step_id = ('', 0)
364
+
365
+ # The manifest may have changed after computing the dependencies.
366
+ manifest = self.manifest()
367
+
368
+ signal_col = Column(path=source_path, alias='value', signal_udf=signal)
369
+ select_rows_result = self.select_rows([ROWID, signal_col],
370
+ task_step_id=task_step_id,
371
+ resolve_span=True)
372
+ df = select_rows_result.df()
373
+ values = df['value']
374
+
375
+ enriched_path = _col_destination_path(signal_col, is_computed_signal=True)
376
+ spec = _split_path_into_subpaths_of_lists(enriched_path)
377
+ output_dir = os.path.join(self.dataset_path, _signal_dir(enriched_path))
378
+ signal_schema = create_signal_schema(signal, source_path, manifest.data_schema)
379
+ enriched_signal_items = cast(Iterable[Item], wrap_in_dicts(values, spec))
380
+ for rowid, item in zip(df[ROWID], enriched_signal_items):
381
+ item[ROWID] = rowid
382
+
383
+ enriched_signal_items = list(enriched_signal_items)
384
+ parquet_filename, _ = write_items_to_parquet(
385
+ items=enriched_signal_items,
386
+ output_dir=output_dir,
387
+ schema=signal_schema,
388
+ filename_prefix='data',
389
+ shard_index=0,
390
+ num_shards=1)
391
+
392
+ signal_manifest = SignalManifest(
393
+ files=[parquet_filename],
394
+ data_schema=signal_schema,
395
+ signal=signal,
396
+ enriched_path=source_path,
397
+ parquet_id=make_parquet_id(signal, source_path, is_computed_signal=True))
398
+ signal_manifest_filepath = os.path.join(output_dir, SIGNAL_MANIFEST_FILENAME)
399
+ with open_file(signal_manifest_filepath, 'w') as f:
400
+ f.write(signal_manifest.json(exclude_none=True, indent=2))
401
+
402
+ self._update_config(signals=[SignalConfig(path=source_path, signal=signal)])
403
+
404
+ log(f'Wrote signal output to {output_dir}')
405
+
406
+ @override
407
+ def compute_embedding(self,
408
+ embedding: str,
409
+ path: Path,
410
+ task_step_id: Optional[TaskStepId] = None) -> None:
411
+ source_path = normalize_path(path)
412
+ manifest = self.manifest()
413
+
414
+ if task_step_id is None:
415
+ # Make a dummy task step so we report progress via tqdm.
416
+ task_step_id = ('', 0)
417
+
418
+ signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
419
+ signal_col = Column(path=source_path, alias='value', signal_udf=signal)
420
+ select_rows_result = self.select_rows([ROWID, signal_col],
421
+ task_step_id=task_step_id,
422
+ resolve_span=True)
423
+ df = select_rows_result.df()
424
+ values = df['value']
425
+
426
+ enriched_path = _col_destination_path(signal_col, is_computed_signal=True)
427
+ output_dir = os.path.join(self.dataset_path, _signal_dir(enriched_path))
428
+ signal_schema = create_signal_schema(signal, source_path, manifest.data_schema)
429
+
430
+ write_embeddings_to_disk(
431
+ vector_store=self.vector_store, rowids=df[ROWID], signal_items=values, output_dir=output_dir)
432
+
433
+ del select_rows_result, df, values
434
+ gc.collect()
435
+
436
+ signal_manifest = SignalManifest(
437
+ files=[],
438
+ data_schema=signal_schema,
439
+ signal=signal,
440
+ enriched_path=source_path,
441
+ parquet_id=make_parquet_id(signal, source_path, is_computed_signal=True),
442
+ vector_store=self.vector_store)
443
+ signal_manifest_filepath = os.path.join(output_dir, SIGNAL_MANIFEST_FILENAME)
444
+
445
+ with open_file(signal_manifest_filepath, 'w') as f:
446
+ f.write(signal_manifest.json(exclude_none=True, indent=2))
447
+
448
+ self._update_config(embeddings=[EmbeddingConfig(path=source_path, embedding=embedding)])
449
+
450
+ log(f'Wrote embedding index to {output_dir}')
451
+
452
+ @override
453
+ def delete_signal(self, signal_path: Path) -> None:
454
+ signal_path = normalize_path(signal_path)
455
+ manifest = self.manifest()
456
+ if not manifest.data_schema.has_field(signal_path):
457
+ raise ValueError(f'Unknown signal path: {signal_path}')
458
+
459
+ output_dir = os.path.join(self.dataset_path, _signal_dir(signal_path))
460
+ shutil.rmtree(output_dir, ignore_errors=True)
461
+
462
+ def _validate_filters(self, filters: Sequence[Filter], col_aliases: dict[str, PathTuple],
463
+ manifest: DatasetManifest) -> None:
464
+ for filter in filters:
465
+ if filter.path[0] in col_aliases:
466
+ # This is a filter on a column alias, which is always allowed.
467
+ continue
468
+
469
+ current_field = Field(fields=manifest.data_schema.fields)
470
+ if filter.path == (ROWID,):
471
+ return
472
+ for path_part in filter.path:
473
+ if path_part == VALUE_KEY:
474
+ if not current_field.dtype:
475
+ raise ValueError(f'Unable to filter on path {filter.path}. The field has no value.')
476
+ continue
477
+ if current_field.fields:
478
+ if path_part not in current_field.fields:
479
+ raise ValueError(f'Unable to filter on path {filter.path}. '
480
+ f'Path part "{path_part}" not found in the dataset.')
481
+ current_field = current_field.fields[str(path_part)]
482
+ continue
483
+ elif current_field.repeated_field:
484
+ current_field = current_field.repeated_field
485
+ continue
486
+ else:
487
+ raise ValueError(f'Unable to filter on path {filter.path}. '
488
+ f'Path part "{path_part}" is not defined on a primitive value.')
489
+
490
+ while current_field.repeated_field:
491
+ current_field = current_field.repeated_field
492
+ filter.path = (*filter.path, PATH_WILDCARD)
493
+
494
+ if not current_field.dtype:
495
+ raise ValueError(f'Unable to filter on path {filter.path}. The field has no value.')
496
+
497
+ def _validate_udfs(self, udf_cols: Sequence[Column], source_schema: Schema) -> None:
498
+ for col in udf_cols:
499
+ path = col.path
500
+
501
+ # Signal transforms must operate on a leaf field.
502
+ leaf = source_schema.leafs.get(path)
503
+ if not leaf or not leaf.dtype:
504
+ raise ValueError(f'Leaf "{path}" not found in dataset. '
505
+ 'Signal transforms must operate on a leaf field.')
506
+
507
+ # Signal transforms must have the same dtype as the leaf field.
508
+ signal = cast(Signal, col.signal_udf)
509
+ if not signal_type_supports_dtype(signal.input_type, leaf.dtype):
510
+ raise ValueError(f'Leaf "{path}" has dtype "{leaf.dtype}" which is not supported '
511
+ f'by "{signal.key()}" with signal input type "{signal.input_type}".')
512
+
513
+ def _validate_selection(self, columns: Sequence[Column], select_schema: Schema) -> None:
514
+ # Validate all the columns and make sure they exist in the `select_schema`.
515
+ for column in columns:
516
+ current_field = Field(fields=select_schema.fields)
517
+ path = column.path
518
+ if path == (ROWID,):
519
+ return
520
+ for path_part in path:
521
+ if path_part == VALUE_KEY:
522
+ if not current_field.dtype:
523
+ raise ValueError(f'Unable to select path {path}. The field that has no value.')
524
+ continue
525
+ if current_field.fields:
526
+ if path_part not in current_field.fields:
527
+ raise ValueError(f'Unable to select path {path}. '
528
+ f'Path part "{path_part}" not found in the dataset.')
529
+ current_field = current_field.fields[path_part]
530
+ continue
531
+ elif current_field.repeated_field:
532
+ if path_part.isdigit():
533
+ raise ValueError(f'Unable to select path {path}. Selecting a specific index of '
534
+ 'a repeated field is currently not supported.')
535
+ if path_part != PATH_WILDCARD:
536
+ raise ValueError(f'Unable to select path {path}. '
537
+ f'Path part "{path_part}" should be a wildcard.')
538
+ current_field = current_field.repeated_field
539
+ elif not current_field.dtype:
540
+ raise ValueError(f'Unable to select path {path}. '
541
+ f'Path part "{path_part}" is not defined on a primitive value.')
542
+
543
+ def _validate_columns(self, columns: Sequence[Column], source_schema: Schema,
544
+ select_schema: Schema) -> None:
545
+ udf_cols = [col for col in columns if col.signal_udf]
546
+ self._validate_udfs(udf_cols, source_schema)
547
+ self._validate_selection(columns, select_schema)
548
+
549
+ def _validate_sort_path(self, path: PathTuple, schema: Schema) -> None:
550
+ current_field = Field(fields=schema.fields)
551
+ if path == (ROWID,):
552
+ return
553
+ for path_part in path:
554
+ if path_part == VALUE_KEY:
555
+ if not current_field.dtype:
556
+ raise ValueError(f'Unable to sort by path {path}. The field that has no value.')
557
+ continue
558
+ if current_field.fields:
559
+ if path_part not in current_field.fields:
560
+ raise ValueError(f'Unable to sort by path {path}. '
561
+ f'Path part "{path_part}" not found in the dataset.')
562
+ current_field = current_field.fields[path_part]
563
+ continue
564
+ elif current_field.repeated_field:
565
+ if path_part.isdigit():
566
+ raise ValueError(f'Unable to sort by path {path}. Selecting a specific index of '
567
+ 'a repeated field is currently not supported.')
568
+ if path_part != PATH_WILDCARD:
569
+ raise ValueError(f'Unable to sort by path {path}. '
570
+ f'Path part "{path_part}" should be a wildcard.')
571
+ current_field = current_field.repeated_field
572
+ elif not current_field.dtype:
573
+ raise ValueError(f'Unable to sort by path {path}. '
574
+ f'Path part "{path_part}" is not defined on a primitive value.')
575
+ if not current_field.dtype:
576
+ raise ValueError(f'Unable to sort by path {path}. The field has no value.')
577
+
578
+ @override
579
+ @functools.cache # Cache stats for leaf paths since we ask on every dataset page refresh.
580
+ def stats(self, leaf_path: Path) -> StatsResult:
581
+ if not leaf_path:
582
+ raise ValueError('leaf_path must be provided')
583
+ path = normalize_path(leaf_path)
584
+ manifest = self.manifest()
585
+ leaf = manifest.data_schema.get_field(path)
586
+ # Find the inner-most leaf in case this field is repeated.
587
+ while leaf.repeated_field:
588
+ leaf = leaf.repeated_field
589
+ path = (*path, PATH_WILDCARD)
590
+
591
+ if not leaf.dtype:
592
+ raise ValueError(f'Leaf "{path}" not found in dataset')
593
+
594
+ duckdb_path = self._leaf_path_to_duckdb_path(path, manifest.data_schema)
595
+ inner_select = _select_sql(
596
+ duckdb_path, flatten=True, unnest=True, span_from=self._get_span_from(path, manifest))
597
+
598
+ # Compute the average length of text fields.
599
+ avg_text_length: Optional[int] = None
600
+ if leaf.dtype in (DataType.STRING, DataType.STRING_SPAN):
601
+ avg_length_query = f"""
602
+ SELECT avg(length(val))
603
+ FROM (SELECT {inner_select} AS val FROM t) USING SAMPLE {SAMPLE_AVG_TEXT_LENGTH};
604
+ """
605
+ row = self._query(avg_length_query)[0]
606
+ if row[0] is not None:
607
+ avg_text_length = int(row[0])
608
+
609
+ total_count_query = f'SELECT count(val) FROM (SELECT {inner_select} as val FROM t)'
610
+ total_count = int(self._query(total_count_query)[0][0])
611
+
612
+ # Compute approximate count by sampling the data to avoid OOM.
613
+ if avg_text_length and avg_text_length > MAX_TEXT_LEN_DISTINCT_COUNT:
614
+ # Assume that every text field is unique.
615
+ approx_count_distinct = manifest.num_items
616
+ elif leaf.dtype == DataType.BOOLEAN:
617
+ approx_count_distinct = 2
618
+ else:
619
+ sample_size = TOO_MANY_DISTINCT
620
+ approx_count_query = f"""
621
+ SELECT approx_count_distinct(val) as approxCountDistinct
622
+ FROM (SELECT {inner_select} AS val FROM t) USING SAMPLE {sample_size};
623
+ """
624
+ row = self._query(approx_count_query)[0]
625
+ approx_count_distinct = int(row[0])
626
+
627
+ # Adjust the counts for the sample size.
628
+ factor = max(1, total_count / sample_size)
629
+ approx_count_distinct = round(approx_count_distinct * factor)
630
+
631
+ result = StatsResult(
632
+ path=path,
633
+ total_count=total_count,
634
+ approx_count_distinct=approx_count_distinct,
635
+ avg_text_length=avg_text_length)
636
+
637
+ # Compute min/max values for ordinal leafs, without sampling the data.
638
+ if is_ordinal(leaf.dtype):
639
+ min_max_query = f"""
640
+ SELECT MIN(val) AS minVal, MAX(val) AS maxVal
641
+ FROM (SELECT {inner_select} as val FROM t)
642
+ {'WHERE NOT isnan(val)' if is_float(leaf.dtype) else ''}
643
+ """
644
+ row = self._query(min_max_query)[0]
645
+ result.min_val, result.max_val = row
646
+
647
+ return result
648
+
649
+ @override
650
+ def select_groups(
651
+ self,
652
+ leaf_path: Path,
653
+ filters: Optional[Sequence[FilterLike]] = None,
654
+ sort_by: Optional[GroupsSortBy] = GroupsSortBy.COUNT,
655
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
656
+ limit: Optional[int] = None,
657
+ bins: Optional[Union[Sequence[Bin], Sequence[float]]] = None) -> SelectGroupsResult:
658
+ if not leaf_path:
659
+ raise ValueError('leaf_path must be provided')
660
+ path = normalize_path(leaf_path)
661
+ manifest = self.manifest()
662
+ leaf = manifest.data_schema.get_field(path)
663
+ # Find the inner-most leaf in case this field is repeated.
664
+ while leaf.repeated_field:
665
+ leaf = leaf.repeated_field
666
+ path = (*path, PATH_WILDCARD)
667
+
668
+ if not leaf.dtype:
669
+ raise ValueError(f'Leaf "{path}" not found in dataset')
670
+
671
+ inner_val = 'inner_val'
672
+ outer_select = inner_val
673
+ # Normalize the bins to be `list[Bin]`.
674
+ named_bins = _normalize_bins(bins or leaf.bins)
675
+ stats = self.stats(leaf_path)
676
+
677
+ leaf_is_float = is_float(leaf.dtype)
678
+ leaf_is_integer = is_integer(leaf.dtype)
679
+ if not leaf.categorical and (leaf_is_float or leaf_is_integer):
680
+ if named_bins is None:
681
+ # Auto-bin.
682
+ named_bins = _auto_bins(stats, NUM_AUTO_BINS)
683
+
684
+ sql_bounds = []
685
+ for label, start, end in named_bins:
686
+ if start is None:
687
+ start = cast(float, "'-Infinity'")
688
+ if end is None:
689
+ end = cast(float, "'Infinity'")
690
+ sql_bounds.append(f"('{label}', {start}, {end})")
691
+
692
+ bin_index_col = 'col0'
693
+ bin_min_col = 'col1'
694
+ bin_max_col = 'col2'
695
+ is_nan_filter = f'NOT isnan({inner_val}) AND' if leaf_is_float else ''
696
+
697
+ # We cast the field to `double` so binning works for both `float` and `int` fields.
698
+ outer_select = f"""(
699
+ SELECT {bin_index_col} FROM (
700
+ VALUES {', '.join(sql_bounds)}
701
+ ) WHERE {is_nan_filter}
702
+ {inner_val}::DOUBLE >= {bin_min_col} AND {inner_val}::DOUBLE < {bin_max_col}
703
+ )"""
704
+ else:
705
+ if stats.approx_count_distinct >= dataset.TOO_MANY_DISTINCT:
706
+ return SelectGroupsResult(too_many_distinct=True, counts=[], bins=named_bins)
707
+
708
+ count_column = 'count'
709
+ value_column = 'value'
710
+
711
+ limit_query = f'LIMIT {limit}' if limit else ''
712
+ duckdb_path = self._leaf_path_to_duckdb_path(path, manifest.data_schema)
713
+ inner_select = _select_sql(
714
+ duckdb_path, flatten=True, unnest=True, span_from=self._get_span_from(path, manifest))
715
+
716
+ filters, _ = self._normalize_filters(filters, col_aliases={}, udf_aliases={}, manifest=manifest)
717
+ filter_queries = self._create_where(manifest, filters, searches=[])
718
+
719
+ where_query = ''
720
+ if filter_queries:
721
+ where_query = f"WHERE {' AND '.join(filter_queries)}"
722
+
723
+ query = f"""
724
+ SELECT {outer_select} AS {value_column}, COUNT() AS {count_column}
725
+ FROM (SELECT {inner_select} AS {inner_val} FROM t {where_query})
726
+ GROUP BY {value_column}
727
+ ORDER BY {sort_by} {sort_order}
728
+ {limit_query}
729
+ """
730
+ df = self._query_df(query)
731
+ counts = list(df.itertuples(index=False, name=None))
732
+ if is_temporal(leaf.dtype):
733
+ # Replace any NaT with None and pd.Timestamp to native datetime objects.
734
+ counts = [(None if pd.isnull(val) else val.to_pydatetime(), count) for val, count in counts]
735
+ return SelectGroupsResult(too_many_distinct=False, counts=counts, bins=named_bins)
736
+
737
+ def _topk_udf_to_sort_by(
738
+ self,
739
+ udf_columns: list[Column],
740
+ sort_by: list[PathTuple],
741
+ limit: Optional[int],
742
+ sort_order: Optional[SortOrder],
743
+ ) -> Optional[Column]:
744
+ if (sort_order != SortOrder.DESC) or (not limit) or (not sort_by):
745
+ return None
746
+ if len(sort_by) < 1:
747
+ return None
748
+ primary_sort_by = sort_by[0]
749
+ udf_cols_to_sort_by = [
750
+ udf_col for udf_col in udf_columns if udf_col.alias == primary_sort_by[0] or
751
+ _path_contains(_col_destination_path(udf_col), primary_sort_by)
752
+ ]
753
+ if not udf_cols_to_sort_by:
754
+ return None
755
+ udf_col = udf_cols_to_sort_by[0]
756
+ if udf_col.signal_udf and not isinstance(udf_col.signal_udf, VectorSignal):
757
+ return None
758
+ return udf_col
759
+
760
+ def _normalize_columns(self, columns: Optional[Sequence[ColumnId]], schema: Schema,
761
+ combine_columns: bool) -> list[Column]:
762
+ """Normalizes the columns to a list of `Column` objects."""
763
+ cols = [column_from_identifier(col) for col in columns or []]
764
+ star_in_cols = any(col.path == (PATH_WILDCARD,) for col in cols)
765
+ if not cols or star_in_cols:
766
+ # Select all columns.
767
+ cols.extend([Column((name,)) for name in schema.fields.keys() if name != ROWID])
768
+
769
+ if not combine_columns:
770
+ # Select all the signal top-level fields.
771
+ for path, field in schema.all_fields:
772
+ if field.signal:
773
+ cols.append(Column(path))
774
+
775
+ if star_in_cols:
776
+ cols = [col for col in cols if col.path != (PATH_WILDCARD,)]
777
+ return cols
778
+
779
+ def _merge_sorts(self, search_udfs: list[DuckDBSearchUDF], sort_by: Optional[Sequence[Path]],
780
+ sort_order: Optional[SortOrder]) -> list[SortResult]:
781
+ # True when the user has explicitly sorted by the alias of a search UDF (e.g. in ASC order).
782
+ is_explicit_search_sort = False
783
+ for sort_by_path in sort_by or []:
784
+ for search_udf in search_udfs:
785
+ if column_paths_match(sort_by_path, search_udf.output_path):
786
+ is_explicit_search_sort = True
787
+ break
788
+
789
+ sort_results: list[SortResult] = []
790
+ if sort_by and not is_explicit_search_sort:
791
+ if not sort_order:
792
+ raise ValueError('`sort_order` is required when `sort_by` is specified.')
793
+ # If the user has explicitly set a sort by, and it's not a search UDF alias, override.
794
+ sort_results = [
795
+ SortResult(path=normalize_path(sort_by), order=sort_order) for sort_by in sort_by if sort_by
796
+ ]
797
+ else:
798
+ search_udfs_with_sort = [search_udf for search_udf in search_udfs if search_udf.sort]
799
+ if search_udfs_with_sort:
800
+ # Override the sort by the last search sort order when the user hasn't provided an
801
+ # explicit sort order.
802
+ last_search_udf = search_udfs_with_sort[-1]
803
+ assert last_search_udf.sort, 'Expected search UDFs with sort to have a sort.'
804
+ udf_sort_path, udf_sort_order = last_search_udf.sort
805
+ sort_results = [
806
+ SortResult(
807
+ path=udf_sort_path,
808
+ order=sort_order or udf_sort_order,
809
+ search_index=len(search_udfs_with_sort) - 1)
810
+ ]
811
+
812
+ return sort_results
813
+
814
+ @override
815
+ def select_rows(self,
816
+ columns: Optional[Sequence[ColumnId]] = None,
817
+ searches: Optional[Sequence[Search]] = None,
818
+ filters: Optional[Sequence[FilterLike]] = None,
819
+ sort_by: Optional[Sequence[Path]] = None,
820
+ sort_order: Optional[SortOrder] = SortOrder.DESC,
821
+ limit: Optional[int] = None,
822
+ offset: Optional[int] = 0,
823
+ task_step_id: Optional[TaskStepId] = None,
824
+ resolve_span: bool = False,
825
+ combine_columns: bool = False,
826
+ user: Optional[UserInfo] = None) -> SelectRowsResult:
827
+ manifest = self.manifest()
828
+ cols = self._normalize_columns(columns, manifest.data_schema, combine_columns)
829
+ offset = offset or 0
830
+ schema = manifest.data_schema
831
+
832
+ if combine_columns:
833
+ schema = self.select_rows_schema(
834
+ columns, sort_by, sort_order, searches, combine_columns=True).data_schema
835
+
836
+ self._validate_columns(cols, manifest.data_schema, schema)
837
+ self._normalize_searches(searches, manifest)
838
+ search_udfs = self._search_udfs(searches, manifest)
839
+ cols.extend([search_udf.udf for search_udf in search_udfs])
840
+ udf_columns = [col for col in cols if col.signal_udf]
841
+
842
+ temp_rowid_selected = False
843
+ for col in cols:
844
+ if col.path == (ROWID,):
845
+ temp_rowid_selected = False
846
+ break
847
+ if isinstance(col.signal_udf, VectorSignal):
848
+ temp_rowid_selected = True
849
+ if temp_rowid_selected:
850
+ cols.append(Column(ROWID))
851
+
852
+ # Set extra information on any concept signals.
853
+ for udf_col in udf_columns:
854
+ if isinstance(udf_col.signal_udf, (ConceptSignal, ConceptLabelsSignal)):
855
+ # Concept are access controlled so we tell it about the user.
856
+ udf_col.signal_udf.set_user(user)
857
+
858
+ # Decide on the exact sorting order.
859
+ sort_results = self._merge_sorts(search_udfs, sort_by, sort_order)
860
+ sort_by = cast(list[PathTuple],
861
+ [(sort.alias,) if sort.alias else sort.path for sort in sort_results])
862
+ # Choose the first sort order as we only support a single sort order for now.
863
+ sort_order = sort_results[0].order if sort_results else None
864
+
865
+ col_aliases: dict[str, PathTuple] = {col.alias: col.path for col in cols if col.alias}
866
+ udf_aliases: dict[str, PathTuple] = {
867
+ col.alias: col.path for col in cols if col.signal_udf and col.alias
868
+ }
869
+ path_to_udf_col_name: dict[PathTuple, str] = {}
870
+ for col in cols:
871
+ if col.signal_udf:
872
+ alias = col.alias or _unique_alias(col)
873
+ dest_path = _col_destination_path(col)
874
+ path_to_udf_col_name[dest_path] = alias
875
+
876
+ # Filtering and searching.
877
+ where_query = ''
878
+ filters, udf_filters = self._normalize_filters(filters, col_aliases, udf_aliases, manifest)
879
+ filter_queries = self._create_where(manifest, filters, searches)
880
+ if filter_queries:
881
+ where_query = f"WHERE {' AND '.join(filter_queries)}"
882
+
883
+ total_num_rows = manifest.num_items
884
+ con = self.con.cursor()
885
+
886
+ topk_udf_col = self._topk_udf_to_sort_by(udf_columns, sort_by, limit, sort_order)
887
+ if topk_udf_col:
888
+ path_keys: Optional[list[PathKey]] = None
889
+ if where_query:
890
+ # If there are filters, we need to send rowids to the top k query.
891
+ df = con.execute(f'SELECT {ROWID} FROM t {where_query}').df()
892
+ total_num_rows = len(df)
893
+ # Convert rowids to path keys.
894
+ path_keys = [(rowid,) for rowid in df[ROWID]]
895
+
896
+ if path_keys is not None and len(path_keys) == 0:
897
+ where_query = 'WHERE false'
898
+ else:
899
+ topk_signal = cast(VectorSignal, topk_udf_col.signal_udf)
900
+ # The input is an embedding.
901
+ vector_index = self._get_vector_db_index(topk_signal.embedding, topk_udf_col.path)
902
+ k = (limit or 0) + offset
903
+ path_id = f'{self.namespace}/{self.dataset_name}:{topk_udf_col.path}'
904
+ with DebugTimer(f'Computing topk on {path_id} with embedding "{topk_signal.embedding}" '
905
+ f'and vector store "{vector_index._vector_store.name}"'):
906
+ topk = topk_signal.vector_compute_topk(k, vector_index, path_keys)
907
+ topk_rowids = list(dict.fromkeys([cast(str, rowid) for (rowid, *_), _ in topk]))
908
+ # Update the offset to account for the number of unique rowids.
909
+ offset = len(dict.fromkeys([cast(str, rowid) for (rowid, *_), _ in topk[:offset]]))
910
+
911
+ # Ignore all the other filters and filter DuckDB results only by the top k rowids.
912
+ rowid_filter = Filter(path=(ROWID,), op='in', value=topk_rowids)
913
+ filter_query = self._create_where(manifest, [rowid_filter])[0]
914
+ where_query = f'WHERE {filter_query}'
915
+
916
+ # Map a final column name to a list of temporary namespaced column names that need to be merged.
917
+ columns_to_merge: dict[str, dict[str, Column]] = {}
918
+ temp_column_to_offset_column: dict[str, tuple[str, Field]] = {}
919
+ select_queries: list[str] = []
920
+
921
+ for column in cols:
922
+ path = column.path
923
+ # If the signal is vector-based, we don't need to select the actual data, just the rowids
924
+ # plus an arbitrarily nested array of `None`s`.
925
+ empty = bool(column.signal_udf and schema.get_field(path).dtype == DataType.EMBEDDING)
926
+
927
+ select_sqls: list[str] = []
928
+ final_col_name = column.alias or _unique_alias(column)
929
+ if final_col_name not in columns_to_merge:
930
+ columns_to_merge[final_col_name] = {}
931
+
932
+ duckdb_paths = self._column_to_duckdb_paths(column, schema, combine_columns)
933
+ span_from = self._get_span_from(path, manifest) if resolve_span or column.signal_udf else None
934
+
935
+ for parquet_id, duckdb_path in duckdb_paths:
936
+ sql = _select_sql(
937
+ duckdb_path, flatten=False, unnest=False, empty=empty, span_from=span_from)
938
+ temp_column_name = (
939
+ final_col_name if len(duckdb_paths) == 1 else f'{final_col_name}/{parquet_id}')
940
+ select_sqls.append(f'{sql} AS {_escape_string_literal(temp_column_name)}')
941
+ columns_to_merge[final_col_name][temp_column_name] = column
942
+
943
+ if column.signal_udf and span_from and _schema_has_spans(column.signal_udf.fields()):
944
+ sql = _select_sql(duckdb_path, flatten=False, unnest=False, empty=empty, span_from=None)
945
+ temp_offset_column_name = f'{temp_column_name}/offset'
946
+ temp_offset_column_name = temp_offset_column_name.replace("'", "\\'")
947
+ select_sqls.append(f'{sql} AS {_escape_string_literal(temp_offset_column_name)}')
948
+ temp_column_to_offset_column[temp_column_name] = (temp_offset_column_name,
949
+ column.signal_udf.fields())
950
+
951
+ # `select_sqls` can be empty if this column points to a path that will be created by a UDF.
952
+ if select_sqls:
953
+ select_queries.append(', '.join(select_sqls))
954
+
955
+ sort_sql_before_udf: list[str] = []
956
+ sort_sql_after_udf: list[str] = []
957
+
958
+ for path in sort_by:
959
+ # We only allow sorting by nodes with a value.
960
+ first_subpath = str(path[0])
961
+ rest_of_path = path[1:]
962
+ signal_alias = '.'.join(map(str, path))
963
+
964
+ udf_path = _path_to_udf_duckdb_path(path, path_to_udf_col_name)
965
+ if not udf_path:
966
+ # Re-route the path if it starts with an alias by pointing it to the actual path.
967
+ if first_subpath in col_aliases:
968
+ path = (*col_aliases[first_subpath], *rest_of_path)
969
+ self._validate_sort_path(path, schema)
970
+ path = self._leaf_path_to_duckdb_path(path, schema)
971
+ else:
972
+ path = udf_path
973
+
974
+ sort_sql = _select_sql(path, flatten=True, unnest=False)
975
+ has_repeated_field = any(subpath == PATH_WILDCARD for subpath in path)
976
+ if has_repeated_field:
977
+ sort_sql = (f'list_min({sort_sql})'
978
+ if sort_order == SortOrder.ASC else f'list_max({sort_sql})')
979
+
980
+ # Separate sort columns into two groups: those that need to be sorted before and after UDFs.
981
+ if udf_path:
982
+ sort_sql_after_udf.append(sort_sql)
983
+ else:
984
+ sort_sql_before_udf.append(sort_sql)
985
+
986
+ order_query = ''
987
+ if sort_sql_before_udf:
988
+ order_query = (f'ORDER BY {", ".join(sort_sql_before_udf)} '
989
+ f'{cast(SortOrder, sort_order).value}')
990
+
991
+ limit_query = ''
992
+ if limit:
993
+ if topk_udf_col:
994
+ limit_query = f'LIMIT {limit + offset}'
995
+ elif sort_sql_after_udf:
996
+ limit_query = ''
997
+ else:
998
+ limit_query = f'LIMIT {limit} OFFSET {offset}'
999
+
1000
+ if not topk_udf_col and where_query:
1001
+ total_num_rows = cast(tuple,
1002
+ con.execute(f'SELECT COUNT(*) FROM t {where_query}').fetchone())[0]
1003
+
1004
+ # Fetch the data from DuckDB.
1005
+ df = con.execute(f"""
1006
+ SELECT {', '.join(select_queries)} FROM t
1007
+ {where_query}
1008
+ {order_query}
1009
+ {limit_query}
1010
+ """).df()
1011
+ df = _replace_nan_with_none(df)
1012
+
1013
+ # Run UDFs on the transformed columns.
1014
+ for udf_col in udf_columns:
1015
+ signal = cast(Signal, udf_col.signal_udf)
1016
+ signal_alias = udf_col.alias or _unique_alias(udf_col)
1017
+ temp_signal_cols = columns_to_merge[signal_alias]
1018
+ if len(temp_signal_cols) != 1:
1019
+ raise ValueError(
1020
+ f'Unable to compute signal {signal.name}. Signal UDFs only operate on leafs, but got '
1021
+ f'{len(temp_signal_cols)} underlying columns that contain data related to {udf_col.path}.'
1022
+ )
1023
+ signal_column = list(temp_signal_cols.keys())[0]
1024
+ input = df[signal_column]
1025
+
1026
+ path_id = f'{self.namespace}/{self.dataset_name}:{udf_col.path}'
1027
+ with DebugTimer(f'Computing signal "{signal.name}" on {path_id}'):
1028
+ signal.setup()
1029
+
1030
+ step_description = f'Computing {signal.key()} on {path_id}'
1031
+
1032
+ if isinstance(signal, VectorSignal):
1033
+ embedding_signal = signal
1034
+ vector_store = self._get_vector_db_index(embedding_signal.embedding, udf_col.path)
1035
+ flat_keys = list(flatten_keys(df[ROWID], input))
1036
+ signal_out = sparse_to_dense_compute(
1037
+ iter(flat_keys), lambda keys: embedding_signal.vector_compute(keys, vector_store))
1038
+ # Add progress.
1039
+ if task_step_id is not None:
1040
+ signal_out = progress(
1041
+ signal_out,
1042
+ task_step_id=task_step_id,
1043
+ estimated_len=len(flat_keys),
1044
+ step_description=step_description)
1045
+ df[signal_column] = deep_unflatten(signal_out, input)
1046
+ else:
1047
+ num_rich_data = count_primitives(input)
1048
+ flat_input = cast(Iterator[Optional[RichData]], deep_flatten(input))
1049
+ signal_out = sparse_to_dense_compute(
1050
+ flat_input, lambda x: signal.compute(cast(Iterable[RichData], x)))
1051
+ # Add progress.
1052
+ if task_step_id is not None:
1053
+ signal_out = progress(
1054
+ signal_out,
1055
+ task_step_id=task_step_id,
1056
+ estimated_len=num_rich_data,
1057
+ step_description=step_description)
1058
+ signal_out_list = list(signal_out)
1059
+ if signal_column in temp_column_to_offset_column:
1060
+ offset_column_name, field = temp_column_to_offset_column[signal_column]
1061
+ nested_spans: Iterable[Item] = df[offset_column_name]
1062
+ flat_spans = deep_flatten(nested_spans)
1063
+ for span, item in zip(flat_spans, signal_out_list):
1064
+ _offset_any_span(cast(int, span[VALUE_KEY][TEXT_SPAN_START_FEATURE]), item, field)
1065
+
1066
+ if len(signal_out_list) != num_rich_data:
1067
+ raise ValueError(
1068
+ f'The signal generated {len(signal_out_list)} values but the input data had '
1069
+ f"{num_rich_data} values. This means the signal either didn't generate a "
1070
+ '"None" for a sparse output, or generated too many items.')
1071
+
1072
+ df[signal_column] = deep_unflatten(signal_out_list, input)
1073
+
1074
+ signal.teardown()
1075
+
1076
+ if not df.empty and (udf_filters or sort_sql_after_udf):
1077
+ # Re-upload the udf outputs to duckdb so we can filter/sort on them.
1078
+ rel = con.from_df(df)
1079
+
1080
+ if udf_filters:
1081
+ udf_filter_queries = self._create_where(manifest, udf_filters)
1082
+ if udf_filter_queries:
1083
+ rel = rel.filter(' AND '.join(udf_filter_queries))
1084
+ total_num_rows = cast(tuple, rel.count('*').fetchone())[0]
1085
+
1086
+ if sort_sql_after_udf:
1087
+ if not sort_order:
1088
+ raise ValueError('`sort_order` is required when `sort_by` is specified.')
1089
+ rel = rel.order(f'{", ".join(sort_sql_after_udf)} {sort_order.value}')
1090
+
1091
+ if limit:
1092
+ rel = rel.limit(limit, offset)
1093
+
1094
+ df = _replace_nan_with_none(rel.df())
1095
+
1096
+ if temp_rowid_selected:
1097
+ del df[ROWID]
1098
+ del columns_to_merge[ROWID]
1099
+
1100
+ if combine_columns:
1101
+ all_columns: dict[str, Column] = {}
1102
+ for col_dict in columns_to_merge.values():
1103
+ all_columns.update(col_dict)
1104
+ columns_to_merge = {'*': all_columns}
1105
+
1106
+ for offset_column, _ in temp_column_to_offset_column.values():
1107
+ del df[offset_column]
1108
+
1109
+ for final_col_name, temp_columns in columns_to_merge.items():
1110
+ for temp_col_name, column in temp_columns.items():
1111
+ if combine_columns:
1112
+ dest_path = _col_destination_path(column)
1113
+ spec = _split_path_into_subpaths_of_lists(dest_path)
1114
+ df[temp_col_name] = wrap_in_dicts(df[temp_col_name], spec)
1115
+
1116
+ # If the temp col name is the same as the final name, we can skip merging. This happens when
1117
+ # we select a source leaf column.
1118
+ if temp_col_name == final_col_name:
1119
+ continue
1120
+
1121
+ if final_col_name not in df:
1122
+ df[final_col_name] = df[temp_col_name]
1123
+ else:
1124
+ df[final_col_name] = merge_series(df[final_col_name], df[temp_col_name])
1125
+ del df[temp_col_name]
1126
+
1127
+ con.close()
1128
+
1129
+ if combine_columns:
1130
+ # Since we aliased every column to `*`, the object with have only '*' as the key. We need to
1131
+ # elevate the all the columns under '*'.
1132
+ df = pd.DataFrame.from_records(df['*'])
1133
+
1134
+ return SelectRowsResult(df, total_num_rows)
1135
+
1136
+ @override
1137
+ def select_rows_schema(self,
1138
+ columns: Optional[Sequence[ColumnId]] = None,
1139
+ sort_by: Optional[Sequence[Path]] = None,
1140
+ sort_order: Optional[SortOrder] = None,
1141
+ searches: Optional[Sequence[Search]] = None,
1142
+ combine_columns: bool = False) -> SelectRowsSchemaResult:
1143
+ """Returns the schema of the result of `select_rows` above with the same arguments."""
1144
+ if not combine_columns:
1145
+ raise NotImplementedError(
1146
+ 'select_rows_schema with combine_columns=False is not yet supported.')
1147
+ manifest = self.manifest()
1148
+ cols = self._normalize_columns(columns, manifest.data_schema, combine_columns)
1149
+
1150
+ self._normalize_searches(searches, manifest)
1151
+ search_udfs = self._search_udfs(searches, manifest)
1152
+ cols.extend([search_udf.udf for search_udf in search_udfs])
1153
+
1154
+ udfs: list[SelectRowsSchemaUDF] = []
1155
+ col_schemas: list[Schema] = []
1156
+ for col in cols:
1157
+ dest_path = _col_destination_path(col)
1158
+ if col.signal_udf:
1159
+ udfs.append(SelectRowsSchemaUDF(path=dest_path, alias=col.alias))
1160
+ field = col.signal_udf.fields()
1161
+ field.signal = col.signal_udf.dict()
1162
+ elif manifest.data_schema.has_field(dest_path):
1163
+ field = manifest.data_schema.get_field(dest_path)
1164
+ else:
1165
+ # This column might refer to an output of a udf. We postpone validation to later.
1166
+ continue
1167
+ col_schemas.append(_make_schema_from_path(dest_path, field))
1168
+
1169
+ sort_results = self._merge_sorts(search_udfs, sort_by, sort_order)
1170
+
1171
+ search_results = [
1172
+ SearchResultInfo(search_path=search_udf.search_path, result_path=search_udf.output_path)
1173
+ for search_udf in search_udfs
1174
+ ]
1175
+
1176
+ new_schema = merge_schemas(col_schemas)
1177
+
1178
+ # Now that we have the new schema, we can validate all the column selections.
1179
+ self._validate_columns(cols, manifest.data_schema, new_schema)
1180
+
1181
+ return SelectRowsSchemaResult(
1182
+ data_schema=new_schema, udfs=udfs, search_results=search_results, sorts=sort_results or None)
1183
+
1184
+ @override
1185
+ def media(self, item_id: str, leaf_path: Path) -> MediaResult:
1186
+ raise NotImplementedError('Media is not yet supported for the DuckDB implementation.')
1187
+
1188
+ def _get_span_from(self, path: PathTuple, manifest: DatasetManifest) -> Optional[PathTuple]:
1189
+ leafs = manifest.data_schema.leafs
1190
+ # Remove the value key so we can check the dtype from leafs.
1191
+ span_path = path[:-1] if path[-1] == VALUE_KEY else path
1192
+ is_span = (span_path in leafs and leafs[span_path].dtype == DataType.STRING_SPAN)
1193
+ return _derived_from_path(path, manifest.data_schema) if is_span else None
1194
+
1195
+ def _leaf_path_to_duckdb_path(self, leaf_path: PathTuple, schema: Schema) -> PathTuple:
1196
+ ((_, duckdb_path),) = self._column_to_duckdb_paths(
1197
+ Column(leaf_path), schema, combine_columns=False, select_leaf=True)
1198
+ return duckdb_path
1199
+
1200
+ def _column_to_duckdb_paths(self,
1201
+ column: Column,
1202
+ schema: Schema,
1203
+ combine_columns: bool,
1204
+ select_leaf: bool = False) -> list[tuple[str, PathTuple]]:
1205
+ path = column.path
1206
+ parquet_manifests: list[Union[SourceManifest, SignalManifest]] = [
1207
+ self._source_manifest, *self._signal_manifests
1208
+ ]
1209
+ duckdb_paths: list[tuple[str, PathTuple]] = []
1210
+ source_has_path = False
1211
+
1212
+ select_leaf = select_leaf or column.signal_udf is not None
1213
+
1214
+ if path == (ROWID,):
1215
+ return [('source', path)]
1216
+
1217
+ for m in parquet_manifests:
1218
+ if not m.files:
1219
+ continue
1220
+ # Skip this parquet file if it doesn't contain the path.
1221
+ if not schema_contains_path(m.data_schema, path):
1222
+ continue
1223
+
1224
+ if isinstance(m, SourceManifest):
1225
+ source_has_path = True
1226
+
1227
+ if isinstance(m, SignalManifest) and source_has_path and not combine_columns:
1228
+ # Skip this signal if the source already has the path and we are not combining columns.
1229
+ continue
1230
+
1231
+ # Skip this parquet file if the path doesn't have a dtype.
1232
+ if select_leaf and not m.data_schema.get_field(path).dtype:
1233
+ continue
1234
+
1235
+ duckdb_path = path
1236
+ parquet_id = 'source'
1237
+
1238
+ if isinstance(m, SignalManifest):
1239
+ duckdb_path = (m.parquet_id, *path[1:])
1240
+ parquet_id = m.parquet_id
1241
+
1242
+ duckdb_paths.append((parquet_id, duckdb_path))
1243
+
1244
+ if not duckdb_paths:
1245
+ # This path is probably a result of a udf. Make sure the result schema contains it.
1246
+ if not schema.has_field(path):
1247
+ raise ValueError(f'Invalid path "{path}": No manifest contains path. Valid paths: '
1248
+ f'{list(schema.leafs.keys())}')
1249
+
1250
+ return duckdb_paths
1251
+
1252
+ def _normalize_filters(self, filter_likes: Optional[Sequence[FilterLike]],
1253
+ col_aliases: dict[str, PathTuple], udf_aliases: dict[str, PathTuple],
1254
+ manifest: DatasetManifest) -> tuple[list[Filter], list[Filter]]:
1255
+ """Normalize `FilterLike` to `Filter` and split into filters on source and filters on UDFs."""
1256
+ filter_likes = filter_likes or []
1257
+ filters: list[Filter] = []
1258
+ udf_filters: list[Filter] = []
1259
+
1260
+ for filter in filter_likes:
1261
+ # Normalize `FilterLike` to `Filter`.
1262
+ if not isinstance(filter, Filter):
1263
+ if len(filter) == 3:
1264
+ path, op, value = filter # type: ignore
1265
+ elif len(filter) == 2:
1266
+ path, op = filter # type: ignore
1267
+ value = None
1268
+ else:
1269
+ raise ValueError(f'Invalid filter: {filter}. Must be a tuple with 2 or 3 elements.')
1270
+ filter = Filter(path=normalize_path(path), op=op, value=value)
1271
+
1272
+ if str(filter.path[0]) in udf_aliases:
1273
+ udf_filters.append(filter)
1274
+ else:
1275
+ filters.append(filter)
1276
+
1277
+ self._validate_filters(filters, col_aliases, manifest)
1278
+ return filters, udf_filters
1279
+
1280
+ def _normalize_searches(self, searches: Optional[Sequence[Search]],
1281
+ manifest: DatasetManifest) -> None:
1282
+ """Validate searches."""
1283
+ if not searches:
1284
+ return
1285
+
1286
+ for search in searches:
1287
+ search.path = normalize_path(search.path)
1288
+ field = manifest.data_schema.get_field(search.path)
1289
+ if field.dtype != DataType.STRING:
1290
+ raise ValueError(f'Invalid search path: {search.path}. '
1291
+ f'Must be a string field, got dtype {field.dtype}')
1292
+
1293
+ def _search_udfs(self, searches: Optional[Sequence[Search]],
1294
+ manifest: DatasetManifest) -> list[DuckDBSearchUDF]:
1295
+ searches = searches or []
1296
+ """Create a UDF for each search for finding the location of the text with spans."""
1297
+ search_udfs: list[DuckDBSearchUDF] = []
1298
+ for search in searches:
1299
+ search_path = normalize_path(search.path)
1300
+ if search.type == 'keyword':
1301
+ udf = Column(path=search_path, signal_udf=SubstringSignal(query=search.query))
1302
+ search_udfs.append(
1303
+ DuckDBSearchUDF(
1304
+ udf=udf,
1305
+ search_path=search_path,
1306
+ output_path=(*_col_destination_path(udf), PATH_WILDCARD)))
1307
+ elif search.type == 'semantic' or search.type == 'concept':
1308
+ embedding = search.embedding
1309
+ if not embedding:
1310
+ raise ValueError(f'Please provide an embedding for semantic search. Got search: {search}')
1311
+
1312
+ try:
1313
+ manifest.data_schema.get_field((*search_path, embedding))
1314
+ except Exception as e:
1315
+ raise ValueError(
1316
+ f'Embedding {embedding} has not been computed. '
1317
+ f'Please compute the embedding index before issuing a {search.type} query.') from e
1318
+
1319
+ search_signal: Optional[Signal] = None
1320
+ if search.type == 'semantic':
1321
+ search_signal = SemanticSimilaritySignal(query=search.query, embedding=search.embedding)
1322
+ elif search.type == 'concept':
1323
+ search_signal = ConceptSignal(
1324
+ namespace=search.concept_namespace,
1325
+ concept_name=search.concept_name,
1326
+ embedding=search.embedding)
1327
+
1328
+ # Add the label UDF.
1329
+ concept_labels_signal = ConceptLabelsSignal(
1330
+ namespace=search.concept_namespace, concept_name=search.concept_name)
1331
+ concept_labels_udf = Column(path=search_path, signal_udf=concept_labels_signal)
1332
+ search_udfs.append(
1333
+ DuckDBSearchUDF(
1334
+ udf=concept_labels_udf,
1335
+ search_path=search_path,
1336
+ output_path=_col_destination_path(concept_labels_udf),
1337
+ sort=None))
1338
+
1339
+ udf = Column(path=search_path, signal_udf=search_signal)
1340
+
1341
+ output_path = _col_destination_path(udf)
1342
+ search_udfs.append(
1343
+ DuckDBSearchUDF(
1344
+ udf=udf,
1345
+ search_path=search_path,
1346
+ output_path=_col_destination_path(udf),
1347
+ sort=((*output_path, PATH_WILDCARD, 'score'), SortOrder.DESC)))
1348
+ else:
1349
+ raise ValueError(f'Unknown search operator {search.type}.')
1350
+
1351
+ return search_udfs
1352
+
1353
+ def _create_where(self,
1354
+ manifest: DatasetManifest,
1355
+ filters: list[Filter],
1356
+ searches: Optional[Sequence[Search]] = []) -> list[str]:
1357
+ if not filters and not searches:
1358
+ return []
1359
+ searches = searches or []
1360
+ sql_filter_queries: list[str] = []
1361
+
1362
+ # Add search where queries.
1363
+ for search in searches:
1364
+ duckdb_path = self._leaf_path_to_duckdb_path(
1365
+ normalize_path(search.path), manifest.data_schema)
1366
+ select_str = _select_sql(duckdb_path, flatten=False, unnest=False)
1367
+ if search.type == 'keyword':
1368
+ sql_op = 'ILIKE'
1369
+ query_val = _escape_like_value(search.query)
1370
+ elif search.type == 'semantic' or search.type == 'concept':
1371
+ # Semantic search and concepts don't yet filter.
1372
+ continue
1373
+ else:
1374
+ raise ValueError(f'Unknown search operator {search.type}.')
1375
+
1376
+ filter_query = f'{select_str} {sql_op} {query_val}'
1377
+
1378
+ sql_filter_queries.append(filter_query)
1379
+
1380
+ # Add filter where queries.
1381
+ for f in filters:
1382
+ duckdb_path = self._leaf_path_to_duckdb_path(f.path, manifest.data_schema)
1383
+ select_str = _select_sql(
1384
+ duckdb_path, flatten=True, unnest=False, span_from=self._get_span_from(f.path, manifest))
1385
+ is_array = any(subpath == PATH_WILDCARD for subpath in f.path)
1386
+
1387
+ nan_filter = ''
1388
+ field = manifest.data_schema.get_field(f.path)
1389
+ filter_nans = field.dtype and is_float(field.dtype)
1390
+
1391
+ if f.op in BINARY_OPS:
1392
+ sql_op = BINARY_OP_TO_SQL[cast(BinaryOp, f.op)]
1393
+ filter_val = cast(FeatureValue, f.value)
1394
+ if isinstance(filter_val, str):
1395
+ filter_val = _escape_string_literal(filter_val)
1396
+ elif isinstance(filter_val, bytes):
1397
+ filter_val = _bytes_to_blob_literal(filter_val)
1398
+ else:
1399
+ filter_val = str(filter_val)
1400
+ if is_array:
1401
+ nan_filter = 'NOT isnan(x) AND' if filter_nans else ''
1402
+ filter_query = (f'len(list_filter({select_str}, '
1403
+ f'x -> {nan_filter} x {sql_op} {filter_val})) > 0')
1404
+ else:
1405
+ nan_filter = f'NOT isnan({select_str}) AND' if filter_nans else ''
1406
+ filter_query = f'{nan_filter} {select_str} {sql_op} {filter_val}'
1407
+ elif f.op in UNARY_OPS:
1408
+ if f.op == 'exists':
1409
+ filter_query = f'len({select_str}) > 0' if is_array else f'{select_str} IS NOT NULL'
1410
+ else:
1411
+ raise ValueError(f'Unary op: {f.op} is not yet supported')
1412
+ elif f.op in LIST_OPS:
1413
+ if f.op == 'in':
1414
+ filter_list_val = cast(FeatureListValue, f.value)
1415
+ if not isinstance(filter_list_val, list):
1416
+ raise ValueError('filter with array value can only use the IN comparison')
1417
+ wrapped_filter_val = [f"'{part}'" for part in filter_list_val]
1418
+ filter_val = f'({", ".join(wrapped_filter_val)})'
1419
+ filter_query = f'{select_str} IN {filter_val}'
1420
+ else:
1421
+ raise ValueError(f'List op: {f.op} is not yet supported')
1422
+ else:
1423
+ raise ValueError(f'Invalid filter op: {f.op}')
1424
+ sql_filter_queries.append(filter_query)
1425
+ return sql_filter_queries
1426
+
1427
+ def _execute(self, query: str) -> duckdb.DuckDBPyConnection:
1428
+ """Execute a query in duckdb."""
1429
+ # FastAPI is multi-threaded so we have to create a thread-specific connection cursor to allow
1430
+ # these queries to be thread-safe.
1431
+ local_con = self.con.cursor()
1432
+ if not env('DEBUG', False):
1433
+ return local_con.execute(query)
1434
+
1435
+ # Debug mode.
1436
+ log('Executing:')
1437
+ log(query)
1438
+ with DebugTimer('Query'):
1439
+ return local_con.execute(query)
1440
+
1441
+ def _query(self, query: str) -> list[tuple]:
1442
+ result = self._execute(query)
1443
+ rows = result.fetchall()
1444
+ result.close()
1445
+ return rows
1446
+
1447
+ def _query_df(self, query: str) -> pd.DataFrame:
1448
+ """Execute a query that returns a data frame."""
1449
+ result = self._execute(query)
1450
+ df = _replace_nan_with_none(result.df())
1451
+ result.close()
1452
+ return df
1453
+
1454
+ def _path_to_col(self, path: Path, quote_each_part: bool = True) -> str:
1455
+ """Convert a path to a column name."""
1456
+ if isinstance(path, str):
1457
+ path = (path,)
1458
+ return '.'.join([
1459
+ f'{_escape_col_name(path_comp)}' if quote_each_part else str(path_comp) for path_comp in path
1460
+ ])
1461
+
1462
+ def _get_selection(self, columns: Optional[Sequence[ColumnId]] = None) -> str:
1463
+ """Get the selection clause for download a dataset."""
1464
+ manifest = self.manifest()
1465
+ cols = self._normalize_columns(columns, manifest.data_schema, combine_columns=False)
1466
+ schema = manifest.data_schema
1467
+ self._validate_columns(cols, manifest.data_schema, schema)
1468
+
1469
+ select_queries: list[str] = []
1470
+ for column in cols:
1471
+ col_name = column.alias or _unique_alias(column)
1472
+ duckdb_paths = self._column_to_duckdb_paths(column, schema, combine_columns=False)
1473
+ if not duckdb_paths:
1474
+ raise ValueError(f'Cannot download path {column.path} which does not exist in the dataset.')
1475
+ if len(duckdb_paths) > 1:
1476
+ raise ValueError(
1477
+ f'Cannot download path {column.path} which spans multiple parquet files: {duckdb_paths}')
1478
+ _, duckdb_path = duckdb_paths[0]
1479
+ sql = _select_sql(duckdb_path, flatten=False, unnest=False)
1480
+ select_queries.append(f'{sql} AS {_escape_string_literal(col_name)}')
1481
+ return ', '.join(select_queries)
1482
+
1483
+ @override
1484
+ def to_json(self,
1485
+ filepath: Union[str, pathlib.Path],
1486
+ jsonl: bool = True,
1487
+ columns: Optional[Sequence[ColumnId]] = None) -> None:
1488
+ selection = self._get_selection(columns)
1489
+ self._execute(f"COPY (SELECT {selection} FROM t) TO '{filepath}' "
1490
+ f"(FORMAT JSON, ARRAY {'FALSE' if jsonl else 'TRUE'})")
1491
+ log(f'Dataset exported to {filepath}')
1492
+
1493
+ @override
1494
+ def to_pandas(self, columns: Optional[Sequence[ColumnId]] = None) -> pd.DataFrame:
1495
+ selection = self._get_selection(columns)
1496
+ return self._query_df(f'SELECT {selection} FROM t')
1497
+
1498
+ @override
1499
+ def to_csv(self,
1500
+ filepath: Union[str, pathlib.Path],
1501
+ columns: Optional[Sequence[ColumnId]] = None) -> None:
1502
+ selection = self._get_selection(columns)
1503
+ self._execute(f"COPY (SELECT {selection} FROM t) TO '{filepath}' (FORMAT CSV, HEADER)")
1504
+ log(f'Dataset exported to {filepath}')
1505
+
1506
+ @override
1507
+ def to_parquet(self,
1508
+ filepath: Union[str, pathlib.Path],
1509
+ columns: Optional[Sequence[ColumnId]] = None) -> None:
1510
+ selection = self._get_selection(columns)
1511
+ self._execute(f"COPY (SELECT {selection} FROM t) TO '{filepath}' (FORMAT PARQUET)")
1512
+ log(f'Dataset exported to {filepath}')
1513
+
1514
+
1515
+ def _escape_string_literal(string: str) -> str:
1516
+ string = string.replace("'", "''")
1517
+ return f"'{string}'"
1518
+
1519
+
1520
+ def _escape_col_name(col_name: str) -> str:
1521
+ col_name = col_name.replace('"', '""')
1522
+ return f'"{col_name}"'
1523
+
1524
+
1525
+ def _escape_like_value(value: str) -> str:
1526
+ value = value.replace('%', '\\%').replace('_', '\\_')
1527
+ return f"'%{value}%' ESCAPE '\\'"
1528
+
1529
+
1530
+ def _inner_select(sub_paths: list[PathTuple],
1531
+ inner_var: Optional[str] = None,
1532
+ empty: bool = False,
1533
+ span_from: Optional[PathTuple] = None) -> str:
1534
+ """Recursively generate the inner select statement for a list of sub paths."""
1535
+ current_sub_path = sub_paths[0]
1536
+ lambda_var = inner_var + 'x' if inner_var else 'x'
1537
+ if not inner_var:
1538
+ lambda_var = 'x'
1539
+ inner_var = _escape_col_name(current_sub_path[0])
1540
+ current_sub_path = current_sub_path[1:]
1541
+ # Select the path inside structs. E.g. x['a']['b']['c'] given current_sub_path = [a, b, c].
1542
+ path_key = inner_var + ''.join([f'[{_escape_string_literal(p)}]' for p in current_sub_path])
1543
+ if len(sub_paths) == 1:
1544
+ if span_from:
1545
+ derived_col = _select_sql(span_from, flatten=False, unnest=False)
1546
+ path_key = (f'{derived_col}[{path_key}.{VALUE_KEY}.{TEXT_SPAN_START_FEATURE}+1:'
1547
+ f'{path_key}.{VALUE_KEY}.{TEXT_SPAN_END_FEATURE}]')
1548
+ return 'NULL' if empty else path_key
1549
+ return (f'list_transform({path_key}, {lambda_var} -> '
1550
+ f'{_inner_select(sub_paths[1:], lambda_var, empty, span_from)})')
1551
+
1552
+
1553
+ def _split_path_into_subpaths_of_lists(leaf_path: PathTuple) -> list[PathTuple]:
1554
+ """Split a path into a subpath of lists.
1555
+
1556
+ E.g. [a, b, c, *, d, *, *] gets splits [[a, b, c], [d], [], []].
1557
+ """
1558
+ sub_paths: list[PathTuple] = []
1559
+ offset = 0
1560
+ while offset <= len(leaf_path):
1561
+ new_offset = leaf_path.index(PATH_WILDCARD,
1562
+ offset) if PATH_WILDCARD in leaf_path[offset:] else len(leaf_path)
1563
+ sub_path = leaf_path[offset:new_offset]
1564
+ sub_paths.append(sub_path)
1565
+ offset = new_offset + 1
1566
+ return sub_paths
1567
+
1568
+
1569
+ def _select_sql(path: PathTuple,
1570
+ flatten: bool,
1571
+ unnest: bool,
1572
+ empty: bool = False,
1573
+ span_from: Optional[PathTuple] = None) -> str:
1574
+ """Create a select column for a path.
1575
+
1576
+ Args:
1577
+ path: A path to a feature. E.g. ['a', 'b', 'c'].
1578
+ flatten: Whether to flatten the result.
1579
+ unnest: Whether to unnest the result.
1580
+ empty: Whether to return an empty list (used for embedding signals that don't need the data).
1581
+ span_from: The path this span is derived from. If specified, the span will be resolved
1582
+ to a substring of the original string.
1583
+ """
1584
+ sub_paths = _split_path_into_subpaths_of_lists(path)
1585
+ selection = _inner_select(sub_paths, None, empty, span_from)
1586
+ # We only flatten when the result of a nested list to avoid segfault.
1587
+ is_result_nested_list = len(sub_paths) >= 3 # E.g. subPaths = [[a, b, c], *, *].
1588
+ if flatten and is_result_nested_list:
1589
+ selection = f'flatten({selection})'
1590
+ # We only unnest when the result is a list. // E.g. subPaths = [[a, b, c], *].
1591
+ is_result_a_list = len(sub_paths) >= 2
1592
+ if unnest and is_result_a_list:
1593
+ selection = f'unnest({selection})'
1594
+ return selection
1595
+
1596
+
1597
+ def read_source_manifest(dataset_path: str) -> SourceManifest:
1598
+ """Read the manifest file."""
1599
+ with open_file(os.path.join(dataset_path, MANIFEST_FILENAME), 'r') as f:
1600
+ return SourceManifest.parse_raw(f.read())
1601
+
1602
+
1603
+ def _signal_dir(enriched_path: PathTuple) -> str:
1604
+ """Get the filename prefix for a signal parquet file."""
1605
+ path_without_wildcards = (p for p in enriched_path if p != PATH_WILDCARD)
1606
+ return os.path.join(*path_without_wildcards)
1607
+
1608
+
1609
+ def split_column_name(column: str, split_name: str) -> str:
1610
+ """Get the name of a split column."""
1611
+ return f'{column}.{split_name}'
1612
+
1613
+
1614
+ def split_parquet_prefix(column_name: str, splitter_name: str) -> str:
1615
+ """Get the filename prefix for a split parquet file."""
1616
+ return f'{column_name}.{splitter_name}'
1617
+
1618
+
1619
+ def _bytes_to_blob_literal(bytes: bytes) -> str:
1620
+ """Convert bytes to a blob literal."""
1621
+ escaped_hex = re.sub(r'(.{2})', r'\\x\1', bytes.hex())
1622
+ return f"'{escaped_hex}'::BLOB"
1623
+
1624
+
1625
+ class SignalManifest(BaseModel):
1626
+ """The manifest that describes a signal computation including schema and parquet files."""
1627
+ # List of a parquet filepaths storing the data. The paths are relative to the manifest.
1628
+ files: list[str]
1629
+
1630
+ # An identifier for this parquet table. Will be used as the view name in SQL.
1631
+ parquet_id: str
1632
+
1633
+ data_schema: Schema
1634
+ signal: Signal
1635
+
1636
+ # The column path that this signal is derived from.
1637
+ enriched_path: PathTuple
1638
+
1639
+ # The name of the vector store. Present when the signal is an embedding.
1640
+ vector_store: Optional[str] = None
1641
+
1642
+ @validator('signal', pre=True)
1643
+ def parse_signal(cls, signal: dict) -> Signal:
1644
+ """Parse a signal to its specific subclass instance."""
1645
+ return resolve_signal(signal)
1646
+
1647
+
1648
+ def _merge_cells(dest_cell: Item, source_cell: Item) -> Item:
1649
+ if source_cell is None or isinstance(source_cell, float) and math.isnan(source_cell):
1650
+ # Nothing to merge here (missing value).
1651
+ return dest_cell
1652
+ if isinstance(dest_cell, dict):
1653
+ if isinstance(source_cell, list):
1654
+ raise ValueError(f'Failed to merge cells. Destination is a dict ({dest_cell!r}), '
1655
+ f'but source is a list ({source_cell!r}).')
1656
+ if isinstance(source_cell, dict):
1657
+ res = {**dest_cell}
1658
+ for key, value in source_cell.items():
1659
+ res[key] = (value if key not in dest_cell else _merge_cells(dest_cell[key], value))
1660
+ return res
1661
+ else:
1662
+ return {VALUE_KEY: source_cell, **dest_cell}
1663
+ elif isinstance(dest_cell, list):
1664
+ if not isinstance(source_cell, list):
1665
+ raise ValueError('Failed to merge cells. Destination is a list, but source is not.')
1666
+ return [
1667
+ _merge_cells(dest_subcell, source_subcell)
1668
+ for dest_subcell, source_subcell in zip(dest_cell, source_cell)
1669
+ ]
1670
+ else:
1671
+ # The destination is a primitive.
1672
+ if isinstance(source_cell, list):
1673
+ raise ValueError(f'Failed to merge cells. Destination is a primitive ({dest_cell!r}), '
1674
+ f'but source is a list ({source_cell!r}).')
1675
+ if isinstance(source_cell, dict):
1676
+ return {VALUE_KEY: dest_cell, **source_cell}
1677
+ else:
1678
+ # Primitives can be merged together if they are equal. This can happen if a user selects a
1679
+ # column that is the child of another.
1680
+ # NOTE: This can be removed if we fix https://github.com/lilacai/lilac/issues/166.
1681
+ if source_cell != dest_cell:
1682
+ raise ValueError(f'Cannot merge source "{source_cell!r}" into destination "{dest_cell!r}"')
1683
+ return dest_cell
1684
+
1685
+
1686
+ def merge_series(destination: pd.Series, source: pd.Series) -> list[Item]:
1687
+ """Merge two series of values recursively."""
1688
+ return _merge_cells(destination.tolist(), source.tolist())
1689
+
1690
+
1691
+ def _unique_alias(column: Column) -> str:
1692
+ """Get a unique alias for a selection column."""
1693
+ if column.signal_udf:
1694
+ return make_parquet_id(column.signal_udf, column.path)
1695
+ return '.'.join(map(str, column.path))
1696
+
1697
+
1698
+ def _path_contains(parent_path: PathTuple, child_path: PathTuple) -> bool:
1699
+ """Check if a path contains another path."""
1700
+ if len(parent_path) > len(child_path):
1701
+ return False
1702
+ return all(parent_path[i] == child_path[i] for i in range(len(parent_path)))
1703
+
1704
+
1705
+ def _path_to_udf_duckdb_path(path: PathTuple,
1706
+ path_to_udf_col_name: dict[PathTuple, str]) -> Optional[PathTuple]:
1707
+ first_subpath, *rest_of_path = path
1708
+ for parent_path, udf_col_name in path_to_udf_col_name.items():
1709
+ # If the user selected udf(document.*.text) as "udf" and wanted to sort by "udf.len", we need to
1710
+ # sort by "udf.*.len" where the "*" came from the fact that the udf was applied to a list of
1711
+ # "text" fields.
1712
+ wildcards = [x for x in parent_path if x == PATH_WILDCARD]
1713
+ if _path_contains(parent_path, path):
1714
+ return (udf_col_name, *wildcards, *path[len(parent_path):])
1715
+ elif first_subpath == udf_col_name:
1716
+ return (udf_col_name, *wildcards, *rest_of_path)
1717
+
1718
+ return None
1719
+
1720
+
1721
+ def _col_destination_path(column: Column, is_computed_signal: Optional[bool] = False) -> PathTuple:
1722
+ """Get the destination path where the output of this selection column will be stored."""
1723
+ source_path = column.path
1724
+
1725
+ if not column.signal_udf:
1726
+ return source_path
1727
+
1728
+ signal_key = column.signal_udf.key(is_computed_signal=is_computed_signal)
1729
+ # If we are enriching a value we should store the signal data in the value's parent.
1730
+ if source_path[-1] == VALUE_KEY:
1731
+ dest_path = (*source_path[:-1], signal_key)
1732
+ else:
1733
+ dest_path = (*source_path, signal_key)
1734
+
1735
+ return dest_path
1736
+
1737
+
1738
+ def _root_column(manifest: SignalManifest) -> str:
1739
+ """Returns the root column of a signal manifest."""
1740
+ field_keys = list(manifest.data_schema.fields.keys())
1741
+ if len(field_keys) > 2:
1742
+ raise ValueError('Expected at most two fields in signal manifest, '
1743
+ f'the rowid and root this signal is enriching. Got {field_keys}.')
1744
+ return next(filter(lambda field: field != ROWID, manifest.data_schema.fields.keys()))
1745
+
1746
+
1747
+ def _derived_from_path(path: PathTuple, schema: Schema) -> PathTuple:
1748
+ # Find the closest parent of `path` that is a signal root.
1749
+ for i in reversed(range(len(path))):
1750
+ sub_path = path[:i]
1751
+ if schema.get_field(sub_path).signal is not None:
1752
+ # Skip the signal name at the end to get the source path that was enriched.
1753
+ return sub_path[:-1]
1754
+ raise ValueError('Cannot find the source path for the enriched path: {path}')
1755
+
1756
+
1757
+ def _make_schema_from_path(path: PathTuple, field: Field) -> Schema:
1758
+ """Returns a schema that contains only the given path."""
1759
+ for sub_path in reversed(path):
1760
+ if sub_path == PATH_WILDCARD:
1761
+ field = Field(repeated_field=field)
1762
+ else:
1763
+ field = Field(fields={sub_path: field})
1764
+ if not field.fields:
1765
+ raise ValueError(f'Invalid path: {path}. Must contain at least one field name.')
1766
+ return Schema(fields=field.fields)
1767
+
1768
+
1769
+ def _replace_nan_with_none(df: pd.DataFrame) -> pd.DataFrame:
1770
+ """DuckDB returns np.nan for missing field in string column, replace with None for correctness."""
1771
+ # TODO(https://github.com/duckdb/duckdb/issues/4066): Remove this once duckdb fixes upstream.
1772
+ for col in df.columns:
1773
+ if is_object_dtype(df[col]):
1774
+ df[col].replace(np.nan, None, inplace=True)
1775
+ return df
1776
+
1777
+
1778
+ def _offset_any_span(offset: int, item: Item, schema: Field) -> None:
1779
+ """Offsets any spans inplace by the given parent offset."""
1780
+ if schema.dtype == DataType.STRING_SPAN:
1781
+ item = cast(dict, item)
1782
+ item[VALUE_KEY][TEXT_SPAN_START_FEATURE] += offset
1783
+ item[VALUE_KEY][TEXT_SPAN_END_FEATURE] += offset
1784
+ if schema.fields:
1785
+ item = cast(dict, item)
1786
+ for key, sub_schema in schema.fields.items():
1787
+ _offset_any_span(offset, item[key], sub_schema)
1788
+ if schema.repeated_field:
1789
+ item = cast(list, item)
1790
+ for sub_item in item:
1791
+ _offset_any_span(offset, sub_item, schema.repeated_field)
1792
+
1793
+
1794
+ def _schema_has_spans(field: Field) -> bool:
1795
+ if field.dtype and field.dtype == DataType.STRING_SPAN:
1796
+ return True
1797
+ if field.fields:
1798
+ children_have_spans = any(_schema_has_spans(sub_field) for sub_field in field.fields.values())
1799
+ if children_have_spans:
1800
+ return True
1801
+ if field.repeated_field:
1802
+ return _schema_has_spans(field.repeated_field)
1803
+ return False
1804
+
1805
+
1806
+ def _normalize_bins(bins: Optional[Union[Sequence[Bin], Sequence[float]]]) -> Optional[list[Bin]]:
1807
+ if bins is None:
1808
+ return None
1809
+ if not isinstance(bins[0], (float, int)):
1810
+ return cast(list[Bin], bins)
1811
+ named_bins: list[Bin] = []
1812
+ for i in range(len(bins) + 1):
1813
+ start = cast(float, bins[i - 1]) if i > 0 else None
1814
+ end = cast(float, bins[i]) if i < len(bins) else None
1815
+ named_bins.append((str(i), start, end))
1816
+ return named_bins
1817
+
1818
+
1819
+ def _auto_bins(stats: StatsResult, num_bins: int) -> list[Bin]:
1820
+ min_val = cast(float, stats.min_val)
1821
+ max_val = cast(float, stats.max_val)
1822
+ bin_width = (max_val - min_val) / num_bins
1823
+ bins: list[Bin] = []
1824
+ for i in range(num_bins):
1825
+ start = None if i == 0 else min_val + i * bin_width
1826
+ end = None if i == num_bins - 1 else min_val + (i + 1) * bin_width
1827
+ bins.append((str(i), start, end))
1828
+ return bins
1829
+
1830
+
1831
+ def get_config_filepath(namespace: str, dataset_name: str) -> str:
1832
+ """Gets the config yaml filepath."""
1833
+ return os.path.join(get_dataset_output_dir(data_path(), namespace, dataset_name), CONFIG_FILENAME)
lilac/data/dataset_test_utils.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests utils of for dataset_test."""
2
+ import os
3
+ import pathlib
4
+ from copy import deepcopy
5
+ from datetime import datetime
6
+ from typing import Optional, Type, cast
7
+
8
+ import numpy as np
9
+ from typing_extensions import Protocol
10
+
11
+ from ..config import CONFIG_FILENAME, DatasetConfig
12
+ from ..embeddings.vector_store import VectorDBIndex
13
+ from ..schema import (
14
+ MANIFEST_FILENAME,
15
+ PARQUET_FILENAME_PREFIX,
16
+ ROWID,
17
+ VALUE_KEY,
18
+ DataType,
19
+ Field,
20
+ Item,
21
+ PathKey,
22
+ Schema,
23
+ SourceManifest,
24
+ )
25
+ from ..sources.source import Source
26
+ from ..utils import get_dataset_output_dir, open_file, to_yaml
27
+ from .dataset import Dataset, default_settings
28
+ from .dataset_utils import is_primitive, write_items_to_parquet
29
+
30
+ TEST_NAMESPACE = 'test_namespace'
31
+ TEST_DATASET_NAME = 'test_dataset'
32
+
33
+
34
+ def _infer_dtype(value: Item) -> DataType:
35
+ if isinstance(value, str):
36
+ return DataType.STRING
37
+ elif isinstance(value, bool):
38
+ return DataType.BOOLEAN
39
+ elif isinstance(value, bytes):
40
+ return DataType.BINARY
41
+ elif isinstance(value, float):
42
+ return DataType.FLOAT32
43
+ elif isinstance(value, int):
44
+ return DataType.INT32
45
+ elif isinstance(value, datetime):
46
+ return DataType.TIMESTAMP
47
+ else:
48
+ raise ValueError(f'Cannot infer dtype of primitive value: {value}')
49
+
50
+
51
+ def _infer_field(item: Item) -> Field:
52
+ """Infer the schema from the items."""
53
+ if isinstance(item, dict):
54
+ fields: dict[str, Field] = {}
55
+ for k, v in item.items():
56
+ fields[k] = _infer_field(cast(Item, v))
57
+ dtype = None
58
+ if VALUE_KEY in fields:
59
+ dtype = fields[VALUE_KEY].dtype
60
+ del fields[VALUE_KEY]
61
+ return Field(fields=fields, dtype=dtype)
62
+ elif is_primitive(item):
63
+ return Field(dtype=_infer_dtype(item))
64
+ elif isinstance(item, list):
65
+ return Field(repeated_field=_infer_field(item[0]))
66
+ else:
67
+ raise ValueError(f'Cannot infer schema of item: {item}')
68
+
69
+
70
+ def _infer_schema(items: list[Item]) -> Schema:
71
+ """Infer the schema from the items."""
72
+ schema = Schema(fields={})
73
+ for item in items:
74
+ field = _infer_field(item)
75
+ if not field.fields:
76
+ raise ValueError(f'Invalid schema of item. Expected an object, but got: {item}')
77
+ schema.fields = {**schema.fields, **field.fields}
78
+ return schema
79
+
80
+
81
+ class TestDataMaker(Protocol):
82
+ """A function that creates a test dataset."""
83
+
84
+ def __call__(self, items: list[Item], schema: Optional[Schema] = None) -> Dataset:
85
+ """Create a test dataset."""
86
+ ...
87
+
88
+
89
+ class TestSource(Source):
90
+ """Test source that does nothing."""
91
+ name = 'test_source'
92
+
93
+
94
+ def make_dataset(dataset_cls: Type[Dataset],
95
+ tmp_path: pathlib.Path,
96
+ items: list[Item],
97
+ schema: Optional[Schema] = None) -> Dataset:
98
+ """Create a test dataset."""
99
+ schema = schema or _infer_schema(items)
100
+ _write_items(tmp_path, TEST_DATASET_NAME, items, schema)
101
+ dataset = dataset_cls(TEST_NAMESPACE, TEST_DATASET_NAME)
102
+
103
+ config = DatasetConfig(
104
+ namespace=TEST_NAMESPACE,
105
+ name=TEST_DATASET_NAME,
106
+ source=TestSource(),
107
+ settings=default_settings(dataset))
108
+ config_filepath = os.path.join(
109
+ get_dataset_output_dir(str(tmp_path), TEST_NAMESPACE, TEST_DATASET_NAME), CONFIG_FILENAME)
110
+ with open_file(config_filepath, 'w') as f:
111
+ f.write(to_yaml(config.dict(exclude_defaults=True, exclude_none=True, exclude_unset=True)))
112
+
113
+ return dataset
114
+
115
+
116
+ def _write_items(tmpdir: pathlib.Path, dataset_name: str, items: list[Item],
117
+ schema: Schema) -> None:
118
+ """Write the items JSON to the dataset format: manifest.json and parquet files."""
119
+ source_dir = get_dataset_output_dir(str(tmpdir), TEST_NAMESPACE, dataset_name)
120
+ os.makedirs(source_dir)
121
+
122
+ # Add rowids to the items.
123
+ items = [deepcopy(item) for item in items]
124
+ for i, item in enumerate(items):
125
+ item[ROWID] = str(i + 1)
126
+
127
+ simple_parquet_files, _ = write_items_to_parquet(
128
+ items, source_dir, schema, filename_prefix=PARQUET_FILENAME_PREFIX, shard_index=0, num_shards=1)
129
+ manifest = SourceManifest(files=[simple_parquet_files], data_schema=schema)
130
+ with open_file(os.path.join(source_dir, MANIFEST_FILENAME), 'w') as f:
131
+ f.write(manifest.json(indent=2, exclude_none=True))
132
+
133
+
134
+ def enriched_item(value: Optional[Item] = None, metadata: dict[str, Item] = {}) -> Item:
135
+ """Wrap a value in a dict with the value key."""
136
+ return {VALUE_KEY: value, **metadata}
137
+
138
+
139
+ def make_vector_index(vector_store: str, vector_dict: dict[PathKey,
140
+ list[list[float]]]) -> VectorDBIndex:
141
+ """Make a vector index from a dictionary of vector keys to vectors."""
142
+ embeddings: list[np.ndarray] = []
143
+ spans: list[tuple[PathKey, list[tuple[int, int]]]] = []
144
+ for path_key, vectors in vector_dict.items():
145
+ vector_spans: list[tuple[int, int]] = []
146
+ for i, vector in enumerate(vectors):
147
+ embeddings.append(np.array(vector))
148
+ vector_spans.append((0, 0))
149
+ spans.append((path_key, vector_spans))
150
+
151
+ vector_index = VectorDBIndex(vector_store)
152
+ vector_index.add(spans, np.array(embeddings))
153
+ return vector_index
lilac/data/dataset_utils.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utilities for working with datasets."""
2
+
3
+ import gc
4
+ import json
5
+ import math
6
+ import os
7
+ import pprint
8
+ import secrets
9
+ from collections.abc import Iterable
10
+ from typing import Any, Callable, Iterator, Optional, Sequence, TypeVar, Union, cast
11
+
12
+ import numpy as np
13
+ import pyarrow as pa
14
+
15
+ from ..batch_utils import deep_flatten
16
+ from ..embeddings.vector_store import VectorDBIndex
17
+ from ..env import env
18
+ from ..parquet_writer import ParquetWriter
19
+ from ..schema import (
20
+ EMBEDDING_KEY,
21
+ PATH_WILDCARD,
22
+ ROWID,
23
+ TEXT_SPAN_END_FEATURE,
24
+ TEXT_SPAN_START_FEATURE,
25
+ VALUE_KEY,
26
+ DataType,
27
+ Field,
28
+ Item,
29
+ PathKey,
30
+ PathTuple,
31
+ Schema,
32
+ VectorKey,
33
+ field,
34
+ schema,
35
+ schema_to_arrow_schema,
36
+ )
37
+ from ..signal import Signal
38
+ from ..utils import is_primitive, log, open_file
39
+
40
+
41
+ def _replace_embeddings_with_none(input: Union[Item, Item]) -> Union[Item, Item]:
42
+ if isinstance(input, np.ndarray):
43
+ return None
44
+ if isinstance(input, dict):
45
+ return {k: _replace_embeddings_with_none(v) for k, v in input.items()}
46
+ if isinstance(input, list):
47
+ return [_replace_embeddings_with_none(v) for v in input]
48
+
49
+ return input
50
+
51
+
52
+ def replace_embeddings_with_none(input: Union[Item, Item]) -> Item:
53
+ """Replaces all embeddings with None."""
54
+ return cast(Item, _replace_embeddings_with_none(input))
55
+
56
+
57
+ def count_primitives(input: Union[Iterable, Iterator]) -> int:
58
+ """Iterate through each element of the input, flattening each one, computing a count.
59
+
60
+ Sum the final set of counts. This is the important iterable not to exhaust.
61
+ """
62
+ return sum((len(list(deep_flatten(i))) for i in input))
63
+
64
+
65
+ def _wrap_value_in_dict(input: Union[object, dict], props: PathTuple) -> Union[object, dict]:
66
+ # If the signal produced no value, or nan, we should return None so the parquet value is sparse.
67
+ if isinstance(input, float) and math.isnan(input):
68
+ input = None
69
+ for prop in reversed(props):
70
+ input = {prop: input}
71
+ return input
72
+
73
+
74
+ def _wrap_in_dicts(input: Union[object, Iterable[object]],
75
+ spec: list[PathTuple]) -> Union[object, Iterable[object]]:
76
+ """Wraps an object or iterable in a dict according to the spec."""
77
+ props = spec[0] if spec else tuple()
78
+ if len(spec) == 1:
79
+ return _wrap_value_in_dict(input, props)
80
+ if input is None or isinstance(input, float) and math.isnan(input):
81
+ # Return empty dict for missing inputs.
82
+ return {}
83
+ res = [_wrap_in_dicts(elem, spec[1:]) for elem in cast(Iterable, input)]
84
+ return _wrap_value_in_dict(res, props)
85
+
86
+
87
+ def wrap_in_dicts(input: Iterable[object], spec: list[PathTuple]) -> Iterable[object]:
88
+ """Wraps an object or iterable in a dict according to the spec."""
89
+ return [_wrap_in_dicts(elem, spec) for elem in input]
90
+
91
+
92
+ def _merge_field_into(schema: Field, destination: Field) -> None:
93
+ if isinstance(schema, Field):
94
+ destination.signal = destination.signal or schema.signal
95
+ destination.dtype = destination.dtype or schema.dtype
96
+ if schema.fields:
97
+ destination.fields = destination.fields or {}
98
+ for field_name, subfield in schema.fields.items():
99
+ if field_name not in destination.fields:
100
+ destination.fields[field_name] = subfield.copy(deep=True)
101
+ else:
102
+ _merge_field_into(subfield, destination.fields[field_name])
103
+ elif schema.repeated_field:
104
+ if not destination.repeated_field:
105
+ raise ValueError('Failed to merge schemas. Origin schema is repeated, but destination is not')
106
+ _merge_field_into(schema.repeated_field, destination.repeated_field)
107
+ else:
108
+ if destination.dtype != schema.dtype:
109
+ raise ValueError(f'Failed to merge schemas. Origin schema has dtype {schema.dtype}, '
110
+ f'but destination has dtype {destination.dtype}')
111
+
112
+
113
+ def merge_schemas(schemas: Sequence[Union[Schema, Field]]) -> Schema:
114
+ """Merge a list of schemas."""
115
+ merged_schema = Schema(fields={})
116
+ for s in schemas:
117
+ _merge_field_into(cast(Field, s), cast(Field, merged_schema))
118
+ return merged_schema
119
+
120
+
121
+ def schema_contains_path(schema: Schema, path: PathTuple) -> bool:
122
+ """Check if a schema contains a path."""
123
+ current_field = cast(Field, schema)
124
+ for path_part in path:
125
+ if path_part == PATH_WILDCARD:
126
+ if current_field.repeated_field is None:
127
+ return False
128
+ current_field = current_field.repeated_field
129
+ else:
130
+ if current_field.fields is None or path_part not in current_field.fields:
131
+ return False
132
+ current_field = current_field.fields[str(path_part)]
133
+ return True
134
+
135
+
136
+ def create_signal_schema(signal: Signal, source_path: PathTuple, current_schema: Schema) -> Schema:
137
+ """Create a schema describing the enriched fields added an enrichment."""
138
+ leafs = current_schema.leafs
139
+ # Validate that the enrich fields are actually a valid leaf path.
140
+ if source_path not in leafs:
141
+ raise ValueError(f'"{source_path}" is not a valid leaf path. Leaf paths: {leafs.keys()}')
142
+
143
+ signal_schema = signal.fields()
144
+ signal_schema.signal = signal.dict()
145
+
146
+ enriched_schema = field(fields={signal.key(is_computed_signal=True): signal_schema})
147
+
148
+ for path_part in reversed(source_path):
149
+ if path_part == PATH_WILDCARD:
150
+ enriched_schema = Field(repeated_field=enriched_schema)
151
+ else:
152
+ enriched_schema = Field(fields={path_part: enriched_schema})
153
+
154
+ if not enriched_schema.fields:
155
+ raise ValueError('This should not happen since enriched_schema always has fields (see above)')
156
+
157
+ return schema(enriched_schema.fields.copy())
158
+
159
+
160
+ def write_embeddings_to_disk(vector_store: str, rowids: Iterable[str], signal_items: Iterable[Item],
161
+ output_dir: str) -> None:
162
+ """Write a set of embeddings to disk."""
163
+
164
+ def embedding_predicate(input: Any) -> bool:
165
+ return (isinstance(input, list) and len(input) > 0 and isinstance(input[0], dict) and
166
+ EMBEDDING_KEY in input[0])
167
+
168
+ path_keys = flatten_keys(rowids, signal_items, is_primitive_predicate=embedding_predicate)
169
+ all_embeddings = cast(Iterable[Item],
170
+ deep_flatten(signal_items, is_primitive_predicate=embedding_predicate))
171
+
172
+ embedding_vectors: list[np.ndarray] = []
173
+ all_spans: list[tuple[PathKey, list[tuple[int, int]]]] = []
174
+ for path_key, embeddings in zip(path_keys, all_embeddings):
175
+ if not path_key or not embeddings:
176
+ # Sparse embeddings may not have an embedding for every key.
177
+ continue
178
+
179
+ spans: list[tuple[int, int]] = []
180
+ for e in embeddings:
181
+ span = e[VALUE_KEY]
182
+ vector = e[EMBEDDING_KEY]
183
+ # We squeeze here because embedding functions can return outer dimensions of 1.
184
+ embedding_vectors.append(vector.reshape(-1))
185
+ spans.append((span[TEXT_SPAN_START_FEATURE], span[TEXT_SPAN_END_FEATURE]))
186
+ all_spans.append((path_key, spans))
187
+ embedding_matrix = np.array(embedding_vectors, dtype=np.float32)
188
+ del path_keys, all_embeddings, embedding_vectors
189
+ gc.collect()
190
+
191
+ # Write to disk.
192
+ vector_index = VectorDBIndex(vector_store)
193
+ vector_index.add(all_spans, embedding_matrix)
194
+ vector_index.save(output_dir)
195
+
196
+ del vector_index
197
+ gc.collect()
198
+
199
+
200
+ def write_items_to_parquet(items: Iterable[Item], output_dir: str, schema: Schema,
201
+ filename_prefix: str, shard_index: int,
202
+ num_shards: int) -> tuple[str, int]:
203
+ """Write a set of items to a parquet file, in columnar format."""
204
+ schema = schema.copy(deep=True)
205
+ # Add a rowid column.
206
+ schema.fields[ROWID] = Field(dtype=DataType.STRING)
207
+
208
+ arrow_schema = schema_to_arrow_schema(schema)
209
+ out_filename = parquet_filename(filename_prefix, shard_index, num_shards)
210
+ filepath = os.path.join(output_dir, out_filename)
211
+ f = open_file(filepath, mode='wb')
212
+ writer = ParquetWriter(schema)
213
+ writer.open(f)
214
+ debug = env('DEBUG', False)
215
+ num_items = 0
216
+ for item in items:
217
+ # Add a rowid column.
218
+ if ROWID not in item:
219
+ item[ROWID] = secrets.token_urlsafe(nbytes=12) # 16 base64 characters.
220
+ if debug:
221
+ try:
222
+ _validate(item, arrow_schema)
223
+ except Exception as e:
224
+ raise ValueError(f'Error validating item: {json.dumps(item)}') from e
225
+ writer.write(item)
226
+ num_items += 1
227
+ writer.close()
228
+ f.close()
229
+ return out_filename, num_items
230
+
231
+
232
+ def _validate(item: Item, schema: pa.Schema) -> None:
233
+ # Try to parse the item using the inferred schema.
234
+ try:
235
+ pa.RecordBatch.from_pylist([item], schema=schema)
236
+ except pa.ArrowTypeError:
237
+ log('Failed to parse arrow item using the arrow schema.')
238
+ log('Item:')
239
+ log(pprint.pformat(item, indent=2))
240
+ log('Arrow schema:')
241
+ log(schema)
242
+ raise # Re-raise the same exception, same stacktrace.
243
+
244
+
245
+ def parquet_filename(prefix: str, shard_index: int, num_shards: int) -> str:
246
+ """Return the filename for a parquet file."""
247
+ return f'{prefix}-{shard_index:05d}-of-{num_shards:05d}.parquet'
248
+
249
+
250
+ def _flatten_keys(rowid: str, nested_input: Iterable, location: list[int],
251
+ is_primitive_predicate: Callable[[object], bool]) -> Iterator[VectorKey]:
252
+ if is_primitive_predicate(nested_input) or is_primitive(nested_input) or isinstance(
253
+ nested_input, dict):
254
+ yield (rowid, *location)
255
+ return
256
+
257
+ for i, input in enumerate(nested_input):
258
+ yield from _flatten_keys(rowid, input, [*location, i], is_primitive_predicate)
259
+
260
+
261
+ def flatten_keys(
262
+ rowids: Iterable[str],
263
+ nested_input: Iterable,
264
+ is_primitive_predicate: Callable[[object],
265
+ bool] = is_primitive) -> Iterator[Optional[VectorKey]]:
266
+ """Flatten the rowids of a nested input."""
267
+ for rowid, input in zip(rowids, nested_input):
268
+ if input is None:
269
+ yield None
270
+ continue
271
+ yield from _flatten_keys(rowid, input, [], is_primitive_predicate)
272
+
273
+
274
+ Tin = TypeVar('Tin')
275
+ Tout = TypeVar('Tout')
276
+
277
+
278
+ def sparse_to_dense_compute(
279
+ sparse_input: Iterator[Optional[Tin]],
280
+ func: Callable[[Iterable[Tin]], Iterable[Tout]]) -> Iterator[Optional[Tout]]:
281
+ """Densifies the input before calling the provided `func` and sparsifies the output."""
282
+ locations: list[int] = []
283
+ total_size: int = 0
284
+
285
+ def densify(x: Iterator[Optional[Tin]]) -> Iterator[Tin]:
286
+ nonlocal locations, total_size
287
+ for i, value in enumerate(x):
288
+ total_size += 1
289
+ if value is not None:
290
+ locations.append(i)
291
+ yield value
292
+
293
+ dense_input = densify(sparse_input)
294
+ dense_output = iter(func(dense_input))
295
+ index = 0
296
+
297
+ location_index = 0
298
+
299
+ while True:
300
+ try:
301
+ out = next(dense_output)
302
+ out_index = locations[location_index]
303
+ while index < out_index:
304
+ yield None
305
+ index += 1
306
+ yield out
307
+ location_index += 1
308
+ index += 1
309
+ except StopIteration:
310
+ while index < total_size:
311
+ yield None
312
+ index += 1
313
+ return
lilac/data_loader.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A data loader standalone binary. This should only be run as a script to load a dataset.
2
+
3
+ To run the source loader as a binary directly:
4
+
5
+ poetry run python -m lilac.data_loader \
6
+ --dataset_name=movies_dataset \
7
+ --output_dir=./data/ \
8
+ --config_path=./datasets/the_movies_dataset.json
9
+ """
10
+ import os
11
+ import pathlib
12
+ import uuid
13
+ from typing import Iterable, Optional, Union
14
+
15
+ import pandas as pd
16
+
17
+ from .config import CONFIG_FILENAME, DatasetConfig
18
+ from .data.dataset import Dataset, default_settings
19
+ from .data.dataset_utils import write_items_to_parquet
20
+ from .db_manager import get_dataset
21
+ from .env import data_path
22
+ from .schema import (
23
+ MANIFEST_FILENAME,
24
+ PARQUET_FILENAME_PREFIX,
25
+ ROWID,
26
+ Field,
27
+ Item,
28
+ Schema,
29
+ SourceManifest,
30
+ is_float,
31
+ )
32
+ from .tasks import TaskStepId, progress
33
+ from .utils import get_dataset_output_dir, log, open_file, to_yaml
34
+
35
+
36
+ def create_dataset(config: DatasetConfig) -> Dataset:
37
+ """Load a dataset from a given source configuration."""
38
+ process_source(data_path(), config)
39
+ return get_dataset(config.namespace, config.name)
40
+
41
+
42
+ def process_source(base_dir: Union[str, pathlib.Path],
43
+ config: DatasetConfig,
44
+ task_step_id: Optional[TaskStepId] = None) -> tuple[str, int]:
45
+ """Process a source."""
46
+ output_dir = get_dataset_output_dir(base_dir, config.namespace, config.name)
47
+
48
+ config.source.setup()
49
+ source_schema = config.source.source_schema()
50
+ items = config.source.process()
51
+
52
+ # Add rowids and fix NaN in string columns.
53
+ items = normalize_items(items, source_schema.fields)
54
+
55
+ # Add progress.
56
+ items = progress(
57
+ items,
58
+ task_step_id=task_step_id,
59
+ estimated_len=source_schema.num_items,
60
+ step_description=f'Reading from source {config.source.name}...')
61
+
62
+ # Filter out the `None`s after progress.
63
+ items = (item for item in items if item is not None)
64
+
65
+ data_schema = Schema(fields=source_schema.fields.copy())
66
+ filepath, num_items = write_items_to_parquet(
67
+ items=items,
68
+ output_dir=output_dir,
69
+ schema=data_schema,
70
+ filename_prefix=PARQUET_FILENAME_PREFIX,
71
+ shard_index=0,
72
+ num_shards=1)
73
+
74
+ filenames = [os.path.basename(filepath)]
75
+ manifest = SourceManifest(files=filenames, data_schema=data_schema, images=None)
76
+ with open_file(os.path.join(output_dir, MANIFEST_FILENAME), 'w') as f:
77
+ f.write(manifest.json(indent=2, exclude_none=True))
78
+
79
+ if not config.settings:
80
+ dataset = get_dataset(config.namespace, config.name)
81
+ config.settings = default_settings(dataset)
82
+ with open_file(os.path.join(output_dir, CONFIG_FILENAME), 'w') as f:
83
+ f.write(to_yaml(config.dict(exclude_defaults=True, exclude_none=True)))
84
+
85
+ log(f'Dataset "{config.name}" written to {output_dir}')
86
+
87
+ return output_dir, num_items
88
+
89
+
90
+ def normalize_items(items: Iterable[Item], fields: dict[str, Field]) -> Item:
91
+ """Sanitize items by removing NaNs and NaTs."""
92
+ replace_nan_fields = [
93
+ field_name for field_name, field in fields.items() if field.dtype and not is_float(field.dtype)
94
+ ]
95
+ for item in items:
96
+ if item is None:
97
+ yield item
98
+ continue
99
+
100
+ # Add rowid if it doesn't exist.
101
+ if ROWID not in item:
102
+ item[ROWID] = uuid.uuid4().hex
103
+
104
+ # Fix NaN values.
105
+ for field_name in replace_nan_fields:
106
+ item_value = item.get(field_name)
107
+ if item_value and pd.isna(item_value):
108
+ item[field_name] = None
109
+
110
+ yield item
lilac/db_manager.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Manages mapping the dataset name to the database instance."""
2
+ import os
3
+ import pathlib
4
+ import threading
5
+ from typing import Optional, Type, Union
6
+
7
+ import yaml
8
+ from pydantic import BaseModel
9
+
10
+ from .config import DatasetConfig
11
+ from .data.dataset import Dataset
12
+ from .data.dataset_duckdb import get_config_filepath
13
+ from .utils import get_datasets_dir
14
+
15
+ _DEFAULT_DATASET_CLS: Type[Dataset]
16
+
17
+ _CACHED_DATASETS: dict[str, Dataset] = {}
18
+
19
+ _db_lock = threading.Lock()
20
+
21
+
22
+ def get_dataset(namespace: str, dataset_name: str) -> Dataset:
23
+ """Get the dataset instance."""
24
+ if not _DEFAULT_DATASET_CLS:
25
+ raise ValueError('Default dataset class not set.')
26
+ cache_key = f'{namespace}/{dataset_name}'
27
+ # https://docs.pytest.org/en/latest/example/simple.html#pytest-current-test-environment-variable
28
+ inside_test = 'PYTEST_CURRENT_TEST' in os.environ
29
+ with _db_lock:
30
+ if cache_key not in _CACHED_DATASETS or inside_test:
31
+ _CACHED_DATASETS[cache_key] = _DEFAULT_DATASET_CLS(
32
+ namespace=namespace, dataset_name=dataset_name)
33
+ return _CACHED_DATASETS[cache_key]
34
+
35
+
36
+ def remove_dataset_from_cache(namespace: str, dataset_name: str) -> None:
37
+ """Remove the dataset from the db manager cache."""
38
+ cache_key = f'{namespace}/{dataset_name}'
39
+ with _db_lock:
40
+ if cache_key in _CACHED_DATASETS:
41
+ del _CACHED_DATASETS[cache_key]
42
+
43
+
44
+ class DatasetInfo(BaseModel):
45
+ """Information about a dataset."""
46
+ namespace: str
47
+ dataset_name: str
48
+ description: Optional[str] = None
49
+ tags: list[str] = []
50
+
51
+
52
+ def list_datasets(base_dir: Union[str, pathlib.Path]) -> list[DatasetInfo]:
53
+ """List the datasets in a data directory."""
54
+ datasets_path = get_datasets_dir(base_dir)
55
+
56
+ # Skip if 'datasets' doesn't exist.
57
+ if not os.path.isdir(datasets_path):
58
+ return []
59
+
60
+ dataset_infos: list[DatasetInfo] = []
61
+ for namespace in os.listdir(datasets_path):
62
+ dataset_dir = os.path.join(datasets_path, namespace)
63
+ # Skip if namespace is not a directory.
64
+ if not os.path.isdir(dataset_dir):
65
+ continue
66
+ if namespace.startswith('.'):
67
+ continue
68
+
69
+ for dataset_name in os.listdir(dataset_dir):
70
+ # Skip if dataset_name is not a directory.
71
+ dataset_path = os.path.join(dataset_dir, dataset_name)
72
+ if not os.path.isdir(dataset_path):
73
+ continue
74
+ if dataset_name.startswith('.'):
75
+ continue
76
+
77
+ # Open the config file to read the tags. We avoid instantiating a dataset for now to reduce
78
+ # the overhead of listing datasets.
79
+ config_filepath = get_config_filepath(namespace, dataset_name)
80
+ tags = []
81
+ if os.path.exists(config_filepath):
82
+ with open(config_filepath) as f:
83
+ config = DatasetConfig(**yaml.safe_load(f))
84
+ tags = config.tags
85
+
86
+ dataset_infos.append(DatasetInfo(namespace=namespace, dataset_name=dataset_name, tags=tags))
87
+
88
+ return dataset_infos
89
+
90
+
91
+ # TODO(nsthorat): Make this a registry once we have multiple dataset implementations. This breaks a
92
+ # circular dependency.
93
+ def set_default_dataset_cls(dataset_cls: Type[Dataset]) -> None:
94
+ """Set the default dataset class."""
95
+ global _DEFAULT_DATASET_CLS
96
+ _DEFAULT_DATASET_CLS = dataset_cls
lilac/embeddings/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Embeddings compute a vector for a chunk of a document."""
2
+
3
+ from .embedding import compute_split_embeddings
4
+
5
+ __all__ = [
6
+ 'compute_split_embeddings',
7
+ ]
lilac/embeddings/cohere.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cohere embeddings."""
2
+ from typing import TYPE_CHECKING, Iterable, cast
3
+
4
+ import numpy as np
5
+ from typing_extensions import override
6
+
7
+ from ..env import env
8
+ from ..schema import Item, RichData
9
+ from ..signal import TextEmbeddingSignal
10
+ from ..splitters.chunk_splitter import split_text
11
+ from .embedding import compute_split_embeddings
12
+
13
+ if TYPE_CHECKING:
14
+ from cohere import Client
15
+
16
+ NUM_PARALLEL_REQUESTS = 10
17
+ COHERE_BATCH_SIZE = 96
18
+
19
+
20
+ class Cohere(TextEmbeddingSignal):
21
+ """Computes embeddings using Cohere's embedding API.
22
+
23
+ <br>**Important**: This will send data to an external server!
24
+
25
+ <br>To use this signal, you must get a Cohere API key from
26
+ [cohere.com/embed](https://cohere.com/embed) and add it to your .env.local.
27
+
28
+ <br>For details on pricing, see: https://cohere.com/pricing.
29
+ """
30
+
31
+ name = 'cohere'
32
+ display_name = 'Cohere Embeddings'
33
+
34
+ _model: 'Client'
35
+
36
+ @override
37
+ def setup(self) -> None:
38
+ """Validate that the api key and python package exists in environment."""
39
+ api_key = env('COHERE_API_KEY')
40
+ if not api_key:
41
+ raise ValueError('`COHERE_API_KEY` environment variable not set.')
42
+ try:
43
+ import cohere
44
+ self._model = cohere.Client(api_key, max_retries=10)
45
+ except ImportError:
46
+ raise ImportError('Could not import the "cohere" python package. '
47
+ 'Please install it with `pip install cohere`.')
48
+
49
+ @override
50
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
51
+ """Compute embeddings for the given documents."""
52
+
53
+ def embed_fn(texts: list[str]) -> list[np.ndarray]:
54
+ return self._model.embed(texts, truncate='END').embeddings
55
+
56
+ docs = cast(Iterable[str], docs)
57
+ split_fn = split_text if self._split else None
58
+ yield from compute_split_embeddings(
59
+ docs, COHERE_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)
lilac/embeddings/default_vector_stores.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Registers all vector stores."""
2
+ from .vector_store import register_vector_store
3
+ from .vector_store_hnsw import HNSWVectorStore
4
+ from .vector_store_numpy import NumpyVectorStore
5
+
6
+
7
+ def register_default_vector_stores() -> None:
8
+ """Register all the default vector stores."""
9
+ register_vector_store(HNSWVectorStore)
10
+ register_vector_store(NumpyVectorStore)
lilac/embeddings/embedding.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Embedding registry."""
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from typing import Callable, Generator, Iterable, Iterator, Optional, Union, cast
4
+
5
+ import numpy as np
6
+ from pydantic import StrictStr
7
+ from sklearn.preprocessing import normalize
8
+
9
+ from ..schema import (
10
+ EMBEDDING_KEY,
11
+ TEXT_SPAN_END_FEATURE,
12
+ TEXT_SPAN_START_FEATURE,
13
+ VALUE_KEY,
14
+ Item,
15
+ RichData,
16
+ SpanVector,
17
+ lilac_embedding,
18
+ )
19
+ from ..signal import TextEmbeddingSignal, get_signal_by_type
20
+ from ..splitters.chunk_splitter import TextChunk
21
+ from ..utils import chunks
22
+
23
+ EmbeddingId = Union[StrictStr, TextEmbeddingSignal]
24
+
25
+ EmbedFn = Callable[[Iterable[RichData]], Iterator[list[SpanVector]]]
26
+
27
+
28
+ def get_embed_fn(embedding_name: str, split: bool) -> EmbedFn:
29
+ """Return a function that returns the embedding matrix for the given embedding signal."""
30
+ embedding_cls = get_signal_by_type(embedding_name, TextEmbeddingSignal)
31
+ embedding = embedding_cls(split=split)
32
+ embedding.setup()
33
+
34
+ def _embed_fn(data: Iterable[RichData]) -> Iterator[list[SpanVector]]:
35
+ items = embedding.compute(data)
36
+
37
+ for item in items:
38
+ if not item:
39
+ raise ValueError('Embedding signal returned None.')
40
+
41
+ yield [{
42
+ 'vector': item_val[EMBEDDING_KEY].reshape(-1),
43
+ 'span':
44
+ (item_val[VALUE_KEY][TEXT_SPAN_START_FEATURE], item_val[VALUE_KEY][TEXT_SPAN_END_FEATURE])
45
+ } for item_val in item]
46
+
47
+ return _embed_fn
48
+
49
+
50
+ def compute_split_embeddings(docs: Iterable[str],
51
+ batch_size: int,
52
+ embed_fn: Callable[[list[str]], list[np.ndarray]],
53
+ split_fn: Optional[Callable[[str], list[TextChunk]]] = None,
54
+ num_parallel_requests: int = 1) -> Generator[Item, None, None]:
55
+ """Compute text embeddings in batches of chunks, using the provided splitter and embedding fn."""
56
+ pool = ThreadPoolExecutor()
57
+
58
+ def _splitter(doc: str) -> list[TextChunk]:
59
+ if not doc:
60
+ return []
61
+ if split_fn:
62
+ return split_fn(doc)
63
+ else:
64
+ # Return a single chunk that spans the entire document.
65
+ return [(doc, (0, len(doc)))]
66
+
67
+ num_docs = 0
68
+
69
+ def _flat_split_batch_docs(docs: Iterable[str]) -> Generator[tuple[int, TextChunk], None, None]:
70
+ """Split a batch of documents into chunks and yield them."""
71
+ nonlocal num_docs
72
+ for i, doc in enumerate(docs):
73
+ num_docs += 1
74
+ chunks = _splitter(doc)
75
+ for chunk in chunks:
76
+ yield (i, chunk)
77
+
78
+ doc_chunks = _flat_split_batch_docs(docs)
79
+ items_to_yield: Optional[list[Item]] = None
80
+ current_index = 0
81
+
82
+ mega_batch_size = batch_size * num_parallel_requests
83
+
84
+ for batch in chunks(doc_chunks, mega_batch_size):
85
+ texts = [text for _, (text, _) in batch]
86
+ embeddings: list[np.ndarray] = []
87
+
88
+ for x in list(pool.map(lambda x: embed_fn(x), chunks(texts, batch_size))):
89
+ embeddings.extend(x)
90
+ matrix = cast(np.ndarray, normalize(np.array(embeddings, dtype=np.float32)))
91
+ # np.split returns a shallow copy of each embedding so we don't increase the mem footprint.
92
+ embeddings_batch = cast(list[np.ndarray], np.split(matrix, matrix.shape[0]))
93
+ for (index, (_, (start, end))), embedding in zip(batch, embeddings_batch):
94
+ embedding = embedding.reshape(-1)
95
+ if index == current_index:
96
+ if items_to_yield is None:
97
+ items_to_yield = []
98
+ items_to_yield.append(lilac_embedding(start, end, embedding))
99
+ else:
100
+ yield items_to_yield
101
+ current_index += 1
102
+ while current_index < index:
103
+ yield None
104
+ current_index += 1
105
+ items_to_yield = [lilac_embedding(start, end, embedding)]
106
+
107
+ while current_index < num_docs:
108
+ yield items_to_yield
109
+ items_to_yield = None
110
+ current_index += 1
lilac/embeddings/gte.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
2
+ from typing import TYPE_CHECKING, Iterable, cast
3
+
4
+ from typing_extensions import override
5
+
6
+ from ..schema import Item, RichData
7
+ from ..signal import TextEmbeddingSignal
8
+ from ..splitters.chunk_splitter import split_text
9
+ from .embedding import compute_split_embeddings
10
+ from .transformer_utils import get_model
11
+
12
+ if TYPE_CHECKING:
13
+ pass
14
+
15
+ # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
16
+ GTE_SMALL = 'thenlper/gte-small'
17
+ GTE_BASE = 'thenlper/gte-base'
18
+
19
+ # Maps a tuple of model name and device to the optimal batch size, found empirically.
20
+ _OPTIMAL_BATCH_SIZES: dict[str, dict[str, int]] = {
21
+ GTE_SMALL: {
22
+ '': 64, # Default batch size.
23
+ 'mps': 256,
24
+ },
25
+ GTE_BASE: {
26
+ '': 64, # Default batch size.
27
+ 'mps': 128,
28
+ }
29
+ }
30
+
31
+
32
+ class GTESmall(TextEmbeddingSignal):
33
+ """Computes Gegeral Text Embeddings (GTE).
34
+
35
+ <br>This embedding runs on-device. See the [model card](https://huggingface.co/thenlper/gte-small)
36
+ for details.
37
+ """
38
+
39
+ name = 'gte-small'
40
+ display_name = 'Gegeral Text Embeddings (small)'
41
+
42
+ _model_name = GTE_SMALL
43
+
44
+ @override
45
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
46
+ """Call the embedding function."""
47
+ batch_size, model = get_model(self._model_name, _OPTIMAL_BATCH_SIZES[self._model_name])
48
+ embed_fn = model.encode
49
+ split_fn = split_text if self._split else None
50
+ docs = cast(Iterable[str], docs)
51
+ yield from compute_split_embeddings(docs, batch_size, embed_fn=embed_fn, split_fn=split_fn)
52
+
53
+
54
+ class GTEBase(GTESmall):
55
+ """Computes Gegeral Text Embeddings (GTE).
56
+
57
+ <br>This embedding runs on-device. See the [model card](https://huggingface.co/thenlper/gte-base)
58
+ for details.
59
+ """
60
+ name = 'gte-base'
61
+ display_name = 'Gegeral Text Embeddings (base)'
62
+
63
+ _model_name = GTE_BASE
lilac/embeddings/openai.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenAI embeddings."""
2
+ from typing import TYPE_CHECKING, Any, Iterable, cast
3
+
4
+ import numpy as np
5
+ from tenacity import retry, stop_after_attempt, wait_random_exponential
6
+ from typing_extensions import override
7
+
8
+ from ..env import env
9
+ from ..schema import Item, RichData
10
+ from ..signal import TextEmbeddingSignal
11
+ from ..splitters.chunk_splitter import split_text
12
+ from .embedding import compute_split_embeddings
13
+
14
+ if TYPE_CHECKING:
15
+ import openai
16
+
17
+ NUM_PARALLEL_REQUESTS = 10
18
+ OPENAI_BATCH_SIZE = 128
19
+ EMBEDDING_MODEL = 'text-embedding-ada-002'
20
+
21
+
22
+ class OpenAI(TextEmbeddingSignal):
23
+ """Computes embeddings using OpenAI's embedding API.
24
+
25
+ <br>**Important**: This will send data to an external server!
26
+
27
+ <br>To use this signal, you must get an OpenAI API key from
28
+ [platform.openai.com](https://platform.openai.com/) and add it to your .env.local.
29
+
30
+ <br>For details on pricing, see: https://openai.com/pricing.
31
+ """
32
+
33
+ name = 'openai'
34
+ display_name = 'OpenAI Embeddings'
35
+
36
+ _model: type['openai.Embedding']
37
+
38
+ @override
39
+ def setup(self) -> None:
40
+ api_key = env('OPENAI_API_KEY')
41
+ if not api_key:
42
+ raise ValueError('`OPENAI_API_KEY` environment variable not set.')
43
+ try:
44
+ import openai
45
+ openai.api_key = api_key
46
+ self._model = openai.Embedding
47
+ except ImportError:
48
+ raise ImportError('Could not import the "openai" python package. '
49
+ 'Please install it with `pip install openai`.')
50
+
51
+ @override
52
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
53
+ """Compute embeddings for the given documents."""
54
+
55
+ @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
56
+ def embed_fn(texts: list[str]) -> list[np.ndarray]:
57
+
58
+ # Replace newlines, which can negatively affect performance.
59
+ # See https://github.com/search?q=repo%3Aopenai%2Fopenai-python+replace+newlines&type=code
60
+ texts = [text.replace('\n', ' ') for text in texts]
61
+
62
+ response: Any = self._model.create(input=texts, model=EMBEDDING_MODEL)
63
+ return [np.array(embedding['embedding'], dtype=np.float32) for embedding in response['data']]
64
+
65
+ docs = cast(Iterable[str], docs)
66
+ split_fn = split_text if self._split else None
67
+ yield from compute_split_embeddings(
68
+ docs, OPENAI_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)
lilac/embeddings/palm.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PaLM embeddings."""
2
+ from typing import TYPE_CHECKING, Iterable, cast
3
+
4
+ import numpy as np
5
+ from tenacity import retry, stop_after_attempt, wait_random_exponential
6
+ from typing_extensions import override
7
+
8
+ from ..env import env
9
+ from ..schema import Item, RichData
10
+ from ..signal import TextEmbeddingSignal
11
+ from ..splitters.chunk_splitter import split_text
12
+ from .embedding import compute_split_embeddings
13
+
14
+ if TYPE_CHECKING:
15
+ import google.generativeai as palm
16
+
17
+ PALM_BATCH_SIZE = 1 # PaLM API only supports batch size 1.
18
+ NUM_PARALLEL_REQUESTS = 256 # Because batch size is 1, we can send many requests in parallel.
19
+ EMBEDDING_MODEL = 'models/embedding-gecko-001'
20
+
21
+
22
+ class PaLM(TextEmbeddingSignal):
23
+ """Computes embeddings using PaLM's embedding API.
24
+
25
+ <br>**Important**: This will send data to an external server!
26
+
27
+ <br>To use this signal, you must get a PaLM API key from
28
+ [makersuite.google.com](https://makersuite.google.com/app/apikey) and add it to your .env.local.
29
+ """
30
+
31
+ name = 'palm'
32
+ display_name = 'PaLM Embeddings'
33
+
34
+ _model: 'palm.generate_embeddings'
35
+
36
+ @override
37
+ def setup(self) -> None:
38
+ api_key = env('PALM_API_KEY')
39
+ if not api_key:
40
+ raise ValueError('`PALM_API_KEY` environment variable not set.')
41
+ try:
42
+ import google.generativeai as palm
43
+ palm.configure(api_key=api_key)
44
+ self._model = palm.generate_embeddings
45
+ except ImportError:
46
+ raise ImportError('Could not import the "google.generativeai" python package. '
47
+ 'Please install it with `pip install google-generativeai`.')
48
+
49
+ @override
50
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
51
+ """Compute embeddings for the given documents."""
52
+
53
+ @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
54
+ def embed_fn(texts: list[str]) -> list[np.ndarray]:
55
+ assert len(texts) == 1, 'PaLM API only supports batch size 1.'
56
+ response = self._model(model=EMBEDDING_MODEL, text=texts[0])
57
+ return [np.array(response['embedding'], dtype=np.float32)]
58
+
59
+ docs = cast(Iterable[str], docs)
60
+ split_fn = split_text if self._split else None
61
+ yield from compute_split_embeddings(
62
+ docs, PALM_BATCH_SIZE, embed_fn, split_fn, num_parallel_requests=NUM_PARALLEL_REQUESTS)
lilac/embeddings/sbert.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sentence-BERT embeddings. Open-source models, designed to run on device."""
2
+ from typing import Iterable, cast
3
+
4
+ from typing_extensions import override
5
+
6
+ from ..schema import Item, RichData
7
+ from ..signal import TextEmbeddingSignal
8
+ from ..splitters.chunk_splitter import split_text
9
+ from .embedding import compute_split_embeddings
10
+ from .transformer_utils import get_model
11
+
12
+ # The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2`` is 5 times
13
+ # faster and still offers good quality. See https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models/
14
+ MINI_LM_MODEL = 'all-MiniLM-L6-v2'
15
+
16
+ # Maps a tuple of model name and device to the optimal batch size, found empirically.
17
+ _OPTIMAL_BATCH_SIZES: dict[str, dict[str, int]] = {
18
+ MINI_LM_MODEL: {
19
+ '': 64, # Default batch size.
20
+ 'mps': 256,
21
+ }
22
+ }
23
+
24
+
25
+ class SBERT(TextEmbeddingSignal):
26
+ """Computes embeddings using Sentence-BERT library."""
27
+
28
+ name = 'sbert'
29
+ display_name = 'SBERT Embeddings'
30
+
31
+ @override
32
+ def compute(self, docs: Iterable[RichData]) -> Iterable[Item]:
33
+ """Call the embedding function."""
34
+ batch_size, model = get_model(MINI_LM_MODEL, _OPTIMAL_BATCH_SIZES[MINI_LM_MODEL])
35
+ embed_fn = model.encode
36
+ split_fn = split_text if self._split else None
37
+ docs = cast(Iterable[str], docs)
38
+ yield from compute_split_embeddings(docs, batch_size, embed_fn=embed_fn, split_fn=split_fn)
lilac/embeddings/transformer_utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utils for transformer embeddings."""
2
+
3
+ import functools
4
+ import os
5
+ from typing import TYPE_CHECKING, Optional
6
+
7
+ from ..env import data_path
8
+ from ..utils import log
9
+
10
+ if TYPE_CHECKING:
11
+ from sentence_transformers import SentenceTransformer
12
+
13
+
14
+ def get_model(model_name: str,
15
+ optimal_batch_sizes: dict[str, int] = {}) -> tuple[int, 'SentenceTransformer']:
16
+ """Get a transformer model and the optimal batch size for it."""
17
+ try:
18
+ import torch.backends.mps
19
+ from sentence_transformers import SentenceTransformer
20
+ except ImportError:
21
+ raise ImportError('Could not import the "sentence_transformers" python package. '
22
+ 'Please install it with `pip install sentence-transformers`.')
23
+ preferred_device: Optional[str] = None
24
+ if torch.backends.mps.is_available():
25
+ preferred_device = 'mps'
26
+ elif not torch.backends.mps.is_built():
27
+ log('MPS not available because the current PyTorch install was not built with MPS enabled.')
28
+
29
+ @functools.cache
30
+ def _get_model(model_name: str) -> 'SentenceTransformer':
31
+ return SentenceTransformer(
32
+ model_name, device=preferred_device, cache_folder=os.path.join(data_path(), '.cache'))
33
+
34
+ batch_size = optimal_batch_sizes[preferred_device or '']
35
+ return batch_size, _get_model(model_name)
lilac/embeddings/vector_store.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Interface for storing vectors."""
2
+
3
+ import abc
4
+ import os
5
+ import pickle
6
+ from typing import Iterable, Optional, Type
7
+
8
+ import numpy as np
9
+
10
+ from ..schema import SpanVector, VectorKey
11
+ from ..utils import open_file
12
+
13
+
14
+ class VectorStore(abc.ABC):
15
+ """Interface for storing and retrieving vectors."""
16
+
17
+ # The global name of the vector store.
18
+ name: str
19
+
20
+ @abc.abstractmethod
21
+ def save(self, base_path: str) -> None:
22
+ """Save the store to disk."""
23
+ pass
24
+
25
+ @abc.abstractmethod
26
+ def load(self, base_path: str) -> None:
27
+ """Load the store from disk."""
28
+ pass
29
+
30
+ @abc.abstractmethod
31
+ def size(self) -> int:
32
+ """Return the number of vectors in the store."""
33
+ pass
34
+
35
+ @abc.abstractmethod
36
+ def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
37
+ """Add or edit the given keyed embeddings to the store.
38
+
39
+ If the keys already exist they will be overwritten, acting as an "upsert".
40
+
41
+ Args:
42
+ keys: The keys to add the embeddings for.
43
+ embeddings: The embeddings to add. This should be a 2D matrix with the same length as keys.
44
+ """
45
+ pass
46
+
47
+ @abc.abstractmethod
48
+ def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
49
+ """Return the embeddings for given keys.
50
+
51
+ Args:
52
+ keys: The keys to return the embeddings for. If None, return all embeddings.
53
+
54
+ Returns
55
+ The embeddings for the given keys.
56
+ """
57
+ pass
58
+
59
+ def topk(self,
60
+ query: np.ndarray,
61
+ k: int,
62
+ keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
63
+ """Return the top k most similar vectors.
64
+
65
+ Args:
66
+ query: The query vector.
67
+ k: The number of results to return.
68
+ keys: Optional keys to restrict the search to.
69
+
70
+ Returns
71
+ A list of (key, score) tuples.
72
+ """
73
+ raise NotImplementedError
74
+
75
+
76
+ PathKey = VectorKey
77
+
78
+ _SPANS_PICKLE_NAME = 'spans.pkl'
79
+
80
+
81
+ class VectorDBIndex:
82
+ """Stores and retrives span vectors.
83
+
84
+ This wraps a regular vector store by adding a mapping from path keys, such as (rowid1, 0),
85
+ to span keys, such as (rowid1, 0, 0), which denotes the first span in the (rowid1, 0) document.
86
+ """
87
+
88
+ def __init__(self, vector_store: str) -> None:
89
+ self._vector_store: VectorStore = get_vector_store_cls(vector_store)()
90
+ # Map a path key to spans for that path.
91
+ self._id_to_spans: dict[PathKey, list[tuple[int, int]]] = {}
92
+
93
+ def load(self, base_path: str) -> None:
94
+ """Load the vector index from disk."""
95
+ assert not self._id_to_spans, 'Cannot load into a non-empty index.'
96
+ with open_file(os.path.join(base_path, _SPANS_PICKLE_NAME), 'rb') as f:
97
+ self._id_to_spans.update(pickle.load(f))
98
+ self._vector_store.load(os.path.join(base_path, self._vector_store.name))
99
+
100
+ def save(self, base_path: str) -> None:
101
+ """Save the vector index to disk."""
102
+ assert self._id_to_spans, 'Cannot save an empty index.'
103
+ with open_file(os.path.join(base_path, _SPANS_PICKLE_NAME), 'wb') as f:
104
+ pickle.dump(list(self._id_to_spans.items()), f)
105
+ self._vector_store.save(os.path.join(base_path, self._vector_store.name))
106
+
107
+ def add(self, all_spans: list[tuple[PathKey, list[tuple[int, int]]]],
108
+ embeddings: np.ndarray) -> None:
109
+ """Add the given spans and embeddings.
110
+
111
+ Args:
112
+ all_spans: The spans to initialize the index with.
113
+ embeddings: The embeddings to initialize the index with.
114
+ """
115
+ assert not self._id_to_spans, 'Cannot add to a non-empty index.'
116
+ self._id_to_spans.update(all_spans)
117
+ vector_keys = [(*path_key, i) for path_key, spans in all_spans for i in range(len(spans))]
118
+ assert len(vector_keys) == len(embeddings), (
119
+ f'Number of spans ({len(vector_keys)}) and embeddings ({len(embeddings)}) must match.')
120
+ self._vector_store.add(vector_keys, embeddings)
121
+
122
+ def get_vector_store(self) -> VectorStore:
123
+ """Return the underlying vector store."""
124
+ return self._vector_store
125
+
126
+ def get(self, keys: Iterable[PathKey]) -> Iterable[list[SpanVector]]:
127
+ """Return the spans with vectors for each key in `keys`.
128
+
129
+ Args:
130
+ keys: The keys to return the vectors for.
131
+
132
+ Returns
133
+ The span vectors for the given keys.
134
+ """
135
+ all_spans: list[list[tuple[int, int]]] = []
136
+ vector_keys: list[VectorKey] = []
137
+ for path_key in keys:
138
+ spans = self._id_to_spans[path_key]
139
+ all_spans.append(spans)
140
+ vector_keys.extend([(*path_key, i) for i in range(len(spans))])
141
+
142
+ all_vectors = self._vector_store.get(vector_keys)
143
+ offset = 0
144
+ for spans in all_spans:
145
+ vectors = all_vectors[offset:offset + len(spans)]
146
+ yield [{'span': span, 'vector': vector} for span, vector in zip(spans, vectors)]
147
+ offset += len(spans)
148
+
149
+ def topk(self,
150
+ query: np.ndarray,
151
+ k: int,
152
+ path_keys: Optional[Iterable[PathKey]] = None) -> list[tuple[PathKey, float]]:
153
+ """Return the top k most similar vectors.
154
+
155
+ Args:
156
+ query: The query vector.
157
+ k: The number of results to return.
158
+ path_keys: Optional key prefixes to restrict the search to.
159
+
160
+ Returns
161
+ A list of (key, score) tuples.
162
+ """
163
+ span_keys: Optional[list[VectorKey]] = None
164
+ if path_keys is not None:
165
+ span_keys = [
166
+ (*path_key, i) for path_key in path_keys for i in range(len(self._id_to_spans[path_key]))
167
+ ]
168
+ span_k = k
169
+ path_key_scores: dict[PathKey, float] = {}
170
+ total_num_span_keys = self._vector_store.size()
171
+ while (len(path_key_scores) < k and span_k < total_num_span_keys and
172
+ (not span_keys or span_k < len(span_keys))):
173
+ span_k += k
174
+ vector_key_scores = self._vector_store.topk(query, span_k, span_keys)
175
+ for (*path_key_list, _), score in vector_key_scores:
176
+ path_key = tuple(path_key_list)
177
+ if path_key not in path_key_scores:
178
+ path_key_scores[path_key] = score
179
+
180
+ return list(path_key_scores.items())[:k]
181
+
182
+
183
+ VECTOR_STORE_REGISTRY: dict[str, Type[VectorStore]] = {}
184
+
185
+
186
+ def register_vector_store(vector_store_cls: Type[VectorStore]) -> None:
187
+ """Register a vector store in the global registry."""
188
+ if vector_store_cls.name in VECTOR_STORE_REGISTRY:
189
+ raise ValueError(f'Vector store "{vector_store_cls.name}" has already been registered!')
190
+
191
+ VECTOR_STORE_REGISTRY[vector_store_cls.name] = vector_store_cls
192
+
193
+
194
+ def get_vector_store_cls(vector_store_name: str) -> Type[VectorStore]:
195
+ """Return a registered vector store given the name in the registry."""
196
+ return VECTOR_STORE_REGISTRY[vector_store_name]
197
+
198
+
199
+ def clear_vector_store_registry() -> None:
200
+ """Clear the vector store registry."""
201
+ VECTOR_STORE_REGISTRY.clear()
lilac/embeddings/vector_store_hnsw.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HNSW vector store."""
2
+
3
+ import multiprocessing
4
+ from typing import Iterable, Optional, Set, cast
5
+
6
+ import hnswlib
7
+ import numpy as np
8
+ import pandas as pd
9
+ from typing_extensions import override
10
+
11
+ from ..schema import VectorKey
12
+ from ..utils import DebugTimer
13
+ from .vector_store import VectorStore
14
+
15
+ _HNSW_SUFFIX = '.hnswlib.bin'
16
+ _LOOKUP_SUFFIX = '.lookup.pkl'
17
+
18
+ # Parameters for HNSW index: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
19
+ QUERY_EF = 50
20
+ CONSTRUCTION_EF = 100
21
+ M = 16
22
+ SPACE = 'ip'
23
+
24
+
25
+ class HNSWVectorStore(VectorStore):
26
+ """HNSW-backed vector store."""
27
+
28
+ name = 'hnsw'
29
+
30
+ def __init__(self) -> None:
31
+ # Maps a `VectorKey` to a row index in `_embeddings`.
32
+ self._key_to_label: Optional[pd.Series] = None
33
+ self._index: Optional[hnswlib.Index] = None
34
+
35
+ @override
36
+ def save(self, base_path: str) -> None:
37
+ assert self._key_to_label is not None and self._index is not None, (
38
+ 'The vector store has no embeddings. Call load() or add() first.')
39
+ self._index.save_index(base_path + _HNSW_SUFFIX)
40
+ self._key_to_label.to_pickle(base_path + _LOOKUP_SUFFIX)
41
+
42
+ @override
43
+ def load(self, base_path: str) -> None:
44
+ self._key_to_label = pd.read_pickle(base_path + _LOOKUP_SUFFIX)
45
+ dim = int(self._key_to_label.name)
46
+ index = hnswlib.Index(space=SPACE, dim=dim)
47
+ index.set_ef(QUERY_EF)
48
+ index.set_num_threads(multiprocessing.cpu_count())
49
+ index.load_index(base_path + _HNSW_SUFFIX)
50
+ self._index = index
51
+
52
+ @override
53
+ def size(self) -> int:
54
+ assert self._index is not None, (
55
+ 'The vector store has no embeddings. Call load() or add() first.')
56
+ return self._index.get_current_count()
57
+
58
+ @override
59
+ def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
60
+ assert self._index is None, (
61
+ 'Embeddings already exist in this store. Upsert is not yet supported.')
62
+
63
+ if len(keys) != embeddings.shape[0]:
64
+ raise ValueError(
65
+ f'Length of keys ({len(keys)}) does not match number of embeddings {embeddings.shape[0]}.')
66
+
67
+ dim = embeddings.shape[1]
68
+ with DebugTimer('hnswlib index creation'):
69
+ index = hnswlib.Index(space=SPACE, dim=dim)
70
+ index.set_ef(QUERY_EF)
71
+ index.set_num_threads(multiprocessing.cpu_count())
72
+ index.init_index(max_elements=len(keys), ef_construction=CONSTRUCTION_EF, M=M)
73
+
74
+ # Cast to float32 since dot product with float32 is 40-50x faster than float16 and 2.5x faster
75
+ # than float64.
76
+ embeddings = embeddings.astype(np.float32)
77
+ row_indices = np.arange(len(keys), dtype=np.int32)
78
+ self._key_to_label = pd.Series(row_indices, index=keys, dtype=np.int32)
79
+ self._key_to_label.name = str(dim)
80
+ index.add_items(embeddings, row_indices)
81
+ self._index = index
82
+
83
+ @override
84
+ def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
85
+ assert self._index is not None and self._key_to_label is not None, (
86
+ 'No embeddings exist in this store.')
87
+ if not keys:
88
+ return np.array(self._index.get_items(self._key_to_label.values), dtype=np.float32)
89
+ locs = self._key_to_label.loc[cast(list[str], keys)].values
90
+ return np.array(self._index.get_items(locs), dtype=np.float32)
91
+
92
+ @override
93
+ def topk(self,
94
+ query: np.ndarray,
95
+ k: int,
96
+ keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
97
+ assert self._index is not None and self._key_to_label is not None, (
98
+ 'No embeddings exist in this store.')
99
+ labels: Set[int] = set()
100
+ if keys is not None:
101
+ labels = set(self._key_to_label.loc[cast(list[str], keys)].tolist())
102
+ k = min(k, len(labels))
103
+
104
+ def filter_func(label: int) -> bool:
105
+ return label in labels
106
+
107
+ query = np.expand_dims(query.astype(np.float32), axis=0)
108
+ locs, dists = self._index.knn_query(query, k=k, filter=filter_func if labels else None)
109
+ locs = locs[0]
110
+ dists = dists[0]
111
+ topk_keys = self._key_to_label.index.values[locs]
112
+ return [(key, 1 - dist) for key, dist in zip(topk_keys, dists)]
lilac/embeddings/vector_store_numpy.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """NumpyVectorStore class for storing vectors in numpy arrays."""
2
+
3
+ from typing import Iterable, Optional, cast
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from typing_extensions import override
8
+
9
+ from ..schema import VectorKey
10
+ from .vector_store import VectorStore
11
+
12
+ _EMBEDDINGS_SUFFIX = '.matrix.npy'
13
+ _LOOKUP_SUFFIX = '.lookup.pkl'
14
+
15
+
16
+ class NumpyVectorStore(VectorStore):
17
+ """Stores vectors as in-memory np arrays."""
18
+ name = 'numpy'
19
+
20
+ def __init__(self) -> None:
21
+ self._embeddings: Optional[np.ndarray] = None
22
+ # Maps a `VectorKey` to a row index in `_embeddings`.
23
+ self._key_to_index: Optional[pd.Series] = None
24
+
25
+ @override
26
+ def size(self) -> int:
27
+ assert self._embeddings is not None, (
28
+ 'The vector store has no embeddings. Call load() or add() first.')
29
+ return len(self._embeddings)
30
+
31
+ @override
32
+ def save(self, base_path: str) -> None:
33
+ assert self._embeddings is not None and self._key_to_index is not None, (
34
+ 'The vector store has no embeddings. Call load() or add() first.')
35
+ np.save(base_path + _EMBEDDINGS_SUFFIX, self._embeddings, allow_pickle=False)
36
+ self._key_to_index.to_pickle(base_path + _LOOKUP_SUFFIX)
37
+
38
+ @override
39
+ def load(self, base_path: str) -> None:
40
+ self._embeddings = np.load(base_path + _EMBEDDINGS_SUFFIX, allow_pickle=False)
41
+ self._key_to_index = pd.read_pickle(base_path + _LOOKUP_SUFFIX)
42
+
43
+ @override
44
+ def add(self, keys: list[VectorKey], embeddings: np.ndarray) -> None:
45
+ if self._embeddings or self._key_to_index:
46
+ raise ValueError('Embeddings already exist in this store. Upsert is not yet supported.')
47
+
48
+ if len(keys) != embeddings.shape[0]:
49
+ raise ValueError(
50
+ f'Length of keys ({len(keys)}) does not match number of embeddings {embeddings.shape[0]}.')
51
+
52
+ # Cast to float32 since dot product with float32 is 40-50x faster than float16 and 2.5x faster
53
+ # than float64.
54
+ self._embeddings = embeddings.astype(np.float32)
55
+ row_indices = np.arange(len(embeddings), dtype=np.uint32)
56
+ self._key_to_index = pd.Series(row_indices, index=keys, dtype=np.uint32)
57
+
58
+ @override
59
+ def get(self, keys: Optional[Iterable[VectorKey]] = None) -> np.ndarray:
60
+ assert self._embeddings is not None and self._key_to_index is not None, (
61
+ 'The vector store has no embeddings. Call load() or add() first.')
62
+ if not keys:
63
+ return self._embeddings
64
+ locs = self._key_to_index.loc[cast(list[str], keys)]
65
+ return self._embeddings.take(locs, axis=0)
66
+
67
+ @override
68
+ def topk(self,
69
+ query: np.ndarray,
70
+ k: int,
71
+ keys: Optional[Iterable[VectorKey]] = None) -> list[tuple[VectorKey, float]]:
72
+ assert self._embeddings is not None and self._key_to_index is not None, (
73
+ 'The vector store has no embeddings. Call load() or add() first.')
74
+ if keys is not None:
75
+ row_indices = self._key_to_index.loc[cast(list[str], keys)]
76
+ embeddings = self._embeddings.take(row_indices, axis=0)
77
+ keys = list(keys)
78
+ else:
79
+ keys, embeddings = cast(list[VectorKey], self._key_to_index.index.tolist()), self._embeddings
80
+
81
+ query = query.astype(embeddings.dtype)
82
+ similarities: np.ndarray = np.dot(embeddings, query).reshape(-1)
83
+ k = min(k, len(similarities))
84
+
85
+ # We do a partition + sort only top K to save time: O(n + klogk) instead of O(nlogn).
86
+ indices = np.argpartition(similarities, -k)[-k:]
87
+ # Indices sorted by value from largest to smallest.
88
+ indices = indices[np.argsort(similarities[indices])][::-1]
89
+
90
+ topk_similarities = similarities[indices]
91
+ topk_keys = [keys[idx] for idx in indices]
92
+ return list(zip(topk_keys, topk_similarities))