mikeion / data /lilac.yml
nsthorat-lilac's picture
Push to HF space
d3f0f8f verified
datasets:
- namespace: local
name: OpenHermes-2.5-10k
source:
dataset_name: teknium/OpenHermes-2.5
sample_size: 9999
source_name: huggingface
embeddings:
- path:
- conversations
- '*'
- value
embedding: gte-small
settings:
ui:
media_paths:
- - test__clusters
- text
- - conversations
- '*'
- value
- - test__cluster
- text
markdown_paths: []
- namespace: local
name: OpenOrca-100k
source:
dataset_name: Open-Orca/OpenOrca
sample_size: 100000
source_name: huggingface
embeddings:
- path: question
embedding: openai
signals:
- path: question
signal:
embedding: openai
namespace: local
concept_name: physics
version: 21
signal_name: concept_score
- path: question
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- question
- response
markdown_paths: []
- namespace: local
name: glue_ax
source:
dataset_name: glue
config_name: ax
source_name: huggingface
embeddings:
- path: premise
embedding: gte-small
- path: hypothesis
embedding: gte-small
signals:
- path: premise
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- premise
markdown_paths: []
- namespace: local
name: ableton
source:
source_name: llama_index_docs
embeddings:
- path: text
embedding: gte-small
settings:
ui:
media_paths:
- text
markdown_paths: []
- namespace: local
name: Capybara
source:
dataset_name: LDJnr/Capybara
source_name: huggingface
embeddings:
- path:
- conversation
- '*'
- input
embedding: gte-small
signals:
- path:
- conversation
- '*'
- input
signal:
signal_name: text_statistics
- path:
- conversation
- '*'
- input
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
settings:
ui:
media_paths:
- input
- - conversation
- '*'
- input
- - conversation
- '*'
- output
markdown_paths: []
- namespace: local
name: OpenOrca-10k
source:
dataset_name: Open-Orca/OpenOrca
sample_size: 10000
source_name: huggingface
embeddings:
- path: response
embedding: gte-small
settings:
ui:
media_paths:
- question
- response
markdown_paths: []
- namespace: local
name: cpb
source:
dataset_name: LDJnr/Capybara
source_name: huggingface
signals:
- path:
- conversation
- '*'
- input
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- input
- - conversation
- '*'
- input
- - conversation
- '*'
- output
markdown_paths: []
- namespace: local
name: mikeion_dissertation_data_with_split
source:
dataset_name: mikeion/dissertation_data_with_split
source_name: huggingface
settings:
ui:
media_paths:
- content
markdown_paths: []
- namespace: local
name: mikeion_dissertation_data
source:
dataset_name: mikeion/dissertation_data
source_name: huggingface
settings:
ui:
media_paths:
- - messages
- '*'
- attachments
- '*'
- url
markdown_paths: []
- namespace: local
name: test
source:
filepaths:
- ~/Code/lilac_datasets/test.json
source_name: json
settings:
ui:
media_paths:
- json
markdown_paths: []
- namespace: local
name: OrcaMyles
source:
dataset_name: Open-Orca/OpenOrca
source_name: huggingface
settings:
ui:
media_paths:
- question
- response
markdown_paths: []
- namespace: local
name: OpenOrca
source:
dataset_name: Open-Orca/OpenOrca
source_name: huggingface
embeddings:
- path: question
embedding: gte-small
- path: response
embedding: gte-small
signals:
- path: question
signal:
signal_name: pii
- path: question
signal:
signal_name: text_statistics
- path: response
signal:
signal_name: pii
- path: response
signal:
signal_name: markdown_code_block
- path: response
signal:
signal_name: text_statistics
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
settings:
ui:
media_paths:
- question
- response
markdown_paths: []
- namespace: local
name: imdb
source:
dataset_name: imdb
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
signals:
- path: text
signal:
signal_name: pii
- path: label
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- text
markdown_paths: []
- namespace: local
name: capybara
source:
dataset_name: capybara
source_name: huggingface
settings:
ui:
media_paths:
- - conversation
- '*'
- input
- - conversation
- '*'
- output
markdown_paths: []
- namespace: local
name: db-openorca-10k
source:
dataset_name: Open-Orca/OpenOrca
sample_size: 10000
source_name: huggingface
embeddings:
- path: question
embedding: gte-small
signals:
- path: question
signal:
embedding: gte-small
namespace: local
concept_name: physics
signal_name: concept_score
settings:
ui:
media_paths:
- question
- response
- - question__cluster
- text
markdown_paths: []