datasets: - namespace: local name: OpenHermes-2.5-10k source: dataset_name: teknium/OpenHermes-2.5 sample_size: 9999 source_name: huggingface embeddings: - path: - conversations - '*' - value embedding: gte-small settings: ui: media_paths: - - test__clusters - text - - conversations - '*' - value - - test__cluster - text markdown_paths: [] - namespace: local name: OpenOrca-100k source: dataset_name: Open-Orca/OpenOrca sample_size: 100000 source_name: huggingface embeddings: - path: question embedding: openai signals: - path: question signal: embedding: openai namespace: local concept_name: physics version: 21 signal_name: concept_score - path: question signal: signal_name: text_statistics settings: ui: media_paths: - question - response markdown_paths: [] - namespace: local name: glue_ax source: dataset_name: glue config_name: ax source_name: huggingface embeddings: - path: premise embedding: gte-small - path: hypothesis embedding: gte-small signals: - path: premise signal: signal_name: text_statistics settings: ui: media_paths: - premise markdown_paths: [] - namespace: local name: ableton source: source_name: llama_index_docs embeddings: - path: text embedding: gte-small settings: ui: media_paths: - text markdown_paths: [] - namespace: local name: Capybara source: dataset_name: LDJnr/Capybara source_name: huggingface embeddings: - path: - conversation - '*' - input embedding: gte-small signals: - path: - conversation - '*' - input signal: signal_name: text_statistics - path: - conversation - '*' - input signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score settings: ui: media_paths: - input - - conversation - '*' - input - - conversation - '*' - output markdown_paths: [] - namespace: local name: OpenOrca-10k source: dataset_name: Open-Orca/OpenOrca sample_size: 10000 source_name: huggingface embeddings: - path: response embedding: gte-small settings: ui: media_paths: - question - response markdown_paths: [] - namespace: local name: cpb source: dataset_name: LDJnr/Capybara source_name: huggingface signals: - path: - conversation - '*' - input signal: signal_name: text_statistics settings: ui: media_paths: - input - - conversation - '*' - input - - conversation - '*' - output markdown_paths: [] - namespace: local name: mikeion_dissertation_data_with_split source: dataset_name: mikeion/dissertation_data_with_split source_name: huggingface settings: ui: media_paths: - content markdown_paths: [] - namespace: local name: mikeion_dissertation_data source: dataset_name: mikeion/dissertation_data source_name: huggingface settings: ui: media_paths: - - messages - '*' - attachments - '*' - url markdown_paths: [] - namespace: local name: test source: filepaths: - ~/Code/lilac_datasets/test.json source_name: json settings: ui: media_paths: - json markdown_paths: [] - namespace: local name: OrcaMyles source: dataset_name: Open-Orca/OpenOrca source_name: huggingface settings: ui: media_paths: - question - response markdown_paths: [] - namespace: local name: OpenOrca source: dataset_name: Open-Orca/OpenOrca source_name: huggingface embeddings: - path: question embedding: gte-small - path: response embedding: gte-small signals: - path: question signal: signal_name: pii - path: question signal: signal_name: text_statistics - path: response signal: signal_name: pii - path: response signal: signal_name: markdown_code_block - path: response signal: signal_name: text_statistics - path: question signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score - path: question signal: embedding: gte-small namespace: lilac concept_name: non-english signal_name: concept_score settings: ui: media_paths: - question - response markdown_paths: [] - namespace: local name: imdb source: dataset_name: imdb source_name: huggingface embeddings: - path: text embedding: gte-small signals: - path: text signal: signal_name: pii - path: label signal: signal_name: text_statistics settings: ui: media_paths: - text markdown_paths: [] - namespace: local name: capybara source: dataset_name: capybara source_name: huggingface settings: ui: media_paths: - - conversation - '*' - input - - conversation - '*' - output markdown_paths: [] - namespace: local name: db-openorca-10k source: dataset_name: Open-Orca/OpenOrca sample_size: 10000 source_name: huggingface embeddings: - path: question embedding: gte-small signals: - path: question signal: embedding: gte-small namespace: local concept_name: physics signal_name: concept_score settings: ui: media_paths: - question - response - - question__cluster - text markdown_paths: []