nikhil_staging / data /lilac.yml
nsthorat-lilac's picture
Upload data/lilac.yml with huggingface_hub
be17f2e
raw
history blame
No virus
25 kB
# Lilac project config.
# See https://lilacml.com/api_reference/index.html#lilac.Config for details.
datasets:
- namespace: local
name: glue
source:
dataset_name: glue
config_name: ax
source_name: huggingface
embeddings:
- path: premise
embedding: gte-small
- path: premise
embedding: gte-base
- path: hypothesis
embedding: gte-small
signals:
- path: premise
signal:
signal_name: pii
- path: hypothesis
signal:
signal_name: pii
- path: premise
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- premise
markdown_paths: []
- namespace: local
name: glue_ax
source:
dataset_name: glue
config_name: ax
source_name: huggingface
embeddings:
- path: hypothesis
embedding: gte-small
signals:
- path: premise
signal:
signal_name: text_statistics
- path: premise
signal:
signal_name: pii
- path: premise
signal:
signal_name: near_dup
- path: hypothesis
signal:
embedding: gte-small
namespace: ''
concept_name: ''
signal_name: concept_score
- path: hypothesis
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path: hypothesis
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
settings:
ui:
media_paths:
- hypothesis
markdown_paths: []
- namespace: local
name: imdb3
source:
dataset_name: imdb
source_name: huggingface
settings:
ui:
media_paths:
- text
markdown_paths: []
- namespace: local
name: imdb
source:
dataset_name: imdb
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
signals:
- path: text
signal:
signal_name: pii
- path: text
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- text
markdown_paths: []
- namespace: local
name: imdb2
source:
dataset_name: imdb
source_name: huggingface
settings:
ui:
media_paths:
- text
markdown_paths: []
- namespace: lilac
name: OpenOrca-100k
source:
dataset_name: Open-Orca/OpenOrca
sample_size: 100000
source_name: huggingface
embeddings:
- path: question
embedding: gte-small
- path: response
embedding: gte-small
signals:
- path: question
signal:
signal_name: near_dup
- path: question
signal:
signal_name: pii
- path: question
signal:
signal_name: lang_detection
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: question
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path: question
signal:
signal_name: text_statistics
- path: response
signal:
signal_name: near_dup
- path: response
signal:
signal_name: pii
- path: response
signal:
signal_name: lang_detection
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: question
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path: response
signal:
signal_name: text_statistics
- path: system_prompt
signal:
signal_name: pii
settings:
ui:
media_paths:
- question
- response
markdown_paths: []
- namespace: local
name: the_movies_dataset
source:
filepaths:
- gs://lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv
names: []
source_name: csv
settings:
ui:
media_paths:
- overview
markdown_paths: []
- namespace: local
name: glue_ax_parquet
source:
filepaths:
- gs://lilac-data/datasets/glue_ax_parquet/glue_ax.parquet
source_name: parquet
settings:
ui:
media_paths:
- premise
markdown_paths: []
- namespace: lilac
name: mmlu_professional_law
source:
dataset_name: cais/mmlu
config_name: professional_law
source_name: huggingface
embeddings:
- path: question
embedding: gte-small
- path:
- choices
- '*'
embedding: gte-small
signals:
- path: question
signal:
signal_name: near_dup
- path: question
signal:
signal_name: pii
- path: question
signal:
signal_name: lang_detection
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: question
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: question
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path: question
signal:
signal_name: text_statistics
- path:
- choices
- '*'
signal:
signal_name: near_dup
- path:
- choices
- '*'
signal:
signal_name: pii
- path:
- choices
- '*'
signal:
signal_name: lang_detection
- path:
- choices
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path:
- choices
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path:
- choices
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path:
- choices
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: question
signal_name: concept_score
- path:
- choices
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
signal_name: concept_score
- path:
- choices
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path:
- choices
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path:
- choices
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path:
- choices
- '*'
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- question
- - choices
- '*'
markdown_paths: []
preferred_embedding: gte-small
- namespace: local
name: deepset-prompt-inj
source:
dataset_name: deepset/prompt-injections
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
settings:
ui:
media_paths:
- text
markdown_paths: []
- namespace: local
name: jasper-prompt-inj
source:
dataset_name: JasperLS/prompt-injections
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
settings:
ui:
media_paths:
- text
markdown_paths: []
- namespace: local
name: mosaic-chat-v2
source:
dataset_name: sam-mosaic/chat-v2
source_name: huggingface
embeddings:
- path: prompt
embedding: gte-small
- path: response
embedding: gte-small
signals:
- path: prompt
signal:
signal_name: near_dup
- path: prompt
signal:
signal_name: pii
- path: prompt
signal:
signal_name: lang_detection
- path: prompt
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: prompt
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path: prompt
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path: prompt
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: prompt
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path: prompt
signal:
signal_name: text_statistics
- path: response
signal:
signal_name: near_dup
- path: response
signal:
signal_name: pii
- path: response
signal:
signal_name: lang_detection
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: response
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path: response
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- prompt
- response
markdown_paths: []
preferred_embedding: gte-small
- namespace: local
name: databricks-dolly-15k-curated-en
source:
dataset_name: argilla/databricks-dolly-15k-curated-en
source_name: huggingface
embeddings:
- path: original-context
embedding: gte-small
- path:
- new-context
- value
- '*'
embedding: gte-small
- path: original-instruction
embedding: gte-small
signals:
- path: original-instruction
signal:
signal_name: near_dup
- path: original-instruction
signal:
signal_name: pii
- path: original-instruction
signal:
signal_name: lang_detection
- path: original-instruction
signal:
signal_name: text_statistics
- path: original-context
signal:
signal_name: near_dup
- path: original-context
signal:
signal_name: pii
- path: original-context
signal:
signal_name: lang_detection
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: question
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: original-context
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path: original-context
signal:
signal_name: text_statistics
- path: original-response
signal:
signal_name: near_dup
- path: original-response
signal:
signal_name: pii
- path: original-response
signal:
signal_name: lang_detection
- path: original-response
signal:
signal_name: text_statistics
- path:
- new-instruction
- value
- '*'
signal:
signal_name: near_dup
- path:
- new-instruction
- value
- '*'
signal:
signal_name: pii
- path:
- new-instruction
- value
- '*'
signal:
signal_name: lang_detection
- path:
- new-instruction
- value
- '*'
signal:
signal_name: text_statistics
- path:
- new-context
- value
- '*'
signal:
signal_name: near_dup
- path:
- new-context
- value
- '*'
signal:
signal_name: pii
- path:
- new-context
- value
- '*'
signal:
signal_name: lang_detection
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: question
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path:
- new-context
- value
- '*'
signal:
signal_name: text_statistics
- path:
- new-response
- value
- '*'
signal:
signal_name: near_dup
- path:
- new-response
- value
- '*'
signal:
signal_name: pii
- path:
- new-response
- value
- '*'
signal:
signal_name: lang_detection
- path:
- new-response
- value
- '*'
signal:
signal_name: text_statistics
- path: original-instruction
signal:
signal_name: spacy_ner
settings:
ui:
media_paths:
- original-instruction
- original-context
- original-response
- - new-instruction
- value
- '*'
- - new-context
- value
- '*'
- - new-response
- value
- '*'
markdown_paths: []
preferred_embedding: gte-small
- namespace: local
name: open-asssistant-conversations
source:
dataset_name: OpenAssistant/oasst1
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
signals:
- path: text
signal:
signal_name: near_dup
- path: text
signal:
signal_name: pii
- path: text
signal:
signal_name: lang_detection
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: question
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path: text
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- text
markdown_paths: []
preferred_embedding: gte-small
- namespace: local
name: enron-emails
source:
dataset_name: EleutherAI/pile
config_name: enron_emails
sample_size: 100000
source_name: huggingface
embeddings:
- path: text
embedding: gte-small
signals:
- path: text
signal:
signal_name: near_dup
- path: text
signal:
signal_name: pii
- path: text
signal:
signal_name: lang_detection
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: positive-sentiment
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: non-english
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: toxicity
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: question
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: legal-termination
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: source-code
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: negative-sentiment
signal_name: concept_score
- path: text
signal:
embedding: gte-small
namespace: lilac
concept_name: profanity
signal_name: concept_score
- path: text
signal:
signal_name: text_statistics
settings:
ui:
media_paths:
- text
markdown_paths: []
preferred_embedding: gte-small