Add model

Browse files

Files changed (10) hide show

.gitattributes +1 -0
README.md +7 -44
config.json +9 -4
model_head.pkl +1 -1
pytorch_model.bin +2 -2
sentence_bert_config.json +1 -1
sentencepiece.bpe.model +3 -0
special_tokens_map.json +1 -1
tokenizer.json +0 -0
tokenizer_config.json +7 -54

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,31 +1,18 @@
 ---
 tags:
 - setfit
 - sentence-transformers
 - text-classification
 pipeline_tag: text-classification
-datasets:
-- mserras/alpaca-es-hackaton
-- somosnlp/somos-clean-alpaca-es
-language:
-- es
 ---
 # mserras/setfit-alpaca-es-unprocessable-sample-detection
-This is a [SetFit model](https://github.com/huggingface/setfit) that can be used for filtering the Alpaca ES instruction dataset.
-The base model is the multilingual model of [Paraphrase mpnet base v2](sentence-transformers/paraphrase-multilingual-mpnet-base-v2) from Sentence Transformers
- This model has been developed during the 2023 Hackaton organized by [SomosNLP](https://somosnlp.org/)/[HF Card](https://huggingface.co/somosnlp) and with the GPUs provided by [Q Blocks](https://www.qblocks.cloud)
-This model has been trained over "unprocessable" samples of the translated [Clean Alpaca Es](https://huggingface.co/datasets/somosnlp/somos-clean-alpaca-es) dataset from
-the HF [Argilla](https://argilla.io) space https://huggingface.co/spaces/mserras/somos-alpaca-es.
-To this end, a custom tag is proposed: "unprocessable" which corresponds to instruction/input/output triplets that require processing image, fetching information from the
-open web and similar tasks where the LLM has no capability action, thus, ending in hallucinations or strange outcomes.
-As this model was trained over samples of Alpaca, which were generated using ChatGPT3.5 this model **cannot be used for commercial purposes or to compete against OpenAI**
 ## Usage
@@ -39,37 +26,13 @@ You can then run inference as follows:
 ```python
 from setfit import SetFitModel
-import argilla as rg
 # Download from Hub and run inference
 model = SetFitModel.from_pretrained("mserras/setfit-alpaca-es-unprocessable-sample-detection")
-def instruct_fields_to_text(field_instruction: str, field_input: str, field_output: str):
-    """Given the instruction, input and output fields, return a text to be used by setfit"""
-    return f"INSTRUCTION:\n{field_instruction}\nINPUT:\n{field_input}\nOUTPUT:\n{field_output}\n"
-def sample_to_text(sample: rg.TextClassificationRecord) -> str:
-    """Converts and Argilla TextClassificationRecord to a text to be used by setfit"""
-    return instruct_fields_to_text(sample.inputs["1-instruction"], sample.inputs["2-input"], sample.inputs["3-output"])
-# For a given Argilla record:
-unprocessable_score = model.predict_proba([sample_to_text(argilla_record)])[0].tolist()[1]
 ```
-## Evaluation
-*Disclaimer*: There was no formal evaluation done, just a bunch of guys looking at the data & the outcomes.
-## Changelog
-- [09/04/2023] SQL code generation, date conversion, percentual discounts and renewable energies no longer detected as unprocessable.
-- [06/04/2023] It no longer detects password generation as unprocessable.
-## Authors
 ## BibTeX entry and citation info
 ```bibtex
@@ -83,4 +46,4 @@ publisher = {arXiv},
 year = {2022},
 copyright = {Creative Commons Attribution 4.0 International}
 }
-```

 ---
+license: apache-2.0
 tags:
 - setfit
 - sentence-transformers
 - text-classification
 pipeline_tag: text-classification
 ---
 # mserras/setfit-alpaca-es-unprocessable-sample-detection
+This is a [SetFit model](https://github.com/huggingface/setfit) that can be used for text classification. The model has been trained using an efficient few-shot learning technique that involves:
+1. Fine-tuning a [Sentence Transformer](https://www.sbert.net) with contrastive learning.
+2. Training a classification head with features from the fine-tuned Sentence Transformer.
 ## Usage
 ```python
 from setfit import SetFitModel
 # Download from Hub and run inference
 model = SetFitModel.from_pretrained("mserras/setfit-alpaca-es-unprocessable-sample-detection")
+# Run inference
+preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
 ```
 ## BibTeX entry and citation info
 ```bibtex
 year = {2022},
 copyright = {Creative Commons Attribution 4.0 International}
 }
+```

config.json CHANGED Viewed

@@ -1,11 +1,13 @@
 {
   "_name_or_path": "/home/mserras/Downloads/setfit-model/backup-model-setfit-unprocessable/",
   "architectures": [
-    "MPNetModel"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
   "eos_token_id": 2,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
@@ -13,12 +15,15 @@
   "intermediate_size": 3072,
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
-  "model_type": "mpnet",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
   "pad_token_id": 1,
-  "relative_attention_num_buckets": 32,
   "torch_dtype": "float32",
   "transformers_version": "4.27.4",
-  "vocab_size": 30527
 }

 {
   "_name_or_path": "/home/mserras/Downloads/setfit-model/backup-model-setfit-unprocessable/",
   "architectures": [
+    "XLMRobertaModel"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
+  "classifier_dropout": null,
   "eos_token_id": 2,
+  "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "intermediate_size": 3072,
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
+  "output_past": true,
   "pad_token_id": 1,
+  "position_embedding_type": "absolute",
   "torch_dtype": "float32",
   "transformers_version": "4.27.4",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
 }

model_head.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72f98a0af212cef0dcf5978c5f04d30a14a55476e346e877a201354eb4fa2ee6
 size 6991

 version https://git-lfs.github.com/spec/v1
+oid sha256:fad7d4c0ce4f486ba42180d19aa1647bcde1c0847b6b8b29004c86ab4d1b98de
 size 6991

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85771c639e1487f57b71ee7188abdb70135d299bae9ffd53b7f87c8656ff305b
-size 438013677

 version https://git-lfs.github.com/spec/v1
+oid sha256:7130a90b18531f9303ffac78c1a02fbf97be1098c86ae7b584de8a9c425580d8
+size 1112242989

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "max_seq_length": 512,
   "do_lower_case": false
 }

 {
+  "max_seq_length": 128,
   "do_lower_case": false
 }

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

special_tokens_map.json CHANGED Viewed

@@ -11,5 +11,5 @@
   },
   "pad_token": "<pad>",
   "sep_token": "</s>",
-  "unk_token": "[UNK]"
 }

   },
   "pad_token": "<pad>",
   "sep_token": "</s>",
+  "unk_token": "<unk>"
 }

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,30 +1,7 @@
 {
-  "bos_token": {
-    "__type": "AddedToken",
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "cls_token": {
-    "__type": "AddedToken",
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "do_basic_tokenize": true,
-  "do_lower_case": true,
-  "eos_token": {
-    "__type": "AddedToken",
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
   "mask_token": {
     "__type": "AddedToken",
     "content": "<mask>",
@@ -34,33 +11,9 @@
     "single_word": false
   },
   "model_max_length": 512,
-  "never_split": null,
-  "pad_token": {
-    "__type": "AddedToken",
-    "content": "<pad>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "sep_token": {
-    "__type": "AddedToken",
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
   "special_tokens_map_file": null,
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "MPNetTokenizer",
-  "unk_token": {
-    "__type": "AddedToken",
-    "content": "[UNK]",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
 }

 {
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
   "mask_token": {
     "__type": "AddedToken",
     "content": "<mask>",
     "single_word": false
   },
   "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
   "special_tokens_map_file": null,
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
 }