upload

Browse files

Files changed (9) hide show

README.md +57 -0
config.json +28 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
spiece.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +1 -0
train_script.py +214 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+language: en
+datasets:
+- sentence-transformers/embedding-training-data
+widget:
+- text: "Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects."
+license: apache-2.0
+---
+# doc2query/msmarco-t5-small-v1
+This is a [doc2query](https://arxiv.org/abs/1904.08375) model based on T5 (also known as [docT5query](https://cs.uwaterloo.ca/~jimmylin/publications/Nogueira_Lin_2019_docTTTTTquery-v2.pdf)).
+It can be used for:
+- **Document expansion**: You generate for your paragraphs 20-40 queries and index the paragraphs and the generates queries in a standard BM25 index like Elasticsearch, OpenSearch, or Lucene. The generated queries help to close the lexical gap of lexical search, as the generate queries contain synonyms. Further, it re-weights words giving important words a higher weight even if they appear seldomn in a paragraph. In our [BEIR](https://arxiv.org/abs/2104.08663) paper we showed that BM25+docT5query is a powerful search engine. In the [BEIR repository](https://github.com/UKPLab/beir) we have an example how to use docT5query with Pyserini.
+- **Domain Specific Training Data Generation**: It can be used to generate training data to learn an embedding model. On [SBERT.net](https://www.sbert.net/examples/unsupervised_learning/query_generation/README.html) we have an example how to use the model to generate (query, text) pairs for a given collection of unlabeled texts. These pairs can then be used to train powerful dense embedding models.
+## Usage
+```python
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+model_name = 'doc2query/msmarco-t5-small-v1'
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+text = "Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects."
+input_ids = tokenizer.encode(text, max_length=320, truncation=True, return_tensors='pt')
+outputs = model.generate(
+    input_ids=input_ids,
+    max_length=64,
+    do_sample=True,
+    top_p=0.95,
+    num_return_sequences=5)
+print("Text:")
+print(text)
+print("\nGenerated Queries:")
+for i in range(len(outputs)):
+    query = tokenizer.decode(outputs[i], skip_special_tokens=True)
+    print(f'{i + 1}: {query}')
+```
+**Note:** `model.generate()` is non-deterministic. It produces different queries each time you run it.
+## Training
+This model fine-tuned [google/t5-v1_1-small](https://huggingface.co/google/t5-v1_1-small) for 31k training steps (about 4 epochs on the 500k training pairs from MS MARCO). For the  training script, see the `train_script.py` in this repository.
+The input-text was truncated to 320 word pieces. Output text was generated up to 64 word pieces.
+This model was trained on a (query, passage) from the [MS MARCO Passage-Ranking dataset](https://github.com/microsoft/MSMARCO-Passage-Ranking).

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_name_or_path": "google/t5-v1_1-small",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 1024,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 8,
+  "num_heads": 6,
+  "num_layers": 8,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.11.3",
+  "use_cache": true,
+  "vocab_size": 32128
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e70479fde0478b478ba9ba05d071ccf4eea2bfae51215166ef94ee918837f4d0
+size 307934749

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"]}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "name_or_path": "google/t5-v1_1-small", "special_tokens_map_file": "/root/.cache/huggingface/transformers/3ad6f8335c1b1ef8966245899d47dcf735abd134d21fd7d26f621fe45ac01184.c94798918c92ded6aeef2d2f0e666d2cc4145eca1aa6e1336fde07f2e13e2f46", "sp_model_kwargs": {}, "tokenizer_class": "T5Tokenizer"}

train_script.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import argparse
+import logging
+from torch.utils.data import Dataset, IterableDataset
+import gzip
+import json
+from transformers import Seq2SeqTrainer, AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments
+import sys
+from datetime import datetime
+import torch
+import random
+from shutil import copyfile
+import os
+import wandb
+import random
+import re
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_name", default="google/t5-v1_1-base")
+parser.add_argument("--train_files", required=True, nargs='+', default=[])
+parser.add_argument("--epochs", default=1, type=int)
+parser.add_argument("--batch_size", default=32, type=int)
+parser.add_argument("--max_source_length", default=320, type=int)
+parser.add_argument("--max_target_length", default=64, type=int)
+parser.add_argument("--name", required=True)
+parser.add_argument("--train_size", default=10*1000*1000, type=int)
+parser.add_argument("--eval_size", default=10000, type=int)
+parser.add_argument("--fp16", default=False, action='store_true')
+args = parser.parse_args()
+wandb.init(project="doc2query", name=f"{args.name}-{args.model_name}")
+class PairDataset:
+    def __init__(self, filepath):
+        self.filepath = filepath
+        self.examples = []
+    def __iter__(self):
+        print("open", self.filepath)
+        with gzip.open(self.filepath, 'rt') as fIn:
+            for line in fIn:
+                example = self.get_example(json.loads(line))
+                if example is not None:
+                    self.examples.append(example)
+                    yield example
+        while True:
+            random.shuffle(self.examples)
+            for ex in self.examples:
+                yield ex
+    def get_example(self, raw_example):
+        if isinstance(raw_example, dict):
+            return [raw_example['query'], random.choice(raw_example['pos'])]
+        else:
+            return [raw_example[0], raw_example[1]]
+class RedditTitleDataset(PairDataset):
+    def get_example(self, raw_example):
+        return [self.clean_title(raw_example['title']), raw_example['body']]
+    def clean_title(self, text):
+        text = text.replace("&amp;", "&").strip()
+        if text.startswith("["):
+            text = re.sub("^\[[a-zA-Z0-9]+\]", "", text).strip()
+        if text.endswith("]"):
+            text = re.sub("\[[a-zA-Z0-9\.]+\]$", "", text).strip()
+        if text.startswith("/r"):
+            text = re.sub("^/[a-zA-Z0-9/]+[;,: \-]+", "", text).strip()
+        return text
+class StackExchangeTitleBodyDataset(PairDataset):
+    def get_example(self, raw_example):
+        return raw_example['texts']
+class MultiDataset(IterableDataset):
+    def __init__(self, filepaths, num_samples):
+        self.num_samples = num_samples
+        self.datasets = []
+        self.data_iterators = []
+        for filepath in filepaths:
+            if 'reddit_title_text' in filepath:
+                dataset = RedditTitleDataset(filepath)
+            elif 'stackexchange_archive/jsonl' in filepath:
+                dataset = StackExchangeTitleBodyDataset(filepath)
+            else:
+                dataset = PairDataset(filepath)
+            self.datasets.append(dataset)
+            self.data_iterators.append(iter(dataset))
+    def __len__(self):
+        return self.num_samples
+    def __iter__(self):
+        while True:
+            for dataset in self.data_iterators:
+                yield next(dataset)
+            random.shuffle(self.data_iterators)
+    def delete_examples_cache(self):
+        for dataset in self.datasets:
+            dataset.examples = []
+def main():
+    ############ Model
+    model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    save_steps = 1000
+    output_dir = 'output/'+args.name+'-'+args.model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    print("Output dir:", output_dir)
+    # Write self to path
+    os.makedirs(output_dir, exist_ok=True)
+    train_script_path = os.path.join(output_dir, 'train_script.py')
+    copyfile(__file__, train_script_path)
+    with open(train_script_path, 'a') as fOut:
+        fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
+    ####
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=output_dir,
+        fp16=args.fp16,
+        fp16_backend="amp",
+        per_device_train_batch_size=args.batch_size,
+        evaluation_strategy="steps",
+        save_steps=save_steps,
+        logging_steps=100,
+        eval_steps=save_steps, #logging_steps,
+        warmup_steps=1000,
+        save_total_limit=1,
+        num_train_epochs=args.epochs,
+        report_to="wandb",
+    )
+    ############ Arguments
+    ############ Load datasets
+    train_dataset = MultiDataset(args.train_files, args.train_size)
+    train_dataset_iter = iter(train_dataset)
+    eval_dataset = [next(train_dataset_iter) for _ in range(args.eval_size)]
+    train_dataset.delete_examples_cache()  #Make sure dev data is no re-used for training
+    print("Target:", eval_dataset[0][0])
+    print("Input:", eval_dataset[0][1])
+    print("Train dataset len:", len(train_dataset))
+    def data_collator(examples):
+        targets = [row[0] for row in examples]
+        inputs = [row[1] for row in examples]
+        label_pad_token_id = -100
+        model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=True, truncation=True, return_tensors='pt', pad_to_multiple_of=8 if training_args.fp16 else None)
+        # Setup the tokenizer for targets
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(targets, max_length=args.max_target_length, padding=True, truncation=True, pad_to_multiple_of=8 if training_args.fp16 else None)
+        # replace all tokenizer.pad_token_id in the labels by -100 to ignore padding in the loss.
+        labels["input_ids"] = [
+            [(l if l != tokenizer.pad_token_id else label_pad_token_id) for l in label] for label in labels["input_ids"]
+        ]
+        model_inputs["labels"] = torch.tensor(labels["input_ids"])
+        return model_inputs
+    ## Define the trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        data_collator=data_collator
+    )
+    ### Save the model
+    train_result = trainer.train()
+    trainer.save_model()
+if __name__ == "__main__":
+    main()
+# Script was called via:
+#python train_hf_trainer.py --model_name google/t5-v1_1-small --train_files /home/sbert_pretrained_models/datasets/embedding-training-data/msmarco-triplets.jsonl.gz --name msmarco --train_size 2000000

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dfb7cfff430f81d2d2488d6a1a55dcdaa1fd3b830bc9bcf1697c4dcd8c1498b
+size 2991