Add the example usage.

Browse files

Files changed (7) hide show

usage/README.md +43 -0
usage/conf/generate_prototypes.yaml +9 -0
usage/conf/run_linking.yaml +26 -0
usage/generate_prototypes.py +61 -0
usage/requirements.txt +4 -0
usage/run_entity_linking.py +440 -0
usage/utils.py +289 -0

usage/README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# Knowledge-Rich Self-Supervision (KRISS) for Biomedical Entity Linking
+Usage code for the entity linking approach described in the following paper:
+```bibtex
+@article{kriss,
+  author = {Sheng Zhang, Hao Cheng, Shikhar Vashishth, Cliff Wong, Jinfeng Xiao, Xiaodong Liu, Tristan Naumann, Jianfeng Gao, Hoifung Poon},
+  title = {Knowledge-Rich Self-Supervision for Biomedical Entity Linking},
+  year = {2021},
+  url = {https://arxiv.org/abs/2112.07887},
+  eprinttype = {arXiv},
+  eprint = {2112.07887},
+}
+```
+[https://arxiv.org/pdf/2112.07887.pdf](https://arxiv.org/pdf/2112.07887.pdf)
+## Usage of KRISS for Entity Linking
+Here, we use the [MedMentions](https://github.com/chanzuckerberg/MedMentions) data to show you how to 1) generate prototype embeddings, and 2) run entity linking.
+(We are currently unable to release the self-supervised mention examples, because they requires UMLS and PubMed licenses.)
+### 1. Create conda environment and install requirements
+```bash
+conda create -n kriss -y python=3.8 && conda activate kriss
+pip install -r requirements.txt
+```
+### 2. Download the MedMentions dataset
+```bash
+git clone https://github.com/chanzuckerberg/MedMentions.git
+```
+### 3. Generate prototype embeddings
+```bash
+python generate_prototypes.py
+```
+### 4. Run entity linking
+```bash
+python run_entity_linking.py
+```

usage/conf/generate_prototypes.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+model_name_or_path: microsoft/BiomedNLP-KRISSBERT-PubMed-UMLS-EL
+train_data:
+  _target_: utils.MedMentionsDataset
+  dataset_path: MedMentions/full/data/
+  split: train
+batch_size: 256
+max_length: 64
+output_prototypes: prototypes/embeddings
+output_name_cuis: prototypes/name_cuis

usage/conf/run_linking.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# path to pretrained model and tokenizer
+model_name_or_path: microsoft/BiomedNLP-KRISSBERT-PubMed-UMLS-EL
+test_data:
+  _target_: utils.MedMentionsDataset
+  dataset_path: MedMentions/full/data/
+  split: test
+# paths to encoded data
+encoded_files: [
+  prototypes/embeddings
+]
+encoded_umls_files: []
+entity_list_ids:
+entity_list_names: prototypes/name_cuis
+index_path:
+seed: 12345
+batch_size: 256
+max_length: 64
+num_retrievals: 100
+top_ks: [1, 5, 50, 100]

usage/generate_prototypes.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+ Command line tool that produces embeddings for a large set of entity mentions
+ based on the pretrained mention encoder.
+"""
+import logging
+import os
+import pathlib
+import pickle
+import hydra
+from omegaconf import DictConfig, OmegaConf
+from transformers import AutoConfig, AutoTokenizer, AutoModel
+from utils import generate_vectors
+# Setup logger
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+log_formatter = logging.Formatter(
+    "[%(thread)s] %(asctime)s [%(levelname)s] %(name)s: %(message)s"
+)
+console = logging.StreamHandler()
+console.setFormatter(log_formatter)
+logger.addHandler(console)
+@hydra.main(config_path="conf", config_name="generate_prototypes", version_base=None)
+def main(cfg: DictConfig):
+    logger.info("Configuration:")
+    logger.info("%s", OmegaConf.to_yaml(cfg))
+    config = AutoConfig.from_pretrained(cfg.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(
+        cfg.model_name_or_path,
+        use_fast=True,
+    )
+    encoder = AutoModel.from_pretrained(
+        cfg.model_name_or_path,
+        config=config
+    )
+    encoder.cuda()
+    encoder.eval()
+    ds = hydra.utils.instantiate(cfg.train_data)
+    data = generate_vectors(encoder, tokenizer, ds, cfg.batch_size, cfg.max_length, is_prototype=True)
+    pathlib.Path(os.path.dirname(cfg.output_prototypes)).mkdir(parents=True, exist_ok=True)
+    logger.info("Writing results to %s" % cfg.output_prototypes)
+    with open(cfg.output_prototypes, mode="wb") as f:
+        pickle.dump(data, f)
+    with open(cfg.output_name_cuis, 'w') as f:
+        for name, cuis in ds.name_to_cuis.items():
+            f.write('|'.join(cuis) + '||' + name + '\n')
+    logger.info("Total data processed %d. Written to %s", len(data), cfg.output_prototypes)
+if __name__ == "__main__":
+    main()

usage/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers==4.17.0
+torch==1.11
+hydra-core==1.2.0
+faiss-gpu==1.7.0

usage/run_entity_linking.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+ Run entity linking
+"""
+import os
+import glob
+import logging
+import pathlib
+import pickle
+import time
+import math
+import multiprocessing
+from typing import List, Tuple, Dict, Iterator, Set
+from functools import partial
+from multiprocessing.dummy import Pool
+import hydra
+import numpy as np
+import torch
+from omegaconf import DictConfig, OmegaConf
+from torch import Tensor as T
+from torch import nn
+import faiss
+from transformers import (
+    set_seed,
+    AutoConfig,
+    AutoTokenizer,
+    AutoModel,
+    PreTrainedTokenizer,
+)
+from utils import generate_vectors
+# Setup logger
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+log_formatter = logging.Formatter(
+    "[%(thread)s] %(asctime)s [%(levelname)s] %(name)s: %(message)s"
+)
+console = logging.StreamHandler()
+console.setFormatter(log_formatter)
+logger.addHandler(console)
+class DenseIndexer(object):
+    def __init__(self, buffer_size: int = 50000):
+        self.buffer_size = buffer_size
+        self.index_id_to_db_id = []
+        self.index = None
+    def init_index(self, vector_sz: int):
+        raise NotImplementedError
+    def index_data(self, data: List[Tuple[object, np.array]]):
+        raise NotImplementedError
+    def get_index_name(self):
+        raise NotImplementedError
+    def search_knn(
+        self, query_vectors: np.array, top_docs: int
+    ) -> List[Tuple[List[object], List[float]]]:
+        raise NotImplementedError
+    def serialize(self, file: str):
+        logger.info("Serializing index to %s", file)
+        if os.path.isdir(file):
+            index_file = os.path.join(file, "index.dpr")
+            meta_file = os.path.join(file, "index_meta.dpr")
+        else:
+            index_file = file + ".index.dpr"
+            meta_file = file + ".index_meta.dpr"
+        faiss.write_index(self.index, index_file)
+        with open(meta_file, mode="wb") as f:
+            pickle.dump(self.index_id_to_db_id, f)
+    def get_files(self, path: str):
+        if os.path.isdir(path):
+            index_file = os.path.join(path, "index.dpr")
+            meta_file = os.path.join(path, "index_meta.dpr")
+        else:
+            index_file = path + ".index.dpr"
+            meta_file = path + ".index_meta.dpr"
+        return index_file, meta_file
+    def index_exists(self, path: str):
+        index_file, meta_file = self.get_files(path)
+        return os.path.isfile(index_file) and os.path.isfile(meta_file)
+    def deserialize(self, path: str):
+        logger.info("Loading index from %s", path)
+        index_file, meta_file = self.get_files(path)
+        self.index = faiss.read_index(index_file)
+        logger.info(
+            "Loaded index of type %s and size %d", type(self.index), self.index.ntotal
+        )
+        with open(meta_file, "rb") as reader:
+            self.index_id_to_db_id = pickle.load(reader)
+        assert (
+            len(self.index_id_to_db_id) == self.index.ntotal
+        ), "Deserialized index_id_to_db_id should match faiss index size"
+    def _update_id_mapping(self, db_ids: List) -> int:
+        self.index_id_to_db_id.extend(db_ids)
+        return len(self.index_id_to_db_id)
+class DenseFlatIndexer(DenseIndexer):
+    def __init__(self, buffer_size: int = 50000):
+        super(DenseFlatIndexer, self).__init__(buffer_size=buffer_size)
+    def init_index(self, vector_sz: int):
+        self.index = faiss.IndexFlatIP(vector_sz)
+    def index_data(self, data: List[Tuple[object, np.array]]):
+        n = len(data)
+        # indexing in batches is beneficial for many faiss index types
+        for i in range(0, n, self.buffer_size):
+            db_ids = [t[0] for t in data[i : i + self.buffer_size]]
+            vectors = [
+                np.reshape(t[1], (1, -1)) for t in data[i : i + self.buffer_size]
+            ]
+            vectors = np.concatenate(vectors, axis=0)
+            total_data = self._update_id_mapping(db_ids)
+            self.index.add(vectors)
+            logger.info("data indexed %d", total_data)
+        indexed_cnt = len(self.index_id_to_db_id)
+        logger.info("Total data indexed %d", indexed_cnt)
+    def search_knn(
+        self, query_vectors: np.array, top_docs: int, batch_size: int = 4096,
+    ) -> List[Tuple[List[object], List[float]]]:
+        num_queries = query_vectors.shape[0]
+        scores, indexes = [], []
+        for start in range(0, num_queries, batch_size):
+            logger.info(f"Searched {start} queries.")
+            batch_vectors = query_vectors[start:start + batch_size]
+            batch_scores, batch_indexes = self.index.search(batch_vectors, top_docs)
+            scores.extend(batch_scores)
+            indexes.extend(batch_indexes)
+        # convert to external ids
+        db_ids = [
+            [self.index_id_to_db_id[i] for i in query_top_idxs]
+            for query_top_idxs in indexes
+        ]
+        result = [(db_ids[i], scores[i]) for i in range(len(db_ids))]
+        return result
+    def get_index_name(self):
+        return "flat_index"
+def load_umls_data(files_patterns: List[str], candidate_ids: Dict = None) -> Dict:
+    input_paths = []
+    for pattern in files_patterns:
+        pattern_files = glob.glob(pattern)
+        input_paths.extend(pattern_files)
+    umls_data = {}
+    for file in sorted(input_paths):
+        logger.info("Reading encoded UMLS data from file %s", file)
+        with open(file, "rb") as reader:
+            for meta, vec in pickle.load(reader):
+                assert len(meta['cuis']) == 1, breakpoint()
+                cui = meta['cuis'][0]
+                if candidate_ids and cui not in candidate_ids:
+                    continue
+                umls_data[cui] = (meta, vec)
+    logger.info(f"Loaded UMLS data = {len(umls_data)}.")
+    return umls_data
+def iterate_encoded_files(
+    vector_files: list,
+    candidate_ids: Set = None,
+    umls_data: Dict = None,
+)-> Iterator:
+    logger.info("Loading encoded prototype embeddings...")
+    proto_data = {}
+    for file in vector_files:
+        logger.info("Reading file %s", file)
+        with open(file, "rb") as reader:
+            for meta, vec in pickle.load(reader):
+                cuis = meta['cuis']
+                if candidate_ids and all(c not in candidate_ids for c in cuis):
+                    continue
+                for cui in cuis:
+                    proto_data.setdefault(cui, []).append((meta, vec))
+    # Concatenate prototype embs with additional knowledge embs from UMLS.
+    if umls_data is not None:
+        for cui, (meta, vec) in umls_data.items():
+            if cui in proto_data:
+                for _, _vec in proto_data.pop(cui):
+                    extended_vec = np.concatenate((vec, _vec), axis=0)
+                    yield (meta, extended_vec)
+            else:
+                extended_vec = np.concatenate((vec, np.zeros_like(vec)), axis=0)
+                yield (meta, extended_vec)
+    for cui in list(proto_data.keys()):
+        for meta, vec in proto_data.pop(cui):
+            extended_vec = np.concatenate((np.zeros_like(vec), vec), axis=0)
+            yield (meta, extended_vec)
+    assert len(proto_data) == 0
+class DenseRetriever:
+    def __init__(
+        self,
+        encoder: nn.Module,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int,
+        max_length: int,
+    ):
+        self.encoder = encoder
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.max_length = max_length
+    def generate_mention_vectors(self, ds: torch.utils.data.Dataset) -> T:
+        self.encoder.eval()
+        return generate_vectors(
+            encoder=self.encoder,
+            tokenizer=self.tokenizer,
+            dataset=ds,
+            batch_size=self.batch_size,
+            max_length=self.max_length,
+        )
+class FaissRetriever(DenseRetriever):
+    """
+    Does entity retrieving over the provided index and encoder.
+    """
+    def __init__(
+        self,
+        encoder: nn.Module,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int,
+        max_length: int,
+        index: DenseIndexer,
+    ):
+        super().__init__(encoder, tokenizer, batch_size, max_length)
+        self.index = index
+    def index_encoded_data(
+        self,
+        vector_files: List[str],
+        buffer_size: int,
+        candidate_ids: Set = None,
+        umls_data: Dict = None,
+    ):
+        """
+        Indexes encoded data takes form a list of files
+        :param vector_files: a list of files
+        :param buffer_size: size of a buffer to send for the indexing at once
+        :return:
+        """
+        buffer = []
+        for i, item in enumerate(
+            iterate_encoded_files(vector_files, candidate_ids, umls_data)
+        ):
+            buffer.append(item)
+            if 0 < buffer_size == len(buffer):
+                self.index.index_data(buffer)
+                buffer = []
+        self.index.index_data(buffer)
+        logger.info("Data indexing completed.")
+    def get_top_hits(
+        self, mention_vectors: np.array, top_k: int = 100
+    ) -> List[Tuple[List[object], List[float]]]:
+        """
+        Does the retrieval of the best matching given the mention vectors batch
+        """
+        time0 = time.time()
+        search = partial(
+            self.index.search_knn,
+            top_docs=top_k,
+        )
+        results = []
+        num_processes = multiprocessing.cpu_count()
+        shard_size = math.ceil(mention_vectors.shape[0] / num_processes)
+        shards = []
+        for i in range(0, mention_vectors.shape[0], shard_size):
+            shards.append(mention_vectors[i:i + shard_size])
+        with Pool(processes=num_processes) as pool:
+            it = pool.map(search, shards)
+            for ret in it:
+                results += ret
+            # results = self.index.search_knn(mention_vectors, top_k)
+        logger.info("index search time: %f sec.", time.time() - time0)
+        self.index = None
+        return results
+def hit(pred: List[str], gold: List[str]) -> bool:
+    return all(p in gold for p in pred)
+def dedup_ids(ids: List[Dict]) -> List[Dict]:
+    deduped_ids = []
+    seen_cuis = set()
+    for d in ids:
+        if all(cui in seen_cuis for cui in d['cuis']):
+            continue
+        seen_cuis.update(d['cuis'])
+        deduped_ids.append(d)
+    return deduped_ids
+def evaluate(
+    ds: torch.utils.data.Dataset,
+    result_ent_ids: List[Tuple[List[object], List[float]]],
+    lookup_table: str,
+    top_ks: List[int] = (1, 5, 50, 100),
+) -> List[Dict]:
+    lut = {}
+    with open(lookup_table, encoding='utf-8') as f:
+        for ln in f:
+            cuis, name = ln.strip().split('||')
+            cuis = cuis.split('|')
+            lut[name] = cuis
+    n = len(ds)
+    top_k_hits = {top_k: 0 for top_k in top_ks}
+    for i in range(len(result_ent_ids)):
+        d = ds[i]
+        ids, _ = result_ent_ids[i]
+        ids = dedup_ids(ids)
+        ids = ids[:max(top_ks)]
+        candidates = [
+            {'cuis': eid['cuis'], 'hit': int(hit(pred=eid['cuis'], gold=d.cuis))}
+            for eid in ids
+        ]
+        lut_cuis = lut.get(d.mention, [])
+        if len(lut_cuis) == 1:
+            # If the mention only has one ID in the look up table,
+            # we use the ID as the top prediction.
+            candidates.insert(
+                0,
+                {'cuis': lut_cuis, 'hit': int(hit(pred=lut_cuis, gold=d.cuis))}
+            )
+        for top_k in top_k_hits:
+            if any(c['hit'] for c in candidates[:top_k]):
+                top_k_hits[top_k] += 1
+    top_k_acc = {top_k: v / n for top_k, v in top_k_hits.items()}
+    logger.info("Top-k accuracy %s", top_k_acc)
+@hydra.main(config_path="conf", config_name="run_linking", version_base=None)
+def main(cfg: DictConfig):
+    set_seed(cfg.seed)
+    logger.info("Configuration:")
+    logger.info("%s", OmegaConf.to_yaml(cfg))
+    # Load pretrained.
+    config = AutoConfig.from_pretrained(cfg.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(
+        cfg.model_name_or_path,
+        use_fast=True,
+    )
+    encoder = AutoModel.from_pretrained(
+        cfg.model_name_or_path,
+        config=config
+    )
+    encoder.cuda()
+    encoder.eval()
+    vector_size = config.hidden_size
+    logger.info("Encoder vector_size=%d", vector_size)
+    # Load test data.
+    ds = hydra.utils.instantiate(cfg.test_data)
+    # Init indexer.
+    index = DenseFlatIndexer()
+    index_buffer_sz = index.buffer_size
+    index.init_index(vector_size * 2)
+    # candidate ids
+    candidate_ids = None
+    if cfg.entity_list_ids:
+        with open(cfg.entity_list_ids, encoding='utf-8') as f:
+            candidate_ids = set(f.read().split('\n'))
+    # Start indexing
+    input_paths = []
+    for pattern in cfg.encoded_files:
+        pattern_files = glob.glob(pattern)
+        input_paths.extend(pattern_files)
+    input_paths = sorted(set(input_paths))
+    retriever = FaissRetriever(
+        encoder, tokenizer, cfg.batch_size, cfg.max_length, index)
+    mentions_tensor = retriever.generate_mention_vectors(ds)
+    # Load UMLS knowledge
+    umls_data = None
+    if cfg.encoded_umls_files:
+        umls_data = load_umls_data(cfg.encoded_umls_files, candidate_ids)
+    index_path = cfg.index_path
+    if index_path and index.index_exists(index_path):
+        logger.info("Index path: %s", index_path)
+        retriever.index.deserialize(index_path)
+    else:
+        logger.info("Indexing encoded data from files: %s", input_paths)
+        retriever.index_encoded_data(
+            vector_files=input_paths,
+            buffer_size=index_buffer_sz,
+            candidate_ids=candidate_ids,
+            umls_data=umls_data,
+        )
+        if index_path:
+            pathlib.Path(os.path.dirname(index_path)).mkdir(
+                parents=True, exist_ok=True)
+            retriever.index.serialize(index_path)
+    # Encode test data.
+    mentions_tensor = torch.cat([mentions_tensor, mentions_tensor], dim=1)
+    # To get k different entities, we retrieve 32 * k mentions and then dedup.
+    top_ids_and_scores = retriever.get_top_hits(
+        mentions_tensor.numpy(), cfg.num_retrievals * 32)
+    evaluate(ds, top_ids_and_scores, cfg.entity_list_names)
+if __name__ == "__main__":
+    main()

usage/utils.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Dict
+import os
+import time
+import logging
+import json
+import gzip
+from dataclasses import dataclass, field
+import torch
+from torch import Tensor as T
+from transformers import PreTrainedTokenizer
+logger = logging.getLogger()
+@dataclass
+class Mention:
+    cui: str
+    start: int
+    end: int
+    text: str
+    types: str
+@dataclass
+class ContextualMention:
+    mention: str
+    cuis: List[str]
+    ctx_l: str
+    ctx_r: str
+    def to_tensor(self, tokenizer: PreTrainedTokenizer, max_length: int) -> T:
+        ctx_l_ids = tokenizer.encode(
+            text=self.ctx_l,
+            add_special_tokens=False,
+            max_length=max_length,
+            truncation=True,
+        )
+        ctx_r_ids = tokenizer.encode(
+            text=self.ctx_r,
+            add_special_tokens=False,
+            max_length=max_length,
+            truncation=True,
+        )
+        mention_ids = tokenizer.encode(
+            text=self.mention,
+            add_special_tokens=False,
+            max_length=max_length,
+            truncation=True,
+        )
+        # Concatenate context and mention to the max length.
+        token_ids = tokenizer.convert_tokens_to_ids(['<ENT>']) + mention_ids \
+            + tokenizer.convert_tokens_to_ids(['</ENT>'])
+        max_ctx_len = max_length - len(token_ids) - 2   # Exclude [CLS] and [SEP]
+        max_ctx_l_len = max_ctx_len // 2
+        max_ctx_r_len = max_ctx_len - max_ctx_l_len
+        if len(ctx_l_ids) < max_ctx_l_len and len(ctx_r_ids) < max_ctx_r_len:
+            token_ids = ctx_l_ids + token_ids + ctx_r_ids
+        elif len(ctx_l_ids) >= max_ctx_l_len and len(ctx_r_ids) >= max_ctx_r_len:
+            token_ids = ctx_l_ids[-max_ctx_l_len:] + token_ids \
+                + ctx_r_ids[:max_ctx_r_len]
+        elif len(ctx_l_ids) >= max_ctx_l_len:
+            ctx_l_len = max_ctx_len - len(ctx_r_ids)
+            token_ids = ctx_l_ids[-ctx_l_len:] + token_ids + ctx_r_ids
+        else:
+            ctx_r_len = max_ctx_len - len(ctx_l_ids)
+            token_ids = ctx_l_ids + token_ids + ctx_r_ids[:ctx_r_len]
+        token_ids = [tokenizer.cls_token_id] + token_ids
+        # The above snippet doesn't guarantee the max length limit.
+        token_ids = token_ids[:max_length - 1] + [tokenizer.sep_token_id]
+        if len(token_ids) < max_length:
+            token_ids = token_ids + [tokenizer.pad_token_id] * (max_length - len(token_ids))
+        return torch.tensor(token_ids)
+@dataclass
+class Document:
+    id: str = None
+    title: str = None
+    abstract: str = None
+    mentions: List[Mention] = field(default_factory=list)
+    def concatenate_text(self) -> str:
+        return ' '.join([self.title, self.abstract])
+    @classmethod
+    def from_PubTator(cls, path: str, split_path_prefix: str) -> Dict[str, List]:
+        docs = []
+        with gzip.open(path, 'rb') as f:
+            for b in f.read().decode().strip().split('\n\n'):
+                d = cls()
+                s = ''
+                for i, ln in enumerate(b.split('\n')):
+                    if i == 0:
+                        id, type, text = ln.strip().split('|', 2)
+                        assert type == 't'
+                        d.id, d.title = id, text
+                    elif i == 1:
+                        id, type, text = ln.strip().split('|', 2)
+                        assert type == 'a'
+                        assert d.id == id
+                        d.abstract = text
+                        s = d.concatenate_text()
+                    else:
+                        items = ln.strip().split('\t')
+                        assert d.id == items[0]
+                        cui = items[5].split('UMLS:')[-1]
+                        assert len(cui) == 8, breakpoint()
+                        m = Mention(
+                            cui=cui,
+                            start=int(items[1]),
+                            end=int(items[2]),
+                            text=items[3],
+                            types=items[4].split(',')
+                        )
+                        assert m.text == s[m.start: m.end]
+                        d.mentions.append(m)
+                docs.append(d)
+        dataset = split_dataset(docs, split_path_prefix)
+        print_dataset_stats(dataset)
+        return dataset
+    def to_contextual_mentions(self, max_length: int = 64) -> List[ContextualMention]:
+        text = self.concatenate_text()
+        mentions = []
+        for m in self.mentions:
+            assert m.text == text[m.start:m.end]
+            # Context
+            ctx_l, ctx_r = text[:m.start].strip().split(), text[m.end:].strip().split()
+            ctx_l, ctx_r = ' '.join(ctx_l[-max_length:]), ' '.join(ctx_r[:max_length])
+            cm = ContextualMention(
+                mention=m.text,
+                cuis=[m.cui],
+                ctx_l=ctx_l,
+                ctx_r=ctx_r,
+            )
+            mentions.append(cm)
+        return mentions
+def split_dataset(docs: List, split_path_prefix: str) -> Dict[str, List]:
+    split_kv = {'train': 'trng', 'dev': 'dev', 'test': 'test'}
+    id_to_split = {}
+    dataset = {}
+    for k, v in split_kv.items():
+        dataset[k] = []
+        path = split_path_prefix + v + '.txt'
+        for i in open(path, encoding='utf-8').read().strip().split('\n'):
+            assert i not in id_to_split, breakpoint()
+            id_to_split[i] = k
+    for doc in docs:
+        split = id_to_split[doc.id]
+        dataset[split].append(doc)
+    return dataset
+def print_dataset_stats(dataset: Dict[str, List[Document]]) -> None:
+    all_docs = []
+    for v in dataset.values():
+        all_docs.extend(v)
+    for split, docs in {'all': all_docs, **dataset}.items():
+        logger.info(f"***** {split} *****")
+        logger.info(f"Documents: {len(docs)}")
+        logger.info(f"Mentions: {sum(len(d.mentions) for d in docs)}")
+        cuis = set()
+        for d in docs:
+            for m in d.mentions:
+                cuis.add(m.cui)
+        logger.info(f"Mentioned concepts: {len(cuis)}")
+class MedMentionsDataset(torch.utils.data.Dataset):
+    def __init__(self, dataset_path: str, split: str) -> None:
+        super().__init__()
+        self.dataset_path = dataset_path
+        self.docs = Document.from_PubTator(
+            path=os.path.join(self.dataset_path, 'corpus_pubtator.txt.gz'),
+            split_path_prefix=os.path.join(self.dataset_path, 'corpus_pubtator_pmids_')
+        )[split]
+        self.mentions = []
+        self.name_to_cuis = {}
+        self._post_init()
+    def _post_init(self):
+        for d in self.docs:
+            self.mentions.extend(d.to_contextual_mentions())
+        for m in self.mentions:
+            if m.mention not in self.name_to_cuis:
+                self.name_to_cuis[m.mention] = set()
+            self.name_to_cuis[m.mention].update(m.cuis)
+    def __getitem__(self, index: int) -> ContextualMention:
+        return self.mentions[index]
+    def __len__(self) -> int:
+        return len(self.mentions)
+class PreprocessedDataset(torch.utils.data.Dataset):
+    def __init__(self, dataset_path: str) -> None:
+        super().__init__()
+        self.file = dataset_path
+        self.data = []
+        self.load_data()
+    def load_data(self) -> None:
+        with open(self.file, encoding='utf-8') as f:
+            logger.info("Reading file %s" % self.file)
+            for ln in f:
+                if ln.strip():
+                    self.data.append(json.loads(ln))
+        logger.info("Loaded data size: {}".format(len(self.data)))
+    def __getitem__(self, index: int) -> ContextualMention:
+        d = self.data[index]
+        return ContextualMention(
+            ctx_l=d['context_left'],
+            ctx_r=d['context_right'],
+            mention=d['mention'],
+            cuis=d['cuis'],
+        )
+    def __len__(self) -> int:
+        return len(self.data)
+def generate_vectors(
+    encoder: torch.nn.Module,
+    tokenizer: PreTrainedTokenizer,
+    dataset: torch.utils.data.Dataset,
+    batch_size: int,
+    max_length: int,
+    is_prototype: bool = False,
+):
+    n = len(dataset)
+    total = 0
+    results = []
+    start_time = time.time()
+    logger.info("Start encoding...")
+    for i, batch_start in enumerate(range(0, n, batch_size)):
+        batch = [dataset[i] for i in range(batch_start, min(n, batch_start + batch_size))]
+        batch_token_tensors = [m.to_tensor(tokenizer, max_length) for m in batch]
+        ids_batch = torch.stack(batch_token_tensors, dim=0).cuda()
+        seg_batch = torch.zeros_like(ids_batch)
+        attn_mask = (ids_batch != tokenizer.pad_token_id)
+        with torch.inference_mode():
+            out = encoder(
+                input_ids=ids_batch,
+                token_type_ids=seg_batch,
+                attention_mask=attn_mask
+            )
+            out = out[0][:, 0, :]
+        out = out.cpu()
+        num_mentions = out.size(0)
+        total += num_mentions
+        if is_prototype:
+            meta_batch = [{'cuis': m.cuis} for m in batch]
+            assert len(meta_batch) == num_mentions
+            results.extend([(meta_batch[i], out[i].view(-1).numpy()) for i in range(num_mentions)])
+        else:
+            results.extend(out.cpu().split(1, dim=0))
+        if (i + 1) % 10 == 0:
+            eta = (n - total) * (time.time() - start_time) / 60 / total
+            logger.info(f"Batch={i + 1}, Encoded mentions={total}, ETA={eta:.1f}m")
+    assert len(results) == n
+    logger.info(f"Total encoded mentions={n}")
+    if not is_prototype:
+        results = torch.cat(results, dim=0)
+    return results