3ie-intervention-outcome-entity-linking

Sleeping

App Files Files Community

CarlosMalaga commited on May 2, 2024

Commit

8e4a1aa

verified ·

1 Parent(s): a22a927

Delete examples

Browse files

Files changed (3) hide show

examples/explore_faiss.md +0 -8
examples/explore_faiss.py +0 -163
examples/train_retriever.py +0 -45

examples/explore_faiss.md DELETED Viewed

@@ -1,8 +0,0 @@
-# table to store results
-| Index          | nprobe | Recall | Time  |
-|----------------|--------|--------|-------|
-| Flat           |    1   | 98.7   | 38.64 |
-| IVFx,Flat      |    1   | 42.5   | 23.46 |
-| IVFx,Flat      |    14  | 88.5   | 133   |
-| IVFx_HNSW,Flat |    1   | 88.5   | 133   |

examples/explore_faiss.py DELETED Viewed

@@ -1,163 +0,0 @@
-import argparse
-import json
-import logging
-import os
-from pathlib import Path
-import time
-from typing import Union
-import torch
-import tqdm
-from relik.retriever import GoldenRetriever
-from relik.common.log import get_logger
-from relik.retriever.common.model_inputs import ModelInputs
-from relik.retriever.data.base.datasets import BaseDataset
-from relik.retriever.indexers.base import BaseDocumentIndex
-from relik.retriever.indexers.faiss import FaissDocumentIndex
-logger = get_logger(level=logging.INFO)
-def compute_retriever_stats(dataset) -> None:
-    correct, total = 0, 0
-    for sample in dataset:
-        window_candidates = sample["window_candidates"]
-        window_candidates = [c.replace("_", " ").lower() for c in window_candidates]
-        for ss, se, label in sample["window_labels"]:
-            if label == "--NME--":
-                continue
-            if label.replace("_", " ").lower() in window_candidates:
-                correct += 1
-            total += 1
-    recall = correct / total
-    print("Recall:", recall)
-@torch.no_grad()
-def add_candidates(
-    retriever_name_or_path: Union[str, os.PathLike],
-    document_index_name_or_path: Union[str, os.PathLike],
-    input_path: Union[str, os.PathLike],
-    batch_size: int = 128,
-    num_workers: int = 4,
-    index_type: str = "Flat",
-    nprobe: int = 1,
-    device: str = "cpu",
-    precision: str = "fp32",
-    topics: bool = False,
-):
-    document_index = BaseDocumentIndex.from_pretrained(
-        document_index_name_or_path,
-        # config_kwargs={
-        #     "_target_": "relik.retriever.indexers.faiss.FaissDocumentIndex",
-        #     "index_type": index_type,
-        #     "nprobe": nprobe,
-        # },
-        device=device,
-        precision=precision,
-    )
-    retriever = GoldenRetriever(
-        question_encoder=retriever_name_or_path,
-        document_index=document_index,
-        device=device,
-        precision=precision,
-        index_device=device,
-        index_precision=precision,
-    )
-    retriever.eval()
-    logger.info(f"Loading from {input_path}")
-    with open(input_path) as f:
-        samples = [json.loads(line) for line in f.readlines()]
-    topics = topics and "doc_topic" in samples[0]
-    # get tokenizer
-    tokenizer = retriever.question_tokenizer
-    collate_fn = lambda batch: ModelInputs(
-        tokenizer(
-            [b["text"] for b in batch],
-            text_pair=[b["doc_topic"] for b in batch] if topics else None,
-            padding=True,
-            return_tensors="pt",
-            truncation=True,
-        )
-    )
-    logger.info(f"Creating dataloader with batch size {batch_size}")
-    dataloader = torch.utils.data.DataLoader(
-        BaseDataset(name="passage", data=samples),
-        batch_size=batch_size,
-        shuffle=False,
-        num_workers=num_workers,
-        pin_memory=False,
-        collate_fn=collate_fn,
-    )
-    # we also dump the candidates to a file after a while
-    retrieved_accumulator = []
-    with torch.inference_mode():
-        start = time.time()
-        num_completed_docs = 0
-        for documents_batch in tqdm.tqdm(dataloader):
-            retrieve_kwargs = {
-                **documents_batch,
-                "k": 100,
-                "precision": precision,
-            }
-            batch_out = retriever.retrieve(**retrieve_kwargs)
-            retrieved_accumulator.extend(batch_out)
-        end = time.time()
-        output_data = []
-        # get the correct document from the original dataset
-        # the dataloader is not shuffled, so we can just count the number of
-        # documents we have seen so far
-        for sample, retrieved in zip(
-            samples[
-                num_completed_docs : num_completed_docs + len(retrieved_accumulator)
-            ],
-            retrieved_accumulator,
-        ):
-            candidate_titles = [c.label.split(" <def>", 1)[0] for c in retrieved]
-            sample["window_candidates"] = candidate_titles
-            sample["window_candidates_scores"] = [c.score for c in retrieved]
-            output_data.append(sample)
-        # for sample in output_data:
-        #     f_out.write(json.dumps(sample) + "\n")
-        num_completed_docs += len(retrieved_accumulator)
-        retrieved_accumulator = []
-    compute_retriever_stats(output_data)
-    print(f"Retrieval took {end - start:.2f} seconds")
-if __name__ == "__main__":
-    # arg_parser = argparse.ArgumentParser()
-    # arg_parser.add_argument("--retriever_name_or_path", type=str, required=True)
-    # arg_parser.add_argument("--document_index_name_or_path", type=str, required=True)
-    # arg_parser.add_argument("--input_path", type=str, required=True)
-    # arg_parser.add_argument("--output_path", type=str, required=True)
-    # arg_parser.add_argument("--batch_size", type=int, default=128)
-    # arg_parser.add_argument("--device", type=str, default="cuda")
-    # arg_parser.add_argument("--index_device", type=str, default="cpu")
-    # arg_parser.add_argument("--precision", type=str, default="fp32")
-    # add_candidates(**vars(arg_parser.parse_args()))
-    add_candidates(
-        "/root/relik-spaces/models/relik-retriever-small-aida-blink-pretrain-omniencoder/question_encoder",
-        "/root/relik-spaces/models/relik-retriever-small-aida-blink-pretrain-omniencoder/document_index_filtered",
-        "/root/relik-spaces/data/reader/aida/testa_windowed.jsonl",
-        # index_type="HNSW32",
-        # index_type="IVF1024,PQ8",
-        # nprobe=1,
-        topics=True,
-        device="cuda",
-    )

examples/train_retriever.py DELETED Viewed

@@ -1,45 +0,0 @@
-from relik.retriever.trainer import RetrieverTrainer
-from relik import GoldenRetriever
-from relik.retriever.indexers.inmemory import InMemoryDocumentIndex
-from relik.retriever.data.datasets import AidaInBatchNegativesDataset
-if __name__ == "__main__":
-    # instantiate retriever
-    document_index = InMemoryDocumentIndex(
-        documents="/root/golden-retriever-v2/data/dpr-like/el/definitions.txt",
-        device="cuda",
-        precision="16",
-    )
-    retriever = GoldenRetriever(
-        question_encoder="intfloat/e5-small-v2", document_index=document_index
-    )
-    train_dataset = AidaInBatchNegativesDataset(
-        name="aida_train",
-        path="/root/golden-retriever-v2/data/dpr-like/el/aida_32_tokens_topic/train.jsonl",
-        tokenizer=retriever.question_tokenizer,
-        question_batch_size=64,
-        passage_batch_size=400,
-        max_passage_length=64,
-        use_topics=True,
-        shuffle=True,
-    )
-    val_dataset = AidaInBatchNegativesDataset(
-        name="aida_val",
-        path="/root/golden-retriever-v2/data/dpr-like/el/aida_32_tokens_topic/val.jsonl",
-        tokenizer=retriever.question_tokenizer,
-        question_batch_size=64,
-        passage_batch_size=400,
-        max_passage_length=64,
-        use_topics=True,
-    )
-    trainer = RetrieverTrainer(
-        retriever=retriever,
-        train_dataset=train_dataset,
-        val_dataset=val_dataset,
-        max_steps=25_000,
-        wandb_offline_mode=True,
-    )
-    trainer.train()