Spaces:

VagoX1
/

NLP1

Sleeping

App Files Files Community

VagoX1 commited on Nov 10, 2024

Commit

417b39a

•

1 Parent(s): 6daaa3f

Update app.py

Browse files

Files changed (1) hide show

app.py +214 -179

app.py CHANGED Viewed

@@ -1,134 +1,21 @@
-import joblib
-import gradio as gr
-from collections import Counter
-from typing import TypedDict
-from abc import ABC, abstractmethod
-from typing import Any, Dict, Type
-from scipy.sparse._csc import csc_matrix
-from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
-import pickle
 from dataclasses import dataclass
 import tqdm
 import re
-import os
 import nltk
 nltk.download("stopwords", quiet=True)
 from nltk.corpus import stopwords as nltk_stopwords
-import math
-from dataclasses import dataclass
-from typing import Optional
-from datasets import load_dataset
-from enum import Enum
-import numpy as np
-@dataclass
-class Document:
-    collection_id: str
-    text: str
-@dataclass
-class Query:
-    query_id: str
-    text: str
-@dataclass
-class QRel:
-    query_id: str
-    collection_id: str
-    relevance: int
-    answer: Optional[str] = None
-class Split(str, Enum):
-    train = "train"
-    dev = "dev"
-    test = "test"
-@dataclass
-class IRDataset:
-    corpus: List[Document]
-    queries: List[Query]
-    split2qrels: Dict[Split, List[QRel]]
-    def get_stats(self) -> Dict[str, int]:
-        stats = {"|corpus|": len(self.corpus), "|queries|": len(self.queries)}
-        for split, qrels in self.split2qrels.items():
-            stats[f"|qrels-{split}|"] = len(qrels)
-        return stats
-    def get_qrels_dict(self, split: Split) -> Dict[str, Dict[str, int]]:
-        qrels_dict = {}
-        for qrel in self.split2qrels[split]:
-            qrels_dict.setdefault(qrel.query_id, {})
-            qrels_dict[qrel.query_id][qrel.collection_id] = qrel.relevance
-        return qrels_dict
-    def get_split_queries(self, split: Split) -> List[Query]:
-        qrels = self.split2qrels[split]
-        qids = {qrel.query_id for qrel in qrels}
-        return list(filter(lambda query: query.query_id in qids, self.queries))
-@(joblib.Memory(".cache").cache)
-def load_sciq(verbose: bool = False) -> IRDataset:
-    train = load_dataset("allenai/sciq", split="train")
-    validation = load_dataset("allenai/sciq", split="validation")
-    test = load_dataset("allenai/sciq", split="test")
-    data = {Split.train: train, Split.dev: validation, Split.test: test}
-    # Each duplicated record is the same to each other:
-    df = train.to_pandas() + validation.to_pandas() + test.to_pandas()
-    for question, group in df.groupby("question"):
-        assert len(set(group["support"].tolist())) == len(group)
-        assert len(set(group["correct_answer"].tolist())) == len(group)
-    # Build:
-    corpus = []
-    queries = []
-    split2qrels: Dict[str, List[dict]] = {}
-    question2id = {}
-    support2id = {}
-    for split, rows in data.items():
-        if verbose:
-            print(f"|raw_{split}|", len(rows))
-        split2qrels[split] = []
-        for i, row in enumerate(rows):
-            example_id = f"{split}-{i}"
-            support: str = row["support"]
-            if len(support.strip()) == 0:
-                continue
-            question = row["question"]
-            if len(support.strip()) == 0:
-                continue
-            if support in support2id:
-                continue
-            else:
-                support2id[support] = example_id
-            if question in question2id:
-                continue
-            else:
-                question2id[question] = example_id
-            doc = {"collection_id": example_id, "text": support}
-            query = {"query_id": example_id, "text": row["question"]}
-            qrel = {
-                "query_id": example_id,
-                "collection_id": example_id,
-                "relevance": 1,
-                "answer": row["correct_answer"],
-            }
-            corpus.append(Document(**doc))
-            queries.append(Query(**query))
-            split2qrels[split].append(QRel(**qrel))
-    # Assembly and return:
-    return IRDataset(corpus=corpus, queries=queries, split2qrels=split2qrels)
 LANGUAGE = "english"
 word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
 stopwords = set(nltk_stopwords.words(LANGUAGE))
 def word_splitting(text: str) -> List[str]:
     return word_splitter(text.lower())
@@ -149,6 +36,7 @@ class PostingList:
     docid_postings: List[int]  # docid_postings[i] means the docid (int) of the i-th associated posting
     tweight_postings: List[float]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
 @dataclass
 class InvertedIndex:
     posting_lists: List[PostingList]  # docid -> posting_list
@@ -171,24 +59,8 @@ class InvertedIndex:
             index = pickle.load(f)
         return index
-class BaseRetriever(ABC):
-    @property
-    @abstractmethod
-    def index_class(self) -> Type[Any]:
-        pass
-    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
-        raise NotImplementedError
-    @abstractmethod
-    def score(self, query: str, cid: str) -> float:
-        pass
-    @abstractmethod
-    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
-        pass
 @dataclass
 class Counting:
     posting_lists: List[PostingList]
@@ -260,6 +132,17 @@ def run_counting(
         doc_texts=doc_texts,
     )
 @dataclass
 class BM25Index(InvertedIndex):
@@ -305,7 +188,7 @@ class BM25Index(InvertedIndex):
     @classmethod
     def build_from_documents(
-        cls: Type["BM25Index"],
         documents: Iterable[Document],
         store_raw: bool = True,
         output_dir: Optional[str] = None,
@@ -313,7 +196,7 @@ class BM25Index(InvertedIndex):
         show_progress_bar: bool = True,
         k1: float = 0.9,
         b: float = 0.4,
-    ) -> "BM25Index":
         # Counting TFs, DFs, doc_lengths, etc.:
         counting = run_counting(
             documents=documents,
@@ -346,6 +229,147 @@ class BM25Index(InvertedIndex):
         )
         return index
 @dataclass
 class CSCInvertedIndex:
@@ -369,6 +393,7 @@ class CSCInvertedIndex:
             index = pickle.load(f)
         return index
 @dataclass
 class CSCBM25Index(CSCInvertedIndex):
@@ -388,7 +413,6 @@ class CSCBM25Index(CSCInvertedIndex):
     ) -> csc_matrix:
         """Compute term weights and caching"""
-        ## YOUR_CODE_STARTS_HERE
         data = []
         indices = []
         indptr = [0]
@@ -431,7 +455,7 @@ class CSCBM25Index(CSCInvertedIndex):
     @classmethod
     def build_from_documents(
-        cls: Type["CSCBM25Index"],
         documents: Iterable[Document],
         store_raw: bool = True,
         output_dir: Optional[str] = None,
@@ -439,7 +463,7 @@ class CSCBM25Index(CSCInvertedIndex):
         show_progress_bar: bool = True,
         k1: float = 0.9,
         b: float = 0.4,
-    ) -> "CSCBM25Index":
         # Counting TFs, DFs, doc_lengths, etc.:
         counting = run_counting(
             documents=documents,
@@ -472,6 +496,15 @@ class CSCBM25Index(CSCInvertedIndex):
         )
         return index
 class BaseCSCInvertedIndexRetriever(BaseRetriever):
     @property
@@ -503,28 +536,29 @@ class BaseCSCInvertedIndexRetriever(BaseRetriever):
     def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
         ## YOUR_CODE_STARTS_HERE
         toks = self.index.tokenize(query)
         docid2score: Dict[int, float] = {}
         for tok in toks:
-           if tok not in self.index.vocab:
-               continue
-           tid = self.index.vocab[tok]
-           # Get weights for all documents for the current term
-           weights_for_term = self.index.posting_lists_matrix.getcol(tid).toarray()[:, 0]
-           for docid, weight in enumerate(weights_for_term):
-               docid2score.setdefault(docid, 0)
-               docid2score[docid] += weight  # Accumulate scores for each document
-       # Sort and get topk documents
         docid2score = dict(
-           sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
-       )
-        return {
-           self.index.collection_ids[docid]: score
-           for docid, score in docid2score.items()
-       }
         ## YOUR_CODE_ENDS_HERE
 class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):
@@ -533,6 +567,9 @@ class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):
     def index_class(self) -> Type[CSCBM25Index]:
         return CSCBM25Index
 class Hit(TypedDict):
   cid: str
   score: float
@@ -542,28 +579,26 @@ demo: Optional[gr.Interface] = None  # Assign your gradio demo to this variable
 return_type = List[Hit]
 ## YOUR_CODE_STARTS_HERE
-def search(query: str) -> List[Hit]:
-    bm25_index = BM25Index.build_from_documents(
-        documents=iter(sciq.corpus),
-        ndocs=12160,
-        show_progress_bar=True
-    )
-    bm25_index.save("output/bm25_index")
-    bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
-    ranking = bm25_retriever.retrieve(query=query)
-    hits = []
-    for cid, score in ranking.items():
-        doc = next((doc for doc in sciq.corpus if doc.collection_id == cid), None)
-        if doc:
-            hits.append({"cid": cid, "score": score, "text": doc.text})
-    return hits
 demo = gr.Interface(
     fn=search,
-    inputs=gr.Textbox(lines=2, placeholder="Enter your query here..."),
-    outputs=gr.JSON(label="Search Results"),
-    title="SciQ Search Engine",
-    description="Enter a query to search the SciQ dataset using BM25.",
 )
 ## YOUR_CODE_ENDS_HERE
-demo.launch()

+from __future__ import annotations
 from dataclasses import dataclass
+import pickle
+import os
+from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
+from nlp4web_codebase.ir.data_loaders.dm import Document
+from collections import Counter
 import tqdm
 import re
 import nltk
 nltk.download("stopwords", quiet=True)
 from nltk.corpus import stopwords as nltk_stopwords
 LANGUAGE = "english"
 word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
 stopwords = set(nltk_stopwords.words(LANGUAGE))
 def word_splitting(text: str) -> List[str]:
     return word_splitter(text.lower())
     docid_postings: List[int]  # docid_postings[i] means the docid (int) of the i-th associated posting
     tweight_postings: List[float]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
 @dataclass
 class InvertedIndex:
     posting_lists: List[PostingList]  # docid -> posting_list
             index = pickle.load(f)
         return index
+# The output of the counting function:
 @dataclass
 class Counting:
     posting_lists: List[PostingList]
         doc_texts=doc_texts,
     )
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+sciq = load_sciq()
+counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
+from dataclasses import asdict, dataclass
+import math
+import os
+from typing import Iterable, List, Optional, Type
+import tqdm
+from nlp4web_codebase.ir.data_loaders.dm import Document
 @dataclass
 class BM25Index(InvertedIndex):
     @classmethod
     def build_from_documents(
+        cls: Type[BM25Index],
         documents: Iterable[Document],
         store_raw: bool = True,
         output_dir: Optional[str] = None,
         show_progress_bar: bool = True,
         k1: float = 0.9,
         b: float = 0.4,
+    ) -> BM25Index:
         # Counting TFs, DFs, doc_lengths, etc.:
         counting = run_counting(
             documents=documents,
         )
         return index
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+)
+bm25_index.save("output/bm25_index")
+from nlp4web_codebase.ir.models import BaseRetriever
+from typing import Type
+from abc import abstractmethod
+class BaseInvertedIndexRetriever(BaseRetriever):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[InvertedIndex]:
+        pass
+    def __init__(self, index_dir: str) -> None:
+        self.index = self.index_class.from_saved(index_dir)
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        target_docid = self.index.cid2docid[cid]
+        term_weights = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists[tid]
+            for docid, tweight in zip(
+                posting_list.docid_postings, posting_list.tweight_postings
+            ):
+                if docid == target_docid:
+                    term_weights[tok] = tweight
+                    break
+        return term_weights
+    def score(self, query: str, cid: str) -> float:
+        return sum(self.get_term_weights(query=query, cid=cid).values())
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        docid2score: Dict[int, float] = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            posting_list = self.index.posting_lists[tid]
+            for docid, tweight in zip(
+                posting_list.docid_postings, posting_list.tweight_postings
+            ):
+                docid2score.setdefault(docid, 0)
+                docid2score[docid] += tweight
+        docid2score = dict(
+            sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
+        )
+        return {
+            self.index.collection_ids[docid]: score
+            for docid, score in docid2score.items()
+        }
+class BM25Retriever(BaseInvertedIndexRetriever):
+    @property
+    def index_class(self) -> Type[BM25Index]:
+        return BM25Index
+from nlp4web_codebase.ir.data_loaders import Split
+import pytrec_eval
+import numpy as np
+def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float:
+  metric = "map_cut_10"
+  qrels = sciq.get_qrels_dict(split)
+  evaluator = pytrec_eval.RelevanceEvaluator(sciq.get_qrels_dict(split), (metric,))
+  qps = evaluator.evaluate(rankings)
+  return float(np.mean([qp[metric] for qp in qps.values()]))
+# Loading dataset:
+from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
+sciq = load_sciq()
+counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
+# Building BM25 index and save:
+bm25_index = BM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True
+)
+bm25_index.save("output/bm25_index")
+plots_b: Dict[str, List[float]] = {
+    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+    "Y": []
+}
+plots_k1: Dict[str, List[float]] = {
+    "X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
+    "Y": []
+}
+## YOUR_CODE_STARTS_HERE
+# Step 1: Tune b (with fixed k1=0.9)
+for b_val in plots_b["X"]:
+    bm25_index = BM25Index.build_from_documents(
+        documents=iter(sciq.corpus),
+        ndocs=12160,
+        show_progress_bar=True,
+        k1=0.9,  # Fix k1
+        b=b_val
+    )
+    bm25_index.save("output/bm25_index")
+    bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+    rankings = {}
+    for query in sciq.get_split_queries(Split.dev):
+        ranking = bm25_retriever.retrieve(query=query.text)
+        rankings[query.query_id] = ranking
+    map_score = evaluate_map(rankings)
+    plots_b["Y"].append(map_score)
+# Step 2: Tune k1 (with the best b from step 1)
+best_b = plots_b["X"][np.argmax(plots_b["Y"])]  # Get best b
+for k1_val in plots_k1["X"]:
+    bm25_index = BM25Index.build_from_documents(
+        documents=iter(sciq.corpus),
+        ndocs=12160,
+        show_progress_bar=True,
+        k1=k1_val,
+        b=best_b  # Use best b
+    )
+    bm25_index.save("output/bm25_index")
+    bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
+    rankings = {}
+    for query in sciq.get_split_queries(Split.dev):
+        ranking = bm25_retriever.retrieve(query=query.text)
+        rankings[query.query_id] = ranking
+    map_score = evaluate_map(rankings)
+    plots_k1["Y"].append(map_score)
+from scipy.sparse._csc import csc_matrix
 @dataclass
 class CSCInvertedIndex:
             index = pickle.load(f)
         return index
 @dataclass
 class CSCBM25Index(CSCInvertedIndex):
     ) -> csc_matrix:
         """Compute term weights and caching"""
         data = []
         indices = []
         indptr = [0]
     @classmethod
     def build_from_documents(
+        cls: Type[CSCBM25Index],
         documents: Iterable[Document],
         store_raw: bool = True,
         output_dir: Optional[str] = None,
         show_progress_bar: bool = True,
         k1: float = 0.9,
         b: float = 0.4,
+    ) -> CSCBM25Index:
         # Counting TFs, DFs, doc_lengths, etc.:
         counting = run_counting(
             documents=documents,
         )
         return index
+csc_bm25_index = CSCBM25Index.build_from_documents(
+    documents=iter(sciq.corpus),
+    ndocs=12160,
+    show_progress_bar=True,
+    k1=best_k1,
+    b=best_b
+)
+csc_bm25_index.save("output/csc_bm25_index")
 class BaseCSCInvertedIndexRetriever(BaseRetriever):
     @property
     def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
         ## YOUR_CODE_STARTS_HERE
+        ranking: Dict[str, float] = {}
         toks = self.index.tokenize(query)
         docid2score: Dict[int, float] = {}
         for tok in toks:
+          if tok not in self.index.vocab:
+            continue
+          tid = self.index.vocab[tok]
+          tid2documents = self.index.posting_lists_matrix.getcol(tid)
+          for docid, tweight in zip(tid2documents.indices, tid2documents.data):
+            docid2score.setdefault(docid, 0)
+            docid2score[docid] += tweight
         docid2score = dict(
+            sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
+        )
+        ranking = {
+            self.index.collection_ids[docid]: score
+            for docid, score in docid2score.items()
+        }
+        return ranking
         ## YOUR_CODE_ENDS_HERE
 class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):
     def index_class(self) -> Type[CSCBM25Index]:
         return CSCBM25Index
+import gradio as gr
+from typing import TypedDict
 class Hit(TypedDict):
   cid: str
   score: float
 return_type = List[Hit]
 ## YOUR_CODE_STARTS_HERE
+def search(query) -> List[Hit]:
+    return_type: List[Hit] = []
+    bm_25_retriever = BM25Retriever(index_dir="output/bm25_index")
+    ranking = bm_25_retriever.retrieve(query)
+    for rank in ranking:
+      hit = {
+          "cid": rank,
+          "score": ranking[rank],
+          "text": bm_25_retriever.index.doc_texts[bm_25_retriever.index.cid2docid[rank]]
+      }
+      return_type.append(hit)
+    return return_type
 demo = gr.Interface(
     fn=search,
+    inputs=["text"],
+    outputs=gr.Textbox()
 )
 ## YOUR_CODE_ENDS_HERE
+demo.launch(share=True)