Spaces:

lreining
/

nlp4web

Sleeping

App Files Files Community

lreining commited on Nov 12, 2024

Commit

b8fd633

1 Parent(s): 152b8a0

Add app

Browse files

Files changed (2) hide show

app.py +454 -0
requirements.txt +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,454 @@

+from __future__ import annotations
+import math
+import os
+import pickle
+import re
+from abc import abstractmethod
+from collections import Counter
+from dataclasses import dataclass
+from typing import Callable, Dict, Iterable, List, Optional, Type, TypedDict, TypeVar
+import gradio as gr
+import nltk
+import numpy as np
+import tqdm
+from nlp4web_codebase.ir.data_loaders.dm import Document
+from nlp4web_codebase.ir.models import BaseRetriever
+from scipy.sparse._csc import csc_matrix
+class Hit(TypedDict):
+    cid: str
+    score: float
+    text: str
+demo: Optional[gr.Interface] = None  # Assign your gradio demo to this variable
+return_type = List[Hit]
+LANGUAGE = "english"
+nltk.download("stopwords", quiet=True)
+from nltk.corpus import stopwords as nltk_stopwords
+word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
+stopwords = set(nltk_stopwords.words(LANGUAGE))
+def word_splitting(text: str) -> List[str]:
+    return word_splitter(text.lower())
+def lemmatization(words: List[str]) -> List[str]:
+    return words  # We ignore lemmatization here for simplicity
+def simple_tokenize(text: str) -> List[str]:
+    words = word_splitting(text)
+    tokenized = list(filter(lambda w: w not in stopwords, words))
+    tokenized = lemmatization(tokenized)
+    return tokenized
+@dataclass
+class PostingList:
+    term: str  # The term
+    docid_postings: List[
+        int
+    ]  # docid_postings[i] means the docid (int) of the i-th associated posting
+    tweight_postings: List[
+        float
+    ]  # tweight_postings[i] means the term weight (float) of the i-th associated posting
+@dataclass
+class InvertedIndex:
+    posting_lists: List[PostingList]  # docid -> posting_list
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]  # collection_id -> docid
+    collection_ids: List[str]  # docid -> collection_id
+    doc_texts: Optional[List[str]] = None  # docid -> document text
+    def save(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
+            pickle.dump(self, f)
+    @classmethod
+    def from_saved(cls: Type[T], saved_dir: str) -> T:
+        index = cls(
+            posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
+        )
+        with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
+            index = pickle.load(f)
+        return index
+T = TypeVar("T", bound="InvertedIndex")
+# The output of the counting function:
+@dataclass
+class Counting:
+    posting_lists: List[PostingList]
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]
+    collection_ids: List[str]
+    dfs: List[int]  # tid -> df
+    dls: List[int]  # docid -> doc length
+    avgdl: float
+    nterms: int
+    doc_texts: Optional[List[str]] = None
+def run_counting(
+    documents: Iterable[Document],
+    tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
+    store_raw: bool = True,  # store the document text in doc_texts
+    ndocs: Optional[int] = None,
+    show_progress_bar: bool = True,
+) -> Counting:
+    """Counting TFs, DFs, doc_lengths, etc."""
+    posting_lists: List[PostingList] = []
+    vocab: Dict[str, int] = {}
+    cid2docid: Dict[str, int] = {}
+    collection_ids: List[str] = []
+    dfs: List[int] = []  # tid -> df
+    dls: List[int] = []  # docid -> doc length
+    nterms: int = 0
+    doc_texts: Optional[List[str]] = []
+    for doc in tqdm.tqdm(
+        documents,
+        desc="Counting",
+        total=ndocs,
+        disable=not show_progress_bar,
+    ):
+        if doc.collection_id in cid2docid:
+            continue
+        collection_ids.append(doc.collection_id)
+        docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
+        toks = tokenize_fn(doc.text)
+        tok2tf = Counter(toks)
+        dls.append(sum(tok2tf.values()))
+        for tok, tf in tok2tf.items():
+            nterms += tf
+            tid = vocab.get(tok, None)
+            if tid is None:
+                posting_lists.append(
+                    PostingList(term=tok, docid_postings=[], tweight_postings=[])
+                )
+                tid = vocab.setdefault(tok, len(vocab))
+            posting_lists[tid].docid_postings.append(docid)
+            posting_lists[tid].tweight_postings.append(tf)
+            if tid < len(dfs):
+                dfs[tid] += 1
+            else:
+                dfs.append(0)
+        if store_raw:
+            doc_texts.append(doc.text)
+        else:
+            doc_texts = None
+    return Counting(
+        posting_lists=posting_lists,
+        vocab=vocab,
+        cid2docid=cid2docid,
+        collection_ids=collection_ids,
+        dfs=dfs,
+        dls=dls,
+        avgdl=sum(dls) / len(dls),
+        nterms=nterms,
+        doc_texts=doc_texts,
+    )
+@dataclass
+class BM25Index(InvertedIndex):
+    @staticmethod
+    def tokenize(text: str) -> List[str]:
+        return simple_tokenize(text)
+    @staticmethod
+    def cache_term_weights(
+        posting_lists: List[PostingList],
+        total_docs: int,
+        avgdl: float,
+        dfs: List[int],
+        dls: List[int],
+        k1: float,
+        b: float,
+    ) -> None:
+        """Compute term weights and caching"""
+        N = total_docs
+        for tid, posting_list in enumerate(
+            tqdm.tqdm(posting_lists, desc="Regularizing TFs")
+        ):
+            idf = BM25Index.calc_idf(df=dfs[tid], N=N)
+            for i in range(len(posting_list.docid_postings)):
+                docid = posting_list.docid_postings[i]
+                tf = posting_list.tweight_postings[i]
+                dl = dls[docid]
+                regularized_tf = BM25Index.calc_regularized_tf(
+                    tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
+                )
+                posting_list.tweight_postings[i] = regularized_tf * idf
+    @staticmethod
+    def calc_regularized_tf(
+        tf: int, dl: float, avgdl: float, k1: float, b: float
+    ) -> float:
+        return tf / (tf + k1 * (1 - b + b * dl / avgdl))
+    @staticmethod
+    def calc_idf(df: int, N: int):
+        return math.log(1 + (N - df + 0.5) / (df + 0.5))
+    @classmethod
+    def build_from_documents(
+        cls: Type[BM25Index],
+        documents: Iterable[Document],
+        store_raw: bool = True,
+        output_dir: Optional[str] = None,
+        ndocs: Optional[int] = None,
+        show_progress_bar: bool = True,
+        k1: float = 0.9,
+        b: float = 0.4,
+    ) -> BM25Index:
+        # Counting TFs, DFs, doc_lengths, etc.:
+        counting = run_counting(
+            documents=documents,
+            tokenize_fn=BM25Index.tokenize,
+            store_raw=store_raw,
+            ndocs=ndocs,
+            show_progress_bar=show_progress_bar,
+        )
+        # Compute term weights and caching:
+        posting_lists = counting.posting_lists
+        total_docs = len(counting.cid2docid)
+        BM25Index.cache_term_weights(
+            posting_lists=posting_lists,
+            total_docs=total_docs,
+            avgdl=counting.avgdl,
+            dfs=counting.dfs,
+            dls=counting.dls,
+            k1=k1,
+            b=b,
+        )
+        # Assembly and save:
+        index = BM25Index(
+            posting_lists=posting_lists,
+            vocab=counting.vocab,
+            cid2docid=counting.cid2docid,
+            collection_ids=counting.collection_ids,
+            doc_texts=counting.doc_texts,
+        )
+        return index
+@dataclass
+class CSCInvertedIndex:
+    posting_lists_matrix: csc_matrix  # docid -> posting_list
+    vocab: Dict[str, int]
+    cid2docid: Dict[str, int]  # collection_id -> docid
+    collection_ids: List[str]  # docid -> collection_id
+    doc_texts: Optional[List[str]] = None  # docid -> document text
+    def save(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
+            pickle.dump(self, f)
+    @classmethod
+    def from_saved(cls: Type[T], saved_dir: str) -> T:
+        index = cls(
+            posting_lists_matrix=None,
+            vocab={},
+            cid2docid={},
+            collection_ids=[],
+            doc_texts=None,
+        )
+        with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
+            index = pickle.load(f)
+        return index
+@dataclass
+class CSCBM25Index(CSCInvertedIndex):
+    @staticmethod
+    def tokenize(text: str) -> List[str]:
+        return simple_tokenize(text)
+    @staticmethod
+    def cache_term_weights(
+        posting_lists: List[PostingList],
+        total_docs: int,
+        avgdl: float,
+        dfs: List[int],
+        dls: List[int],
+        k1: float,
+        b: float,
+    ) -> csc_matrix:
+        """Compute term weights and caching"""
+        data = []
+        indices = []
+        indptr = [0]
+        max_docid = 0
+        for tid, posting_list in enumerate(
+            tqdm.tqdm(posting_lists, desc="Regularizing TFs")
+        ):
+            idf = CSCBM25Index.calc_idf(df=dfs[tid], N=total_docs)
+            for i in range(len(posting_list.docid_postings)):
+                docid = posting_list.docid_postings[i]
+                if docid > max_docid:
+                    max_docid = docid
+                tf = posting_list.tweight_postings[i]
+                dl = dls[docid]
+                regularized_tf = CSCBM25Index.calc_regularized_tf(
+                    tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
+                )
+                result = regularized_tf * idf
+                posting_list.tweight_postings[i] = result  # TODO?
+                if result != 0:
+                    data.append(result)
+                    indices.append(docid)
+            indptr.append(len(data))
+        shape = (max_docid, len(posting_lists))
+        return csc_matrix((data, indices, indptr), shape=shape)
+    @staticmethod
+    def calc_regularized_tf(
+        tf: int, dl: float, avgdl: float, k1: float, b: float
+    ) -> float:
+        return tf / (tf + k1 * (1 - b + b * dl / avgdl))
+    @staticmethod
+    def calc_idf(df: int, N: int):
+        return math.log(1 + (N - df + 0.5) / (df + 0.5))
+    @classmethod
+    def build_from_documents(
+        cls: Type[CSCBM25Index],
+        documents: Iterable[Document],
+        store_raw: bool = True,
+        output_dir: Optional[str] = None,
+        ndocs: Optional[int] = None,
+        show_progress_bar: bool = True,
+        k1: float = 0.9,
+        b: float = 0.4,
+    ) -> CSCBM25Index:
+        # Counting TFs, DFs, doc_lengths, etc.:
+        counting = run_counting(
+            documents=documents,
+            tokenize_fn=CSCBM25Index.tokenize,
+            store_raw=store_raw,
+            ndocs=ndocs,
+            show_progress_bar=show_progress_bar,
+        )
+        # Compute term weights and caching:
+        posting_lists = counting.posting_lists
+        total_docs = len(counting.cid2docid)
+        posting_lists_matrix = CSCBM25Index.cache_term_weights(
+            posting_lists=posting_lists,
+            total_docs=total_docs,
+            avgdl=counting.avgdl,
+            dfs=counting.dfs,
+            dls=counting.dls,
+            k1=k1,
+            b=b,
+        )
+        # Assembly and save:
+        index = CSCBM25Index(
+            posting_lists_matrix=posting_lists_matrix,
+            vocab=counting.vocab,
+            cid2docid=counting.cid2docid,
+            collection_ids=counting.collection_ids,
+            doc_texts=counting.doc_texts,
+        )
+        return index
+class BaseCSCInvertedIndexRetriever(BaseRetriever):
+    @property
+    @abstractmethod
+    def index_class(self) -> Type[CSCInvertedIndex]:
+        pass
+    def __init__(self, index_dir: str) -> None:
+        self.index = self.index_class.from_saved(index_dir)
+    def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        target_docid = self.index.cid2docid[cid]
+        term_weights = {}
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            weight = self.index.posting_lists_matrix[target_docid, tid]
+            if weight != 0:
+                term_weights[tok] = weight
+        return term_weights
+    def score(self, query: str, cid: str) -> float:
+        return sum(self.get_term_weights(query=query, cid=cid).values())
+    def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
+        toks = self.index.tokenize(query)
+        scores = np.zeros(self.index.posting_lists_matrix.shape[0])
+        for tok in toks:
+            if tok not in self.index.vocab:
+                continue
+            tid = self.index.vocab[tok]
+            col = self.index.posting_lists_matrix[:, tid].toarray().flatten()
+            scores += col
+        docids = np.argsort(scores)[::-1][:topk]
+        scores = scores[docids]
+        return {
+            self.index.collection_ids[docid]: score
+            for docid, score in zip(docids, scores)
+        }
+class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):
+    @property
+    def index_class(self) -> Type[CSCBM25Index]:
+        return CSCBM25Index
+if __name__ == "__main__":
+    top_k = 10
+    csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index")
+    def query(query: str) -> List[Hit]:
+        hits = []
+        for cid, score in csc_bm25_retriever.retrieve(query).items():
+            hit = Hit(
+                cid=cid,
+                score=score,
+                text=csc_bm25_retriever.index.doc_texts[
+                    csc_bm25_retriever.index.cid2docid[cid]
+                ],
+            )
+            hits.append(hit)
+        return hits
+    demo = gr.Interface(
+        fn=query,
+        inputs=gr.Textbox(lines=1, label="Query"),
+        # outputs=["text" for _ in range(top_k)],
+        outputs=[gr.Textbox(label=f"Result {i+1}") for i in range(top_k)],
+        title="BM25 Retriever",
+        description="Enter query",
+    )
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,109 @@

+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+asttokens==2.4.1
+attrs==24.2.0
+audioop-lts==0.2.1
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+comm==0.2.2
+contourpy==1.3.0
+cycler==0.12.1
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+dill==0.3.8
+exceptiongroup==1.2.2
+executing==2.1.0
+fastapi==0.115.4
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+frozenlist==1.5.0
+fsspec==2024.6.1
+gradio==5.5.0
+gradio_client==1.4.2
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.2
+idna==3.10
+importlib_metadata==8.5.0
+ipykernel==6.29.5
+ipython==8.29.0
+jedi==0.19.1
+Jinja2==3.1.4
+joblib==1.4.2
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+kiwisolver==1.4.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+multidict==6.1.0
+multiprocess==0.70.16
+nest_asyncio==1.6.0
+nlp4web-codebase @ git+https://github.com/kwang2049/nlp4web-codebase.git@83f9afbbf7e372c116fdd04997a96449007f861f
+nltk==3.8.1
+numpy==1.26.4
+orjson==3.10.11
+packaging==24.1
+pandas==2.2.2
+parso==0.8.4
+pexpect==4.9.0
+pickleshare==0.7.5
+pillow==11.0.0
+pip==24.2
+platformdirs==4.3.6
+prompt_toolkit==3.0.48
+propcache==0.2.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==18.0.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.2.0
+python-dateutil==2.9.0
+python-multipart==0.0.12
+pytrec_eval==0.5
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+regex==2024.9.11
+requests==2.32.3
+rich==13.9.4
+ruff==0.7.2
+safehttpx==0.1.1
+scipy==1.13.1
+semantic-version==2.10.0
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+stack-data==0.6.2
+starlette==0.41.2
+tomlkit==0.12.0
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.2
+ujson==5.10.0
+urllib3==2.2.3
+uvicorn==0.32.0
+wcwidth==0.2.13
+websockets==12.0
+wheel==0.44.0
+xxhash==3.5.0
+yarl==1.17.1
+zipp==3.20.2