Spaces:
Build error
Build error
Commit
·
8e97619
1
Parent(s):
1c973d4
import am anfang
Browse files
app.py
CHANGED
|
@@ -25,6 +25,36 @@ import re
|
|
| 25 |
import nltk
|
| 26 |
nltk.download("stopwords", quiet=True)
|
| 27 |
from nltk.corpus import stopwords as nltk_stopwords
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
LANGUAGE = "english"
|
| 30 |
word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
|
|
@@ -147,20 +177,11 @@ def run_counting(
|
|
| 147 |
doc_texts=doc_texts,
|
| 148 |
)
|
| 149 |
|
| 150 |
-
from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
|
| 151 |
sciq = load_sciq()
|
| 152 |
counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
|
| 153 |
|
| 154 |
"""### BM25 Index"""
|
| 155 |
|
| 156 |
-
from __future__ import annotations
|
| 157 |
-
from dataclasses import asdict, dataclass
|
| 158 |
-
import math
|
| 159 |
-
import os
|
| 160 |
-
from typing import Iterable, List, Optional, Type
|
| 161 |
-
import tqdm
|
| 162 |
-
from nlp4web_codebase.ir.data_loaders.dm import Document
|
| 163 |
-
|
| 164 |
|
| 165 |
@dataclass
|
| 166 |
class BM25Index(InvertedIndex):
|
|
@@ -257,9 +278,7 @@ bm25_index.save("output/bm25_index")
|
|
| 257 |
|
| 258 |
"""### BM25 Retriever"""
|
| 259 |
|
| 260 |
-
|
| 261 |
-
from typing import Type
|
| 262 |
-
from abc import abstractmethod
|
| 263 |
|
| 264 |
|
| 265 |
class BaseInvertedIndexRetriever(BaseRetriever):
|
|
@@ -330,10 +349,6 @@ Tune b and k1 on the **dev** split of SciQ using the metric MAP@10. The evaluati
|
|
| 330 |
$${\displaystyle {\text{score}}(D,Q)=\sum _{i=1}^{n}{\text{IDF}}(q_{i})\cdot {\frac {f(q_{i},D)\cdot (k_{1}+1)}{f(q_{i},D)+k_{1}\cdot \left(1-b+b\cdot {\frac {|D|}{\text{avgdl}}}\right)}}}$$
|
| 331 |
"""
|
| 332 |
|
| 333 |
-
from nlp4web_codebase.ir.data_loaders import Split
|
| 334 |
-
import pytrec_eval
|
| 335 |
-
import numpy as np
|
| 336 |
-
|
| 337 |
|
| 338 |
|
| 339 |
def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float:
|
|
@@ -346,7 +361,6 @@ def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> floa
|
|
| 346 |
"""Example of using the pre-requisite code:"""
|
| 347 |
|
| 348 |
# Loading dataset:
|
| 349 |
-
from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
|
| 350 |
sciq = load_sciq()
|
| 351 |
counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
|
| 352 |
|
|
@@ -444,7 +458,6 @@ print(plots_b["Y"][1])
|
|
| 444 |
print(plots_k1)
|
| 445 |
print(plots_b)
|
| 446 |
|
| 447 |
-
from matplotlib import pyplot as plt
|
| 448 |
plt.plot(plots_b["X"], plots_b["Y"], label="b")
|
| 449 |
plt.plot(plots_k1["X"], plots_k1["Y"], label="k1")
|
| 450 |
plt.ylabel("MAP")
|
|
@@ -486,7 +499,6 @@ Convert the matrix \begin{bmatrix}
|
|
| 486 |
\end{bmatrix} to a `csc_matrix` by specifying `data`, `indices`, `indptr` and `shape`.
|
| 487 |
"""
|
| 488 |
|
| 489 |
-
from scipy.sparse._csc import csc_matrix
|
| 490 |
input_matrix = [[0, 1, 0, 3], [10, 2, 1, 0], [0, 0, 0, 9]]
|
| 491 |
data = None
|
| 492 |
indices = None
|
|
@@ -760,9 +772,6 @@ def search(query: str) -> List[Hit]:
|
|
| 760 |
```
|
| 761 |
"""
|
| 762 |
|
| 763 |
-
import gradio as gr
|
| 764 |
-
from typing import TypedDict
|
| 765 |
-
|
| 766 |
class Hit(TypedDict):
|
| 767 |
cid: str
|
| 768 |
score: float
|
|
|
|
| 25 |
import nltk
|
| 26 |
nltk.download("stopwords", quiet=True)
|
| 27 |
from nltk.corpus import stopwords as nltk_stopwords
|
| 28 |
+
from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
import gradio as gr
|
| 32 |
+
from typing import TypedDict
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
from __future__ import annotations
|
| 36 |
+
from dataclasses import asdict, dataclass
|
| 37 |
+
import math
|
| 38 |
+
import os
|
| 39 |
+
from typing import Iterable, List, Optional, Type
|
| 40 |
+
import tqdm
|
| 41 |
+
from nlp4web_codebase.ir.data_loaders.dm import Document
|
| 42 |
+
|
| 43 |
+
from nlp4web_codebase.ir.models import BaseRetriever
|
| 44 |
+
from typing import Type
|
| 45 |
+
from abc import abstractmethod
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
from nlp4web_codebase.ir.data_loaders import Split
|
| 49 |
+
import pytrec_eval
|
| 50 |
+
import numpy as np
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
from matplotlib import pyplot as plt
|
| 54 |
+
|
| 55 |
+
from scipy.sparse._csc import csc_matrix
|
| 56 |
+
|
| 57 |
+
|
| 58 |
|
| 59 |
LANGUAGE = "english"
|
| 60 |
word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
|
|
|
|
| 177 |
doc_texts=doc_texts,
|
| 178 |
)
|
| 179 |
|
|
|
|
| 180 |
sciq = load_sciq()
|
| 181 |
counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
|
| 182 |
|
| 183 |
"""### BM25 Index"""
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
@dataclass
|
| 187 |
class BM25Index(InvertedIndex):
|
|
|
|
| 278 |
|
| 279 |
"""### BM25 Retriever"""
|
| 280 |
|
| 281 |
+
|
|
|
|
|
|
|
| 282 |
|
| 283 |
|
| 284 |
class BaseInvertedIndexRetriever(BaseRetriever):
|
|
|
|
| 349 |
$${\displaystyle {\text{score}}(D,Q)=\sum _{i=1}^{n}{\text{IDF}}(q_{i})\cdot {\frac {f(q_{i},D)\cdot (k_{1}+1)}{f(q_{i},D)+k_{1}\cdot \left(1-b+b\cdot {\frac {|D|}{\text{avgdl}}}\right)}}}$$
|
| 350 |
"""
|
| 351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
|
| 353 |
|
| 354 |
def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float:
|
|
|
|
| 361 |
"""Example of using the pre-requisite code:"""
|
| 362 |
|
| 363 |
# Loading dataset:
|
|
|
|
| 364 |
sciq = load_sciq()
|
| 365 |
counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))
|
| 366 |
|
|
|
|
| 458 |
print(plots_k1)
|
| 459 |
print(plots_b)
|
| 460 |
|
|
|
|
| 461 |
plt.plot(plots_b["X"], plots_b["Y"], label="b")
|
| 462 |
plt.plot(plots_k1["X"], plots_k1["Y"], label="k1")
|
| 463 |
plt.ylabel("MAP")
|
|
|
|
| 499 |
\end{bmatrix} to a `csc_matrix` by specifying `data`, `indices`, `indptr` and `shape`.
|
| 500 |
"""
|
| 501 |
|
|
|
|
| 502 |
input_matrix = [[0, 1, 0, 3], [10, 2, 1, 0], [0, 0, 0, 9]]
|
| 503 |
data = None
|
| 504 |
indices = None
|
|
|
|
| 772 |
```
|
| 773 |
"""
|
| 774 |
|
|
|
|
|
|
|
|
|
|
| 775 |
class Hit(TypedDict):
|
| 776 |
cid: str
|
| 777 |
score: float
|