File size: 3,818 Bytes
ad38c8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import arxiv
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from cachetools import TTLCache, cached
from setfit import SetFitModel
from tqdm.auto import tqdm

CACHE_TIME = 60 * 60 * 12
MAX_RESULTS = 30_000


@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
def get_arxiv_result():
    search = arxiv.Search(
        query="ti:dataset AND abs:machine learning",
        max_results=MAX_RESULTS,
        sort_by=arxiv.SortCriterion.SubmittedDate,
    )
    return [
        {
            "title": result.title,
            "abstract": result.summary,
            "url": result.entry_id,
            "category": result.primary_category,
            "updated": result.updated,
        }
        for result in tqdm(search.results(), total=MAX_RESULTS)
    ]


def load_model():
    return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model")


def format_row_for_model(row):
    return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}"


int2label = {0: "new_dataset", 1: "not_new_dataset"}


def get_predictions(data: list[dict], model=None, batch_size=32):
    if model is None:
        model = load_model()
    predictions = []
    for i in tqdm(range(0, len(data), batch_size)):
        batch = data[i : i + batch_size]
        text_inputs = [format_row_for_model(row) for row in batch]
        batch_predictions = model.predict_proba(text_inputs)
        for j, row in enumerate(batch):
            prediction = batch_predictions[j]
            row["prediction"] = int2label[int(prediction.argmax())]
            row["probability"] = float(prediction.max())
            predictions.append(row)
    return predictions


def create_markdown(row):
    title = row["title"]
    abstract = row["abstract"]
    arxiv_id = row["arxiv_id"]
    hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
    updated = row["updated"]
    updated = updated.strftime("%Y-%m-%d")
    broad_category = row["broad_category"]
    category = row["category"]
    return f""" <h1> {title} </h1> updated: {updated} 
    | category: {broad_category}  | subcategory: {category} |
\n\n{abstract}
\n\n [Hugging Face Papers page]({hub_paper_url})
    """


@cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME))
def prepare_data():
    print("Downloading arxiv results...")
    arxiv_results = get_arxiv_result()
    print("loading model...")
    model = load_model()
    print("Making predictions...")
    predictions = get_predictions(arxiv_results, model=model)
    df = pd.DataFrame(predictions)
    df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)")
    df.loc[:, "broad_category"] = df["category"].str.split(".").str[0]
    df.loc[:, "markdown"] = df.apply(create_markdown, axis=1)
    return df


all_possible_arxiv_categories = prepare_data().category.unique().tolist()
broad_categories = prepare_data().broad_category.unique().tolist()


def create_markdown_summary(categories=broad_categories, all_categories=None):
    df = prepare_data()
    if categories is not None:
        df = df[df["broad_category"].isin(categories)]
    return "\n\n".join(df["markdown"].tolist())


scheduler = BackgroundScheduler()
scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
scheduler.start()

with gr.Blocks() as demo:
    gr.Markdown("## New Datasets in Machine Learning")
    gr.Markdown(
        "This Space attempts to show new papers on arXiv that are *likely* to be papers"
        " introducing new datasets. \n\n"
    )
    broad_categories = gr.Dropdown(
        choices=broad_categories,
        label="Categories",
        multiselect=True,
        value=broad_categories,
    )
    results = gr.Markdown(create_markdown_summary())
    broad_categories.change(create_markdown_summary, broad_categories, results)

demo.launch()