Spaces:

librarian-bots
/

new-datasets-in-machine-learning

Running

File size: 3,818 Bytes

ad38c8f

import arxiv
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from cachetools import TTLCache, cached
from setfit import SetFitModel
from tqdm.auto import tqdm

CACHE_TIME = 60 * 60 * 12
MAX_RESULTS = 30_000


@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
def get_arxiv_result():
    search = arxiv.Search(
        query="ti:dataset AND abs:machine learning",
        max_results=MAX_RESULTS,
        sort_by=arxiv.SortCriterion.SubmittedDate,
    )
    return [
        {
            "title": result.title,
            "abstract": result.summary,
            "url": result.entry_id,
            "category": result.primary_category,
            "updated": result.updated,
        }
        for result in tqdm(search.results(), total=MAX_RESULTS)
    ]


def load_model():
    return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model")


def format_row_for_model(row):
    return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}"


int2label = {0: "new_dataset", 1: "not_new_dataset"}


def get_predictions(data: list[dict], model=None, batch_size=32):
    if model is None:
        model = load_model()
    predictions = []
    for i in tqdm(range(0, len(data), batch_size)):
        batch = data[i : i + batch_size]
        text_inputs = [format_row_for_model(row) for row in batch]
        batch_predictions = model.predict_proba(text_inputs)
        for j, row in enumerate(batch):
            prediction = batch_predictions[j]
            row["prediction"] = int2label[int(prediction.argmax())]
            row["probability"] = float(prediction.max())
            predictions.append(row)
    return predictions


def create_markdown(row):
    title = row["title"]
    abstract = row["abstract"]
    arxiv_id = row["arxiv_id"]
    hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
    updated = row["updated"]
    updated = updated.strftime("%Y-%m-%d")
    broad_category = row["broad_category"]
    category = row["category"]
    return f""" <h1> {title} </h1> updated: {updated} 
    | category: {broad_category}  | subcategory: {category} |
\n\n{abstract}
\n\n [Hugging Face Papers page]({hub_paper_url})
    """


@cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME))
def prepare_data():
    print("Downloading arxiv results...")
    arxiv_results = get_arxiv_result()
    print("loading model...")
    model = load_model()
    print("Making predictions...")
    predictions = get_predictions(arxiv_results, model=model)
    df = pd.DataFrame(predictions)
    df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)")
    df.loc[:, "broad_category"] = df["category"].str.split(".").str[0]
    df.loc[:, "markdown"] = df.apply(create_markdown, axis=1)
    return df


all_possible_arxiv_categories = prepare_data().category.unique().tolist()
broad_categories = prepare_data().broad_category.unique().tolist()


def create_markdown_summary(categories=broad_categories, all_categories=None):
    df = prepare_data()
    if categories is not None:
        df = df[df["broad_category"].isin(categories)]
    return "\n\n".join(df["markdown"].tolist())


scheduler = BackgroundScheduler()
scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
scheduler.start()

with gr.Blocks() as demo:
    gr.Markdown("## New Datasets in Machine Learning")
    gr.Markdown(
        "This Space attempts to show new papers on arXiv that are *likely* to be papers"
        " introducing new datasets. \n\n"
    )
    broad_categories = gr.Dropdown(
        choices=broad_categories,
        label="Categories",
        multiselect=True,
        value=broad_categories,
    )
    results = gr.Markdown(create_markdown_summary())
    broad_categories.change(create_markdown_summary, broad_categories, results)

demo.launch()