import arxiv import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from cachetools import TTLCache, cached from setfit import SetFitModel from tqdm.auto import tqdm CACHE_TIME = 60 * 60 * 12 MAX_RESULTS = 30_000 @cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME)) def get_arxiv_result(): search = arxiv.Search( query="ti:dataset AND abs:machine learning", max_results=MAX_RESULTS, sort_by=arxiv.SortCriterion.SubmittedDate, ) return [ { "title": result.title, "abstract": result.summary, "url": result.entry_id, "category": result.primary_category, "updated": result.updated, } for result in tqdm(search.results(), total=MAX_RESULTS) ] def load_model(): return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model") def format_row_for_model(row): return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}" int2label = {0: "new_dataset", 1: "not_new_dataset"} def get_predictions(data: list[dict], model=None, batch_size=32): if model is None: model = load_model() predictions = [] for i in tqdm(range(0, len(data), batch_size)): batch = data[i : i + batch_size] text_inputs = [format_row_for_model(row) for row in batch] batch_predictions = model.predict_proba(text_inputs) for j, row in enumerate(batch): prediction = batch_predictions[j] row["prediction"] = int2label[int(prediction.argmax())] row["probability"] = float(prediction.max()) predictions.append(row) return predictions def create_markdown(row): title = row["title"] abstract = row["abstract"] arxiv_id = row["arxiv_id"] hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}" updated = row["updated"] updated = updated.strftime("%Y-%m-%d") broad_category = row["broad_category"] category = row["category"] return f"""

{title}

updated: {updated} | category: {broad_category} | subcategory: {category} | \n\n{abstract} \n\n [Hugging Face Papers page]({hub_paper_url}) """ @cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME)) def prepare_data(): print("Downloading arxiv results...") arxiv_results = get_arxiv_result() print("loading model...") model = load_model() print("Making predictions...") predictions = get_predictions(arxiv_results, model=model) df = pd.DataFrame(predictions) df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)") df.loc[:, "broad_category"] = df["category"].str.split(".").str[0] df.loc[:, "markdown"] = df.apply(create_markdown, axis=1) return df all_possible_arxiv_categories = prepare_data().category.unique().tolist() broad_categories = prepare_data().broad_category.unique().tolist() def create_markdown_summary(categories=broad_categories, all_categories=None): df = prepare_data() if categories is not None: df = df[df["broad_category"].isin(categories)] return "\n\n".join(df["markdown"].tolist()) scheduler = BackgroundScheduler() scheduler.add_job(prepare_data, "cron", hour=3, minute=30) scheduler.start() with gr.Blocks() as demo: gr.Markdown("## New Datasets in Machine Learning") gr.Markdown( "This Space attempts to show new papers on arXiv that are *likely* to be papers" " introducing new datasets. \n\n" ) broad_categories = gr.Dropdown( choices=broad_categories, label="Categories", multiselect=True, value=broad_categories, ) results = gr.Markdown(create_markdown_summary()) broad_categories.change(create_markdown_summary, broad_categories, results) demo.launch()