Spaces:

librarian-bots
/

new-datasets-in-machine-learning

Running

App Files Files Community

davanstrien HF Staff commited on Oct 5, 2023

Commit

ad38c8f

1 Parent(s): 3c378cf

draft app

Browse files

Files changed (3) hide show

app.py +120 -0
requirements.in +6 -0
requirements.txt +329 -0

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import arxiv
+import gradio as gr
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from cachetools import TTLCache, cached
+from setfit import SetFitModel
+from tqdm.auto import tqdm
+CACHE_TIME = 60 * 60 * 12
+MAX_RESULTS = 30_000
+@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
+def get_arxiv_result():
+    search = arxiv.Search(
+        query="ti:dataset AND abs:machine learning",
+        max_results=MAX_RESULTS,
+        sort_by=arxiv.SortCriterion.SubmittedDate,
+    )
+    return [
+        {
+            "title": result.title,
+            "abstract": result.summary,
+            "url": result.entry_id,
+            "category": result.primary_category,
+            "updated": result.updated,
+        }
+        for result in tqdm(search.results(), total=MAX_RESULTS)
+    ]
+def load_model():
+    return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model")
+def format_row_for_model(row):
+    return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}"
+int2label = {0: "new_dataset", 1: "not_new_dataset"}
+def get_predictions(data: list[dict], model=None, batch_size=32):
+    if model is None:
+        model = load_model()
+    predictions = []
+    for i in tqdm(range(0, len(data), batch_size)):
+        batch = data[i : i + batch_size]
+        text_inputs = [format_row_for_model(row) for row in batch]
+        batch_predictions = model.predict_proba(text_inputs)
+        for j, row in enumerate(batch):
+            prediction = batch_predictions[j]
+            row["prediction"] = int2label[int(prediction.argmax())]
+            row["probability"] = float(prediction.max())
+            predictions.append(row)
+    return predictions
+def create_markdown(row):
+    title = row["title"]
+    abstract = row["abstract"]
+    arxiv_id = row["arxiv_id"]
+    hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
+    updated = row["updated"]
+    updated = updated.strftime("%Y-%m-%d")
+    broad_category = row["broad_category"]
+    category = row["category"]
+    return f""" <h1> {title} </h1> updated: {updated}
+    | category: {broad_category}  | subcategory: {category} |
+\n\n{abstract}
+\n\n [Hugging Face Papers page]({hub_paper_url})
+    """
+@cached(cache=TTLCache(maxsize=100, ttl=CACHE_TIME))
+def prepare_data():
+    print("Downloading arxiv results...")
+    arxiv_results = get_arxiv_result()
+    print("loading model...")
+    model = load_model()
+    print("Making predictions...")
+    predictions = get_predictions(arxiv_results, model=model)
+    df = pd.DataFrame(predictions)
+    df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)")
+    df.loc[:, "broad_category"] = df["category"].str.split(".").str[0]
+    df.loc[:, "markdown"] = df.apply(create_markdown, axis=1)
+    return df
+all_possible_arxiv_categories = prepare_data().category.unique().tolist()
+broad_categories = prepare_data().broad_category.unique().tolist()
+def create_markdown_summary(categories=broad_categories, all_categories=None):
+    df = prepare_data()
+    if categories is not None:
+        df = df[df["broad_category"].isin(categories)]
+    return "\n\n".join(df["markdown"].tolist())
+scheduler = BackgroundScheduler()
+scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
+scheduler.start()
+with gr.Blocks() as demo:
+    gr.Markdown("## New Datasets in Machine Learning")
+    gr.Markdown(
+        "This Space attempts to show new papers on arXiv that are *likely* to be papers"
+        " introducing new datasets. \n\n"
+    )
+    broad_categories = gr.Dropdown(
+        choices=broad_categories,
+        label="Categories",
+        multiselect=True,
+        value=broad_categories,
+    )
+    results = gr.Markdown(create_markdown_summary())
+    broad_categories.change(create_markdown_summary, broad_categories, results)
+demo.launch()

requirements.in ADDED Viewed

	@@ -0,0 +1,6 @@

+apscheduler
+arxiv
+cachetools
+gradio
+scikit-learn==1.2.2
+setfit

requirements.txt ADDED Viewed

	@@ -0,0 +1,329 @@

+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile
+#
+aiofiles==23.2.1
+    # via gradio
+aiohttp==3.8.5
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via aiohttp
+altair==5.1.2
+    # via gradio
+annotated-types==0.5.0
+    # via pydantic
+anyio==3.7.1
+    # via
+    #   fastapi
+    #   httpcore
+    #   starlette
+apscheduler==3.10.4
+    # via -r requirements.in
+arxiv==1.4.8
+    # via -r requirements.in
+async-timeout==4.0.3
+    # via aiohttp
+attrs==23.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+cachetools==5.3.1
+    # via -r requirements.in
+certifi==2023.7.22
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.3.0
+    # via
+    #   aiohttp
+    #   requests
+click==8.1.7
+    # via
+    #   nltk
+    #   uvicorn
+contourpy==1.1.1
+    # via matplotlib
+cycler==0.12.0
+    # via matplotlib
+datasets==2.14.5
+    # via
+    #   evaluate
+    #   setfit
+dill==0.3.7
+    # via
+    #   datasets
+    #   evaluate
+    #   multiprocess
+evaluate==0.4.0
+    # via setfit
+fastapi==0.103.2
+    # via gradio
+feedparser==6.0.10
+    # via arxiv
+ffmpy==0.3.1
+    # via gradio
+filelock==3.12.4
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+fonttools==4.43.0
+    # via matplotlib
+frozenlist==1.4.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2023.6.0
+    # via
+    #   datasets
+    #   evaluate
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==3.46.1
+    # via -r requirements.in
+gradio-client==0.5.3
+    # via gradio
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+httpcore==0.18.0
+    # via httpx
+httpx==0.25.0
+    # via
+    #   gradio
+    #   gradio-client
+huggingface-hub==0.16.4
+    # via
+    #   datasets
+    #   evaluate
+    #   gradio
+    #   gradio-client
+    #   sentence-transformers
+    #   tokenizers
+    #   transformers
+idna==3.4
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+importlib-resources==6.1.0
+    # via gradio
+jinja2==3.1.2
+    # via
+    #   altair
+    #   gradio
+    #   torch
+joblib==1.3.2
+    # via
+    #   nltk
+    #   scikit-learn
+jsonschema==4.19.1
+    # via altair
+jsonschema-specifications==2023.7.1
+    # via jsonschema
+kiwisolver==1.4.5
+    # via matplotlib
+markupsafe==2.1.3
+    # via
+    #   gradio
+    #   jinja2
+matplotlib==3.8.0
+    # via gradio
+mpmath==1.3.0
+    # via sympy
+multidict==6.0.4
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.15
+    # via
+    #   datasets
+    #   evaluate
+networkx==3.1
+    # via torch
+nltk==3.8.1
+    # via sentence-transformers
+numpy==1.26.0
+    # via
+    #   altair
+    #   contourpy
+    #   datasets
+    #   evaluate
+    #   gradio
+    #   matplotlib
+    #   pandas
+    #   pyarrow
+    #   scikit-learn
+    #   scipy
+    #   sentence-transformers
+    #   torchvision
+    #   transformers
+orjson==3.9.7
+    # via gradio
+packaging==23.2
+    # via
+    #   altair
+    #   datasets
+    #   evaluate
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   matplotlib
+    #   transformers
+pandas==2.1.1
+    # via
+    #   altair
+    #   datasets
+    #   evaluate
+    #   gradio
+pillow==10.0.1
+    # via
+    #   gradio
+    #   matplotlib
+    #   torchvision
+pyarrow==13.0.0
+    # via datasets
+pydantic==2.4.2
+    # via
+    #   fastapi
+    #   gradio
+pydantic-core==2.10.1
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pyparsing==3.1.1
+    # via matplotlib
+python-dateutil==2.8.2
+    # via
+    #   matplotlib
+    #   pandas
+python-multipart==0.0.6
+    # via gradio
+pytz==2023.3.post1
+    # via
+    #   apscheduler
+    #   pandas
+pyyaml==6.0.1
+    # via
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+referencing==0.30.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2023.10.3
+    # via
+    #   nltk
+    #   transformers
+requests==2.31.0
+    # via
+    #   datasets
+    #   evaluate
+    #   fsspec
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   responses
+    #   torchvision
+    #   transformers
+responses==0.18.0
+    # via evaluate
+rpds-py==0.10.4
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.3.3
+    # via transformers
+scikit-learn==1.2.2
+    # via
+    #   -r requirements.in
+    #   sentence-transformers
+scipy==1.11.3
+    # via
+    #   scikit-learn
+    #   sentence-transformers
+semantic-version==2.10.0
+    # via gradio
+sentence-transformers==2.2.2
+    # via setfit
+sentencepiece==0.1.99
+    # via sentence-transformers
+setfit==0.7.0
+    # via -r requirements.in
+sgmllib3k==1.0.0
+    # via feedparser
+six==1.16.0
+    # via
+    #   apscheduler
+    #   python-dateutil
+sniffio==1.3.0
+    # via
+    #   anyio
+    #   httpcore
+    #   httpx
+starlette==0.27.0
+    # via fastapi
+sympy==1.12
+    # via torch
+threadpoolctl==3.2.0
+    # via scikit-learn
+tokenizers==0.14.0
+    # via transformers
+toolz==0.12.0
+    # via altair
+torch==2.1.0
+    # via
+    #   sentence-transformers
+    #   torchvision
+torchvision==0.16.0
+    # via sentence-transformers
+tqdm==4.66.1
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   nltk
+    #   sentence-transformers
+    #   transformers
+transformers==4.34.0
+    # via sentence-transformers
+typing-extensions==4.8.0
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   torch
+tzdata==2023.3
+    # via pandas
+tzlocal==5.1
+    # via apscheduler
+urllib3==2.0.6
+    # via
+    #   requests
+    #   responses
+uvicorn==0.23.2
+    # via gradio
+websockets==11.0.3
+    # via
+    #   gradio
+    #   gradio-client
+xxhash==3.4.1
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.9.2
+    # via aiohttp