Spaces:

nielsr
/

community-science-progress

Sleeping

File size: 6,104 Bytes

import dataclasses
from multiprocessing import cpu_count
import tqdm
import requests
import streamlit as st

import pandas as pd
from datasets import Dataset, load_dataset
from paperswithcode import PapersWithCodeClient


@dataclasses.dataclass(frozen=True)
class PaperInfo:
    date: str
    arxiv_id: str
    github: str
    title: str
    paper_page: str
    upvotes: int
    num_comments: int


def get_df(start_date: str, end_date: str) -> pd.DataFrame:
    """
    Load the initial dataset as a Pandas dataframe.
    """

    df = pd.merge(
        left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
        right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
        on="arxiv_id",
    )
    df = df[::-1].reset_index(drop=True)

    paper_info = []
    for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
        info = PaperInfo(
            **row,
            paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
        )
        paper_info.append(info)
    
    df = pd.DataFrame([dataclasses.asdict(info) for info in paper_info])

    # set date as index
    df = df.set_index('date')
    df.index = pd.to_datetime(df.index)
    # only include data between start_date and end_date
    df = df[(df.index >= start_date) & (df.index <= end_date)]

    return df


def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
  """
  Get the Github URL for a paper.
  """

  repo_url = ""
  try:
    # get paper ID
    results = client.paper_list(q=paper_title).results
    paper_id = results[0].id

    # get paper
    paper = client.paper_get(paper_id=paper_id)

    # get repositories
    repositories = client.paper_repository_list(paper_id=paper.id).results

    for repo in repositories:
      if repo.is_official:
        repo_url = repo.url

  except:
    pass

  return repo_url


def add_metadata_batch(batch, client: PapersWithCodeClient):
    """
    Add metadata to a batch of papers.
    """

    # get Github URLs for all papers in the batch
    github_urls = []
    for paper_title in batch["title"]:
        github_url = get_github_url(client, paper_title)
        github_urls.append(github_url)

    # overwrite the Github links
    batch["github"] = github_urls

    return batch


def add_hf_assets(batch):
    """
    Add Hugging Face assets to a batch of papers.
    """
    num_spaces = []
    num_models = []
    num_datasets = []
    for arxiv_id in batch["arxiv_id"]:
        if arxiv_id != "":
            response = requests.get(f"https://huggingface.co/api/arxiv/{arxiv_id}/repos")
            result = response.json()
            num_spaces_example = len(result["spaces"])
            num_models_example = len(result["models"])
            num_datasets_example = len(result["datasets"])
        else:
            num_spaces_example = 0
            num_models_example = 0
            num_datasets_example = 0

        num_spaces.append(num_spaces_example)
        num_models.append(num_models_example)
        num_datasets.append(num_datasets_example)

    batch["num_models"] = num_models
    batch["num_datasets"] = num_datasets
    batch["num_spaces"] = num_spaces

    return batch


def check_hf_mention(batch):
    """
    Check if a paper mentions Hugging Face in the README.
    """

    hf_mentions = []
    for github_url in batch["github"]:
        hf_mention = 0
        if github_url != "":
            # get README text using Github API
            owner = github_url.split("/")[-2]
            repo = github_url.split("/")[-1]
            branch = "main"
            url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
            response = requests.get(url)
            
            if response.status_code != 200:
                # try master branch as second attempt
                branch = "master"
                url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
                response = requests.get(url)
            
            if response.status_code == 200:
                # get text
                text = response.text
            if "huggingface" in text.lower() or "hugging face" in text.lower():
                hf_mention = 1
            
        hf_mentions.append(hf_mention)

    # overwrite the Github links
    batch["hf_mention"] = hf_mentions

    return batch


def process_data(start_date: str, end_date: str) -> pd.DataFrame:
    """
    Load the dataset and enrich it with metadata.
    """
    # step 1. load as HF dataset
    df = get_df(start_date, end_date)
    dataset = Dataset.from_pandas(df)

    # step 2. enrich using PapersWithCode API
    dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})

    # step 3. enrich using Hugging Face API
    dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())

    # step 4. check if Hugging Face is mentioned in the README
    dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count())

    # return as Pandas dataframe
    dataframe = dataset.to_pandas()

    # convert date column to datetime
    dataframe['date'] = pd.to_datetime(dataframe['date'])

    print("First few rows of the dataset:")
    print(dataframe.head())

    return dataframe


@st.cache_data
def get_data() -> pd.DataFrame:

    # step 1: load pre-processed data
    df = load_dataset("nielsr/daily-papers-enriched", split="train").to_pandas()
    df = df.set_index('date')
    df = df.sort_index()
    df.index = pd.to_datetime(df.index)

    # step 2: check how much extra data we need to process
    latest_day = df.iloc[-1].name.strftime('%d-%m-%Y')
    today = pd.Timestamp.today().strftime('%d-%m-%Y')

    # step 3: process the missing data
    if latest_day < today:
        print(f"Processing data from {latest_day} to {today}")
        new_df = process_data(start_date=latest_day, end_date=today)
        new_df = new_df[new_df.index > latest_day]
        df = pd.concat([df, new_df])

    return df