Spaces:

librarian-bots
/

recommend_similar_papers

Running

File size: 7,281 Bytes

import gradio as gr
import requests
from cachetools import cached, TTLCache
from bs4 import BeautifulSoup
from httpx import Client
import json
from pathlib import Path
from huggingface_hub import CommitScheduler
from dotenv import load_dotenv
import os
from functools import lru_cache

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

CACHE_TIME = 60 * 60 * 6  # 6 hours

client = Client()

REPO_ID = "librarian-bots/paper-recommendations-v2"

scheduler = CommitScheduler(
    repo_id=REPO_ID,
    repo_type="dataset",
    folder_path="comments",
    path_in_repo="data",
    every=5,
    token=HF_TOKEN,
)


def parse_arxiv_id_from_paper_url(url):
    return url.split("/")[-1]


@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_recommendations_from_semantic_scholar(semantic_scholar_id: str):
    try:
        r = requests.post(
            "https://api.semanticscholar.org/recommendations/v1/papers/",
            json={
                "positivePaperIds": [semantic_scholar_id],
            },
            params={"fields": "externalIds,title,year", "limit": 10},
        )
        return r.json()["recommendedPapers"]
    except KeyError as e:
        raise gr.Error(
            "Error getting recommendations, if this is a new paper it may not yet have"
            " been indexed by Semantic Scholar."
        ) from e


def filter_recommendations(recommendations, max_paper_count=5):
    # include only arxiv papers
    arxiv_paper = [
        r for r in recommendations if r["externalIds"].get("ArXiv", None) is not None
    ]
    if len(arxiv_paper) > max_paper_count:
        arxiv_paper = arxiv_paper[:max_paper_count]
    return arxiv_paper


@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
def get_paper_title_from_arxiv_id(arxiv_id):
    try:
        return requests.get(f"https://huggingface.co/api/papers/{arxiv_id}").json()[
            "title"
        ]
    except Exception as e:
        print(f"Error getting paper title for {arxiv_id}: {e}")
        raise gr.Error("Error getting paper title for {arxiv_id}: {e}") from e


def format_recommendation_into_markdown(arxiv_id, recommendations):
    # title = get_paper_title_from_arxiv_id(arxiv_id)
    # url = f"https://huggingface.co/papers/{arxiv_id}"
    # comment = f"Recommended papers for [{title}]({url})\n\n"
    comment = "The following papers were recommended by the Semantic Scholar API \n\n"
    for r in recommendations:
        hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
        comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
    return comment


def format_comment(result: str):
    result = (
        "This is an automated message from the [Librarian Bot](https://huggingface.co/librarian-bots). I found the following papers similar to this paper. \n\n"
        + result
    )
    result += "\n\n Please give a thumbs up to this comment if you found it helpful!"
    result += "\n\n If you want recommendations for any Paper on Hugging Face checkout [this](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) Space"
    result += "\n\n You can directly ask Librarian Bot for paper recommendations by tagging it in a comment: `@librarian-bot recommend`"
    return result


def post_comment(
    paper_url: str, comment: str, token: str | None = None, base_url: str | None = None
) -> bool:
    if not base_url:
        base_url = "https://huggingface.co"
    paper_id = paper_url.split("/")[-1]
    url = f"{base_url}/api/papers/{paper_id}/comment"
    comment_data = {"comment": comment}
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
    response = requests.post(url, json=comment_data, headers=headers)
    if response.status_code == 201:
        print(f"Comment posted successfully for {paper_url}!")
        return True
    else:
        print(f"Failed to post comment! (Status Code: {response.status_code})")
        print(response.text)
        return False


@lru_cache(maxsize=500)
def is_comment_from_librarian_bot(html: str) -> bool:
    """
    Checks if the given HTML contains a comment from the librarian-bot.

    Args:
        html (str): The HTML content to check.

    Returns:
        bool: True if a comment from the librarian-bot is found, False otherwise.
    """
    soup = BeautifulSoup(html, "lxml")
    librarian_bot_links = soup.find_all("a", string="librarian-bot")
    return any(librarian_bot_links)


def check_if_lib_bot_comment_exists(paper_url: str) -> bool:
    """
    Checks if a comment from the librarian bot exists for a given paper URL.

    Args:
        paper_url (str): The URL of the paper.

    Returns:
        bool: True if a comment from the librarian bot exists, False otherwise.
    """
    try:
        resp = client.get(paper_url)
        return is_comment_from_librarian_bot(resp.text)
    except Exception as e:
        print(f"Error checking if comment exists for {paper_url}: {e}")
        return True  # default to not posting comment


def log_comments(paper_url: str, comment: str):
    """
    Logs comments for a given paper URL.

    Args:
        paper_url (str): The URL of the paper.
        comment (str): The comment to be logged.

    Returns:
        None
    """
    paper_id = paper_url.split("/")[-1]
    file_path = Path(f"comments/{paper_id}.json")
    if not file_path.exists():
        with scheduler.lock:
            with open(file_path, "w") as f:
                data = {"paper_url": paper_url, "comment": comment}
                json.dump(data, f)


def return_recommendations(url: str, post_to_paper: bool = True) -> str:
    arxiv_id = parse_arxiv_id_from_paper_url(url)
    recommendations = get_recommendations_from_semantic_scholar(f"ArXiv:{arxiv_id}")
    filtered_recommendations = filter_recommendations(recommendations)
    if post_to_paper:
        if comment_already_exists := check_if_lib_bot_comment_exists(url):
            gr.Info(
                f"Existing comment: {comment_already_exists}...skipping posting comment"
            )
        else:
            comment = format_comment(
                format_recommendation_into_markdown(arxiv_id, filtered_recommendations)
            )
            if comment_status := post_comment(url, comment, token=HF_TOKEN):
                log_comments(url, comment)
                gr.Info(f"Comment status: {comment_status}")
            else:
                gr.Info("Failed to post comment")
    return format_recommendation_into_markdown(arxiv_id, filtered_recommendations)


title = "Semantic Scholar Paper Recommender"
description = (
    "Paste a link to a paper on Hugging Face Papers and get recommendations for similar"
    " papers from Semantic Scholar. **Note**: Some papers may not have recommendations"
    " yet if they are new or have not been indexed by Semantic Scholar."
)
examples = [
    ["https://huggingface.co/papers/2309.12307", False],
    ["https://huggingface.co/papers/2211.10086", False],
]
interface = gr.Interface(
    return_recommendations,
    [
        gr.Textbox(lines=1),
        gr.Checkbox(label="Post recommendations to Paper page?", default=False),
    ],
    gr.Markdown(),
    examples=examples,
    title=title,
    description=description,
)
interface.queue()
interface.launch()