import gradio as gr import requests from cachetools import cached, TTLCache from bs4 import BeautifulSoup from httpx import Client import json from pathlib import Path from huggingface_hub import CommitScheduler from dotenv import load_dotenv import os from functools import lru_cache load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") CACHE_TIME = 60 * 60 * 6 # 6 hours client = Client() REPO_ID = "librarian-bots/paper-recommendations-v2" scheduler = CommitScheduler( repo_id=REPO_ID, repo_type="dataset", folder_path="comments", path_in_repo="data", every=5, token=HF_TOKEN, ) def parse_arxiv_id_from_paper_url(url): return url.split("/")[-1] @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) def get_recommendations_from_semantic_scholar(semantic_scholar_id: str): try: r = requests.post( "https://api.semanticscholar.org/recommendations/v1/papers/", json={ "positivePaperIds": [semantic_scholar_id], }, params={"fields": "externalIds,title,year", "limit": 10}, ) return r.json()["recommendedPapers"] except KeyError as e: raise gr.Error( "Error getting recommendations, if this is a new paper it may not yet have" " been indexed by Semantic Scholar." ) from e def filter_recommendations(recommendations, max_paper_count=5): # include only arxiv papers arxiv_paper = [ r for r in recommendations if r["externalIds"].get("ArXiv", None) is not None ] if len(arxiv_paper) > max_paper_count: arxiv_paper = arxiv_paper[:max_paper_count] return arxiv_paper @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) def get_paper_title_from_arxiv_id(arxiv_id): try: return requests.get(f"https://huggingface.co/api/papers/{arxiv_id}").json()[ "title" ] except Exception as e: print(f"Error getting paper title for {arxiv_id}: {e}") raise gr.Error("Error getting paper title for {arxiv_id}: {e}") from e def format_recommendation_into_markdown(arxiv_id, recommendations): # title = get_paper_title_from_arxiv_id(arxiv_id) # url = f"https://huggingface.co/papers/{arxiv_id}" # comment = f"Recommended papers for [{title}]({url})\n\n" comment = "The following papers were recommended by the Semantic Scholar API \n\n" for r in recommendations: hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}" comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n" return comment def format_comment(result: str): result = ( "This is an automated message from the [Librarian Bot](https://huggingface.co/librarian-bots). I found the following papers similar to this paper. \n\n" + result ) result += "\n\n Please give a thumbs up to this comment if you found it helpful!" result += "\n\n If you want recommendations for any Paper on Hugging Face checkout [this](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) Space" result += "\n\n You can directly ask Librarian Bot for paper recommendations by tagging it in a comment: `@librarian-bot recommend`" return result def post_comment( paper_url: str, comment: str, token: str | None = None, base_url: str | None = None ) -> bool: if not base_url: base_url = "https://huggingface.co" paper_id = paper_url.split("/")[-1] url = f"{base_url}/api/papers/{paper_id}/comment" comment_data = {"comment": comment} headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} response = requests.post(url, json=comment_data, headers=headers) if response.status_code == 201: print(f"Comment posted successfully for {paper_url}!") return True else: print(f"Failed to post comment! (Status Code: {response.status_code})") print(response.text) return False @lru_cache(maxsize=500) def is_comment_from_librarian_bot(html: str) -> bool: """ Checks if the given HTML contains a comment from the librarian-bot. Args: html (str): The HTML content to check. Returns: bool: True if a comment from the librarian-bot is found, False otherwise. """ soup = BeautifulSoup(html, "lxml") librarian_bot_links = soup.find_all("a", string="librarian-bot") return any(librarian_bot_links) def check_if_lib_bot_comment_exists(paper_url: str) -> bool: """ Checks if a comment from the librarian bot exists for a given paper URL. Args: paper_url (str): The URL of the paper. Returns: bool: True if a comment from the librarian bot exists, False otherwise. """ try: resp = client.get(paper_url) return is_comment_from_librarian_bot(resp.text) except Exception as e: print(f"Error checking if comment exists for {paper_url}: {e}") return True # default to not posting comment def log_comments(paper_url: str, comment: str): """ Logs comments for a given paper URL. Args: paper_url (str): The URL of the paper. comment (str): The comment to be logged. Returns: None """ paper_id = paper_url.split("/")[-1] file_path = Path(f"comments/{paper_id}.json") if not file_path.exists(): with scheduler.lock: with open(file_path, "w") as f: data = {"paper_url": paper_url, "comment": comment} json.dump(data, f) def return_recommendations(url: str, post_to_paper: bool = True) -> str: arxiv_id = parse_arxiv_id_from_paper_url(url) recommendations = get_recommendations_from_semantic_scholar(f"ArXiv:{arxiv_id}") filtered_recommendations = filter_recommendations(recommendations) if post_to_paper: if comment_already_exists := check_if_lib_bot_comment_exists(url): gr.Info( f"Existing comment: {comment_already_exists}...skipping posting comment" ) else: comment = format_comment( format_recommendation_into_markdown(arxiv_id, filtered_recommendations) ) if comment_status := post_comment(url, comment, token=HF_TOKEN): log_comments(url, comment) gr.Info(f"Comment status: {comment_status}") else: gr.Info("Failed to post comment") return format_recommendation_into_markdown(arxiv_id, filtered_recommendations) title = "Semantic Scholar Paper Recommender" description = ( "Paste a link to a paper on Hugging Face Papers and get recommendations for similar" " papers from Semantic Scholar. **Note**: Some papers may not have recommendations" " yet if they are new or have not been indexed by Semantic Scholar." ) examples = [ ["https://huggingface.co/papers/2309.12307", False], ["https://huggingface.co/papers/2211.10086", False], ] interface = gr.Interface( return_recommendations, [ gr.Textbox(lines=1), gr.Checkbox(label="Post recommendations to Paper page?", default=False), ], gr.Markdown(), examples=examples, title=title, description=description, ) interface.queue() interface.launch()