hf_daily_papers / get_hf_daily_papers.py
yutohub's picture
Create get_hf_daily_papers.py
9aba82f verified
raw
history blame contribute delete
No virus
3.45 kB
from datetime import datetime, timedelta
import json
import time
from typing import List, Dict
from bs4 import BeautifulSoup
import requests
def get_hf_daily_papers_info(date: str) -> List[Dict]:
url = 'https://huggingface.co/papers?date=' + date
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
papers = []
articles = soup.find_all('article', class_='relative flex flex-col overflow-hidden rounded-xl border')
for article in articles:
title_tag = article.find('h3')
title = title_tag.get_text(strip=True)
link = title_tag.find('a')['href']
authors = [author_tag['title'] for author_tag in article.find_all('li', title=True)]
submitter_tag = article.find('div', class_='pointer-events-none absolute right-2 top-56 -mt-8 flex h-6 items-center gap-1 self-end whitespace-nowrap rounded-md border bg-white px-2 text-xs leading-none text-gray-700 shadow-sm dark:bg-gray-900 dark:text-gray-400 sm:text-sm md:top-64')
submitter = submitter_tag.get_text(strip=True).replace('Submitted by', '').strip()
paper_info = {
'title': title,
'link': "https://huggingface.co" + link,
'authors': authors,
'submitter': submitter,
'date': date
}
papers.append(paper_info)
return papers
else:
print(f"No papers, date: {date}")
return []
def get_abstract(url: str) -> str:
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to load page {url}")
soup = BeautifulSoup(response.text, 'html.parser')
abstract_section = soup.find('div', class_='pb-8 pr-4 md:pr-16')
if not abstract_section:
raise Exception("Abstract section not found")
abstract_header = abstract_section.find('h2', string='Abstract')
if not abstract_header:
raise Exception("Abstract header not found")
abstract_paragraph = abstract_header.find_next('p')
if not abstract_paragraph:
raise Exception("Abstract paragraph not found")
return abstract_paragraph.get_text(strip=True).replace('\n', ' ')
def fetch_and_save_papers(start_date: str, end_date: str, output_file: str):
start_date = datetime.strptime(start_date, "%Y-%m-%d")
end_date = datetime.strptime(end_date, "%Y-%m-%d")
date_list = [(start_date + timedelta(days=x)).strftime("%Y-%m-%d") for x in range((end_date - start_date).days + 1)]
all_papers = []
for date in date_list:
papers = get_hf_daily_papers_info(date)
for paper in papers:
try:
paper['abstract'] = get_abstract(paper['link'])
except Exception as e:
print(f"Failed to get abstract for {paper['link']}: {e}")
all_papers.append(paper)
time.sleep(1.0) # sleep for 1 second to safe access to the website
with open(output_file, 'w', encoding='utf-8') as json_file:
json.dump(all_papers, json_file, ensure_ascii=False, indent=4)
print(f"Data saved to {output_file}")
if __name__ == '__main__':
# Example usage
args = {
'start_date': '2023-05-04',
'end_date': '2024-06-27',
'output_file': 'hf_daily_papers_2023-05-04_2024-06-27.json'
}
fetch_and_save_papers(**args)