Spaces:

yutohub
/

hf_daily_papers

Sleeping

App Files Files Community

hf_daily_papers / get_hf_daily_papers.py

yutohub

Create get_hf_daily_papers.py

9aba82f verified about 1 year ago

raw

history blame contribute delete

3.45 kB

	from datetime import datetime, timedelta
	import json
	import time
	from typing import List, Dict

	from bs4 import BeautifulSoup
	import requests


	def get_hf_daily_papers_info(date: str) -> List[Dict]:
	url = 'https://huggingface.co/papers?date=' + date
	response = requests.get(url)

	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')
	papers = []
	articles = soup.find_all('article', class_='relative flex flex-col overflow-hidden rounded-xl border')

	for article in articles:
	title_tag = article.find('h3')
	title = title_tag.get_text(strip=True)
	link = title_tag.find('a')['href']

	authors = [author_tag['title'] for author_tag in article.find_all('li', title=True)]

	submitter_tag = article.find('div', class_='pointer-events-none absolute right-2 top-56 -mt-8 flex h-6 items-center gap-1 self-end whitespace-nowrap rounded-md border bg-white px-2 text-xs leading-none text-gray-700 shadow-sm dark:bg-gray-900 dark:text-gray-400 sm:text-sm md:top-64')
	submitter = submitter_tag.get_text(strip=True).replace('Submitted by', '').strip()

	paper_info = {
	'title': title,
	'link': "https://huggingface.co" + link,
	'authors': authors,
	'submitter': submitter,
	'date': date
	}
	papers.append(paper_info)

	return papers
	else:
	print(f"No papers, date: {date}")
	return []


	def get_abstract(url: str) -> str:
	response = requests.get(url)
	if response.status_code != 200:
	raise Exception(f"Failed to load page {url}")

	soup = BeautifulSoup(response.text, 'html.parser')
	abstract_section = soup.find('div', class_='pb-8 pr-4 md:pr-16')
	if not abstract_section:
	raise Exception("Abstract section not found")

	abstract_header = abstract_section.find('h2', string='Abstract')
	if not abstract_header:
	raise Exception("Abstract header not found")

	abstract_paragraph = abstract_header.find_next('p')
	if not abstract_paragraph:
	raise Exception("Abstract paragraph not found")

	return abstract_paragraph.get_text(strip=True).replace('\n', ' ')


	def fetch_and_save_papers(start_date: str, end_date: str, output_file: str):
	start_date = datetime.strptime(start_date, "%Y-%m-%d")
	end_date = datetime.strptime(end_date, "%Y-%m-%d")

	date_list = [(start_date + timedelta(days=x)).strftime("%Y-%m-%d") for x in range((end_date - start_date).days + 1)]

	all_papers = []
	for date in date_list:
	papers = get_hf_daily_papers_info(date)
	for paper in papers:
	try:
	paper['abstract'] = get_abstract(paper['link'])
	except Exception as e:
	print(f"Failed to get abstract for {paper['link']}: {e}")
	all_papers.append(paper)
	time.sleep(1.0) # sleep for 1 second to safe access to the website

	with open(output_file, 'w', encoding='utf-8') as json_file:
	json.dump(all_papers, json_file, ensure_ascii=False, indent=4)

	print(f"Data saved to {output_file}")


	if __name__ == '__main__':
	# Example usage
	args = {
	'start_date': '2023-05-04',
	'end_date': '2024-06-27',
	'output_file': 'hf_daily_papers_2023-05-04_2024-06-27.json'
	}
	fetch_and_save_papers(**args)