Spaces:

rbiswasfc
/

zotero-refresh-pipeline

Sleeping

App Files Files Community

zotero-refresh-pipeline / main.py

rbiswasfc

refresh

9afacec 3 months ago

raw

history blame

17.5 kB

	import os
	import re
	import time

	import dotenv
	import fitz # PyMuPDF
	import pandas as pd
	import requests
	import schedule
	import srsly
	from bs4 import BeautifulSoup
	from datasets import Dataset, Image, load_dataset
	from huggingface_hub import create_repo, login, whoami
	from PIL import Image as PILImage
	from retry import retry
	from tqdm.auto import tqdm

	dotenv.load_dotenv()
	login(token=os.environ.get("HF_TOKEN"))

	hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
	HF_REPO_ID = f"{hf_user}/zotero-articles"


	########################################################
	### GET ZOTERO ITEMS
	########################################################


	@retry(tries=3, delay=8)
	def _fetch_one_zotero_batch(url, headers, params):
	"""
	Fetch articles from Zotero API
	"""
	response = requests.get(url, headers=headers, params=params)
	response.raise_for_status()
	return response.json()


	def get_zotero_items(debug=False):
	"""
	fetch items from zotero library
	"""

	GROUP_ID = os.getenv("GROUP_ID")
	API_KEY = os.getenv("API_KEY")
	BASE_URL = f"https://api.zotero.org/groups/{GROUP_ID}/items"
	LIMIT = 100

	headers = {"Zotero-API-Key": API_KEY, "Content-Type": "application/json"}

	items = []
	start = 0

	i = 1
	while True:
	i += 1
	params = {"limit": LIMIT, "start": start}
	page_items = _fetch_one_zotero_batch(BASE_URL, headers, params)

	if not page_items:
	break

	items.extend(page_items)
	start += LIMIT
	print(f"# items fetched {len(items)}")

	if debug:
	if len(items) > 500:
	break

	return items


	########################################################
	### EXTRACT ARXIV LINKS AND PDFs
	########################################################


	def get_arxiv_items(items):
	visited = set()

	arxiv_items = []
	arxiv_pattern = re.compile(r"arxiv.org/abs/(\d+\.\d+)")

	for item in items:
	data = item.get("data", {})
	attachments = item.get("links", {}).get("attachment", {})

	arxiv_url = None
	pdf_url = None

	if "url" in data and "arxiv.org" in data["url"]:
	arxiv_match = arxiv_pattern.search(data["url"])
	if arxiv_match:
	arxiv_url = data["url"]

	if attachments:
	pdf_url = attachments["href"]

	if arxiv_url:
	arxiv_id = arxiv_url.split("/")[-1]
	if arxiv_id in visited:
	continue

	arxiv_items.append(
	{
	"arxiv_id": arxiv_id,
	"arxiv_url": arxiv_url,
	"pdf_url": pdf_url,
	"added_by": item["meta"]["createdByUser"]["username"],
	"date_added": data.get("dateAdded", ""),
	}
	)

	visited.add(arxiv_id)

	return arxiv_items


	@retry(tries=3, delay=15, backoff=2)
	def fetch_arxiv_html(arxiv_id):
	url = f"https://ar5iv.labs.arxiv.org/html/{arxiv_id.split('v')[0]}"
	response = requests.get(url)
	return response.text if response.status_code == 200 else None


	def fetch_arxiv_htmls(arxiv_items):
	for item in tqdm(arxiv_items):
	html = fetch_arxiv_html(item["arxiv_id"])
	if html:
	item["raw_html"] = html
	else:
	print(f"failed to fetch html for {item['arxiv_id']}")
	item["raw_html"] = "Error"

	return arxiv_items


	########################################################
	### PARSE CONTENT FROM ARXIV HTML #
	########################################################


	def parse_html_content(html):
	"""
	Parse content from arxiv html
	"""
	arxiv_id_match = re.search(r"\[(\d+\.\d+(v\d+)?)\]", html)
	arxiv_id = arxiv_id_match.group(1) if arxiv_id_match else None
	soup = BeautifulSoup(html, "html.parser")
	result = []

	# Extract paper title
	try:
	paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(strip=True)
	except Exception:
	paper_title = soup.find("title").get_text(strip=True)
	paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)

	for math in soup.find_all("math"):
	math.decompose()
	for cite in soup.find_all("cite"):
	cite.decompose()

	# Extract abstract
	abstract = soup.find("div", class_="ltx_abstract")
	if abstract:
	result.append(
	{
	"content": " ".join(p.get_text(strip=True) for p in abstract.find_all("p")).replace(")", ") "),
	"title": "Abstract",
	"paper_title": paper_title,
	"content_type": "abstract",
	}
	)
	# Extract sections
	sections = soup.find_all("section", class_="ltx_section")
	for index, section in enumerate(sections):
	section_title = section.find("h2", class_="ltx_title ltx_title_section")
	section_title = section_title.get_text(strip=True) if section_title else f"Section {index + 1}"
	section_content = section.get_text(strip=True).replace(")", ") ")

	content_type = "body"
	if index == 0:
	content_type = "introduction"
	elif index == len(sections) - 1:
	content_type = "conclusion"

	result.append(
	{
	"content": section_content,
	"title": section_title,
	"paper_title": paper_title,
	"content_type": content_type,
	}
	)

	for c in result:
	c["arxiv_id"] = arxiv_id

	return result


	########################################################
	### GET TEXTS FROM PDF & PARSE
	########################################################


	def get_pdf_text(arxiv_id):
	url = "http://147.189.194.113:80/extract" # fix: currently down

	try:
	response = requests.get(url, params={"arxiv_id": arxiv_id})
	response = response.json()
	if "text" in response:
	return response["text"]
	return None
	except Exception as e:
	print(e)
	return None


	def get_content_type(section_type, section_count):
	"""Determine the content type based on the section type and count"""
	if section_type == "abstract":
	return "abstract"
	elif section_type == "introduction" or section_count == 1:
	return "introduction"
	elif section_type == "conclusion" or section_type == "references":
	return section_type
	else:
	return "body"


	def get_section_type(title):
	"""Determine the section type based on the title"""
	title_lower = title.lower()
	if "abstract" in title_lower:
	return "abstract"
	elif "introduction" in title_lower:
	return "introduction"
	elif "conclusion" in title_lower:
	return "conclusion"
	elif "reference" in title_lower:
	return "references"
	else:
	return "body"


	def parse_markdown_content(md_content, arxiv_id):
	"""
	Parses markdown content to identify and extract sections based on headers.
	"""

	lines = md_content.split("\n")
	parsed = []
	current_section = None
	content = []
	paper_title = None
	current_title = None

	# identify sections based on headers
	for line in lines:
	if line.startswith("#"):
	if paper_title is None:
	paper_title = line.lstrip("#").strip()
	continue
	if content:
	if current_title:
	parsed.append(
	{
	"content": " ".join(content),
	"title": current_title,
	"paper_title": paper_title,
	"content_type": get_content_type(current_section, len(parsed)),
	"arxiv_id": arxiv_id,
	}
	)
	content = []

	current_title = line.lstrip("#").lstrip("#").lstrip()
	if "bit" not in current_title:
	current_title = (
	current_title.lstrip("123456789")
	.lstrip()
	.lstrip(".")
	.lstrip()
	.lstrip("123456789")
	.lstrip()
	.lstrip(".")
	.lstrip()
	)
	current_section = get_section_type(current_title)

	else:
	content.append(line)

	# Add the last section
	if content and current_title:
	parsed.append(
	{
	"content": " ".join(content).replace(")", ") "),
	"title": current_title,
	"paper_title": paper_title,
	"content_type": get_content_type(current_section, len(parsed)),
	"arxiv_id": arxiv_id,
	}
	)

	return parsed


	########################################################
	### Image Dataset
	########################################################


	def download_arxiv_pdf(arxiv_id):
	arxiv_id = arxiv_id.split("v")[0]
	url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
	response = requests.get(url)
	if response.status_code == 200:
	return response.content
	else:
	raise Exception(f"Failed to download PDF. Status code: {response.status_code}")


	def pdf_to_jpegs(pdf_content, output_folder):
	# Create output folder if it doesn't exist
	os.makedirs(output_folder, exist_ok=True)

	# Open the PDF
	doc = fitz.open(stream=pdf_content, filetype="pdf")

	# Iterate through pages
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)

	# Convert page to image
	pix = page.get_pixmap()

	# Save image as JPEG
	image_path = os.path.join(output_folder, f"page_{page_num + 1}.jpg")
	pix.save(image_path)
	# print(f"Saved {image_path}")

	doc.close()


	def save_arxiv_article_images(arxiv_id):
	output_folder = os.path.join("data", "arxiv_images", arxiv_id)
	try:
	pdf_content = download_arxiv_pdf(arxiv_id)
	pdf_to_jpegs(pdf_content, output_folder)
	except Exception as e:
	print(f"An error occurred: {str(e)}")


	def create_hf_image_dataset(base_dir):
	data = []

	# Walk through the directory
	for root, dirs, files in os.walk(base_dir):
	for file in files:
	if file.endswith(".jpg"):
	# Extract arxiv_id from the path
	arxiv_id = os.path.basename(root)

	# Extract page number from the filename
	match = re.search(r"page_(\d+)", file)
	if match:
	page_number = int(match.group(1))
	else:
	continue # Skip if page number can't be extracted

	# Full path to the image
	image_path = os.path.join(root, file)

	# Open the image to get its size
	with PILImage.open(image_path) as img:
	width, height = img.size

	# Add the data
	data.append(
	{"image": image_path, "arxiv_id": arxiv_id, "page_number": page_number, "width": width, "height": height}
	)

	# Create the dataset
	dataset = Dataset.from_dict(
	{
	"image": [d["image"] for d in data],
	"arxiv_id": [d["arxiv_id"] for d in data],
	"page_number": [d["page_number"] for d in data],
	"width": [d["width"] for d in data],
	"height": [d["height"] for d in data],
	}
	)

	# Cast the image column to Image
	dataset = dataset.cast_column("image", Image())

	return dataset


	########################################################
	### HF UPLOAD
	########################################################


	def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
	repo_id = HF_REPO_ID
	create_repo(
	repo_id=repo_id,
	token=os.environ.get("HF_TOKEN"),
	private=True,
	repo_type="dataset",
	exist_ok=True,
	)

	# upload image dataset
	try:
	img_ds = create_hf_image_dataset("data/arxiv_images")
	img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))

	# push id_to_abstract
	abstract_ds = Dataset.from_pandas(abstract_df)
	abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))

	# push arxiv_items
	arxiv_ds = Dataset.from_pandas(contents_df)
	arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))

	# push processed_arxiv_ids
	processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
	processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
	processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
	except Exception as e:
	print(e)


	########################################################
	### MAIN
	########################################################


	def main():
	items = get_zotero_items(debug=True)
	print(f"# of items fetched from zotero: {len(items)}")
	arxiv_items = get_arxiv_items(items)
	print(f"# of arxiv papers: {len(arxiv_items)}")

	# get already processed arxiv ids from HF
	try:
	existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
	except Exception as e:
	print(e)
	try:
	existing_arxiv_ids = srsly.read_json("data/processed_arxiv_ids.json")
	except Exception as e:
	print(e)
	existing_arxiv_ids = []
	existing_arxiv_ids = set(existing_arxiv_ids)
	print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")

	# new arxiv items
	arxiv_items = [item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids]
	arxiv_items = fetch_arxiv_htmls(arxiv_items)
	print(f"# of new arxiv items: {len(arxiv_items)}")

	processed_arxiv_ids = set()
	for item in arxiv_items:
	# download images --
	save_arxiv_article_images(item["arxiv_id"])

	# parse html
	try:
	item["contents"] = parse_html_content(item["raw_html"])
	processed_arxiv_ids.add(item["arxiv_id"])
	except Exception as e:
	print(f"Failed to parse html for {item['arxiv_id']}: {e}")
	item["contents"] = []

	if len(item["contents"]) == 0:
	print("Extracting from pdf...")
	md_content = get_pdf_text(item["arxiv_id"]) # fix this
	if md_content:
	item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
	processed_arxiv_ids.add(item["arxiv_id"])
	else:
	item["contents"] = []

	# save contents ---
	processed_arxiv_ids = list(processed_arxiv_ids)
	print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")

	# save abstracts ---
	id_to_abstract = {}
	for item in arxiv_items:
	for entry in item["contents"]:
	if entry["content_type"] == "abstract":
	id_to_abstract[item["arxiv_id"]] = entry["content"]
	break
	print(f"# of abstracts: {len(id_to_abstract)}")
	abstract_df = pd.Series(id_to_abstract).reset_index().rename(columns={"index": "arxiv_id", 0: "abstract"})
	print(abstract_df.head())

	# add to existing dataset
	try:
	old_abstract_df = load_dataset(HF_REPO_ID, "abstracts")["train"].to_pandas()
	except Exception as e:
	print(e)
	old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
	print(old_abstract_df.head())

	abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
	abstract_df = abstract_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)

	# contents
	contents_df = pd.DataFrame(arxiv_items)
	print(contents_df.head())
	try:
	old_contents_df = load_dataset(HF_REPO_ID, "articles")["train"].to_pandas()
	except Exception as e:
	print(e)
	old_contents_df = pd.DataFrame(columns=contents_df.columns)
	if len(old_contents_df) > 0:
	print(old_contents_df.sample().T)

	contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
	contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)

	# upload to hf
	processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
	upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)

	# save as local copy
	os.makedirs("data", exist_ok=True)
	abstract_df.to_parquet("data/abstracts.parquet")
	contents_df.to_parquet("data/contents.parquet")
	srsly.write_json("data/processed_arxiv_ids.json", processed_arxiv_ids)


	def schedule_periodic_task():
	"""
	Schedule the main task to run at the user-defined frequency
	"""
	main() # run once initially

	frequency = "daily" # TODO: env
	if frequency == "hourly":
	print("Scheduling tasks to run every hour at the top of the hour")
	schedule.every().hour.at(":00").do(main)
	elif frequency == "daily":
	start_time = "10:00"
	print("Scheduling tasks to run every day at: {start_time} UTC+00")
	schedule.every().day.at(start_time).do(main)

	while True:
	schedule.run_pending()
	time.sleep(1)


	if __name__ == "__main__":
	schedule_periodic_task()