Spaces:

answerdotai
/

zotero-weekly

Sleeping

App Files Files Community

rbiswasfc commited on Sep 4, 2024

Commit

f0c7f30

1 Parent(s): b5fea25

app

Browse files

Files changed (7) hide show

.gitignore +5 -0
Dockerfile +31 -0
app.py +156 -0
main.py +592 -0
requirements.txt +15 -0
ruff.toml +3 -0
supervisord.conf +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.env
+*.json
+data
+.ipynb_checkpoints
+__pycache__

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.10
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory
+WORKDIR $HOME/app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+RUN git config --global credential.helper store
+COPY . .
+COPY supervisord.conf .
+# Set permissions on the log file
+USER root
+RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
+RUN mkdir -p /tmp/cache/
+RUN mkdir -p /.cache
+RUN chmod a+rwx -R /tmp/cache/
+RUN chmod a+rwx -R /.cache
+ENV HF_HUB_CACHE=HF_HOME
+ENV PYTHONUNBUFFERED=1 PORT=7860
+# Run supervisord
+CMD ["supervisord", "-c", "supervisord.conf"]

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import base64
+import os
+from collections import defaultdict
+from datetime import date, datetime, timedelta
+from io import BytesIO
+import dotenv
+from datasets import load_dataset
+from dateutil.parser import parse
+from dateutil.tz import tzutc
+from fasthtml.common import *
+from huggingface_hub import login, whoami
+dotenv.load_dotenv()
+style = Style("""
+                .grid { margin-bottom: 1rem; }
+                .card { display: flex; flex-direction: column; }
+                .card img { margin-bottom: 0.5rem; }
+                .card h5 { margin: 0; font-size: 0.9rem; line-height: 1.2; }
+                .card a { color: inherit; text-decoration: none; }
+                .card a:hover { text-decoration: underline; }
+            """)
+app, rt = fast_app(html_style=(style,))
+login(token=os.environ.get("HF_TOKEN"))
+hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
+HF_REPO_ID = f"{hf_user}/zotero-articles"
+abstract_ds = load_dataset(HF_REPO_ID, "abstracts", split="train")
+article_ds = load_dataset(HF_REPO_ID, "articles", split="train")
+image_ds = load_dataset(HF_REPO_ID, "images", split="train")
+image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
+def parse_date(date_string):
+    try:
+        return parse(date_string).astimezone(tzutc()).date()
+    except ValueError:
+        return date.today()
+def get_week_start(date_obj):
+    return date_obj - timedelta(days=date_obj.weekday())
+week2articles = defaultdict(list)
+for article in article_ds:
+    date_added = parse_date(article["date_added"])
+    week_start = get_week_start(date_added)
+    week2articles[week_start].append(article["arxiv_id"])
+weeks = sorted(week2articles.keys(), reverse=True)
+def get_article_details(arxiv_id):
+    article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
+    abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
+    image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
+    return article, abstract, image
+def generate_week_content(current_week):
+    week_index = weeks.index(current_week)
+    prev_week = weeks[week_index + 1] if week_index < len(weeks) - 1 else None
+    next_week = weeks[week_index - 1] if week_index > 0 else None
+    nav_buttons = Group(
+        Button(
+            "← Previous Week",
+            hx_get=f"/week/{prev_week}" if prev_week else "#",
+            hx_target="#content",
+            hx_swap="innerHTML",
+            disabled=not prev_week,
+        ),
+        Button(
+            "Next Week →",
+            hx_get=f"/week/{next_week}" if next_week else "#",
+            hx_target="#content",
+            hx_swap="innerHTML",
+            disabled=not next_week,
+        ),
+    )
+    articles = week2articles[current_week]
+    article_cards = []
+    for arxiv_id in articles:
+        article, abstract, image = get_article_details(arxiv_id)
+        article_title = (
+            article["contents"][0].get("paper_title", "article")
+            if article["contents"]
+            else "article"
+        )
+        card_content = [
+            H5(
+                A(
+                    article_title,
+                    href=f"https://arxiv.org/abs/{arxiv_id}",
+                    target="_blank",
+                )
+            )
+        ]
+        if image:
+            pil_image = image[0]["image"]
+            img_byte_arr = BytesIO()
+            pil_image.save(img_byte_arr, format="JPEG")
+            img_byte_arr = img_byte_arr.getvalue()
+            image_url = f"data:image/jpeg;base64,{base64.b64encode(img_byte_arr).decode('utf-8')}"
+            card_content.insert(
+                0,
+                Img(
+                    src=image_url,
+                    alt="Article image",
+                    style="max-width: 100%; height: auto; margin-bottom: 15px;",
+                ),
+            )
+        article_cards.append(Card(*card_content, cls="mb-4"))
+    grid = Grid(
+        *article_cards,
+        style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 1rem;",
+    )
+    week_end = current_week + timedelta(days=6)
+    return Div(
+        nav_buttons,
+        H3(
+            f"Week of {current_week.strftime('%B %d')} - {week_end.strftime('%B %d, %Y')} ({len(articles)} articles)"
+        ),
+        grid,
+        nav_buttons,
+        id="content",
+    )
+@rt("/")
+def get():
+    return Titled("AnswerAI Zotero Weekly", generate_week_content(weeks[0]))
+@rt("/week/{date}")
+def get(date: str):
+    try:
+        current_week = datetime.strptime(date, "%Y-%m-%d").date()
+        return generate_week_content(current_week)
+    except Exception as e:
+        return Div(f"Error displaying articles: {str(e)}")
+serve()

main.py ADDED Viewed

	@@ -0,0 +1,592 @@

+import os
+import re
+import time
+import dotenv
+import fitz  # PyMuPDF
+import pandas as pd
+import requests
+import schedule
+import srsly
+from bs4 import BeautifulSoup
+from datasets import Dataset, Image, load_dataset
+from huggingface_hub import create_repo, login, whoami
+from PIL import Image as PILImage
+from retry import retry
+from tqdm.auto import tqdm
+dotenv.load_dotenv()
+login(token=os.environ.get("HF_TOKEN"))
+hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
+HF_REPO_ID = f"{hf_user}/zotero-articles"
+########################################################
+### GET ZOTERO ITEMS
+########################################################
+@retry(tries=3, delay=8)
+def _fetch_one_zotero_batch(url, headers, params):
+    """
+    Fetch articles from Zotero API
+    """
+    response = requests.get(url, headers=headers, params=params)
+    response.raise_for_status()
+    return response.json()
+def get_zotero_items(debug=False):
+    """
+    fetch items from zotero library
+    """
+    GROUP_ID = os.getenv("GROUP_ID")
+    API_KEY = os.getenv("API_KEY")
+    BASE_URL = f"https://api.zotero.org/groups/{GROUP_ID}/items"
+    LIMIT = 100
+    headers = {"Zotero-API-Key": API_KEY, "Content-Type": "application/json"}
+    items = []
+    start = 0
+    i = 1
+    while True:
+        i += 1
+        params = {"limit": LIMIT, "start": start}
+        page_items = _fetch_one_zotero_batch(BASE_URL, headers, params)
+        if not page_items:
+            break
+        items.extend(page_items)
+        start += LIMIT
+        print(f"# items fetched {len(items)}")
+        if debug:
+            if len(items) > 200:
+                break
+    return items
+########################################################
+### EXTRACT ARXIV LINKS AND PDFs
+########################################################
+def get_arxiv_items(items):
+    visited = set()
+    arxiv_items = []
+    arxiv_pattern = re.compile(r"arxiv.org/abs/(\d+\.\d+)")
+    for item in items:
+        data = item.get("data", {})
+        attachments = item.get("links", {}).get("attachment", {})
+        arxiv_url = None
+        pdf_url = None
+        if "url" in data and "arxiv.org" in data["url"]:
+            arxiv_match = arxiv_pattern.search(data["url"])
+            if arxiv_match:
+                arxiv_url = data["url"]
+        if attachments:
+            pdf_url = attachments["href"]
+        if arxiv_url:
+            arxiv_id = arxiv_url.split("/")[-1]
+            if arxiv_id in visited:
+                continue
+            arxiv_items.append(
+                {
+                    "arxiv_id": arxiv_id,
+                    "arxiv_url": arxiv_url,
+                    "pdf_url": pdf_url,
+                    "added_by": item["meta"]["createdByUser"]["username"],
+                    "date_added": data.get("dateAdded", ""),
+                }
+            )
+            visited.add(arxiv_id)
+    return arxiv_items
+@retry(tries=3, delay=15, backoff=2)
+def fetch_arxiv_html(arxiv_id):
+    url = f"https://ar5iv.labs.arxiv.org/html/{arxiv_id.split('v')[0]}"
+    response = requests.get(url)
+    return response.text if response.status_code == 200 else None
+def fetch_arxiv_htmls(arxiv_items):
+    for item in tqdm(arxiv_items):
+        html = fetch_arxiv_html(item["arxiv_id"])
+        if html:
+            item["raw_html"] = html
+        else:
+            print(f"failed to fetch html for {item['arxiv_id']}")
+            item["raw_html"] = "Error"
+    return arxiv_items
+########################################################
+### PARSE CONTENT FROM ARXIV HTML #
+########################################################
+def parse_html_content(html):
+    """
+    Parse content from arxiv html
+    """
+    arxiv_id_match = re.search(r"\[(\d+\.\d+(v\d+)?)\]", html)
+    arxiv_id = arxiv_id_match.group(1) if arxiv_id_match else None
+    soup = BeautifulSoup(html, "html.parser")
+    result = []
+    # Extract paper title
+    try:
+        paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(
+            strip=True
+        )
+    except Exception:
+        paper_title = soup.find("title").get_text(strip=True)
+        paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
+    for math in soup.find_all("math"):
+        math.decompose()
+    for cite in soup.find_all("cite"):
+        cite.decompose()
+    # Extract abstract
+    abstract = soup.find("div", class_="ltx_abstract")
+    if abstract:
+        result.append(
+            {
+                "content": " ".join(
+                    p.get_text(strip=True) for p in abstract.find_all("p")
+                ).replace(")", ") "),
+                "title": "Abstract",
+                "paper_title": paper_title,
+                "content_type": "abstract",
+            }
+        )
+    # Extract sections
+    sections = soup.find_all("section", class_="ltx_section")
+    for index, section in enumerate(sections):
+        section_title = section.find("h2", class_="ltx_title ltx_title_section")
+        section_title = (
+            section_title.get_text(strip=True)
+            if section_title
+            else f"Section {index + 1}"
+        )
+        section_content = section.get_text(strip=True).replace(")", ") ")
+        content_type = "body"
+        if index == 0:
+            content_type = "introduction"
+        elif index == len(sections) - 1:
+            content_type = "conclusion"
+        result.append(
+            {
+                "content": section_content,
+                "title": section_title,
+                "paper_title": paper_title,
+                "content_type": content_type,
+            }
+        )
+    for c in result:
+        c["arxiv_id"] = arxiv_id
+    return result
+########################################################
+### GET TEXTS FROM PDF & PARSE
+########################################################
+def get_pdf_text(arxiv_id):
+    url = "http://147.189.194.113:80/extract"  # fix: currently down
+    try:
+        response = requests.get(url, params={"arxiv_id": arxiv_id})
+        response = response.json()
+        if "text" in response:
+            return response["text"]
+        return None
+    except Exception as e:
+        print(e)
+        return None
+def get_content_type(section_type, section_count):
+    """Determine the content type based on the section type and count"""
+    if section_type == "abstract":
+        return "abstract"
+    elif section_type == "introduction" or section_count == 1:
+        return "introduction"
+    elif section_type == "conclusion" or section_type == "references":
+        return section_type
+    else:
+        return "body"
+def get_section_type(title):
+    """Determine the section type based on the title"""
+    title_lower = title.lower()
+    if "abstract" in title_lower:
+        return "abstract"
+    elif "introduction" in title_lower:
+        return "introduction"
+    elif "conclusion" in title_lower:
+        return "conclusion"
+    elif "reference" in title_lower:
+        return "references"
+    else:
+        return "body"
+def parse_markdown_content(md_content, arxiv_id):
+    """
+    Parses markdown content to identify and extract sections based on headers.
+    """
+    lines = md_content.split("\n")
+    parsed = []
+    current_section = None
+    content = []
+    paper_title = None
+    current_title = None
+    # identify sections based on headers
+    for line in lines:
+        if line.startswith("#"):
+            if paper_title is None:
+                paper_title = line.lstrip("#").strip()
+                continue
+            if content:
+                if current_title:
+                    parsed.append(
+                        {
+                            "content": " ".join(content),
+                            "title": current_title,
+                            "paper_title": paper_title,
+                            "content_type": get_content_type(
+                                current_section, len(parsed)
+                            ),
+                            "arxiv_id": arxiv_id,
+                        }
+                    )
+                content = []
+            current_title = line.lstrip("#").lstrip("#").lstrip()
+            if "bit" not in current_title:
+                current_title = (
+                    current_title.lstrip("123456789")
+                    .lstrip()
+                    .lstrip(".")
+                    .lstrip()
+                    .lstrip("123456789")
+                    .lstrip()
+                    .lstrip(".")
+                    .lstrip()
+                )
+            current_section = get_section_type(current_title)
+        else:
+            content.append(line)
+    # Add the last section
+    if content and current_title:
+        parsed.append(
+            {
+                "content": " ".join(content).replace(")", ") "),
+                "title": current_title,
+                "paper_title": paper_title,
+                "content_type": get_content_type(current_section, len(parsed)),
+                "arxiv_id": arxiv_id,
+            }
+        )
+    return parsed
+########################################################
+### Image Dataset
+########################################################
+def download_arxiv_pdf(arxiv_id):
+    arxiv_id = arxiv_id.split("v")[0]
+    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.content
+    else:
+        raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
+def pdf_to_jpegs(pdf_content, output_folder):
+    # Create output folder if it doesn't exist
+    os.makedirs(output_folder, exist_ok=True)
+    # Open the PDF
+    doc = fitz.open(stream=pdf_content, filetype="pdf")
+    # Iterate through pages
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        # Convert page to image
+        pix = page.get_pixmap()
+        # Save image as JPEG
+        image_path = os.path.join(output_folder, f"page_{page_num + 1}.jpg")
+        pix.save(image_path)
+        # print(f"Saved {image_path}")
+    doc.close()
+def save_arxiv_article_images(arxiv_id):
+    output_folder = os.path.join("data", "arxiv_images", arxiv_id)
+    try:
+        pdf_content = download_arxiv_pdf(arxiv_id)
+        pdf_to_jpegs(pdf_content, output_folder)
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+def create_hf_image_dataset(base_dir):
+    data = []
+    # Walk through the directory
+    for root, dirs, files in os.walk(base_dir):
+        for file in files:
+            if file.endswith(".jpg"):
+                # Extract arxiv_id from the path
+                arxiv_id = os.path.basename(root)
+                # Extract page number from the filename
+                match = re.search(r"page_(\d+)", file)
+                if match:
+                    page_number = int(match.group(1))
+                else:
+                    continue  # Skip if page number can't be extracted
+                # Full path to the image
+                image_path = os.path.join(root, file)
+                # Open the image to get its size
+                with PILImage.open(image_path) as img:
+                    width, height = img.size
+                # Add the data
+                data.append(
+                    {
+                        "image": image_path,
+                        "arxiv_id": arxiv_id,
+                        "page_number": page_number,
+                        "width": width,
+                        "height": height,
+                    }
+                )
+    # Create the dataset
+    dataset = Dataset.from_dict(
+        {
+            "image": [d["image"] for d in data],
+            "arxiv_id": [d["arxiv_id"] for d in data],
+            "page_number": [d["page_number"] for d in data],
+            "width": [d["width"] for d in data],
+            "height": [d["height"] for d in data],
+        }
+    )
+    # Cast the image column to Image
+    dataset = dataset.cast_column("image", Image())
+    return dataset
+########################################################
+### HF UPLOAD
+########################################################
+def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
+    repo_id = HF_REPO_ID
+    create_repo(
+        repo_id=repo_id,
+        token=os.environ.get("HF_TOKEN"),
+        private=True,
+        repo_type="dataset",
+        exist_ok=True,
+    )
+    # upload image dataset
+    img_ds = create_hf_image_dataset("data/arxiv_images")
+    img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
+    # push id_to_abstract
+    abstract_ds = Dataset.from_pandas(abstract_df)
+    abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
+    # push arxiv_items
+    arxiv_ds = Dataset.from_pandas(contents_df)
+    arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
+    # push processed_arxiv_ids
+    processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
+    processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
+    processed_arxiv_ids_ds.push_to_hub(
+        repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN")
+    )
+########################################################
+### MAIN
+########################################################
+def main():
+    items = get_zotero_items(debug=True)
+    print(f"# of items fetched from zotero: {len(items)}")
+    arxiv_items = get_arxiv_items(items)
+    print(f"# of arxiv papers: {len(arxiv_items)}")
+    # get already processed arxiv ids from HF
+    try:
+        existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"][
+            "arxiv_id"
+        ]
+    except Exception as e:
+        print(e)
+        try:
+            existing_arxiv_ids = srsly.read_json("data/processed_arxiv_ids.json")
+        except Exception as e:
+            print(e)
+            existing_arxiv_ids = []
+    existing_arxiv_ids = set(existing_arxiv_ids)
+    print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
+    # new arxiv items
+    arxiv_items = [
+        item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids
+    ]
+    arxiv_items = fetch_arxiv_htmls(arxiv_items)
+    print(f"# of new arxiv items: {len(arxiv_items)}")
+    processed_arxiv_ids = set()
+    for item in arxiv_items:
+        # download images --
+        save_arxiv_article_images(item["arxiv_id"])
+        # parse html
+        try:
+            item["contents"] = parse_html_content(item["raw_html"])
+            processed_arxiv_ids.add(item["arxiv_id"])
+        except Exception as e:
+            print(f"Failed to parse html for {item['arxiv_id']}: {e}")
+            item["contents"] = []
+        if len(item["contents"]) == 0:
+            print("Extracting from pdf...")
+            md_content = get_pdf_text(item["arxiv_id"])  # fix this
+            if md_content:
+                item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
+                processed_arxiv_ids.add(item["arxiv_id"])
+            else:
+                item["contents"] = []
+    # save contents ---
+    processed_arxiv_ids = list(processed_arxiv_ids)
+    print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
+    # save abstracts ---
+    id_to_abstract = {}
+    for item in arxiv_items:
+        for entry in item["contents"]:
+            if entry["content_type"] == "abstract":
+                id_to_abstract[item["arxiv_id"]] = entry["content"]
+                break
+    print(f"# of abstracts: {len(id_to_abstract)}")
+    abstract_df = (
+        pd.Series(id_to_abstract)
+        .reset_index()
+        .rename(columns={"index": "arxiv_id", 0: "abstract"})
+    )
+    print(abstract_df.head())
+    # add to existing dataset
+    try:
+        old_abstract_df = load_dataset(HF_REPO_ID, "abstracts")["train"].to_pandas()
+    except Exception as e:
+        print(e)
+        old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
+    print(old_abstract_df.head())
+    abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
+    abstract_df = abstract_df.drop_duplicates(
+        subset=["arxiv_id"], keep="last"
+    ).reset_index(drop=True)
+    # contents
+    contents_df = pd.DataFrame(arxiv_items)
+    print(contents_df.head())
+    try:
+        old_contents_df = load_dataset(HF_REPO_ID, "articles")["train"].to_pandas()
+    except Exception as e:
+        print(e)
+        old_contents_df = pd.DataFrame(columns=contents_df.columns)
+    if len(old_contents_df) > 0:
+        print(old_contents_df.sample().T)
+    contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
+    contents_df = contents_df.drop_duplicates(
+        subset=["arxiv_id"], keep="last"
+    ).reset_index(drop=True)
+    # upload to hf
+    processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
+    upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
+    # save as local copy
+    os.makedirs("data", exist_ok=True)
+    abstract_df.to_parquet("data/abstracts.parquet")
+    contents_df.to_parquet("data/contents.parquet")
+    srsly.write_json("data/processed_arxiv_ids.json", processed_arxiv_ids)
+def schedule_periodic_task():
+    """
+    Schedule the main task to run at the user-defined frequency
+    """
+    main()  # run once initially
+    frequency = "daily"  # TODO: env
+    if frequency == "hourly":
+        print("Scheduling tasks to run every hour at the top of the hour")
+        schedule.every().hour.at(":00").do(main)
+    elif frequency == "daily":
+        start_time = "10:00"
+        print("Scheduling tasks to run every day at: {start_time} UTC+00")
+        schedule.every().day.at(start_time).do(main)
+    while True:
+        schedule.run_pending()
+        time.sleep(1)
+if __name__ == "__main__":
+    schedule_periodic_task()

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fasthtml-hf>=0.1.1
+python-fasthtml>=0.0.8
+huggingface-hub>=0.20.0
+uvicorn>=0.29
+schedule==1.2.0
+supervisor==4.2.5
+requests
+srsly
+python-dotenv
+beautifulsoup4
+retry
+pandas
+datasets
+PyMuPDF
+pillow

ruff.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+line-length = 128
+target-version = "py311"
+ignore = ["E402"]

supervisord.conf ADDED Viewed

	@@ -0,0 +1,20 @@

+[supervisord]
+nodaemon=true
+[program:main]
+command=python main.py
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+autostart=true
+# autorestart=true
+[program:app]
+command=python app.py
+stdout_logfile=/dev/null
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+autostart=true
+autorestart=true