Spaces:

rbiswasfc
/

zotero-refresh-pipeline

Sleeping

App Files Files Community

rbiswasfc commited on Sep 10, 2024

Commit

0889949

1 Parent(s): 9afacec

pipe

Browse files

Files changed (3) hide show

app copy.py +0 -134
app.py +19 -117
main.py +89 -29

app copy.py DELETED Viewed

@@ -1,134 +0,0 @@
-import base64
-import os
-from collections import defaultdict
-from datetime import date, datetime, timedelta
-from io import BytesIO
-import dotenv
-from datasets import load_dataset
-from dateutil.parser import parse
-from dateutil.tz import tzutc
-from fasthtml.common import *
-from huggingface_hub import login, whoami
-dotenv.load_dotenv()
-style = Style("""
-                .grid { margin-bottom: 1rem; }
-                .card { display: flex; flex-direction: column; }
-                .card img { margin-bottom: 0.5rem; }
-                .card h5 { margin: 0; font-size: 0.9rem; line-height: 1.2; }
-                .card a { color: inherit; text-decoration: none; }
-                .card a:hover { text-decoration: underline; }
-            """)
-app, rt = fast_app(html_style=(style,))
-login(token=os.environ.get("HF_TOKEN"))
-hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
-HF_REPO_ID = f"{hf_user}/zotero-articles"
-abstract_ds = load_dataset(HF_REPO_ID, "abstracts", split="train")
-article_ds = load_dataset(HF_REPO_ID, "articles", split="train")
-image_ds = load_dataset(HF_REPO_ID, "images", split="train")
-image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
-def parse_date(date_string):
-    try:
-        return parse(date_string).astimezone(tzutc()).date()
-    except ValueError:
-        return date.today()
-def get_week_start(date_obj):
-    return date_obj - timedelta(days=date_obj.weekday())
-week2articles = defaultdict(list)
-for article in article_ds:
-    date_added = parse_date(article["date_added"])
-    week_start = get_week_start(date_added)
-    week2articles[week_start].append(article["arxiv_id"])
-weeks = sorted(week2articles.keys(), reverse=True)
-def get_article_details(arxiv_id):
-    article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
-    abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
-    image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
-    return article, abstract, image
-def generate_week_content(current_week):
-    week_index = weeks.index(current_week)
-    prev_week = weeks[week_index + 1] if week_index < len(weeks) - 1 else None
-    next_week = weeks[week_index - 1] if week_index > 0 else None
-    nav_buttons = Group(
-        Button(
-            "← Previous Week",
-            hx_get=f"/week/{prev_week}" if prev_week else "#",
-            hx_target="#content",
-            hx_swap="innerHTML",
-            disabled=not prev_week,
-        ),
-        Button(
-            "Next Week →",
-            hx_get=f"/week/{next_week}" if next_week else "#",
-            hx_target="#content",
-            hx_swap="innerHTML",
-            disabled=not next_week,
-        ),
-    )
-    articles = week2articles[current_week]
-    article_cards = []
-    for arxiv_id in articles:
-        article, abstract, image = get_article_details(arxiv_id)
-        article_title = article["contents"][0].get("paper_title", "article") if article["contents"] else "article"
-        card_content = [H5(A(article_title, href=f"https://arxiv.org/abs/{arxiv_id}", target="_blank"))]
-        if image:
-            pil_image = image[0]["image"]
-            img_byte_arr = BytesIO()
-            pil_image.save(img_byte_arr, format="JPEG")
-            img_byte_arr = img_byte_arr.getvalue()
-            image_url = f"data:image/jpeg;base64,{base64.b64encode(img_byte_arr).decode('utf-8')}"
-            card_content.insert(
-                1, Img(src=image_url, alt="Article image", style="max-width: 100%; height: auto; margin-bottom: 15px;")
-            )
-        article_cards.append(Card(*card_content, cls="mb-4"))
-    grid = Grid(*article_cards, style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 1rem;")
-    week_end = current_week + timedelta(days=6)
-    return Div(
-        nav_buttons,
-        H3(f"Week of {current_week.strftime('%B %d')} - {week_end.strftime('%B %d, %Y')} ({len(articles)} articles)"),
-        grid,
-        nav_buttons,
-        id="content",
-    )
-@rt("/")
-def get():
-    return Titled("AnswerAI Zotero Weekly", generate_week_content(weeks[0]))
-@rt("/week/{date}")
-def get(date: str):
-    try:
-        current_week = datetime.strptime(date, "%Y-%m-%d").date()
-        return generate_week_content(current_week)
-    except Exception as e:
-        return Div(f"Error displaying articles: {str(e)}")
-serve()

app.py CHANGED Viewed

@@ -1,134 +1,36 @@
-import base64
 import os
-from collections import defaultdict
-from datetime import date, datetime, timedelta
-from io import BytesIO
 import dotenv
-from datasets import load_dataset
-from dateutil.parser import parse
-from dateutil.tz import tzutc
 from fasthtml.common import *
-from huggingface_hub import login, whoami
 dotenv.load_dotenv()
-style = Style("""
-                .grid { margin-bottom: 1rem; }
-                .card { display: flex; flex-direction: column; }
-                .card img { margin-bottom: 0.5rem; }
-                .card h5 { margin: 0; font-size: 0.9rem; line-height: 1.2; }
-                .card a { color: inherit; text-decoration: none; }
-                .card a:hover { text-decoration: underline; }
-            """)
-app, rt = fast_app(html_style=(style,))
 login(token=os.environ.get("HF_TOKEN"))
 hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
-HF_REPO_ID = f"{hf_user}/zotero-articles"
-abstract_ds = load_dataset(HF_REPO_ID, "abstracts", split="train")
-article_ds = load_dataset(HF_REPO_ID, "articles", split="train")
-image_ds = load_dataset(HF_REPO_ID, "images", split="train")
-image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
-def parse_date(date_string):
-    try:
-        return parse(date_string).astimezone(tzutc()).date()
-    except ValueError:
-        return date.today()
-def get_week_start(date_obj):
-    return date_obj - timedelta(days=date_obj.weekday())
-week2articles = defaultdict(list)
-for article in article_ds:
-    date_added = parse_date(article["date_added"])
-    week_start = get_week_start(date_added)
-    week2articles[week_start].append(article["arxiv_id"])
-weeks = sorted(week2articles.keys(), reverse=True)
-def get_article_details(arxiv_id):
-    article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
-    abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
-    image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
-    return article, abstract, image
-def generate_week_content(current_week):
-    week_index = weeks.index(current_week)
-    prev_week = weeks[week_index + 1] if week_index < len(weeks) - 1 else None
-    next_week = weeks[week_index - 1] if week_index > 0 else None
-    nav_buttons = Group(
-        Button(
-            "← Previous Week",
-            hx_get=f"/week/{prev_week}" if prev_week else "#",
-            hx_target="#content",
-            hx_swap="innerHTML",
-            disabled=not prev_week,
-        ),
-        Button(
-            "Next Week →",
-            hx_get=f"/week/{next_week}" if next_week else "#",
-            hx_target="#content",
-            hx_swap="innerHTML",
-            disabled=not next_week,
-        ),
-    )
-    articles = week2articles[current_week]
-    article_cards = []
-    for arxiv_id in articles:
-        article, abstract, image = get_article_details(arxiv_id)
-        article_title = article["contents"][0].get("paper_title", "article") if article["contents"] else "article"
-        card_content = [H5(A(article_title, href=f"https://arxiv.org/abs/{arxiv_id}", target="_blank"))]
-        if image:
-            pil_image = image[0]["image"]
-            img_byte_arr = BytesIO()
-            pil_image.save(img_byte_arr, format="JPEG")
-            img_byte_arr = img_byte_arr.getvalue()
-            image_url = f"data:image/jpeg;base64,{base64.b64encode(img_byte_arr).decode('utf-8')}"
-            card_content.insert(
-                0, Img(src=image_url, alt="Article image", style="max-width: 100%; height: auto; margin-bottom: 15px;")
-            )
-        article_cards.append(Card(*card_content, cls="mb-4"))
-    grid = Grid(*article_cards, style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 1rem;")
-    week_end = current_week + timedelta(days=6)
-    return Div(
-        nav_buttons,
-        H3(f"Week of {current_week.strftime('%B %d')} - {week_end.strftime('%B %d, %Y')} ({len(articles)} articles)"),
-        grid,
-        nav_buttons,
-        id="content",
-    )
 @rt("/")
 def get():
-    return Titled("AnswerAI Zotero Weekly", generate_week_content(weeks[0]))
-@rt("/week/{date}")
-def get(date: str):
-    try:
-        current_week = datetime.strptime(date, "%Y-%m-%d").date()
-        return generate_week_content(current_week)
-    except Exception as e:
-        return Div(f"Error displaying articles: {str(e)}")
 serve()

 import os
 import dotenv
 from fasthtml.common import *
+from huggingface_hub import HfApi, login, whoami
 dotenv.load_dotenv()
 login(token=os.environ.get("HF_TOKEN"))
+api = HfApi()
 hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
+HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
+HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
+app, rt = fast_app()
 @rt("/")
 def get():
+    info = api.dataset_info(HF_REPO_ID_TXT)
+    text_last_modified = info.last_modified.strftime("%d-%b-%y at %H:%M:%S")
+    info = api.dataset_info(HF_REPO_ID_IMG)
+    img_last_modified = info.last_modified.strftime("%d-%b-%y at %H:%M:%S")
+    return Titled(
+        "Zotero Refresh Pipeline",
+        Div(
+            H3("Status"),
+            P(f"{HF_REPO_ID_TXT}  : {text_last_modified} (last updated)"),
+            P(f"{HF_REPO_ID_IMG}: {img_last_modified} (last updated)"),
+        ),
+    )
 serve()

main.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import re
 import time
 import dotenv
@@ -9,17 +10,19 @@ import requests
 import schedule
 import srsly
 from bs4 import BeautifulSoup
-from datasets import Dataset, Image, load_dataset
-from huggingface_hub import create_repo, login, whoami
 from PIL import Image as PILImage
 from retry import retry
 from tqdm.auto import tqdm
 dotenv.load_dotenv()
 login(token=os.environ.get("HF_TOKEN"))
 hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
-HF_REPO_ID = f"{hf_user}/zotero-articles"
 ########################################################
@@ -66,7 +69,7 @@ def get_zotero_items(debug=False):
         print(f"# items fetched {len(items)}")
         if debug:
-            if len(items) > 500:
                 break
     return items
@@ -103,11 +106,18 @@ def get_arxiv_items(items):
             if arxiv_id in visited:
                 continue
             arxiv_items.append(
                 {
                     "arxiv_id": arxiv_id,
                     "arxiv_url": arxiv_url,
                     "pdf_url": pdf_url,
                     "added_by": item["meta"]["createdByUser"]["username"],
                     "date_added": data.get("dateAdded", ""),
                 }
@@ -129,10 +139,10 @@ def fetch_arxiv_htmls(arxiv_items):
     for item in tqdm(arxiv_items):
         html = fetch_arxiv_html(item["arxiv_id"])
         if html:
-            item["raw_html"] = html
         else:
             print(f"failed to fetch html for {item['arxiv_id']}")
-            item["raw_html"] = "Error"
     return arxiv_items
@@ -326,7 +336,7 @@ def download_arxiv_pdf(arxiv_id):
         raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
-def pdf_to_jpegs(pdf_content, output_folder):
     # Create output folder if it doesn't exist
     os.makedirs(output_folder, exist_ok=True)
@@ -345,6 +355,9 @@ def pdf_to_jpegs(pdf_content, output_folder):
         pix.save(image_path)
         # print(f"Saved {image_path}")
     doc.close()
@@ -392,8 +405,6 @@ def create_hf_image_dataset(base_dir):
             "image": [d["image"] for d in data],
             "arxiv_id": [d["arxiv_id"] for d in data],
             "page_number": [d["page_number"] for d in data],
-            "width": [d["width"] for d in data],
-            "height": [d["height"] for d in data],
         }
     )
@@ -409,9 +420,17 @@ def create_hf_image_dataset(base_dir):
 def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
-    repo_id = HF_REPO_ID
     create_repo(
-        repo_id=repo_id,
         token=os.environ.get("HF_TOKEN"),
         private=True,
         repo_type="dataset",
@@ -421,23 +440,44 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
     # upload image dataset
     try:
         img_ds = create_hf_image_dataset("data/arxiv_images")
-        img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
         # push id_to_abstract
         abstract_ds = Dataset.from_pandas(abstract_df)
-        abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
         # push arxiv_items
         arxiv_ds = Dataset.from_pandas(contents_df)
-        arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
         # push processed_arxiv_ids
         processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
         processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
-        processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
     except Exception as e:
         print(e)
 ########################################################
 ### MAIN
@@ -445,21 +485,20 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
 def main():
-    items = get_zotero_items(debug=True)
     print(f"# of items fetched from zotero: {len(items)}")
     arxiv_items = get_arxiv_items(items)
     print(f"# of arxiv papers: {len(arxiv_items)}")
     # get already processed arxiv ids from HF
     try:
-        existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"]["arxiv_id"]
     except Exception as e:
         print(e)
-        try:
-            existing_arxiv_ids = srsly.read_json("data/processed_arxiv_ids.json")
-        except Exception as e:
-            print(e)
-            existing_arxiv_ids = []
     existing_arxiv_ids = set(existing_arxiv_ids)
     print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
@@ -468,15 +507,27 @@ def main():
     arxiv_items = fetch_arxiv_htmls(arxiv_items)
     print(f"# of new arxiv items: {len(arxiv_items)}")
     processed_arxiv_ids = set()
     for item in arxiv_items:
         # download images --
         save_arxiv_article_images(item["arxiv_id"])
         # parse html
         try:
-            item["contents"] = parse_html_content(item["raw_html"])
-            processed_arxiv_ids.add(item["arxiv_id"])
         except Exception as e:
             print(f"Failed to parse html for {item['arxiv_id']}: {e}")
             item["contents"] = []
@@ -484,12 +535,21 @@ def main():
         if len(item["contents"]) == 0:
             print("Extracting from pdf...")
             md_content = get_pdf_text(item["arxiv_id"])  # fix this
             if md_content:
                 item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
-                processed_arxiv_ids.add(item["arxiv_id"])
             else:
                 item["contents"] = []
     # save contents ---
     processed_arxiv_ids = list(processed_arxiv_ids)
     print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
@@ -507,7 +567,7 @@ def main():
     # add to existing dataset
     try:
-        old_abstract_df = load_dataset(HF_REPO_ID, "abstracts")["train"].to_pandas()
     except Exception as e:
         print(e)
         old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
@@ -520,7 +580,7 @@ def main():
     contents_df = pd.DataFrame(arxiv_items)
     print(contents_df.head())
     try:
-        old_contents_df = load_dataset(HF_REPO_ID, "articles")["train"].to_pandas()
     except Exception as e:
         print(e)
         old_contents_df = pd.DataFrame(columns=contents_df.columns)
@@ -531,7 +591,7 @@ def main():
     contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
     # upload to hf
-    processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
     upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
     # save as local copy
@@ -545,7 +605,7 @@ def schedule_periodic_task():
     """
     Schedule the main task to run at the user-defined frequency
     """
-    main()  # run once initially
     frequency = "daily"  # TODO: env
     if frequency == "hourly":

 import os
 import re
+import shutil
 import time
 import dotenv
 import schedule
 import srsly
 from bs4 import BeautifulSoup
+from datasets import Dataset, Image, concatenate_datasets, load_dataset
+from huggingface_hub import HfApi, create_repo, login, whoami
 from PIL import Image as PILImage
 from retry import retry
 from tqdm.auto import tqdm
 dotenv.load_dotenv()
 login(token=os.environ.get("HF_TOKEN"))
+api = HfApi()
 hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
+HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-texts"
+HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-images"
 ########################################################
         print(f"# items fetched {len(items)}")
         if debug:
+            if len(items) > 1600:
                 break
     return items
             if arxiv_id in visited:
                 continue
+            authors = []
+            for author in data.get("creators", []):
+                authors.append(f"{author.get('firstName', '')} {author.get('lastName', '')}")
             arxiv_items.append(
                 {
                     "arxiv_id": arxiv_id,
                     "arxiv_url": arxiv_url,
+                    "title": data.get("title", ""),
+                    "authors": authors,
                     "pdf_url": pdf_url,
+                    "date_published": data.get("date", ""),
                     "added_by": item["meta"]["createdByUser"]["username"],
                     "date_added": data.get("dateAdded", ""),
                 }
     for item in tqdm(arxiv_items):
         html = fetch_arxiv_html(item["arxiv_id"])
         if html:
+            item["raw_content"] = html
         else:
             print(f"failed to fetch html for {item['arxiv_id']}")
+            item["raw_content"] = "Error"
     return arxiv_items
         raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
+def pdf_to_jpegs(pdf_content, output_folder, max_pages=128):
     # Create output folder if it doesn't exist
     os.makedirs(output_folder, exist_ok=True)
         pix.save(image_path)
         # print(f"Saved {image_path}")
+        if page_num >= max_pages:
+            break
     doc.close()
             "image": [d["image"] for d in data],
             "arxiv_id": [d["arxiv_id"] for d in data],
             "page_number": [d["page_number"] for d in data],
         }
     )
 def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
+    # repo_id = HF_REPO_ID
+    create_repo(
+        repo_id=HF_REPO_ID_TXT,
+        token=os.environ.get("HF_TOKEN"),
+        private=True,
+        repo_type="dataset",
+        exist_ok=True,
+    )
     create_repo(
+        repo_id=HF_REPO_ID_IMG,
         token=os.environ.get("HF_TOKEN"),
         private=True,
         repo_type="dataset",
     # upload image dataset
     try:
         img_ds = create_hf_image_dataset("data/arxiv_images")
+        try:
+            old_img_ds = load_dataset(HF_REPO_ID_IMG, "images")["train"]
+            img_ds = concatenate_datasets([old_img_ds, img_ds])
+        except Exception as e:
+            print(e)
+        img_ds.push_to_hub(HF_REPO_ID_IMG, "images", token=os.environ.get("HF_TOKEN"))
+    except Exception as e:
+        print(e)
+    # upload first pages only
+    try:
+        img_ds = img_ds.filter(lambda x: x["page_number"] == 1)
+        img_ds.push_to_hub(HF_REPO_ID_IMG, "images_first_page", token=os.environ.get("HF_TOKEN"))
+    except Exception as e:
+        print(e)
+    try:
         # push id_to_abstract
         abstract_ds = Dataset.from_pandas(abstract_df)
+        abstract_ds.push_to_hub(HF_REPO_ID_TXT, "abstracts", token=os.environ.get("HF_TOKEN"))
         # push arxiv_items
         arxiv_ds = Dataset.from_pandas(contents_df)
+        arxiv_ds.push_to_hub(HF_REPO_ID_TXT, "articles", token=os.environ.get("HF_TOKEN"))
         # push processed_arxiv_ids
         processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
         processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
+        processed_arxiv_ids_ds.push_to_hub(HF_REPO_ID_TXT, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
     except Exception as e:
         print(e)
+    # trigger refresh of connected datasets
+    print("==" * 40)
+    print("Triggering refresh of connected datasets")
+    api.restart_space(repo_id="answerdotai/zotero-weekly")
+    print("==" * 40)
 ########################################################
 ### MAIN
 def main():
+    # items = get_zotero_items(debug=True)
+    items = get_zotero_items(debug=False)
     print(f"# of items fetched from zotero: {len(items)}")
     arxiv_items = get_arxiv_items(items)
     print(f"# of arxiv papers: {len(arxiv_items)}")
     # get already processed arxiv ids from HF
     try:
+        existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
     except Exception as e:
         print(e)
+        existing_arxiv_ids = []
     existing_arxiv_ids = set(existing_arxiv_ids)
     print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
     arxiv_items = fetch_arxiv_htmls(arxiv_items)
     print(f"# of new arxiv items: {len(arxiv_items)}")
+    if len(arxiv_items) == 0:
+        print("No new arxiv items to process")
+        return
     processed_arxiv_ids = set()
+    pbar = tqdm(range(len(arxiv_items)))
+    # remove "data" directory if it exists
+    if os.path.exists("data"):
+        try:
+            shutil.rmtree("data")
+        except Exception as e:
+            print(e)
     for item in arxiv_items:
         # download images --
         save_arxiv_article_images(item["arxiv_id"])
         # parse html
         try:
+            item["contents"] = parse_html_content(item["raw_content"])
         except Exception as e:
             print(f"Failed to parse html for {item['arxiv_id']}: {e}")
             item["contents"] = []
         if len(item["contents"]) == 0:
             print("Extracting from pdf...")
             md_content = get_pdf_text(item["arxiv_id"])  # fix this
+            item["raw_content"] = md_content
             if md_content:
                 item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
             else:
                 item["contents"] = []
+        if len(item["contents"]) > 0:
+            processed_arxiv_ids.add(item["arxiv_id"])
+            if len(item["authors"]) == 0:
+                item["authors"] = []  # ["unknown"]
+                item["title"] = item["contents"][0]["paper_title"]
+        pbar.update(1)
+    pbar.close()
     # save contents ---
     processed_arxiv_ids = list(processed_arxiv_ids)
     print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
     # add to existing dataset
     try:
+        old_abstract_df = load_dataset(HF_REPO_ID_TXT, "abstracts")["train"].to_pandas()
     except Exception as e:
         print(e)
         old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
     contents_df = pd.DataFrame(arxiv_items)
     print(contents_df.head())
     try:
+        old_contents_df = load_dataset(HF_REPO_ID_TXT, "articles")["train"].to_pandas()
     except Exception as e:
         print(e)
         old_contents_df = pd.DataFrame(columns=contents_df.columns)
     contents_df = contents_df.drop_duplicates(subset=["arxiv_id"], keep="last").reset_index(drop=True)
     # upload to hf
+    processed_arxiv_ids = list(set(processed_arxiv_ids + list(existing_arxiv_ids)))
     upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
     # save as local copy
     """
     Schedule the main task to run at the user-defined frequency
     """
+    # main()  # run once initially
     frequency = "daily"  # TODO: env
     if frequency == "hourly":