rbiswasfc commited on
Commit
9afacec
1 Parent(s): 748a8f9
Files changed (1) hide show
  1. main.py +16 -13
main.py CHANGED
@@ -66,7 +66,7 @@ def get_zotero_items(debug=False):
66
  print(f"# items fetched {len(items)}")
67
 
68
  if debug:
69
- if len(items) > 200:
70
  break
71
 
72
  return items
@@ -419,21 +419,24 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
419
  )
420
 
421
  # upload image dataset
422
- img_ds = create_hf_image_dataset("data/arxiv_images")
423
- img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
 
424
 
425
- # push id_to_abstract
426
- abstract_ds = Dataset.from_pandas(abstract_df)
427
- abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
428
 
429
- # push arxiv_items
430
- arxiv_ds = Dataset.from_pandas(contents_df)
431
- arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
432
 
433
- # push processed_arxiv_ids
434
- processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
435
- processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
436
- processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
 
 
437
 
438
 
439
  ########################################################
 
66
  print(f"# items fetched {len(items)}")
67
 
68
  if debug:
69
+ if len(items) > 500:
70
  break
71
 
72
  return items
 
419
  )
420
 
421
  # upload image dataset
422
+ try:
423
+ img_ds = create_hf_image_dataset("data/arxiv_images")
424
+ img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
425
 
426
+ # push id_to_abstract
427
+ abstract_ds = Dataset.from_pandas(abstract_df)
428
+ abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
429
 
430
+ # push arxiv_items
431
+ arxiv_ds = Dataset.from_pandas(contents_df)
432
+ arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
433
 
434
+ # push processed_arxiv_ids
435
+ processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
436
+ processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
437
+ processed_arxiv_ids_ds.push_to_hub(repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
438
+ except Exception as e:
439
+ print(e)
440
 
441
 
442
  ########################################################