rbiswasfc commited on
Commit
c43dfe6
1 Parent(s): bf0041a

added images

Browse files
Files changed (2) hide show
  1. main.py +105 -3
  2. requirements.txt +3 -1
main.py CHANGED
@@ -3,13 +3,15 @@ import re
3
  import time
4
 
5
  import dotenv
 
6
  import pandas as pd
7
  import requests
8
  import schedule
9
  import srsly
10
  from bs4 import BeautifulSoup
11
- from datasets import Dataset, load_dataset
12
  from huggingface_hub import create_repo, login, whoami
 
13
  from retry import retry
14
  from tqdm.auto import tqdm
15
 
@@ -17,7 +19,7 @@ dotenv.load_dotenv()
17
  login(token=os.environ.get("HF_TOKEN"))
18
 
19
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
20
- HF_REPO_ID = f"{hf_user}/zotero-answer-ai-articles"
21
 
22
 
23
  ########################################################
@@ -64,7 +66,7 @@ def get_zotero_items(debug=False):
64
  print(f"# items fetched {len(items)}")
65
 
66
  if debug:
67
- if len(items) > 300:
68
  break
69
 
70
  return items
@@ -309,6 +311,98 @@ def parse_markdown_content(md_content, arxiv_id):
309
  return parsed
310
 
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  ########################################################
313
  ### HF UPLOAD
314
  ########################################################
@@ -324,6 +418,10 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
324
  exist_ok=True,
325
  )
326
 
 
 
 
 
327
  # push id_to_abstract
328
  abstract_ds = Dataset.from_pandas(abstract_df)
329
  abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
@@ -369,6 +467,10 @@ def main():
369
 
370
  processed_arxiv_ids = set()
371
  for item in arxiv_items:
 
 
 
 
372
  try:
373
  item["contents"] = parse_html_content(item["raw_html"])
374
  processed_arxiv_ids.add(item["arxiv_id"])
 
3
  import time
4
 
5
  import dotenv
6
+ import fitz # PyMuPDF
7
  import pandas as pd
8
  import requests
9
  import schedule
10
  import srsly
11
  from bs4 import BeautifulSoup
12
+ from datasets import Dataset, Image, load_dataset
13
  from huggingface_hub import create_repo, login, whoami
14
+ from PIL import Image as PILImage
15
  from retry import retry
16
  from tqdm.auto import tqdm
17
 
 
19
  login(token=os.environ.get("HF_TOKEN"))
20
 
21
  hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
22
+ HF_REPO_ID = f"{hf_user}/zotero-articles"
23
 
24
 
25
  ########################################################
 
66
  print(f"# items fetched {len(items)}")
67
 
68
  if debug:
69
+ if len(items) > 200:
70
  break
71
 
72
  return items
 
311
  return parsed
312
 
313
 
314
+ ########################################################
315
+ ### Image Dataset
316
+ ########################################################
317
+
318
+
319
+ def download_arxiv_pdf(arxiv_id):
320
+ arxiv_id = arxiv_id.split("v")[0]
321
+ url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
322
+ response = requests.get(url)
323
+ if response.status_code == 200:
324
+ return response.content
325
+ else:
326
+ raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
327
+
328
+
329
+ def pdf_to_jpegs(pdf_content, output_folder):
330
+ # Create output folder if it doesn't exist
331
+ os.makedirs(output_folder, exist_ok=True)
332
+
333
+ # Open the PDF
334
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
335
+
336
+ # Iterate through pages
337
+ for page_num in range(len(doc)):
338
+ page = doc.load_page(page_num)
339
+
340
+ # Convert page to image
341
+ pix = page.get_pixmap()
342
+
343
+ # Save image as JPEG
344
+ image_path = os.path.join(output_folder, f"page_{page_num + 1}.jpg")
345
+ pix.save(image_path)
346
+ # print(f"Saved {image_path}")
347
+
348
+ doc.close()
349
+
350
+
351
+ def save_arxiv_article_images(arxiv_id):
352
+ output_folder = os.path.join("data", "arxiv_images", arxiv_id)
353
+ try:
354
+ pdf_content = download_arxiv_pdf(arxiv_id)
355
+ pdf_to_jpegs(pdf_content, output_folder)
356
+ except Exception as e:
357
+ print(f"An error occurred: {str(e)}")
358
+
359
+
360
+ def create_hf_image_dataset(base_dir):
361
+ data = []
362
+
363
+ # Walk through the directory
364
+ for root, dirs, files in os.walk(base_dir):
365
+ for file in files:
366
+ if file.endswith(".jpg"):
367
+ # Extract arxiv_id from the path
368
+ arxiv_id = os.path.basename(root)
369
+
370
+ # Extract page number from the filename
371
+ match = re.search(r"page_(\d+)", file)
372
+ if match:
373
+ page_number = int(match.group(1))
374
+ else:
375
+ continue # Skip if page number can't be extracted
376
+
377
+ # Full path to the image
378
+ image_path = os.path.join(root, file)
379
+
380
+ # Open the image to get its size
381
+ with PILImage.open(image_path) as img:
382
+ width, height = img.size
383
+
384
+ # Add the data
385
+ data.append(
386
+ {"image": image_path, "arxiv_id": arxiv_id, "page_number": page_number, "width": width, "height": height}
387
+ )
388
+
389
+ # Create the dataset
390
+ dataset = Dataset.from_dict(
391
+ {
392
+ "image": [d["image"] for d in data],
393
+ "arxiv_id": [d["arxiv_id"] for d in data],
394
+ "page_number": [d["page_number"] for d in data],
395
+ "width": [d["width"] for d in data],
396
+ "height": [d["height"] for d in data],
397
+ }
398
+ )
399
+
400
+ # Cast the image column to Image
401
+ dataset = dataset.cast_column("image", Image())
402
+
403
+ return dataset
404
+
405
+
406
  ########################################################
407
  ### HF UPLOAD
408
  ########################################################
 
418
  exist_ok=True,
419
  )
420
 
421
+ # upload image dataset
422
+ img_ds = create_hf_image_dataset("data/arxiv_images")
423
+ img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
424
+
425
  # push id_to_abstract
426
  abstract_ds = Dataset.from_pandas(abstract_df)
427
  abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
 
467
 
468
  processed_arxiv_ids = set()
469
  for item in arxiv_items:
470
+ # download images --
471
+ save_arxiv_article_images(item["arxiv_id"])
472
+
473
+ # parse html
474
  try:
475
  item["contents"] = parse_html_content(item["raw_html"])
476
  processed_arxiv_ids.add(item["arxiv_id"])
requirements.txt CHANGED
@@ -10,4 +10,6 @@ python-dotenv
10
  beautifulsoup4
11
  retry
12
  pandas
13
- datasets
 
 
 
10
  beautifulsoup4
11
  retry
12
  pandas
13
+ datasets
14
+ PyMuPDF
15
+ pillow