rbiswasfc commited on
Commit
f0c7f30
1 Parent(s): b5fea25
Files changed (7) hide show
  1. .gitignore +5 -0
  2. Dockerfile +31 -0
  3. app.py +156 -0
  4. main.py +592 -0
  5. requirements.txt +15 -0
  6. ruff.toml +3 -0
  7. supervisord.conf +20 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .env
2
+ *.json
3
+ data
4
+ .ipynb_checkpoints
5
+ __pycache__
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV HOME=/home/user \
6
+ PATH=/home/user/.local/bin:$PATH
7
+
8
+ # Set the working directory
9
+ WORKDIR $HOME/app
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+ RUN git config --global credential.helper store
14
+
15
+ COPY . .
16
+ COPY supervisord.conf .
17
+
18
+ # Set permissions on the log file
19
+ USER root
20
+ RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
21
+
22
+ RUN mkdir -p /tmp/cache/
23
+ RUN mkdir -p /.cache
24
+ RUN chmod a+rwx -R /tmp/cache/
25
+ RUN chmod a+rwx -R /.cache
26
+ ENV HF_HUB_CACHE=HF_HOME
27
+
28
+ ENV PYTHONUNBUFFERED=1 PORT=7860
29
+
30
+ # Run supervisord
31
+ CMD ["supervisord", "-c", "supervisord.conf"]
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ from collections import defaultdict
4
+ from datetime import date, datetime, timedelta
5
+ from io import BytesIO
6
+
7
+ import dotenv
8
+ from datasets import load_dataset
9
+ from dateutil.parser import parse
10
+ from dateutil.tz import tzutc
11
+ from fasthtml.common import *
12
+ from huggingface_hub import login, whoami
13
+
14
+ dotenv.load_dotenv()
15
+
16
+ style = Style("""
17
+ .grid { margin-bottom: 1rem; }
18
+ .card { display: flex; flex-direction: column; }
19
+ .card img { margin-bottom: 0.5rem; }
20
+ .card h5 { margin: 0; font-size: 0.9rem; line-height: 1.2; }
21
+ .card a { color: inherit; text-decoration: none; }
22
+ .card a:hover { text-decoration: underline; }
23
+ """)
24
+
25
+ app, rt = fast_app(html_style=(style,))
26
+
27
+ login(token=os.environ.get("HF_TOKEN"))
28
+
29
+ hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
30
+ HF_REPO_ID = f"{hf_user}/zotero-articles"
31
+
32
+ abstract_ds = load_dataset(HF_REPO_ID, "abstracts", split="train")
33
+ article_ds = load_dataset(HF_REPO_ID, "articles", split="train")
34
+
35
+ image_ds = load_dataset(HF_REPO_ID, "images", split="train")
36
+ image_ds = image_ds.filter(lambda x: x["page_number"] == 1)
37
+
38
+
39
+ def parse_date(date_string):
40
+ try:
41
+ return parse(date_string).astimezone(tzutc()).date()
42
+ except ValueError:
43
+ return date.today()
44
+
45
+
46
+ def get_week_start(date_obj):
47
+ return date_obj - timedelta(days=date_obj.weekday())
48
+
49
+
50
+ week2articles = defaultdict(list)
51
+ for article in article_ds:
52
+ date_added = parse_date(article["date_added"])
53
+ week_start = get_week_start(date_added)
54
+ week2articles[week_start].append(article["arxiv_id"])
55
+
56
+ weeks = sorted(week2articles.keys(), reverse=True)
57
+
58
+
59
+ def get_article_details(arxiv_id):
60
+ article = article_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)[0]
61
+ abstract = abstract_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
62
+ image = image_ds.filter(lambda x: x["arxiv_id"] == arxiv_id)
63
+ return article, abstract, image
64
+
65
+
66
+ def generate_week_content(current_week):
67
+ week_index = weeks.index(current_week)
68
+ prev_week = weeks[week_index + 1] if week_index < len(weeks) - 1 else None
69
+ next_week = weeks[week_index - 1] if week_index > 0 else None
70
+
71
+ nav_buttons = Group(
72
+ Button(
73
+ "← Previous Week",
74
+ hx_get=f"/week/{prev_week}" if prev_week else "#",
75
+ hx_target="#content",
76
+ hx_swap="innerHTML",
77
+ disabled=not prev_week,
78
+ ),
79
+ Button(
80
+ "Next Week →",
81
+ hx_get=f"/week/{next_week}" if next_week else "#",
82
+ hx_target="#content",
83
+ hx_swap="innerHTML",
84
+ disabled=not next_week,
85
+ ),
86
+ )
87
+
88
+ articles = week2articles[current_week]
89
+ article_cards = []
90
+ for arxiv_id in articles:
91
+ article, abstract, image = get_article_details(arxiv_id)
92
+ article_title = (
93
+ article["contents"][0].get("paper_title", "article")
94
+ if article["contents"]
95
+ else "article"
96
+ )
97
+
98
+ card_content = [
99
+ H5(
100
+ A(
101
+ article_title,
102
+ href=f"https://arxiv.org/abs/{arxiv_id}",
103
+ target="_blank",
104
+ )
105
+ )
106
+ ]
107
+
108
+ if image:
109
+ pil_image = image[0]["image"]
110
+ img_byte_arr = BytesIO()
111
+ pil_image.save(img_byte_arr, format="JPEG")
112
+ img_byte_arr = img_byte_arr.getvalue()
113
+ image_url = f"data:image/jpeg;base64,{base64.b64encode(img_byte_arr).decode('utf-8')}"
114
+ card_content.insert(
115
+ 0,
116
+ Img(
117
+ src=image_url,
118
+ alt="Article image",
119
+ style="max-width: 100%; height: auto; margin-bottom: 15px;",
120
+ ),
121
+ )
122
+
123
+ article_cards.append(Card(*card_content, cls="mb-4"))
124
+
125
+ grid = Grid(
126
+ *article_cards,
127
+ style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 1rem;",
128
+ )
129
+
130
+ week_end = current_week + timedelta(days=6)
131
+ return Div(
132
+ nav_buttons,
133
+ H3(
134
+ f"Week of {current_week.strftime('%B %d')} - {week_end.strftime('%B %d, %Y')} ({len(articles)} articles)"
135
+ ),
136
+ grid,
137
+ nav_buttons,
138
+ id="content",
139
+ )
140
+
141
+
142
+ @rt("/")
143
+ def get():
144
+ return Titled("AnswerAI Zotero Weekly", generate_week_content(weeks[0]))
145
+
146
+
147
+ @rt("/week/{date}")
148
+ def get(date: str):
149
+ try:
150
+ current_week = datetime.strptime(date, "%Y-%m-%d").date()
151
+ return generate_week_content(current_week)
152
+ except Exception as e:
153
+ return Div(f"Error displaying articles: {str(e)}")
154
+
155
+
156
+ serve()
main.py ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+
5
+ import dotenv
6
+ import fitz # PyMuPDF
7
+ import pandas as pd
8
+ import requests
9
+ import schedule
10
+ import srsly
11
+ from bs4 import BeautifulSoup
12
+ from datasets import Dataset, Image, load_dataset
13
+ from huggingface_hub import create_repo, login, whoami
14
+ from PIL import Image as PILImage
15
+ from retry import retry
16
+ from tqdm.auto import tqdm
17
+
18
+ dotenv.load_dotenv()
19
+ login(token=os.environ.get("HF_TOKEN"))
20
+
21
+ hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
22
+ HF_REPO_ID = f"{hf_user}/zotero-articles"
23
+
24
+
25
+ ########################################################
26
+ ### GET ZOTERO ITEMS
27
+ ########################################################
28
+
29
+
30
+ @retry(tries=3, delay=8)
31
+ def _fetch_one_zotero_batch(url, headers, params):
32
+ """
33
+ Fetch articles from Zotero API
34
+ """
35
+ response = requests.get(url, headers=headers, params=params)
36
+ response.raise_for_status()
37
+ return response.json()
38
+
39
+
40
+ def get_zotero_items(debug=False):
41
+ """
42
+ fetch items from zotero library
43
+ """
44
+
45
+ GROUP_ID = os.getenv("GROUP_ID")
46
+ API_KEY = os.getenv("API_KEY")
47
+ BASE_URL = f"https://api.zotero.org/groups/{GROUP_ID}/items"
48
+ LIMIT = 100
49
+
50
+ headers = {"Zotero-API-Key": API_KEY, "Content-Type": "application/json"}
51
+
52
+ items = []
53
+ start = 0
54
+
55
+ i = 1
56
+ while True:
57
+ i += 1
58
+ params = {"limit": LIMIT, "start": start}
59
+ page_items = _fetch_one_zotero_batch(BASE_URL, headers, params)
60
+
61
+ if not page_items:
62
+ break
63
+
64
+ items.extend(page_items)
65
+ start += LIMIT
66
+ print(f"# items fetched {len(items)}")
67
+
68
+ if debug:
69
+ if len(items) > 200:
70
+ break
71
+
72
+ return items
73
+
74
+
75
+ ########################################################
76
+ ### EXTRACT ARXIV LINKS AND PDFs
77
+ ########################################################
78
+
79
+
80
+ def get_arxiv_items(items):
81
+ visited = set()
82
+
83
+ arxiv_items = []
84
+ arxiv_pattern = re.compile(r"arxiv.org/abs/(\d+\.\d+)")
85
+
86
+ for item in items:
87
+ data = item.get("data", {})
88
+ attachments = item.get("links", {}).get("attachment", {})
89
+
90
+ arxiv_url = None
91
+ pdf_url = None
92
+
93
+ if "url" in data and "arxiv.org" in data["url"]:
94
+ arxiv_match = arxiv_pattern.search(data["url"])
95
+ if arxiv_match:
96
+ arxiv_url = data["url"]
97
+
98
+ if attachments:
99
+ pdf_url = attachments["href"]
100
+
101
+ if arxiv_url:
102
+ arxiv_id = arxiv_url.split("/")[-1]
103
+ if arxiv_id in visited:
104
+ continue
105
+
106
+ arxiv_items.append(
107
+ {
108
+ "arxiv_id": arxiv_id,
109
+ "arxiv_url": arxiv_url,
110
+ "pdf_url": pdf_url,
111
+ "added_by": item["meta"]["createdByUser"]["username"],
112
+ "date_added": data.get("dateAdded", ""),
113
+ }
114
+ )
115
+
116
+ visited.add(arxiv_id)
117
+
118
+ return arxiv_items
119
+
120
+
121
+ @retry(tries=3, delay=15, backoff=2)
122
+ def fetch_arxiv_html(arxiv_id):
123
+ url = f"https://ar5iv.labs.arxiv.org/html/{arxiv_id.split('v')[0]}"
124
+ response = requests.get(url)
125
+ return response.text if response.status_code == 200 else None
126
+
127
+
128
+ def fetch_arxiv_htmls(arxiv_items):
129
+ for item in tqdm(arxiv_items):
130
+ html = fetch_arxiv_html(item["arxiv_id"])
131
+ if html:
132
+ item["raw_html"] = html
133
+ else:
134
+ print(f"failed to fetch html for {item['arxiv_id']}")
135
+ item["raw_html"] = "Error"
136
+
137
+ return arxiv_items
138
+
139
+
140
+ ########################################################
141
+ ### PARSE CONTENT FROM ARXIV HTML #
142
+ ########################################################
143
+
144
+
145
+ def parse_html_content(html):
146
+ """
147
+ Parse content from arxiv html
148
+ """
149
+ arxiv_id_match = re.search(r"\[(\d+\.\d+(v\d+)?)\]", html)
150
+ arxiv_id = arxiv_id_match.group(1) if arxiv_id_match else None
151
+ soup = BeautifulSoup(html, "html.parser")
152
+ result = []
153
+
154
+ # Extract paper title
155
+ try:
156
+ paper_title = soup.find("h1", class_="ltx_title ltx_title_document").get_text(
157
+ strip=True
158
+ )
159
+ except Exception:
160
+ paper_title = soup.find("title").get_text(strip=True)
161
+ paper_title = re.sub(r"^\[\d+\.\d+(v\d+)?\]\s*", "", paper_title)
162
+
163
+ for math in soup.find_all("math"):
164
+ math.decompose()
165
+ for cite in soup.find_all("cite"):
166
+ cite.decompose()
167
+
168
+ # Extract abstract
169
+ abstract = soup.find("div", class_="ltx_abstract")
170
+ if abstract:
171
+ result.append(
172
+ {
173
+ "content": " ".join(
174
+ p.get_text(strip=True) for p in abstract.find_all("p")
175
+ ).replace(")", ") "),
176
+ "title": "Abstract",
177
+ "paper_title": paper_title,
178
+ "content_type": "abstract",
179
+ }
180
+ )
181
+ # Extract sections
182
+ sections = soup.find_all("section", class_="ltx_section")
183
+ for index, section in enumerate(sections):
184
+ section_title = section.find("h2", class_="ltx_title ltx_title_section")
185
+ section_title = (
186
+ section_title.get_text(strip=True)
187
+ if section_title
188
+ else f"Section {index + 1}"
189
+ )
190
+ section_content = section.get_text(strip=True).replace(")", ") ")
191
+
192
+ content_type = "body"
193
+ if index == 0:
194
+ content_type = "introduction"
195
+ elif index == len(sections) - 1:
196
+ content_type = "conclusion"
197
+
198
+ result.append(
199
+ {
200
+ "content": section_content,
201
+ "title": section_title,
202
+ "paper_title": paper_title,
203
+ "content_type": content_type,
204
+ }
205
+ )
206
+
207
+ for c in result:
208
+ c["arxiv_id"] = arxiv_id
209
+
210
+ return result
211
+
212
+
213
+ ########################################################
214
+ ### GET TEXTS FROM PDF & PARSE
215
+ ########################################################
216
+
217
+
218
+ def get_pdf_text(arxiv_id):
219
+ url = "http://147.189.194.113:80/extract" # fix: currently down
220
+
221
+ try:
222
+ response = requests.get(url, params={"arxiv_id": arxiv_id})
223
+ response = response.json()
224
+ if "text" in response:
225
+ return response["text"]
226
+ return None
227
+ except Exception as e:
228
+ print(e)
229
+ return None
230
+
231
+
232
+ def get_content_type(section_type, section_count):
233
+ """Determine the content type based on the section type and count"""
234
+ if section_type == "abstract":
235
+ return "abstract"
236
+ elif section_type == "introduction" or section_count == 1:
237
+ return "introduction"
238
+ elif section_type == "conclusion" or section_type == "references":
239
+ return section_type
240
+ else:
241
+ return "body"
242
+
243
+
244
+ def get_section_type(title):
245
+ """Determine the section type based on the title"""
246
+ title_lower = title.lower()
247
+ if "abstract" in title_lower:
248
+ return "abstract"
249
+ elif "introduction" in title_lower:
250
+ return "introduction"
251
+ elif "conclusion" in title_lower:
252
+ return "conclusion"
253
+ elif "reference" in title_lower:
254
+ return "references"
255
+ else:
256
+ return "body"
257
+
258
+
259
+ def parse_markdown_content(md_content, arxiv_id):
260
+ """
261
+ Parses markdown content to identify and extract sections based on headers.
262
+ """
263
+
264
+ lines = md_content.split("\n")
265
+ parsed = []
266
+ current_section = None
267
+ content = []
268
+ paper_title = None
269
+ current_title = None
270
+
271
+ # identify sections based on headers
272
+ for line in lines:
273
+ if line.startswith("#"):
274
+ if paper_title is None:
275
+ paper_title = line.lstrip("#").strip()
276
+ continue
277
+ if content:
278
+ if current_title:
279
+ parsed.append(
280
+ {
281
+ "content": " ".join(content),
282
+ "title": current_title,
283
+ "paper_title": paper_title,
284
+ "content_type": get_content_type(
285
+ current_section, len(parsed)
286
+ ),
287
+ "arxiv_id": arxiv_id,
288
+ }
289
+ )
290
+ content = []
291
+
292
+ current_title = line.lstrip("#").lstrip("#").lstrip()
293
+ if "bit" not in current_title:
294
+ current_title = (
295
+ current_title.lstrip("123456789")
296
+ .lstrip()
297
+ .lstrip(".")
298
+ .lstrip()
299
+ .lstrip("123456789")
300
+ .lstrip()
301
+ .lstrip(".")
302
+ .lstrip()
303
+ )
304
+ current_section = get_section_type(current_title)
305
+
306
+ else:
307
+ content.append(line)
308
+
309
+ # Add the last section
310
+ if content and current_title:
311
+ parsed.append(
312
+ {
313
+ "content": " ".join(content).replace(")", ") "),
314
+ "title": current_title,
315
+ "paper_title": paper_title,
316
+ "content_type": get_content_type(current_section, len(parsed)),
317
+ "arxiv_id": arxiv_id,
318
+ }
319
+ )
320
+
321
+ return parsed
322
+
323
+
324
+ ########################################################
325
+ ### Image Dataset
326
+ ########################################################
327
+
328
+
329
+ def download_arxiv_pdf(arxiv_id):
330
+ arxiv_id = arxiv_id.split("v")[0]
331
+ url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
332
+ response = requests.get(url)
333
+ if response.status_code == 200:
334
+ return response.content
335
+ else:
336
+ raise Exception(f"Failed to download PDF. Status code: {response.status_code}")
337
+
338
+
339
+ def pdf_to_jpegs(pdf_content, output_folder):
340
+ # Create output folder if it doesn't exist
341
+ os.makedirs(output_folder, exist_ok=True)
342
+
343
+ # Open the PDF
344
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
345
+
346
+ # Iterate through pages
347
+ for page_num in range(len(doc)):
348
+ page = doc.load_page(page_num)
349
+
350
+ # Convert page to image
351
+ pix = page.get_pixmap()
352
+
353
+ # Save image as JPEG
354
+ image_path = os.path.join(output_folder, f"page_{page_num + 1}.jpg")
355
+ pix.save(image_path)
356
+ # print(f"Saved {image_path}")
357
+
358
+ doc.close()
359
+
360
+
361
+ def save_arxiv_article_images(arxiv_id):
362
+ output_folder = os.path.join("data", "arxiv_images", arxiv_id)
363
+ try:
364
+ pdf_content = download_arxiv_pdf(arxiv_id)
365
+ pdf_to_jpegs(pdf_content, output_folder)
366
+ except Exception as e:
367
+ print(f"An error occurred: {str(e)}")
368
+
369
+
370
+ def create_hf_image_dataset(base_dir):
371
+ data = []
372
+
373
+ # Walk through the directory
374
+ for root, dirs, files in os.walk(base_dir):
375
+ for file in files:
376
+ if file.endswith(".jpg"):
377
+ # Extract arxiv_id from the path
378
+ arxiv_id = os.path.basename(root)
379
+
380
+ # Extract page number from the filename
381
+ match = re.search(r"page_(\d+)", file)
382
+ if match:
383
+ page_number = int(match.group(1))
384
+ else:
385
+ continue # Skip if page number can't be extracted
386
+
387
+ # Full path to the image
388
+ image_path = os.path.join(root, file)
389
+
390
+ # Open the image to get its size
391
+ with PILImage.open(image_path) as img:
392
+ width, height = img.size
393
+
394
+ # Add the data
395
+ data.append(
396
+ {
397
+ "image": image_path,
398
+ "arxiv_id": arxiv_id,
399
+ "page_number": page_number,
400
+ "width": width,
401
+ "height": height,
402
+ }
403
+ )
404
+
405
+ # Create the dataset
406
+ dataset = Dataset.from_dict(
407
+ {
408
+ "image": [d["image"] for d in data],
409
+ "arxiv_id": [d["arxiv_id"] for d in data],
410
+ "page_number": [d["page_number"] for d in data],
411
+ "width": [d["width"] for d in data],
412
+ "height": [d["height"] for d in data],
413
+ }
414
+ )
415
+
416
+ # Cast the image column to Image
417
+ dataset = dataset.cast_column("image", Image())
418
+
419
+ return dataset
420
+
421
+
422
+ ########################################################
423
+ ### HF UPLOAD
424
+ ########################################################
425
+
426
+
427
+ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
428
+ repo_id = HF_REPO_ID
429
+ create_repo(
430
+ repo_id=repo_id,
431
+ token=os.environ.get("HF_TOKEN"),
432
+ private=True,
433
+ repo_type="dataset",
434
+ exist_ok=True,
435
+ )
436
+
437
+ # upload image dataset
438
+ img_ds = create_hf_image_dataset("data/arxiv_images")
439
+ img_ds.push_to_hub(repo_id, "images", token=os.environ.get("HF_TOKEN"))
440
+
441
+ # push id_to_abstract
442
+ abstract_ds = Dataset.from_pandas(abstract_df)
443
+ abstract_ds.push_to_hub(repo_id, "abstracts", token=os.environ.get("HF_TOKEN"))
444
+
445
+ # push arxiv_items
446
+ arxiv_ds = Dataset.from_pandas(contents_df)
447
+ arxiv_ds.push_to_hub(repo_id, "articles", token=os.environ.get("HF_TOKEN"))
448
+
449
+ # push processed_arxiv_ids
450
+ processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
451
+ processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
452
+ processed_arxiv_ids_ds.push_to_hub(
453
+ repo_id, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN")
454
+ )
455
+
456
+
457
+ ########################################################
458
+ ### MAIN
459
+ ########################################################
460
+
461
+
462
+ def main():
463
+ items = get_zotero_items(debug=True)
464
+ print(f"# of items fetched from zotero: {len(items)}")
465
+ arxiv_items = get_arxiv_items(items)
466
+ print(f"# of arxiv papers: {len(arxiv_items)}")
467
+
468
+ # get already processed arxiv ids from HF
469
+ try:
470
+ existing_arxiv_ids = load_dataset(HF_REPO_ID, "processed_arxiv_ids")["train"][
471
+ "arxiv_id"
472
+ ]
473
+ except Exception as e:
474
+ print(e)
475
+ try:
476
+ existing_arxiv_ids = srsly.read_json("data/processed_arxiv_ids.json")
477
+ except Exception as e:
478
+ print(e)
479
+ existing_arxiv_ids = []
480
+ existing_arxiv_ids = set(existing_arxiv_ids)
481
+ print(f"# of existing arxiv ids: {len(existing_arxiv_ids)}")
482
+
483
+ # new arxiv items
484
+ arxiv_items = [
485
+ item for item in arxiv_items if item["arxiv_id"] not in existing_arxiv_ids
486
+ ]
487
+ arxiv_items = fetch_arxiv_htmls(arxiv_items)
488
+ print(f"# of new arxiv items: {len(arxiv_items)}")
489
+
490
+ processed_arxiv_ids = set()
491
+ for item in arxiv_items:
492
+ # download images --
493
+ save_arxiv_article_images(item["arxiv_id"])
494
+
495
+ # parse html
496
+ try:
497
+ item["contents"] = parse_html_content(item["raw_html"])
498
+ processed_arxiv_ids.add(item["arxiv_id"])
499
+ except Exception as e:
500
+ print(f"Failed to parse html for {item['arxiv_id']}: {e}")
501
+ item["contents"] = []
502
+
503
+ if len(item["contents"]) == 0:
504
+ print("Extracting from pdf...")
505
+ md_content = get_pdf_text(item["arxiv_id"]) # fix this
506
+ if md_content:
507
+ item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
508
+ processed_arxiv_ids.add(item["arxiv_id"])
509
+ else:
510
+ item["contents"] = []
511
+
512
+ # save contents ---
513
+ processed_arxiv_ids = list(processed_arxiv_ids)
514
+ print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
515
+
516
+ # save abstracts ---
517
+ id_to_abstract = {}
518
+ for item in arxiv_items:
519
+ for entry in item["contents"]:
520
+ if entry["content_type"] == "abstract":
521
+ id_to_abstract[item["arxiv_id"]] = entry["content"]
522
+ break
523
+ print(f"# of abstracts: {len(id_to_abstract)}")
524
+ abstract_df = (
525
+ pd.Series(id_to_abstract)
526
+ .reset_index()
527
+ .rename(columns={"index": "arxiv_id", 0: "abstract"})
528
+ )
529
+ print(abstract_df.head())
530
+
531
+ # add to existing dataset
532
+ try:
533
+ old_abstract_df = load_dataset(HF_REPO_ID, "abstracts")["train"].to_pandas()
534
+ except Exception as e:
535
+ print(e)
536
+ old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
537
+ print(old_abstract_df.head())
538
+
539
+ abstract_df = pd.concat([old_abstract_df, abstract_df]).reset_index(drop=True)
540
+ abstract_df = abstract_df.drop_duplicates(
541
+ subset=["arxiv_id"], keep="last"
542
+ ).reset_index(drop=True)
543
+
544
+ # contents
545
+ contents_df = pd.DataFrame(arxiv_items)
546
+ print(contents_df.head())
547
+ try:
548
+ old_contents_df = load_dataset(HF_REPO_ID, "articles")["train"].to_pandas()
549
+ except Exception as e:
550
+ print(e)
551
+ old_contents_df = pd.DataFrame(columns=contents_df.columns)
552
+ if len(old_contents_df) > 0:
553
+ print(old_contents_df.sample().T)
554
+
555
+ contents_df = pd.concat([old_contents_df, contents_df]).reset_index(drop=True)
556
+ contents_df = contents_df.drop_duplicates(
557
+ subset=["arxiv_id"], keep="last"
558
+ ).reset_index(drop=True)
559
+
560
+ # upload to hf
561
+ processed_arxiv_ids = list(set(processed_arxiv_ids + list(processed_arxiv_ids)))
562
+ upload_to_hf(abstract_df, contents_df, processed_arxiv_ids)
563
+
564
+ # save as local copy
565
+ os.makedirs("data", exist_ok=True)
566
+ abstract_df.to_parquet("data/abstracts.parquet")
567
+ contents_df.to_parquet("data/contents.parquet")
568
+ srsly.write_json("data/processed_arxiv_ids.json", processed_arxiv_ids)
569
+
570
+
571
+ def schedule_periodic_task():
572
+ """
573
+ Schedule the main task to run at the user-defined frequency
574
+ """
575
+ main() # run once initially
576
+
577
+ frequency = "daily" # TODO: env
578
+ if frequency == "hourly":
579
+ print("Scheduling tasks to run every hour at the top of the hour")
580
+ schedule.every().hour.at(":00").do(main)
581
+ elif frequency == "daily":
582
+ start_time = "10:00"
583
+ print("Scheduling tasks to run every day at: {start_time} UTC+00")
584
+ schedule.every().day.at(start_time).do(main)
585
+
586
+ while True:
587
+ schedule.run_pending()
588
+ time.sleep(1)
589
+
590
+
591
+ if __name__ == "__main__":
592
+ schedule_periodic_task()
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fasthtml-hf>=0.1.1
2
+ python-fasthtml>=0.0.8
3
+ huggingface-hub>=0.20.0
4
+ uvicorn>=0.29
5
+ schedule==1.2.0
6
+ supervisor==4.2.5
7
+ requests
8
+ srsly
9
+ python-dotenv
10
+ beautifulsoup4
11
+ retry
12
+ pandas
13
+ datasets
14
+ PyMuPDF
15
+ pillow
ruff.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ line-length = 128
2
+ target-version = "py311"
3
+ ignore = ["E402"]
supervisord.conf ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [supervisord]
2
+ nodaemon=true
3
+
4
+ [program:main]
5
+ command=python main.py
6
+ stdout_logfile=/dev/stdout
7
+ stdout_logfile_maxbytes=0
8
+ stderr_logfile=/dev/stderr
9
+ stderr_logfile_maxbytes=0
10
+ autostart=true
11
+ # autorestart=true
12
+
13
+ [program:app]
14
+ command=python app.py
15
+ stdout_logfile=/dev/null
16
+ stdout_logfile_maxbytes=0
17
+ stderr_logfile=/dev/stderr
18
+ stderr_logfile_maxbytes=0
19
+ autostart=true
20
+ autorestart=true