Files changed (2) hide show
  1. app.py +100 -15
  2. arxiv_categories.json +238 -0
app.py CHANGED
@@ -3,21 +3,51 @@ from google import genai
3
  from google.genai import types
4
  import os
5
  import requests
 
6
 
7
  # setting api keys
8
  genai_api_key = os.environ.get('GENAI_API_KEY')
9
  client = genai.Client(api_key=genai_api_key)
10
 
11
- # reading arxiv category list
12
- with open("./arxiv_categories.txt", "r") as f:
13
- cat_list = [line for line in f]
14
 
15
- def launch(temp, cat, iteration):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  yield "⏳ Generating summaries... please wait."
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # arxiv announcement page
19
- cat_txt = cat.split()
20
- url = f"https://arxiv.org/list/{cat_txt[-1][1:-1]}/new"
21
  url = "https://r.jina.ai/" + url
22
  scraped_text = requests.get(url)
23
 
@@ -26,12 +56,13 @@ def launch(temp, cat, iteration):
26
  ind2 = scraped_text.text.index("Cross submissions (")
27
  filtered = scraped_text.text[ind1:ind2]
28
 
29
-
30
- prompt = f"""You are an expert in { " ".join(cat_txt[:-1]) } and a scientific assistant.
 
31
  You will be given a list of new paper entries from the arXiv daily announcement page.
32
 
33
  Each entry includes the title and abstract.
34
- For each entry, extract insights to help readers understand the paper’s purpose, methods, and findings.
35
  If any information is not clearly mentioned in the abstract, write `"Not specified."`
36
 
37
  For each paper, briefly answer:
@@ -93,8 +124,6 @@ Please do NOT generate any HTML, only return answers in plain structured text.
93
  if 'Article link:' in paper:
94
  article_link = paper.split('Article link:')[1].strip()
95
 
96
-
97
-
98
  html_output += f"""
99
  <div style="border:1px solid #ddd; padding:20px; margin:15px; border-radius:15px; background: linear-gradient(to right, #f7f7f7, #e1e1e1); box-shadow: 0px 8px 15px rgba(0,0,0,0.1); transition: transform 0.3s ease-in-out, box-shadow 0.3s ease-in-out; font-family: 'Verdana', sans-serif; color: #34495e;">
100
  <h3 style="font-size: 22px; font-weight: bold; margin-bottom: 12px; color: #2c3e50; text-shadow: 1px 1px 3px rgba(0,0,0,0.2);">{[idx+1]} {title[:-2]}</h3>
@@ -130,15 +159,71 @@ css = """
130
  """
131
  # font-family: Arial, sans-serif; /* Apply Arial font to all text */
132
  with gr.Blocks(theme=gr.themes.Glass(), css=css) as demo:
133
- gr.Markdown("## Hey, I’m skimarXiv πŸ‘‹ I summarize today’s arXiv papers from your favorite fields, saving you time so you don't have to!")
134
  gr.Markdown("SkimArXiv provides daily AI-generated summaries of new arXiv papers (using Gemini) β€” customized by the research fields you select. Stay up-to-date with the latest discoveries across science, technology, math, and more.")
 
135
  with gr.Row():
136
  slider_temp = gr.Slider(minimum=0, maximum=2, value=0.95, label="temperature", info="0 for reproducibility and > 0 for creativity. 0.95 is the default value for gemini-2.0-flash.")
137
  iterations = gr.Number(value=2, label="iterations", info="larger number of papers requires larger iterations (e.g., 1 iteration ~ 40-50 papers).")
138
- category = gr.Dropdown(cat_list, label="arXiv category", info="All subjects are included from Physics to Economics.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  generate_button = gr.Button("Generate Today's Summaries")
140
 
141
  output_html = gr.HTML()
142
- generate_button.click(fn=launch, inputs=[slider_temp, category, iterations], outputs=output_html)
143
 
144
- demo.launch()
 
3
  from google.genai import types
4
  import os
5
  import requests
6
+ import json
7
 
8
  # setting api keys
9
  genai_api_key = os.environ.get('GENAI_API_KEY')
10
  client = genai.Client(api_key=genai_api_key)
11
 
12
+ # Load categories from JSON file
13
+ with open("./arxiv_categories.json", "r") as f:
14
+ arxiv_categories = json.load(f)
15
 
16
+ # Extract main categories and sub-categories
17
+ cat_list = list(arxiv_categories.keys())
18
+ sub_categories = {
19
+ cat: list(arxiv_categories[cat]["subcategories"].keys())
20
+ for cat in cat_list
21
+ }
22
+
23
+ # Create a mapping of sub-categories to their descriptions
24
+ sub_category_descriptions = {
25
+ cat: {
26
+ sub: arxiv_categories[cat]["subcategories"][sub]
27
+ for sub in arxiv_categories[cat]["subcategories"]
28
+ }
29
+ for cat in cat_list
30
+ }
31
+
32
+ def launch(temp, cat, sub_cat, iteration):
33
  yield "⏳ Generating summaries... please wait."
34
 
35
+ # Extract the category code from the full string (e.g., "astro-ph - Astrophysics" -> "astro-ph")
36
+ category_code = cat.split(" - ")[0]
37
+
38
+ # Extract sub-category codes from the full strings
39
+ sub_category_codes = [sub.split(" - ")[0] for sub in sub_cat] if sub_cat else []
40
+
41
+ # Construct the category string for arXiv API
42
+ if sub_category_codes: # If sub-categories are selected
43
+ # Format: main_category.sub_category
44
+ categories = [f"{category_code}.{sub}" for sub in sub_category_codes]
45
+ cat_str = " OR ".join(categories)
46
+ else: # If no sub-categories selected, use main category
47
+ cat_str = category_code
48
+
49
  # arxiv announcement page
50
+ url = f"https://arxiv.org/list/{cat_str}/new"
 
51
  url = "https://r.jina.ai/" + url
52
  scraped_text = requests.get(url)
53
 
 
56
  ind2 = scraped_text.text.index("Cross submissions (")
57
  filtered = scraped_text.text[ind1:ind2]
58
 
59
+ # Update the prompt to include the selected categories
60
+ category_description = f"{category_code}" + (f" ({', '.join(sub_category_codes)})" if sub_category_codes else "")
61
+ prompt = f"""You are an expert in {category_description} and a scientific assistant.
62
  You will be given a list of new paper entries from the arXiv daily announcement page.
63
 
64
  Each entry includes the title and abstract.
65
+ For each entry, extract insights to help readers understand the paper's purpose, methods, and findings.
66
  If any information is not clearly mentioned in the abstract, write `"Not specified."`
67
 
68
  For each paper, briefly answer:
 
124
  if 'Article link:' in paper:
125
  article_link = paper.split('Article link:')[1].strip()
126
 
 
 
127
  html_output += f"""
128
  <div style="border:1px solid #ddd; padding:20px; margin:15px; border-radius:15px; background: linear-gradient(to right, #f7f7f7, #e1e1e1); box-shadow: 0px 8px 15px rgba(0,0,0,0.1); transition: transform 0.3s ease-in-out, box-shadow 0.3s ease-in-out; font-family: 'Verdana', sans-serif; color: #34495e;">
129
  <h3 style="font-size: 22px; font-weight: bold; margin-bottom: 12px; color: #2c3e50; text-shadow: 1px 1px 3px rgba(0,0,0,0.2);">{[idx+1]} {title[:-2]}</h3>
 
159
  """
160
  # font-family: Arial, sans-serif; /* Apply Arial font to all text */
161
  with gr.Blocks(theme=gr.themes.Glass(), css=css) as demo:
162
+ gr.Markdown("## Hey, I'm skimarXiv πŸ‘‹ I summarize today's arXiv papers from your favorite fields, saving you time so you don't have to!")
163
  gr.Markdown("SkimArXiv provides daily AI-generated summaries of new arXiv papers (using Gemini) β€” customized by the research fields you select. Stay up-to-date with the latest discoveries across science, technology, math, and more.")
164
+
165
  with gr.Row():
166
  slider_temp = gr.Slider(minimum=0, maximum=2, value=0.95, label="temperature", info="0 for reproducibility and > 0 for creativity. 0.95 is the default value for gemini-2.0-flash.")
167
  iterations = gr.Number(value=2, label="iterations", info="larger number of papers requires larger iterations (e.g., 1 iteration ~ 40-50 papers).")
168
+
169
+ with gr.Column():
170
+ # Set default category to astro-ph and initialize its sub-categories
171
+ default_category = 'astro-ph'
172
+
173
+ # Create a mapping of category codes to their descriptions for the dropdown
174
+ category_choices = {
175
+ cat: f"{cat} - {arxiv_categories[cat]['description']}"
176
+ for cat in cat_list
177
+ }
178
+
179
+ category = gr.Dropdown(
180
+ choices=list(category_choices.values()),
181
+ value=category_choices[default_category],
182
+ label="arXiv category",
183
+ info="Select a main category"
184
+ )
185
+
186
+ # Create a mapping of sub-category codes to their descriptions
187
+ def get_sub_category_choices(category_code):
188
+ if category_code in sub_categories and sub_categories[category_code]:
189
+ return {
190
+ sub: f"{sub} - {sub_category_descriptions[category_code][sub]}"
191
+ for sub in sub_categories[category_code]
192
+ }
193
+ return {}
194
+
195
+ sub_category = gr.CheckboxGroup(
196
+ choices=list(get_sub_category_choices(default_category).values()),
197
+ value=[],
198
+ label="Sub-categories",
199
+ info="Select specific sub-categories (optional)",
200
+ interactive=True,
201
+ visible=bool(sub_categories[default_category])
202
+ )
203
+
204
+ # Add change event to update sub-categories when main category changes
205
+ def update_sub_categories(category_full):
206
+ # Extract the category code from the full string (e.g., "astro-ph - Astrophysics" -> "astro-ph")
207
+ category_code = category_full.split(" - ")[0]
208
+
209
+ if category_code in sub_categories and sub_categories[category_code]:
210
+ sub_choices = get_sub_category_choices(category_code)
211
+ return {
212
+ sub_category: gr.CheckboxGroup(
213
+ choices=list(sub_choices.values()),
214
+ value=[],
215
+ visible=True
216
+ )
217
+ }
218
+ return {
219
+ sub_category: gr.CheckboxGroup(choices=[], value=[], visible=False)
220
+ }
221
+
222
+ category.change(fn=update_sub_categories, inputs=category, outputs=[sub_category])
223
+
224
  generate_button = gr.Button("Generate Today's Summaries")
225
 
226
  output_html = gr.HTML()
227
+ generate_button.click(fn=launch, inputs=[slider_temp, category, sub_category, iterations], outputs=output_html)
228
 
229
+ demo.launch(share=True)
arxiv_categories.json ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "astro-ph": {
3
+ "description": "Astrophysics",
4
+ "subcategories": {
5
+ "GA": "Astrophysics of Galaxies",
6
+ "CO": "Cosmology and Nongalactic Astrophysics",
7
+ "EP": "Earth and Planetary Astrophysics",
8
+ "IM": "Instrumentation and Methods for Astrophysics",
9
+ "SR": "Solar and Stellar Astrophysics",
10
+ "HE": "High Energy Astrophysical Phenomena"
11
+ }
12
+ },
13
+ "cond-mat": {
14
+ "description": "Condensed Matter",
15
+ "subcategories": {
16
+ "dis-nn": "Disordered Systems and Neural Networks",
17
+ "mes-hall": "Mesoscale and Nanoscale Physics",
18
+ "mtrl-sci": "Materials Science",
19
+ "other": "Other Condensed Matter",
20
+ "quant-gas": "Quantum Gases",
21
+ "soft": "Soft Condensed Matter",
22
+ "stat-mech": "Statistical Mechanics",
23
+ "str-el": "Strongly Correlated Electrons",
24
+ "supr-con": "Superconductivity"
25
+ }
26
+ },
27
+ "gr-qc": {
28
+ "description": "General Relativity and Quantum Cosmology",
29
+ "subcategories": {}
30
+ },
31
+ "hep-ex": {
32
+ "description": "High Energy Physics - Experiment",
33
+ "subcategories": {}
34
+ },
35
+ "hep-lat": {
36
+ "description": "High Energy Physics - Lattice",
37
+ "subcategories": {}
38
+ },
39
+ "hep-ph": {
40
+ "description": "High Energy Physics - Phenomenology",
41
+ "subcategories": {}
42
+ },
43
+ "hep-th": {
44
+ "description": "High Energy Physics - Theory",
45
+ "subcategories": {}
46
+ },
47
+ "math-ph": {
48
+ "description": "Mathematical Physics",
49
+ "subcategories": {}
50
+ },
51
+ "nlin": {
52
+ "description": "Nonlinear Sciences",
53
+ "subcategories": {
54
+ "AO": "Adaptation and Self-Organizing Systems",
55
+ "CD": "Chaotic Dynamics",
56
+ "CG": "Cellular Automata and Lattice Gases",
57
+ "PS": "Pattern Formation and Solitons",
58
+ "SI": "Exactly Solvable and Integrable Systems"
59
+ }
60
+ },
61
+ "nucl-ex": {
62
+ "description": "Nuclear Experiment",
63
+ "subcategories": {}
64
+ },
65
+ "nucl-th": {
66
+ "description": "Nuclear Theory",
67
+ "subcategories": {}
68
+ },
69
+ "physics": {
70
+ "description": "Physics",
71
+ "subcategories": {
72
+ "acc-ph": "Accelerator Physics",
73
+ "app-ph": "Applied Physics",
74
+ "ao-ph": "Atmospheric and Oceanic Physics",
75
+ "atom-ph": "Atomic Physics",
76
+ "atm-clus": "Atomic and Molecular Clusters",
77
+ "bio-ph": "Biological Physics",
78
+ "chem-ph": "Chemical Physics",
79
+ "class-ph": "Classical Physics",
80
+ "comp-ph": "Computational Physics",
81
+ "data-an": "Data Analysis, Statistics and Probability",
82
+ "flu-dyn": "Fluid Dynamics",
83
+ "gen-ph": "General Physics",
84
+ "geo-ph": "Geophysics",
85
+ "hist-ph": "History and Philosophy of Physics",
86
+ "ins-det": "Instrumentation and Detectors",
87
+ "med-ph": "Medical Physics",
88
+ "optics": "Optics",
89
+ "plasm-ph": "Plasma Physics",
90
+ "pop-ph": "Popular Physics",
91
+ "soc-ph": "Physics and Society",
92
+ "space-ph": "Space Physics"
93
+ }
94
+ },
95
+ "quant-ph": {
96
+ "description": "Quantum Physics",
97
+ "subcategories": {}
98
+ },
99
+ "math": {
100
+ "description": "Mathematics",
101
+ "subcategories": {
102
+ "AC": "Commutative Algebra",
103
+ "AG": "Algebraic Geometry",
104
+ "AP": "Analysis of PDEs",
105
+ "AT": "Algebraic Topology",
106
+ "CA": "Classical Analysis and ODEs",
107
+ "CO": "Combinatorics",
108
+ "CT": "Category Theory",
109
+ "CV": "Complex Variables",
110
+ "DG": "Differential Geometry",
111
+ "DS": "Dynamical Systems",
112
+ "FA": "Functional Analysis",
113
+ "GM": "General Mathematics",
114
+ "GN": "General Topology",
115
+ "GR": "Group Theory",
116
+ "GT": "Geometric Topology",
117
+ "HO": "History and Overview",
118
+ "IT": "Information Theory",
119
+ "KT": "K-Theory and Homology",
120
+ "LO": "Logic",
121
+ "MG": "Metric Geometry",
122
+ "MP": "Mathematical Physics",
123
+ "NA": "Numerical Analysis",
124
+ "NT": "Number Theory",
125
+ "OA": "Operator Algebras",
126
+ "OC": "Optimization and Control",
127
+ "PR": "Probability",
128
+ "QA": "Quantum Algebra",
129
+ "RA": "Rings and Algebras",
130
+ "RT": "Representation Theory",
131
+ "SG": "Symplectic Geometry",
132
+ "SP": "Spectral Theory",
133
+ "ST": "Statistics Theory"
134
+ }
135
+ },
136
+ "cs": {
137
+ "description": "Computing Research Repository",
138
+ "subcategories": {
139
+ "AI": "Artificial Intelligence",
140
+ "AR": "Hardware Architecture",
141
+ "CC": "Computational Complexity",
142
+ "CE": "Computational Engineering, Finance, and Science",
143
+ "CG": "Computational Geometry",
144
+ "CL": "Computation and Language",
145
+ "CR": "Cryptography and Security",
146
+ "CV": "Computer Vision and Pattern Recognition",
147
+ "CY": "Computers and Society",
148
+ "DB": "Databases",
149
+ "DC": "Distributed, Parallel, and Cluster Computing",
150
+ "DL": "Digital Libraries",
151
+ "DM": "Discrete Mathematics",
152
+ "DS": "Data Structures and Algorithms",
153
+ "ET": "Emerging Technologies",
154
+ "FL": "Formal Languages and Automata Theory",
155
+ "GL": "General Literature",
156
+ "GR": "Graphics",
157
+ "GT": "Computer Science and Game Theory",
158
+ "HC": "Human-Computer Interaction",
159
+ "IR": "Information Retrieval",
160
+ "IT": "Information Theory",
161
+ "LG": "Machine Learning",
162
+ "LO": "Logic in Computer Science",
163
+ "MA": "Multiagent Systems",
164
+ "MM": "Multimedia",
165
+ "MS": "Mathematical Software",
166
+ "NA": "Numerical Analysis",
167
+ "NE": "Neural and Evolutionary Computing",
168
+ "NI": "Networking and Internet Architecture",
169
+ "OH": "Other Computer Science",
170
+ "OS": "Operating Systems",
171
+ "PF": "Performance",
172
+ "PL": "Programming Languages",
173
+ "RO": "Robotics",
174
+ "SC": "Symbolic Computation",
175
+ "SD": "Sound",
176
+ "SE": "Software Engineering",
177
+ "SI": "Social and Information Networks",
178
+ "SY": "Systems and Control"
179
+ }
180
+ },
181
+ "q-bio": {
182
+ "description": "Quantitative Biology",
183
+ "subcategories": {
184
+ "BM": "Biomolecules",
185
+ "CB": "Cell Behavior",
186
+ "GN": "Genomics",
187
+ "MN": "Molecular Networks",
188
+ "NC": "Neurons and Cognition",
189
+ "OT": "Other Quantitative Biology",
190
+ "PE": "Populations and Evolution",
191
+ "QM": "Quantitative Methods",
192
+ "SC": "Subcellular Processes",
193
+ "TO": "Tissues and Organs"
194
+ }
195
+ },
196
+ "q-fin": {
197
+ "description": "Quantitative Finance",
198
+ "subcategories": {
199
+ "CP": "Computational Finance",
200
+ "EC": "Economics",
201
+ "GN": "General Finance",
202
+ "MF": "Mathematical Finance",
203
+ "PM": "Portfolio Management",
204
+ "PR": "Pricing of Securities",
205
+ "RM": "Risk Management",
206
+ "ST": "Statistical Finance",
207
+ "TR": "Trading and Market Microstructure"
208
+ }
209
+ },
210
+ "stat": {
211
+ "description": "Statistics",
212
+ "subcategories": {
213
+ "AP": "Applications",
214
+ "CO": "Computation",
215
+ "ME": "Methodology",
216
+ "ML": "Machine Learning",
217
+ "OT": "Other Statistics",
218
+ "TH": "Statistics Theory"
219
+ }
220
+ },
221
+ "eess": {
222
+ "description": "Electrical Engineering and Systems Science",
223
+ "subcategories": {
224
+ "AS": "Audio and Speech Processing",
225
+ "IV": "Image and Video Processing",
226
+ "SP": "Signal Processing",
227
+ "SY": "Systems and Control"
228
+ }
229
+ },
230
+ "econ": {
231
+ "description": "Economics",
232
+ "subcategories": {
233
+ "EM": "Econometrics",
234
+ "GN": "General Economics",
235
+ "TH": "Theoretical Economics"
236
+ }
237
+ }
238
+ }