jpwahle commited on
Commit
7574c0c
1 Parent(s): 8b8e2ce

Finalize PDF function and update on hf-hub

Browse files
Files changed (5) hide show
  1. Dockerfile +5 -0
  2. main.py +14 -5
  3. pdf.py +14 -0
  4. plots.py +61 -21
  5. s2.py +237 -92
Dockerfile CHANGED
@@ -14,6 +14,8 @@ RUN mkdir -p /var/run/supervisor && chmod 777 /var/run/supervisor
14
  # Install supervisord and python (for gradio)
15
  RUN apt-get update && apt-get install -y supervisor python3 python3-pip && rm -rf /var/lib/apt/lists/*
16
  RUN pip3 install gradio
 
 
17
 
18
  # Copy your gradio app to the image
19
  COPY . /app/
@@ -22,6 +24,9 @@ COPY ./data /app/data
22
  # Install gradio
23
  RUN pip3 install -r /app/requirements.txt
24
 
 
 
 
25
  # Supervisord configuration
26
  RUN echo "[supervisord]" > /etc/supervisor/conf.d/supervisord.conf && \
27
  echo "nodaemon=true" >> /etc/supervisor/conf.d/supervisord.conf && \
 
14
  # Install supervisord and python (for gradio)
15
  RUN apt-get update && apt-get install -y supervisor python3 python3-pip && rm -rf /var/lib/apt/lists/*
16
  RUN pip3 install gradio
17
+ RUN pip3 install git+https://github.com/kermitt2/grobid_client_python
18
+ RUN pip3 install git+https://github.com/titipata/scipdf_parser
19
 
20
  # Copy your gradio app to the image
21
  COPY . /app/
 
24
  # Install gradio
25
  RUN pip3 install -r /app/requirements.txt
26
 
27
+ # Download spacy en_core_web_sm
28
+ RUN python3 -m spacy download en_core_web_sm
29
+
30
  # Supervisord configuration
31
  RUN echo "[supervisord]" > /etc/supervisor/conf.d/supervisord.conf && \
32
  echo "nodaemon=true" >> /etc/supervisor/conf.d/supervisord.conf && \
main.py CHANGED
@@ -14,6 +14,7 @@ from s2 import (
14
  compute_stats_for_acl_author,
15
  compute_stats_for_acl_paper,
16
  compute_stats_for_acl_venue,
 
17
  compute_stats_for_s2_author,
18
  compute_stats_for_s2_paper,
19
  )
@@ -35,25 +36,32 @@ def create_compute_stats(submit_type=None):
35
  id_type, author_name = check_s2_id_type(s2_id)
36
  if id_type == "paper":
37
  results = compute_stats_for_s2_paper(s2_id)
 
38
  return plot_and_return_stats(*results)
39
  if id_type == "author":
40
  results = compute_stats_for_s2_author(s2_id, author_name)
 
41
  return plot_and_return_stats(*results)
42
  if submit_type == "acl_link" and acl_link:
43
  # Crawl all papers for the author or venue or just the paper if it is a paper link
44
  url_type = determine_page_type(acl_link)
45
  if url_type == "paper":
46
  results = compute_stats_for_acl_paper(acl_link)
 
47
  return plot_and_return_stats(*results)
48
  if url_type == "author":
49
  results = compute_stats_for_acl_author(acl_link)
 
50
  return plot_and_return_stats(*results)
51
  if url_type == "venue":
52
  results = compute_stats_for_acl_venue(acl_link)
 
53
  return plot_and_return_stats(*results)
54
- # if submit_type == "pdf_file" and pdf_file:
55
- # # Compute the citation field diversity index and citation age diversity index
56
- # pass
 
 
57
  return None, None, None, None, None, None, None, None
58
 
59
  return compute_stats
@@ -67,6 +75,7 @@ def plot_and_return_stats(
67
  cfdi,
68
  cadi,
69
  maoc,
 
70
  ):
71
  """
72
  Plots the data and returns statistics.
@@ -85,10 +94,10 @@ def plot_and_return_stats(
85
  the most common oldest papers, the cfdi, cadi, and the plots for cfdi and maoc.
86
  """
87
  # Generate cfdi plot
88
- plot_cfdi = generate_cfdi_plot(cfdi)
89
 
90
  # Generate cadi plot
91
- plot_maoc = generate_maoc_plot(maoc)
92
 
93
  # Get top 3 most cited fields
94
  top_fields_text = "\n".join(
 
14
  compute_stats_for_acl_author,
15
  compute_stats_for_acl_paper,
16
  compute_stats_for_acl_venue,
17
+ compute_stats_for_pdf,
18
  compute_stats_for_s2_author,
19
  compute_stats_for_s2_paper,
20
  )
 
36
  id_type, author_name = check_s2_id_type(s2_id)
37
  if id_type == "paper":
38
  results = compute_stats_for_s2_paper(s2_id)
39
+ results = results + ("paper",)
40
  return plot_and_return_stats(*results)
41
  if id_type == "author":
42
  results = compute_stats_for_s2_author(s2_id, author_name)
43
+ results = results + ("author",)
44
  return plot_and_return_stats(*results)
45
  if submit_type == "acl_link" and acl_link:
46
  # Crawl all papers for the author or venue or just the paper if it is a paper link
47
  url_type = determine_page_type(acl_link)
48
  if url_type == "paper":
49
  results = compute_stats_for_acl_paper(acl_link)
50
+ results = results + ("paper",)
51
  return plot_and_return_stats(*results)
52
  if url_type == "author":
53
  results = compute_stats_for_acl_author(acl_link)
54
+ results = results + ("author",)
55
  return plot_and_return_stats(*results)
56
  if url_type == "venue":
57
  results = compute_stats_for_acl_venue(acl_link)
58
+ results = results + ("proceedings",)
59
  return plot_and_return_stats(*results)
60
+ if submit_type == "pdf_file" and pdf_file:
61
+ # Compute the citation field diversity index and citation age diversity index
62
+ results = asyncio.run(compute_stats_for_pdf(pdf_file))
63
+ results = results + ("paper",)
64
+ return plot_and_return_stats(*results)
65
  return None, None, None, None, None, None, None, None
66
 
67
  return compute_stats
 
75
  cfdi,
76
  cadi,
77
  maoc,
78
+ compute_type,
79
  ):
80
  """
81
  Plots the data and returns statistics.
 
94
  the most common oldest papers, the cfdi, cadi, and the plots for cfdi and maoc.
95
  """
96
  # Generate cfdi plot
97
+ plot_cfdi = generate_cfdi_plot(cfdi, compute_type)
98
 
99
  # Generate cadi plot
100
+ plot_maoc = generate_maoc_plot(maoc, compute_type)
101
 
102
  # Get top 3 most cited fields
103
  top_fields_text = "\n".join(
pdf.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import scipdf
2
+
3
+
4
+ def parse_pdf_to_artcile_dict(pdf_path):
5
+ return scipdf.parse_pdf_to_dict(pdf_path)
6
+
7
+
8
+ if __name__ == "__main__":
9
+ article_dict = scipdf.parse_pdf_to_dict(
10
+ "/Users/jp/Documents/papers/demo-test/EMNLP23_Influence_NLP_Citation_Analysis.pdf"
11
+ ) # return dictionary
12
+ print(article_dict.keys())
13
+ print(article_dict["title"])
14
+ print(article_dict["references"][0].keys())
plots.py CHANGED
@@ -33,7 +33,7 @@ with open(
33
  mean_citation_ages.append(temp)
34
 
35
 
36
- def generate_cfdi_plot(input_cfdi):
37
  """
38
  Function to generate a plot for CFDI
39
  """
@@ -56,20 +56,40 @@ def generate_cfdi_plot(input_cfdi):
56
  interpolated_y_cfdi,
57
  c="r",
58
  marker="*",
59
- linewidths=1,
60
  zorder=2,
 
61
  )
62
  ax.vlines(
63
- input_cfdi, 0, interpolated_y_cfdi, color="tomato", ls="--", lw=1.5
 
 
 
 
 
64
  )
 
65
  epsilon = 0.005
66
- # ax.text(
67
- # input_cfdi + epsilon,
68
- # interpolated_y_cfdi + epsilon,
69
- # "Your paper",
70
- # {"color": "#DC143C", "fontsize": 13},
71
- # ha="left", # Horizontal alignment
72
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  ax.set_xlabel("Citation Field Diversity Index (CFDI)", fontsize=15)
75
  ax.set_ylabel("Density", fontsize=15)
@@ -78,9 +98,9 @@ def generate_cfdi_plot(input_cfdi):
78
  return fig
79
 
80
 
81
- def generate_maoc_plot(input_maoc):
82
  """
83
- Function to generate a plot for CFDI
84
  """
85
  # Using kdeplot to fill the distribution curve
86
  sns.set(font_scale=1.3, style="whitegrid")
@@ -100,20 +120,40 @@ def generate_maoc_plot(input_maoc):
100
  interpolated_y_cfdi,
101
  c="r",
102
  marker="*",
103
- linewidths=1,
104
  zorder=2,
 
105
  )
106
  ax.vlines(
107
- input_maoc, 0, interpolated_y_cfdi, color="tomato", ls="--", lw=1.5
 
 
 
 
 
108
  )
 
109
  epsilon = 0.005
110
- # ax.text(
111
- # input_maoc + epsilon,
112
- # interpolated_y_cfdi + epsilon,
113
- # "Your paper",
114
- # {"color": "#DC143C", "fontsize": 13},
115
- # ha="left", # Horizontal alignment
116
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  ax.set_xlabel("Mean Age of Citation (mAoC)", fontsize=15)
119
  ax.set_ylabel("Density", fontsize=15)
 
33
  mean_citation_ages.append(temp)
34
 
35
 
36
+ def generate_cfdi_plot(input_cfdi, compute_type="paper"):
37
  """
38
  Function to generate a plot for CFDI
39
  """
 
56
  interpolated_y_cfdi,
57
  c="r",
58
  marker="*",
59
+ linewidths=2,
60
  zorder=2,
61
+ s=32,
62
  )
63
  ax.vlines(
64
+ input_cfdi,
65
+ 0,
66
+ interpolated_y_cfdi,
67
+ color="tomato",
68
+ ls="--",
69
+ lw=1.5,
70
  )
71
+
72
  epsilon = 0.005
73
+ # Compute the average and plot it as a light grey vertical line
74
+ mean_val = np.mean(data)
75
+ # Interpolate the y value for the mean
76
+ interpolated_y_mean = np.interp(mean_val, x_vals, y_vals)
77
+
78
+ ax.vlines(mean_val, 0, interpolated_y_mean, color="grey", ls="--", lw=1.5)
79
+ ax.text(
80
+ mean_val + epsilon,
81
+ interpolated_y_mean + epsilon,
82
+ "Avg.",
83
+ {"color": "grey", "fontsize": 13},
84
+ ha="left", # Horizontal alignment
85
+ )
86
+ ax.text(
87
+ input_cfdi + epsilon,
88
+ interpolated_y_cfdi + epsilon,
89
+ f"This {compute_type}",
90
+ {"color": "#DC143C", "fontsize": 13},
91
+ ha="left", # Horizontal alignment
92
+ )
93
 
94
  ax.set_xlabel("Citation Field Diversity Index (CFDI)", fontsize=15)
95
  ax.set_ylabel("Density", fontsize=15)
 
98
  return fig
99
 
100
 
101
+ def generate_maoc_plot(input_maoc, compute_type="paper"):
102
  """
103
+ Function to generate a plot for MAOC
104
  """
105
  # Using kdeplot to fill the distribution curve
106
  sns.set(font_scale=1.3, style="whitegrid")
 
120
  interpolated_y_cfdi,
121
  c="r",
122
  marker="*",
123
+ linewidths=2,
124
  zorder=2,
125
+ s=32,
126
  )
127
  ax.vlines(
128
+ input_maoc,
129
+ 0,
130
+ interpolated_y_cfdi,
131
+ color="tomato",
132
+ ls="--",
133
+ lw=1.5,
134
  )
135
+
136
  epsilon = 0.005
137
+ # Compute the average and plot it as a light grey vertical line
138
+ mean_val = np.mean(data)
139
+ # Interpolate the y value for the mean
140
+ interpolated_y_mean = np.interp(mean_val, x_vals, y_vals)
141
+
142
+ ax.vlines(mean_val, 0, interpolated_y_mean, color="grey", ls="--", lw=1.5)
143
+ ax.text(
144
+ mean_val + epsilon,
145
+ interpolated_y_mean + epsilon,
146
+ "Avg.",
147
+ {"color": "grey", "fontsize": 13},
148
+ ha="left", # Horizontal alignment
149
+ )
150
+ ax.text(
151
+ input_maoc + epsilon,
152
+ interpolated_y_cfdi + epsilon,
153
+ f"This {compute_type}",
154
+ {"color": "#DC143C", "fontsize": 13},
155
+ ha="left", # Horizontal alignment
156
+ )
157
 
158
  ax.set_xlabel("Mean Age of Citation (mAoC)", fontsize=15)
159
  ax.set_ylabel("Density", fontsize=15)
s2.py CHANGED
@@ -1,11 +1,15 @@
1
  # Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
2
  # All rights reserved.
3
 
 
4
  import asyncio
 
5
  import os
6
  from collections import Counter
7
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
8
 
 
9
  import requests
10
 
11
  from aclanthology import (
@@ -15,9 +19,16 @@ from aclanthology import (
15
  extract_venue_info,
16
  )
17
  from metrics import calculate_gini, calculate_gini_simpson
 
18
 
19
 
20
  def get_or_create_eventloop():
 
 
 
 
 
 
21
  try:
22
  return asyncio.get_event_loop()
23
  except RuntimeError as ex:
@@ -56,12 +67,10 @@ def check_s2_id_type(semantic_scholar_id):
56
  the name of the author (if the ID is valid for an author), or "invalid"
57
  if the ID is not valid for either a paper or an author.
58
  """
59
- # Define the base URL for Semantic Scholar API
60
- base_url = "https://api.semanticscholar.org/v1/"
61
-
62
  # First, check if it's a paper ID
63
  paper_response = requests.get(
64
- f"{base_url}paper/{semantic_scholar_id}", timeout=5
 
65
  )
66
 
67
  # If the response status code is 200, it means the ID is valid for a paper
@@ -70,7 +79,8 @@ def check_s2_id_type(semantic_scholar_id):
70
 
71
  # Next, check if it's an author ID
72
  author_response = requests.get(
73
- f"{base_url}author/{semantic_scholar_id}", timeout=5
 
74
  )
75
 
76
  # If the response status code is 200, it means the ID is valid for an author
@@ -101,6 +111,115 @@ def get_papers_from_author(ssid_author_id):
101
  return []
102
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  def compute_stats_for_s2_paper(ssid_paper_id):
105
  """
106
  Computes statistics for a given paper ID using the Semantic Scholar API.
@@ -143,87 +262,14 @@ def compute_stats_for_s2_paper(ssid_paper_id):
143
  title + "\n" + ", ".join([author["name"] for author in authors])
144
  )
145
 
146
- # Go over the references of the paper
147
- reference_year_list = []
148
- reference_title_list = []
149
- reference_fos_list = []
150
- with ThreadPoolExecutor() as executor:
151
- request_url_refs = [
152
- f"https://api.semanticscholar.org/graph/v1/paper/{ref_paper_key}?fields=title,year,s2FieldsOfStudy"
153
- for ref_paper_key in filtered_s2_ref_paper_keys
154
- ]
155
- futures = [
156
- executor.submit(send_s2_request, request_url_ref)
157
- for request_url_ref in request_url_refs
158
- ]
159
- for future in as_completed(futures):
160
- r_ref = future.result()
161
- if r_ref.status_code == 200:
162
- result_ref = r_ref.json()
163
- (title_ref, year_ref, fields_ref) = (
164
- result_ref["title"],
165
- result_ref["year"],
166
- result_ref["s2FieldsOfStudy"],
167
- )
168
- reference_year_list.append(year_ref)
169
- reference_title_list.append(title_ref)
170
- reference_fos_list.extend(
171
- field["category"]
172
- for field in fields_ref
173
- if field["source"] == "s2-fos-model"
174
- )
175
- else:
176
- print(
177
- f"Error retrieving reference {r_ref.status_code} for"
178
- f" paper {ssid_paper_id}"
179
- )
180
-
181
- # Remove all None from reference_year_list and reference_title_list
182
- reference_year_list = [
183
- year_ref
184
- for year_ref in reference_year_list
185
- if year_ref is not None
186
- ]
187
- reference_title_list = [
188
- title_ref
189
- for title_ref in reference_title_list
190
- if title_ref is not None
191
- ]
192
-
193
- # Count references
194
- num_references = len(reference_year_list)
195
-
196
- # Flatten list and count occurrences
197
- fields_of_study_counts = dict(
198
- Counter(
199
- [
200
- field
201
- for field in reference_fos_list
202
- if "Computer Science" not in field
203
- ]
204
- )
205
- )
206
-
207
- # Citation age list
208
- aoc_list = [
209
- year - year_ref
210
- for year_ref in reference_year_list
211
- if year_ref and year
212
- ]
213
- if not aoc_list:
214
- return None, None, None, None, None, None, None, None
215
-
216
- # Compute citation age
217
- output_maoc = sum(aoc_list) / len(aoc_list)
218
- cadi = calculate_gini(aoc_list)
219
-
220
- # Create a dictionary of year to title
221
- year_to_title_dict = dict(
222
- zip(reference_year_list, reference_title_list)
223
- )
224
-
225
- # Compute CFDI
226
- cfdi = calculate_gini_simpson(fields_of_study_counts)
227
 
228
  # Return the results
229
  return (
@@ -273,9 +319,6 @@ def compute_stats_for_acl_paper(url):
273
  return None
274
 
275
 
276
- import asyncio
277
-
278
-
279
  def compute_stats_for_acl_author(url):
280
  """
281
  Computes statistics for an author's papers in the ACL anthology.
@@ -303,6 +346,15 @@ def compute_stats_for_acl_author(url):
303
 
304
 
305
  def compute_stats_for_acl_venue(url):
 
 
 
 
 
 
 
 
 
306
  if paper_info := extract_venue_info(url):
307
  loop = get_or_create_eventloop()
308
  tasks = [
@@ -317,7 +369,26 @@ def compute_stats_for_acl_venue(url):
317
  return None
318
 
319
 
320
- def compute_stats_for_multiple_s2_papers(papers, title):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  num_references = 0
322
  top_fields = {}
323
  oldest_paper_dict = {}
@@ -337,8 +408,8 @@ def compute_stats_for_multiple_s2_papers(papers, title):
337
  num_references += results[1]
338
  for field, count in results[2].items():
339
  top_fields[field] = top_fields.get(field, 0) + count
340
- for year, title in results[3].items():
341
- oldest_paper_dict[year] = title
342
  cfdi += results[4]
343
  cadi += results[5]
344
  output_maoc += results[6]
@@ -352,3 +423,77 @@ def compute_stats_for_multiple_s2_papers(papers, title):
352
  cadi / len(papers),
353
  output_maoc / len(papers),
354
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
2
  # All rights reserved.
3
 
4
+
5
  import asyncio
6
+ import datetime
7
  import os
8
  from collections import Counter
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from typing import List, Tuple
11
 
12
+ import aiohttp
13
  import requests
14
 
15
  from aclanthology import (
 
19
  extract_venue_info,
20
  )
21
  from metrics import calculate_gini, calculate_gini_simpson
22
+ from pdf import parse_pdf_to_artcile_dict
23
 
24
 
25
  def get_or_create_eventloop():
26
+ """
27
+ Get the current event loop or create a new one if there is no current event loop in the thread.
28
+
29
+ Returns:
30
+ The current event loop.
31
+ """
32
  try:
33
  return asyncio.get_event_loop()
34
  except RuntimeError as ex:
 
67
  the name of the author (if the ID is valid for an author), or "invalid"
68
  if the ID is not valid for either a paper or an author.
69
  """
 
 
 
70
  # First, check if it's a paper ID
71
  paper_response = requests.get(
72
+ f"https://api.semanticscholar.org/v1/paper/{semantic_scholar_id}",
73
+ timeout=5,
74
  )
75
 
76
  # If the response status code is 200, it means the ID is valid for a paper
 
79
 
80
  # Next, check if it's an author ID
81
  author_response = requests.get(
82
+ f"https://api.semanticscholar.org/v1/author/{semantic_scholar_id}",
83
+ timeout=5,
84
  )
85
 
86
  # If the response status code is 200, it means the ID is valid for an author
 
111
  return []
112
 
113
 
114
+ def compute_stats_for_references(s2_ref_paper_keys, year):
115
+ """
116
+ Computes various statistics for a list of reference paper keys.
117
+
118
+ Args:
119
+ s2_ref_paper_keys (list): A list of Semantic Scholar paper keys for the references.
120
+ year (int): The year of the paper.
121
+
122
+ Returns:
123
+ tuple: A tuple containing the following statistics:
124
+ - num_references (int): The number of references.
125
+ - fields_of_study_counts (dict): A dictionary containing the count of each field of study.
126
+ - year_to_title_dict (dict): A dictionary mapping the year of each reference to its title.
127
+ - cfdi (float): The CFDI (Cumulative Field Diversity Index) of the references.
128
+ - cadi (float): The CADI (Cumulative Age Diversity Index) of the references.
129
+ - output_maoc (float): The MAOC (Mean Age of Citation) of the references.
130
+
131
+ If there are no valid references, returns a tuple of None values.
132
+ """
133
+
134
+ # Go over the references of the paper
135
+ reference_year_list = []
136
+ reference_title_list = []
137
+ reference_fos_list = []
138
+ with ThreadPoolExecutor() as executor:
139
+ request_url_refs = [
140
+ f"https://api.semanticscholar.org/graph/v1/paper/{ref_paper_key}?fields=title,year,s2FieldsOfStudy"
141
+ for ref_paper_key in s2_ref_paper_keys
142
+ ]
143
+ futures = [
144
+ executor.submit(send_s2_request, request_url_ref)
145
+ for request_url_ref in request_url_refs
146
+ ]
147
+ for future in as_completed(futures):
148
+ r_ref = future.result()
149
+ if r_ref.status_code == 200:
150
+ result_ref = r_ref.json()
151
+ (title_ref, year_ref, fields_ref) = (
152
+ result_ref["title"],
153
+ result_ref["year"],
154
+ result_ref["s2FieldsOfStudy"],
155
+ )
156
+ reference_year_list.append(year_ref)
157
+ reference_title_list.append(title_ref)
158
+ reference_fos_list.extend(
159
+ field["category"]
160
+ for field in fields_ref
161
+ if field["source"] == "s2-fos-model"
162
+ )
163
+ else:
164
+ print(
165
+ f"Error retrieving reference {r_ref.status_code} for"
166
+ f" paper {s2_ref_paper_keys}"
167
+ )
168
+
169
+ # Remove all None from reference_year_list and reference_title_list
170
+ reference_year_list = [
171
+ year_ref for year_ref in reference_year_list if year_ref is not None
172
+ ]
173
+ reference_title_list = [
174
+ title_ref
175
+ for title_ref in reference_title_list
176
+ if title_ref is not None
177
+ ]
178
+
179
+ # Count references
180
+ num_references = len(reference_year_list)
181
+
182
+ # Flatten list and count occurrences
183
+ fields_of_study_counts = dict(
184
+ Counter(
185
+ [
186
+ field
187
+ for field in reference_fos_list
188
+ if "Computer Science" not in field
189
+ ]
190
+ )
191
+ )
192
+
193
+ # Citation age list
194
+ aoc_list = [
195
+ year - year_ref
196
+ for year_ref in reference_year_list
197
+ if year_ref and year
198
+ ]
199
+ if not aoc_list:
200
+ return None, None, None, None, None, None
201
+
202
+ # Compute citation age
203
+ output_maoc = sum(aoc_list) / len(aoc_list)
204
+ cadi = calculate_gini(aoc_list)
205
+
206
+ # Create a dictionary of year to title
207
+ year_to_title_dict = dict(zip(reference_year_list, reference_title_list))
208
+
209
+ # Compute CFDI
210
+ cfdi = calculate_gini_simpson(fields_of_study_counts)
211
+
212
+ # Return the results
213
+ return (
214
+ num_references,
215
+ fields_of_study_counts,
216
+ year_to_title_dict,
217
+ cfdi,
218
+ cadi,
219
+ output_maoc,
220
+ )
221
+
222
+
223
  def compute_stats_for_s2_paper(ssid_paper_id):
224
  """
225
  Computes statistics for a given paper ID using the Semantic Scholar API.
 
262
  title + "\n" + ", ".join([author["name"] for author in authors])
263
  )
264
 
265
+ (
266
+ num_references,
267
+ fields_of_study_counts,
268
+ year_to_title_dict,
269
+ cfdi,
270
+ cadi,
271
+ output_maoc,
272
+ ) = compute_stats_for_references(filtered_s2_ref_paper_keys, year)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  # Return the results
275
  return (
 
319
  return None
320
 
321
 
 
 
 
322
  def compute_stats_for_acl_author(url):
323
  """
324
  Computes statistics for an author's papers in the ACL anthology.
 
346
 
347
 
348
  def compute_stats_for_acl_venue(url):
349
+ """
350
+ Computes statistics for papers in a given ACL venue.
351
+
352
+ Args:
353
+ url (str): The URL of the ACL venue.
354
+
355
+ Returns:
356
+ dict: A dictionary containing statistics for the papers in the venue.
357
+ """
358
  if paper_info := extract_venue_info(url):
359
  loop = get_or_create_eventloop()
360
  tasks = [
 
369
  return None
370
 
371
 
372
+ def compute_stats_for_multiple_s2_papers(
373
+ papers: List[dict], title: str
374
+ ) -> Tuple[str, int, dict, dict, float, float, float]:
375
+ """
376
+ Computes statistics for multiple S2 papers.
377
+
378
+ Args:
379
+ papers (List[dict]): A list of S2 papers.
380
+ title (str): The title of the papers.
381
+
382
+ Returns:
383
+ A tuple containing the following statistics:
384
+ - title (str): The title of the papers.
385
+ - num_references (int): The total number of references in all papers.
386
+ - top_fields (dict): A dictionary containing the top fields and their counts.
387
+ - oldest_paper_dict (dict): A dictionary containing the oldest paper for each year.
388
+ - cfdi (float): The average CFDI score for all papers.
389
+ - cadi (float): The average CADI score for all papers.
390
+ - output_maoc (float): The average output MAOC score for all papers.
391
+ """
392
  num_references = 0
393
  top_fields = {}
394
  oldest_paper_dict = {}
 
408
  num_references += results[1]
409
  for field, count in results[2].items():
410
  top_fields[field] = top_fields.get(field, 0) + count
411
+ for year, ref_title in results[3].items():
412
+ oldest_paper_dict[year] = ref_title
413
  cfdi += results[4]
414
  cadi += results[5]
415
  output_maoc += results[6]
 
423
  cadi / len(papers),
424
  output_maoc / len(papers),
425
  )
426
+
427
+
428
+ async def send_s2_async_request(url):
429
+ """
430
+ Sends an asynchronous request to the specified URL and returns the response as a JSON object.
431
+
432
+ Args:
433
+ url (str): The URL to send the request to.
434
+
435
+ Returns:
436
+ dict: The response from the URL as a JSON object.
437
+ """
438
+ async with aiohttp.ClientSession() as session:
439
+ async with session.get(url) as response:
440
+ return await response.json()
441
+
442
+
443
+ async def match_title_to_s2_paper(title, authors=None):
444
+ """
445
+ Matches a given paper title (and authors) to Semantic Scholar to retrieve its S2 paper ID.
446
+
447
+ Args:
448
+ title (str): The title of the paper.
449
+ authors (List[str], optional): List of authors of the paper. Defaults to None.
450
+
451
+ Returns:
452
+ str or None: Returns the S2 paper ID if found, otherwise None.
453
+ """
454
+ # Send a request to the Semantic Scholar API to search for the paper by its title
455
+ search_url = (
456
+ f"http://api.semanticscholar.org/graph/v1/paper/search?query={title}"
457
+ )
458
+
459
+ # Send request
460
+ response = await send_s2_async_request(search_url)
461
+
462
+ results = response.get("data", [])
463
+ if len(results) > 0:
464
+ result = results[0] # Ranked by relevance
465
+ return result.get("paperId")
466
+
467
+
468
+ async def compute_stats_for_pdf(pdf_file):
469
+ """
470
+ Computes statistics for a given PDF file.
471
+
472
+ Args:
473
+ pdf_file (file): The PDF file to compute statistics for.
474
+
475
+ Returns:
476
+ tuple: A tuple containing the title of the article and the computed statistics.
477
+ """
478
+ s2_paper_ids = []
479
+ article_dict = parse_pdf_to_artcile_dict(pdf_file.name)
480
+ references = article_dict["references"]
481
+
482
+ # Get S2 paper IDs asynchronously
483
+ tasks = [
484
+ match_title_to_s2_paper(reference["title"], reference["authors"])
485
+ for reference in references
486
+ if reference["title"]
487
+ ]
488
+ s2_paper_ids = await asyncio.gather(*tasks)
489
+
490
+ # Remove all None values from s2paperids
491
+ s2_paper_ids = [s2_id for s2_id in s2_paper_ids if s2_id is not None]
492
+
493
+ # Compute the current year
494
+ today = datetime.date.today()
495
+ year = int(today.strftime("%Y"))
496
+
497
+ results = compute_stats_for_references(s2_paper_ids, year)
498
+ results = (article_dict["title"],) + results
499
+ return results