Spaces:
Running
on
Zero
Running
on
Zero
File size: 15,077 Bytes
d1ed09d f895c88 d1ed09d a28644f d1ed09d d934c35 d1ed09d d934c35 d1ed09d d934c35 d1ed09d a28644f d1ed09d a28644f d1ed09d c3c41c1 d1ed09d ad4e2b9 d1ed09d ad4e2b9 d1ed09d ad4e2b9 d748d3b ad4e2b9 d748d3b d1ed09d ad4e2b9 d748d3b d1ed09d 2395603 d1ed09d ad4e2b9 d1ed09d cabc445 f895c88 20cd1f4 f895c88 20cd1f4 f895c88 20cd1f4 f895c88 20cd1f4 f895c88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 |
import numpy as np
from urllib.parse import urlparse, parse_qs
from pyalex import Works, Authors, Institutions
import pandas as pd
import ast, json
def openalex_url_to_pyalex_query(url):
"""
Convert an OpenAlex search URL to a pyalex query.
Args:
url (str): The OpenAlex search URL.
Returns:
tuple: (Works object, dict of parameters)
"""
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Initialize the Works object
query = Works()
# Handle filters
if 'filter' in query_params:
filters = query_params['filter'][0].split(',')
for f in filters:
if ':' in f:
key, value = f.split(':', 1)
if key == 'default.search':
query = query.search(value)
else:
query = query.filter(**{key: value})
# Handle sort - Fixed to properly handle field:direction format
if 'sort' in query_params:
sort_params = query_params['sort'][0].split(',')
for s in sort_params:
if ':' in s: # Handle field:direction format
field, direction = s.split(':')
query = query.sort(**{field: direction})
elif s.startswith('-'): # Handle -field format
query = query.sort(**{s[1:]: 'desc'})
else: # Handle field format
query = query.sort(**{s: 'asc'})
# Handle other parameters
params = {}
for key in ['page', 'per-page', 'sample', 'seed']:
if key in query_params:
params[key] = query_params[key][0]
return query, params
def invert_abstract(inv_index):
"""Reconstruct abstract from OpenAlex' inverted-index.
Handles dicts, JSON / repr strings, or missing values gracefully.
"""
# Try to coerce a string into a Python object first
if isinstance(inv_index, str):
try:
inv_index = json.loads(inv_index) # double-quoted JSON
except Exception:
try:
inv_index = ast.literal_eval(inv_index) # single-quoted repr
except Exception:
inv_index = None
if isinstance(inv_index, dict):
l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
return " ".join(w for w, _ in sorted(l_inv, key=lambda x: x[1]))
else:
return " "
def get_pub(x):
"""Extract publication name from record."""
try:
source = x['source']['display_name']
if source not in ['parsed_publication','Deleted Journal']:
return source
else:
return ' '
except:
return ' '
def get_field(x):
"""Extract academic field from record."""
try:
field = x['primary_topic']['subfield']['display_name']
if field is not None:
return field
else:
return np.nan
except:
return np.nan
def process_records_to_df(records):
"""
Convert OpenAlex records to a pandas DataFrame with processed fields.
Can handle either raw OpenAlex records or an existing DataFrame.
Args:
records (list or pd.DataFrame): List of OpenAlex record dictionaries or existing DataFrame
Returns:
pandas.DataFrame: Processed DataFrame with abstracts, publications, and titles
"""
# If records is already a DataFrame, use it directly
if isinstance(records, pd.DataFrame):
records_df = records.copy()
# Only process abstract_inverted_index and primary_location if they exist
if 'abstract_inverted_index' in records_df.columns:
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
if 'primary_location' in records_df.columns:
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ') # fill missing values with space, only if we have them.
else:
# Process raw records as before
records_df = pd.DataFrame(records)
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
# Fill missing values and deduplicate
records_df['abstract'] = records_df['abstract'].fillna(' ')
records_df['title'] = records_df['title'].fillna(' ')
records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)
return records_df
def openalex_url_to_filename(url):
"""
Convert an OpenAlex URL to a filename-safe string with timestamp.
Args:
url (str): The OpenAlex search URL
Returns:
str: A filename-safe string with timestamp (without extension)
"""
from datetime import datetime
import re
# First parse the URL into query and params
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Create parts of the filename
parts = []
# Handle filters
if 'filter' in query_params:
filters = query_params['filter'][0].split(',')
for f in filters:
if ':' in f:
key, value = f.split(':', 1)
# Replace dots with underscores and clean the value
key = key.replace('.', '_')
# Clean the value to be filename-safe and add spaces around words
clean_value = re.sub(r'[^\w\s-]', '', value)
# Replace multiple spaces with single space and strip
clean_value = ' '.join(clean_value.split())
# Replace spaces with underscores for filename
clean_value = clean_value.replace(' ', '_')
if key == 'default_search':
parts.append(f"search_{clean_value}")
else:
parts.append(f"{key}_{clean_value}")
# Handle sort parameters
if 'sort' in query_params:
sort_params = query_params['sort'][0].split(',')
for s in sort_params:
if s.startswith('-'):
parts.append(f"sort_{s[1:].replace('.', '_')}_desc")
else:
parts.append(f"sort_{s.replace('.', '_')}_asc")
# Add timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Combine all parts
filename = '__'.join(parts) if parts else 'openalex_query'
filename = f"{filename}__{timestamp}"
# Ensure filename is not too long (max 255 chars is common filesystem limit)
if len(filename) > 255:
filename = filename[:251] # leave room for potential extension
return filename
def get_records_from_dois(doi_list, block_size=50):
"""
Download OpenAlex records for a list of DOIs in blocks.
Args:
doi_list (list): List of DOIs (strings)
block_size (int): Number of DOIs to fetch per request (default 50)
Returns:
pd.DataFrame: DataFrame of OpenAlex records
"""
from pyalex import Works
from tqdm import tqdm
all_records = []
for i in tqdm(range(0, len(doi_list), block_size)):
sublist = doi_list[i:i+block_size]
doi_str = "|".join(sublist)
try:
record_list = Works().filter(doi=doi_str).get(per_page=block_size)
all_records.extend(record_list)
except Exception as e:
print(f"Error fetching DOIs {sublist}: {e}")
return pd.DataFrame(all_records)
def openalex_url_to_readable_name(url):
"""
Convert an OpenAlex URL to a short, human-readable query description.
Args:
url (str): The OpenAlex search URL
Returns:
str: A short, human-readable description of the query
Examples:
- "Search: 'Kuramoto Model'"
- "Search: 'quantum physics', 2020-2023"
- "Cites: Popper (1959)"
- "From: University of Pittsburgh, 1999-2020"
- "By: Einstein, A., 1905-1955"
"""
import re
# Parse the URL
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Initialize description parts
parts = []
year_range = None
# Handle filters
if 'filter' in query_params:
filters = query_params['filter'][0].split(',')
for f in filters:
if ':' not in f:
continue
key, value = f.split(':', 1)
try:
if key == 'default.search':
# Clean up search term (remove quotes if present)
search_term = value.strip('"\'')
parts.append(f"Search: '{search_term}'")
elif key == 'title_and_abstract.search':
# Handle title and abstract search specifically
from urllib.parse import unquote_plus
search_term = unquote_plus(value).strip('"\'')
parts.append(f"T&A: '{search_term}'")
elif key == 'publication_year':
# Handle year ranges or single years
if '-' in value:
start_year, end_year = value.split('-')
year_range = f"{start_year}-{end_year}"
else:
year_range = value
elif key == 'cites':
# Look up the cited work to get author and year
work_id = value
try:
cited_work = Works()[work_id]
if cited_work:
# Get first author's last name
author_name = "Unknown"
year = "Unknown"
if cited_work.get('authorships') and len(cited_work['authorships']) > 0:
first_author = cited_work['authorships'][0]['author']
if first_author.get('display_name'):
# Extract last name (assuming "First Last" format)
name_parts = first_author['display_name'].split()
author_name = name_parts[-1] if name_parts else first_author['display_name']
if cited_work.get('publication_year'):
year = str(cited_work['publication_year'])
parts.append(f"Cites: {author_name} ({year})")
else:
parts.append(f"Cites: Work {work_id}")
except Exception as e:
print(f"Could not fetch cited work {work_id}: {e}")
parts.append(f"Cites: Work {work_id}")
elif key == 'authorships.institutions.lineage':
# Look up institution name
inst_id = value
try:
institution = Institutions()[inst_id]
if institution and institution.get('display_name'):
parts.append(f"From: {institution['display_name']}")
else:
parts.append(f"From: Institution {inst_id}")
except Exception as e:
print(f"Could not fetch institution {inst_id}: {e}")
parts.append(f"From: Institution {inst_id}")
elif key == 'authorships.author.id':
# Look up author name
author_id = value
try:
author = Authors()[author_id]
if author and author.get('display_name'):
parts.append(f"By: {author['display_name']}")
else:
parts.append(f"By: Author {author_id}")
except Exception as e:
print(f"Could not fetch author {author_id}: {e}")
parts.append(f"By: Author {author_id}")
elif key == 'type':
# Handle work types
type_mapping = {
'article': 'Articles',
'book': 'Books',
'book-chapter': 'Book Chapters',
'dissertation': 'Dissertations',
'preprint': 'Preprints'
}
work_type = type_mapping.get(value, value.replace('-', ' ').title())
parts.append(f"Type: {work_type}")
elif key == 'host_venue.id':
# Look up venue name
venue_id = value
try:
# For venues, we can use Works to get source info, but let's try a direct approach
# This might need adjustment based on pyalex API structure
parts.append(f"In: Venue {venue_id}") # Fallback
except Exception as e:
parts.append(f"In: Venue {venue_id}")
elif key.startswith('concepts.id'):
# Handle concept filters - these are topic/concept IDs
concept_id = value
parts.append(f"Topic: {concept_id}") # Could be enhanced with concept lookup
else:
# Generic handling for other filters
from urllib.parse import unquote_plus
clean_key = key.replace('_', ' ').replace('.', ' ').title()
# Properly decode URL-encoded values
try:
clean_value = unquote_plus(value).replace('_', ' ')
except:
clean_value = value.replace('_', ' ')
parts.append(f"{clean_key}: {clean_value}")
except Exception as e:
print(f"Error processing filter {f}: {e}")
continue
# Combine parts into final description
if not parts:
description = "OpenAlex Query"
else:
description = ", ".join(parts)
# Add year range if present
if year_range:
if parts:
description += f", {year_range}"
else:
description = f"Works from {year_range}"
# Limit length to keep it readable
if len(description) > 60:
description = description[:57] + "..."
return description |