Spaces:

MaxNoichl
/

openalex_mapper

Running on Zero

openalex_mapper / openalex_utils.py

Enhance data processing in app.py and openalex_utils.py by improving handling of referenced works and filling missing publication values with spaces.

d748d3b 6 months ago

raw

history blame

6.41 kB

	import numpy as np
	from urllib.parse import urlparse, parse_qs
	from pyalex import Works
	import pandas as pd

	def openalex_url_to_pyalex_query(url):
	"""
	Convert an OpenAlex search URL to a pyalex query.

	Args:
	url (str): The OpenAlex search URL.

	Returns:
	tuple: (Works object, dict of parameters)
	"""
	parsed_url = urlparse(url)
	query_params = parse_qs(parsed_url.query)

	# Initialize the Works object
	query = Works()

	# Handle filters
	if 'filter' in query_params:
	filters = query_params['filter'][0].split(',')
	for f in filters:
	if ':' in f:
	key, value = f.split(':', 1)
	if key == 'default.search':
	query = query.search(value)
	else:
	query = query.filter(**{key: value})

	# Handle sort - Fixed to properly handle field:direction format
	if 'sort' in query_params:
	sort_params = query_params['sort'][0].split(',')
	for s in sort_params:
	if ':' in s: # Handle field:direction format
	field, direction = s.split(':')
	query = query.sort(**{field: direction})
	elif s.startswith('-'): # Handle -field format
	query = query.sort(**{s[1:]: 'desc'})
	else: # Handle field format
	query = query.sort(**{s: 'asc'})

	# Handle other parameters
	params = {}
	for key in ['page', 'per-page', 'sample', 'seed']:
	if key in query_params:
	params[key] = query_params[key][0]

	return query, params

	def invert_abstract(inv_index):
	"""Reconstruct abstract from inverted index."""
	if inv_index is not None:
	l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
	return " ".join(map(lambda x: x[0], sorted(l_inv, key=lambda x: x[1])))
	else:
	return ' '

	def get_pub(x):
	"""Extract publication name from record."""
	try:
	source = x['source']['display_name']
	if source not in ['parsed_publication','Deleted Journal']:
	return source
	else:
	return ' '
	except:
	return ' '

	def get_field(x):
	"""Extract academic field from record."""
	try:
	field = x['primary_topic']['subfield']['display_name']
	if field is not None:
	return field
	else:
	return np.nan
	except:
	return np.nan

	def process_records_to_df(records):
	"""
	Convert OpenAlex records to a pandas DataFrame with processed fields.
	Can handle either raw OpenAlex records or an existing DataFrame.

	Args:
	records (list or pd.DataFrame): List of OpenAlex record dictionaries or existing DataFrame

	Returns:
	pandas.DataFrame: Processed DataFrame with abstracts, publications, and titles
	"""
	# If records is already a DataFrame, use it directly
	if isinstance(records, pd.DataFrame):
	records_df = records.copy()
	# Only process abstract_inverted_index and primary_location if they exist
	if 'abstract_inverted_index' in records_df.columns:
	records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
	if 'primary_location' in records_df.columns:
	records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
	records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ') # fill missing values with space, only if we have them.

	else:
	# Process raw records as before
	records_df = pd.DataFrame(records)
	records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
	records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
	records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')

	# Fill missing values and deduplicate

	records_df['abstract'] = records_df['abstract'].fillna(' ')
	records_df['title'] = records_df['title'].fillna(' ')
	records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)

	return records_df







	def openalex_url_to_filename(url):
	"""
	Convert an OpenAlex URL to a filename-safe string with timestamp.

	Args:
	url (str): The OpenAlex search URL

	Returns:
	str: A filename-safe string with timestamp (without extension)
	"""
	from datetime import datetime
	import re

	# First parse the URL into query and params
	parsed_url = urlparse(url)
	query_params = parse_qs(parsed_url.query)

	# Create parts of the filename
	parts = []

	# Handle filters
	if 'filter' in query_params:
	filters = query_params['filter'][0].split(',')
	for f in filters:
	if ':' in f:
	key, value = f.split(':', 1)
	# Replace dots with underscores and clean the value
	key = key.replace('.', '_')
	# Clean the value to be filename-safe and add spaces around words
	clean_value = re.sub(r'[^\w\s-]', '', value)
	# Replace multiple spaces with single space and strip
	clean_value = ' '.join(clean_value.split())
	# Replace spaces with underscores for filename
	clean_value = clean_value.replace(' ', '_')

	if key == 'default_search':
	parts.append(f"search_{clean_value}")
	else:
	parts.append(f"{key}_{clean_value}")

	# Handle sort parameters
	if 'sort' in query_params:
	sort_params = query_params['sort'][0].split(',')
	for s in sort_params:
	if s.startswith('-'):
	parts.append(f"sort_{s[1:].replace('.', '_')}_desc")
	else:
	parts.append(f"sort_{s.replace('.', '_')}_asc")

	# Add timestamp
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

	# Combine all parts
	filename = '__'.join(parts) if parts else 'openalex_query'
	filename = f"{filename}__{timestamp}"

	# Ensure filename is not too long (max 255 chars is common filesystem limit)
	if len(filename) > 255:
	filename = filename[:251] # leave room for potential extension

	return filename