OpenAlex4NodeXL

Sleeping

App Files Files Community

OpenAlex4NodeXL / OpenAlex4NodeXL.R

Ifeanyi

Upload 2 files

fddf30f verified 5 months ago

raw

history blame

7.82 kB

	OpenAlex4NodeXL <- function(keywords, pub_start_date, pub_end_date) {

	keywords <- keywords
	pub_start_date <- pub_start_date
	pub_end_date <- pub_end_date

	# create search engine function
	search_engine <- function(keywords, pub_start_date, pub_end_date) {
	# load software libraries
	suppressPackageStartupMessages(library(openalexR))
	suppressPackageStartupMessages(library(tidyverse))

	# set options
	options(openalexR.mailto = "youremail@email.com") # replace with your email address

	# search engine
	works_search <- oa_fetch(
	entity = "works",
	title.search = c(keywords),
	cited_by_count = ">50",
	from_publication_date = pub_start_date,
	to_publication_date = pub_end_date,
	options = list(sort = "cited_by_count:desc"),
	verbose = FALSE
	)

	return(works_search)
	}

	# fetch data from openalex.org api
	search_data <- search_engine(keywords, pub_start_date, pub_end_date)

	# grab authors and group them according to collaboration
	authors_collaboration_groups <- list()
	for (i in 1:nrow(search_data)) {
	authors_collaboration_groups[[i]] <- search_data$author[[i]][2]
	}

	all_authors <- c()
	for (i in 1:length(authors_collaboration_groups)) {
	all_authors <- c(all_authors, authors_collaboration_groups[[i]][[1]])
	}

	# grab author position
	authors_position <- list()
	for (i in 1:nrow(search_data)) {
	authors_position[[i]] <- search_data$author[[i]][4]
	}

	all_authors_positions <- c() # grab all authors positions
	for (i in 1:length(authors_position)) {
	all_authors_positions <- c(all_authors_positions, authors_position[[i]][[1]])
	}

	# grab author affiliation
	authors_affiliation <- list()
	for (i in 1:nrow(search_data)) {
	authors_affiliation[[i]] <- search_data$author[[i]][7]
	}

	all_authors_affiliation <- c() # grab all authors affiliations
	for (i in 1:length(authors_affiliation)) {
	all_authors_affiliation <- c(all_authors_affiliation, authors_affiliation[[i]][[1]])
	}

	# grab authors institution country code
	authors_institution_country_code <- list()
	for (i in 1:nrow(search_data)) {
	authors_institution_country_code[[i]] <- search_data$author[[i]][9]
	}


	all_authors_institution_country_code <- c() # grab all authors institution country code
	for (i in 1:length(authors_institution_country_code)) {
	all_authors_institution_country_code <- c(all_authors_institution_country_code, authors_institution_country_code[[i]][[1]])
	}

	# grab author institution type
	authors_institution_type <- list()
	for (i in 1:nrow(search_data)) {
	authors_institution_type[[i]] <- search_data$author[[i]][10]
	}


	all_authors_institution_type <- c() # grab all authors institution type
	for (i in 1:length(authors_institution_type)) {
	all_authors_institution_type <- c(all_authors_institution_type, authors_institution_type[[i]][[1]])
	}

	# get length of each authors collaboration
	authors_length <- c()
	for (authors in 1:length(authors_collaboration_groups)) {
	authors_length <- c(authors_length, authors_collaboration_groups[[authors]] \|> nrow())
	}


	# create authors data frame
	authorAtt_df <- data.frame(
	Authors = all_authors,
	Position = all_authors_positions,
	Affiliation = all_authors_affiliation,
	Institution = all_authors_institution_type
	)

	# I did not want to have to use underscore to separate
	# the two words (Institution_Country). That is why I
	# created that column in the data frame using back ticks
	# instead as shown below
	authorAtt_df$`Institution Country` <- all_authors_institution_country_code

	# publication attributes
	# grab all publications

	publications <- list()
	for (i in 1:nrow(search_data)) {
	publications[[i]] <- rep(search_data$display_name[i], each = authors_length[i])
	}

	all_publications <- c()
	for (i in 1:length(publications)) {
	all_publications <- c(all_publications, publications[[i]])
	}

	# grab all so
	pub_so <- list()
	for (i in 1:nrow(search_data)) {
	pub_so[[i]] <- rep(search_data$so[i], each = authors_length[i])
	}

	all_so <- c()
	for (i in 1:length(pub_so)) {
	all_so <- c(all_so, pub_so[[i]])
	}

	# grab all host organization
	hostOrg <- list()
	for (i in 1:nrow(search_data)) {
	hostOrg[[i]] <- rep(search_data$host_organization[i], each = authors_length[i])
	}

	all_hostOrg <- c()
	for (i in 1:length(hostOrg)) {
	all_hostOrg <- c(all_hostOrg, hostOrg[[i]])
	}

	# grab all cited by count
	citedby_count <- list()
	for (i in 1:nrow(search_data)) {
	citedby_count[[i]] <- rep(search_data$cited_by_count[i], each = authors_length[i])
	}

	all_citedby_count <- c()
	for (i in 1:length(citedby_count)) {
	all_citedby_count <- c(all_citedby_count, citedby_count[[i]])
	}

	# grab all publication year
	pub_year <- list()
	for (i in 1:nrow(search_data)) {
	pub_year[[i]] <- rep(search_data$publication_year[i], each = authors_length[i])
	}

	all_pub_year <- c()
	for (i in 1:length(citedby_count)) {
	all_pub_year <- c(all_pub_year, pub_year[[i]])
	}

	# grab all type
	type <- list()
	for (i in 1:nrow(search_data)) {
	type[[i]] <- rep(search_data$type[i], each = authors_length[i])
	}

	all_type <- c()
	for (i in 1:length(type)) {
	all_type <- c(all_type, type[[i]])
	}

	# grab all abstract
	abstract <- list()
	for (i in 1:nrow(search_data)) {
	abstract[[i]] <- rep(search_data$ab[i], each = authors_length[i])
	}

	all_abstracts <- c()
	for (i in 1:length(abstract)) {
	all_abstracts <- c(all_abstracts, abstract[[i]])
	}

	# grab all referenced works
	referenced <- list()
	for (i in 1:nrow(search_data)) {
	referenced[[i]] <- rep(search_data$referenced_works[i], each = authors_length[i])
	}

	all_referenced <- c()
	for (i in 1:length(referenced)) {
	all_referenced <- c(all_referenced, referenced[[i]])
	}

	# update the authors data frame
	{
	authorAtt_df$Publication <- all_publications
	authorAtt_df$`Abstract` <- all_abstracts
	authorAtt_df$`Publication Type` <- all_type
	authorAtt_df$`Publication Year` <- all_pub_year
	authorAtt_df$`Cited By Count` <- all_citedby_count
	authorAtt_df$`Referenced Works` <- all_referenced
	authorAtt_df$`Host Organization` <- all_hostOrg
	authorAtt_df$SO <- all_so
	}


	# filter out missing values from the data frame
	authorAtt_df <- authorAtt_df \|>
	na.omit()

	# move abstract column to behind Publication
	authorAtt_df <- authorAtt_df \|>
	relocate(Abstract, .after = Publication)

	# rearrange columns for NodeXL flat file csv format
	authorAtt_df <- authorAtt_df \|>
	relocate(Publication, .after = Authors)


	# rename columns
	colnames(authorAtt_df)[c(1:13)] <- c(
	"Vertex1",
	"Vertex2",
	"Vertex1 Position",
	"Vertex1 Affiliation",
	"Vertex1 Institution",
	"Vertex1 Institution Country",
	"Vertex2 Abstract",
	"Vertex2 Type",
	"Vertex2 Publication Year",
	"Vertex2 Cited By Count",
	"Vertex2 Referenced Works",
	"Vertex2 Host Organization",
	"Vertex2 SO"
	)

	list2vec <- function(x){
	paste(x,collapse = " ")
	}
	# convert list column into character column
	authorAtt_df$`Vertex2 Referenced Works` <- sapply(authorAtt_df$`Vertex2 Referenced Works`,list2vec)


	return(authorAtt_df)



	}

	# test software program
	# mydata <- OpenAlex4NodeXL(
	# keywords = c("software", "information"),
	# pub_start_date = "2019-01-01",
	# pub_end_date = "2023-09-30"
	# )

	#
	# view returned data
	# mydata \|> view()