Spaces:
Sleeping
Sleeping
OpenAlex4NodeXL <- function(keywords, pub_start_date, pub_end_date) { | |
keywords <- keywords | |
pub_start_date <- pub_start_date | |
pub_end_date <- pub_end_date | |
# create search engine function | |
search_engine <- function(keywords, pub_start_date, pub_end_date) { | |
# load software libraries | |
suppressPackageStartupMessages(library(openalexR)) | |
suppressPackageStartupMessages(library(tidyverse)) | |
# set options | |
options(openalexR.mailto = "youremail@email.com") # replace with your email address | |
# search engine | |
works_search <- oa_fetch( | |
entity = "works", | |
title.search = c(keywords), | |
cited_by_count = ">50", | |
from_publication_date = pub_start_date, | |
to_publication_date = pub_end_date, | |
options = list(sort = "cited_by_count:desc"), | |
verbose = FALSE | |
) | |
return(works_search) | |
} | |
# fetch data from openalex.org api | |
search_data <- search_engine(keywords, pub_start_date, pub_end_date) | |
# grab authors and group them according to collaboration | |
authors_collaboration_groups <- list() | |
for (i in 1:nrow(search_data)) { | |
authors_collaboration_groups[[i]] <- search_data$author[[i]][2] | |
} | |
all_authors <- c() | |
for (i in 1:length(authors_collaboration_groups)) { | |
all_authors <- c(all_authors, authors_collaboration_groups[[i]][[1]]) | |
} | |
# grab author position | |
authors_position <- list() | |
for (i in 1:nrow(search_data)) { | |
authors_position[[i]] <- search_data$author[[i]][4] | |
} | |
all_authors_positions <- c() # grab all authors positions | |
for (i in 1:length(authors_position)) { | |
all_authors_positions <- c(all_authors_positions, authors_position[[i]][[1]]) | |
} | |
# grab author affiliation | |
authors_affiliation <- list() | |
for (i in 1:nrow(search_data)) { | |
authors_affiliation[[i]] <- search_data$author[[i]][7] | |
} | |
all_authors_affiliation <- c() # grab all authors affiliations | |
for (i in 1:length(authors_affiliation)) { | |
all_authors_affiliation <- c(all_authors_affiliation, authors_affiliation[[i]][[1]]) | |
} | |
# grab authors institution country code | |
authors_institution_country_code <- list() | |
for (i in 1:nrow(search_data)) { | |
authors_institution_country_code[[i]] <- search_data$author[[i]][9] | |
} | |
all_authors_institution_country_code <- c() # grab all authors institution country code | |
for (i in 1:length(authors_institution_country_code)) { | |
all_authors_institution_country_code <- c(all_authors_institution_country_code, authors_institution_country_code[[i]][[1]]) | |
} | |
# grab author institution type | |
authors_institution_type <- list() | |
for (i in 1:nrow(search_data)) { | |
authors_institution_type[[i]] <- search_data$author[[i]][10] | |
} | |
all_authors_institution_type <- c() # grab all authors institution type | |
for (i in 1:length(authors_institution_type)) { | |
all_authors_institution_type <- c(all_authors_institution_type, authors_institution_type[[i]][[1]]) | |
} | |
# get length of each authors collaboration | |
authors_length <- c() | |
for (authors in 1:length(authors_collaboration_groups)) { | |
authors_length <- c(authors_length, authors_collaboration_groups[[authors]] |> nrow()) | |
} | |
# create authors data frame | |
authorAtt_df <- data.frame( | |
Authors = all_authors, | |
Position = all_authors_positions, | |
Affiliation = all_authors_affiliation, | |
Institution = all_authors_institution_type | |
) | |
# I did not want to have to use underscore to separate | |
# the two words (Institution_Country). That is why I | |
# created that column in the data frame using back ticks | |
# instead as shown below | |
authorAtt_df$`Institution Country` <- all_authors_institution_country_code | |
# publication attributes | |
# grab all publications | |
publications <- list() | |
for (i in 1:nrow(search_data)) { | |
publications[[i]] <- rep(search_data$display_name[i], each = authors_length[i]) | |
} | |
all_publications <- c() | |
for (i in 1:length(publications)) { | |
all_publications <- c(all_publications, publications[[i]]) | |
} | |
# grab all so | |
pub_so <- list() | |
for (i in 1:nrow(search_data)) { | |
pub_so[[i]] <- rep(search_data$so[i], each = authors_length[i]) | |
} | |
all_so <- c() | |
for (i in 1:length(pub_so)) { | |
all_so <- c(all_so, pub_so[[i]]) | |
} | |
# grab all host organization | |
hostOrg <- list() | |
for (i in 1:nrow(search_data)) { | |
hostOrg[[i]] <- rep(search_data$host_organization[i], each = authors_length[i]) | |
} | |
all_hostOrg <- c() | |
for (i in 1:length(hostOrg)) { | |
all_hostOrg <- c(all_hostOrg, hostOrg[[i]]) | |
} | |
# grab all cited by count | |
citedby_count <- list() | |
for (i in 1:nrow(search_data)) { | |
citedby_count[[i]] <- rep(search_data$cited_by_count[i], each = authors_length[i]) | |
} | |
all_citedby_count <- c() | |
for (i in 1:length(citedby_count)) { | |
all_citedby_count <- c(all_citedby_count, citedby_count[[i]]) | |
} | |
# grab all publication year | |
pub_year <- list() | |
for (i in 1:nrow(search_data)) { | |
pub_year[[i]] <- rep(search_data$publication_year[i], each = authors_length[i]) | |
} | |
all_pub_year <- c() | |
for (i in 1:length(citedby_count)) { | |
all_pub_year <- c(all_pub_year, pub_year[[i]]) | |
} | |
# grab all type | |
type <- list() | |
for (i in 1:nrow(search_data)) { | |
type[[i]] <- rep(search_data$type[i], each = authors_length[i]) | |
} | |
all_type <- c() | |
for (i in 1:length(type)) { | |
all_type <- c(all_type, type[[i]]) | |
} | |
# grab all abstract | |
abstract <- list() | |
for (i in 1:nrow(search_data)) { | |
abstract[[i]] <- rep(search_data$ab[i], each = authors_length[i]) | |
} | |
all_abstracts <- c() | |
for (i in 1:length(abstract)) { | |
all_abstracts <- c(all_abstracts, abstract[[i]]) | |
} | |
# grab all referenced works | |
referenced <- list() | |
for (i in 1:nrow(search_data)) { | |
referenced[[i]] <- rep(search_data$referenced_works[i], each = authors_length[i]) | |
} | |
all_referenced <- c() | |
for (i in 1:length(referenced)) { | |
all_referenced <- c(all_referenced, referenced[[i]]) | |
} | |
# update the authors data frame | |
{ | |
authorAtt_df$Publication <- all_publications | |
authorAtt_df$`Abstract` <- all_abstracts | |
authorAtt_df$`Publication Type` <- all_type | |
authorAtt_df$`Publication Year` <- all_pub_year | |
authorAtt_df$`Cited By Count` <- all_citedby_count | |
authorAtt_df$`Referenced Works` <- all_referenced | |
authorAtt_df$`Host Organization` <- all_hostOrg | |
authorAtt_df$SO <- all_so | |
} | |
# filter out missing values from the data frame | |
authorAtt_df <- authorAtt_df |> | |
na.omit() | |
# move abstract column to behind Publication | |
authorAtt_df <- authorAtt_df |> | |
relocate(Abstract, .after = Publication) | |
# rearrange columns for NodeXL flat file csv format | |
authorAtt_df <- authorAtt_df |> | |
relocate(Publication, .after = Authors) | |
# rename columns | |
colnames(authorAtt_df)[c(1:13)] <- c( | |
"Vertex1", | |
"Vertex2", | |
"Vertex1 Position", | |
"Vertex1 Affiliation", | |
"Vertex1 Institution", | |
"Vertex1 Institution Country", | |
"Vertex2 Abstract", | |
"Vertex2 Type", | |
"Vertex2 Publication Year", | |
"Vertex2 Cited By Count", | |
"Vertex2 Referenced Works", | |
"Vertex2 Host Organization", | |
"Vertex2 SO" | |
) | |
list2vec <- function(x){ | |
paste(x,collapse = " ") | |
} | |
# convert list column into character column | |
authorAtt_df$`Vertex2 Referenced Works` <- sapply(authorAtt_df$`Vertex2 Referenced Works`,list2vec) | |
return(authorAtt_df) | |
} | |
# test software program | |
# mydata <- OpenAlex4NodeXL( | |
# keywords = c("software", "information"), | |
# pub_start_date = "2019-01-01", | |
# pub_end_date = "2023-09-30" | |
# ) | |
# | |
# view returned data | |
# mydata |> view() | |