OpenAlex4NodeXL

Sleeping

File size: 7,815 Bytes

fddf30f

OpenAlex4NodeXL <- function(keywords, pub_start_date, pub_end_date) {
  
  keywords <- keywords
  pub_start_date <- pub_start_date
  pub_end_date <- pub_end_date

  # create search engine function
  search_engine <- function(keywords, pub_start_date, pub_end_date) {
    # load software libraries
    suppressPackageStartupMessages(library(openalexR))
    suppressPackageStartupMessages(library(tidyverse))

    # set options
    options(openalexR.mailto = "youremail@email.com") # replace with your email address

    # search engine
    works_search <- oa_fetch(
      entity = "works",
      title.search = c(keywords),
      cited_by_count = ">50",
      from_publication_date = pub_start_date,
      to_publication_date = pub_end_date,
      options = list(sort = "cited_by_count:desc"),
      verbose = FALSE
    )

    return(works_search)
  }
  
  # fetch data from openalex.org api 
  search_data <- search_engine(keywords, pub_start_date, pub_end_date)

  # grab authors and group them according to collaboration
  authors_collaboration_groups <- list()
  for (i in 1:nrow(search_data)) {
    authors_collaboration_groups[[i]] <- search_data$author[[i]][2]
  }

  all_authors <- c()
  for (i in 1:length(authors_collaboration_groups)) {
    all_authors <- c(all_authors, authors_collaboration_groups[[i]][[1]])
  }

  # grab author position
  authors_position <- list()
  for (i in 1:nrow(search_data)) {
    authors_position[[i]] <- search_data$author[[i]][4]
  }

  all_authors_positions <- c() # grab all authors positions
  for (i in 1:length(authors_position)) {
    all_authors_positions <- c(all_authors_positions, authors_position[[i]][[1]])
  }

  # grab author affiliation
  authors_affiliation <- list()
  for (i in 1:nrow(search_data)) {
    authors_affiliation[[i]] <- search_data$author[[i]][7]
  }

  all_authors_affiliation <- c() # grab all authors affiliations
  for (i in 1:length(authors_affiliation)) {
    all_authors_affiliation <- c(all_authors_affiliation, authors_affiliation[[i]][[1]])
  }

  # grab authors institution country code
  authors_institution_country_code <- list()
  for (i in 1:nrow(search_data)) {
    authors_institution_country_code[[i]] <- search_data$author[[i]][9]
  }


  all_authors_institution_country_code <- c() # grab all authors institution country code
  for (i in 1:length(authors_institution_country_code)) {
    all_authors_institution_country_code <- c(all_authors_institution_country_code, authors_institution_country_code[[i]][[1]])
  }

  # grab author institution type
  authors_institution_type <- list()
  for (i in 1:nrow(search_data)) {
    authors_institution_type[[i]] <- search_data$author[[i]][10]
  }


  all_authors_institution_type <- c() # grab all authors institution type
  for (i in 1:length(authors_institution_type)) {
    all_authors_institution_type <- c(all_authors_institution_type, authors_institution_type[[i]][[1]])
  }

  # get length of each authors collaboration
  authors_length <- c()
  for (authors in 1:length(authors_collaboration_groups)) {
    authors_length <- c(authors_length, authors_collaboration_groups[[authors]] |> nrow())
  }


  # create authors data frame
  authorAtt_df <- data.frame(
    Authors = all_authors,
    Position = all_authors_positions,
    Affiliation = all_authors_affiliation,
    Institution = all_authors_institution_type
  )

  # I did not want to have to use underscore to separate
  # the two words (Institution_Country). That is why I
  # created that column in the data frame using back ticks
  # instead as shown below
  authorAtt_df$`Institution Country` <- all_authors_institution_country_code

  # publication attributes
  # grab all publications

  publications <- list()
  for (i in 1:nrow(search_data)) {
    publications[[i]] <- rep(search_data$display_name[i], each = authors_length[i])
  }

  all_publications <- c()
  for (i in 1:length(publications)) {
    all_publications <- c(all_publications, publications[[i]])
  }

  # grab all so
  pub_so <- list()
  for (i in 1:nrow(search_data)) {
    pub_so[[i]] <- rep(search_data$so[i], each = authors_length[i])
  }

  all_so <- c()
  for (i in 1:length(pub_so)) {
    all_so <- c(all_so, pub_so[[i]])
  }

  # grab all host organization
  hostOrg <- list()
  for (i in 1:nrow(search_data)) {
    hostOrg[[i]] <- rep(search_data$host_organization[i], each = authors_length[i])
  }

  all_hostOrg <- c()
  for (i in 1:length(hostOrg)) {
    all_hostOrg <- c(all_hostOrg, hostOrg[[i]])
  }

  # grab all cited by count
  citedby_count <- list()
  for (i in 1:nrow(search_data)) {
    citedby_count[[i]] <- rep(search_data$cited_by_count[i], each = authors_length[i])
  }

  all_citedby_count <- c()
  for (i in 1:length(citedby_count)) {
    all_citedby_count <- c(all_citedby_count, citedby_count[[i]])
  }

  # grab all publication year
  pub_year <- list()
  for (i in 1:nrow(search_data)) {
    pub_year[[i]] <- rep(search_data$publication_year[i], each = authors_length[i])
  }

  all_pub_year <- c()
  for (i in 1:length(citedby_count)) {
    all_pub_year <- c(all_pub_year, pub_year[[i]])
  }

  # grab all type
  type <- list()
  for (i in 1:nrow(search_data)) {
    type[[i]] <- rep(search_data$type[i], each = authors_length[i])
  }

  all_type <- c()
  for (i in 1:length(type)) {
    all_type <- c(all_type, type[[i]])
  }

  # grab all abstract
  abstract <- list()
  for (i in 1:nrow(search_data)) {
    abstract[[i]] <- rep(search_data$ab[i], each = authors_length[i])
  }

  all_abstracts <- c()
  for (i in 1:length(abstract)) {
    all_abstracts <- c(all_abstracts, abstract[[i]])
  }
  
  # grab all referenced works
  referenced <- list()
  for (i in 1:nrow(search_data)) {
    referenced[[i]] <- rep(search_data$referenced_works[i], each = authors_length[i])
  }
  
  all_referenced <- c()
  for (i in 1:length(referenced)) {
    all_referenced <- c(all_referenced, referenced[[i]])
  }

  # update the authors data frame
  {
    authorAtt_df$Publication <- all_publications
    authorAtt_df$`Abstract` <- all_abstracts
    authorAtt_df$`Publication Type` <- all_type
    authorAtt_df$`Publication Year` <- all_pub_year
    authorAtt_df$`Cited By Count` <- all_citedby_count
    authorAtt_df$`Referenced Works` <- all_referenced
    authorAtt_df$`Host Organization` <- all_hostOrg
    authorAtt_df$SO <- all_so
  }


  # filter out missing values from the data frame
  authorAtt_df <- authorAtt_df |>
    na.omit()

  # move abstract column to behind Publication
  authorAtt_df <- authorAtt_df |>
    relocate(Abstract, .after = Publication)

  # rearrange columns for NodeXL flat file csv format
  authorAtt_df <- authorAtt_df |>
    relocate(Publication, .after = Authors)


  # rename columns
  colnames(authorAtt_df)[c(1:13)] <- c(
    "Vertex1",
    "Vertex2",
    "Vertex1 Position",
    "Vertex1 Affiliation",
    "Vertex1 Institution",
    "Vertex1 Institution Country",
    "Vertex2 Abstract",
    "Vertex2 Type",
    "Vertex2 Publication Year",
    "Vertex2 Cited By Count",
    "Vertex2 Referenced Works",
    "Vertex2 Host Organization",
    "Vertex2 SO"
  )

  list2vec <- function(x){
    paste(x,collapse = " ")
  }
  # convert list column into character column
  authorAtt_df$`Vertex2 Referenced Works` <- sapply(authorAtt_df$`Vertex2 Referenced Works`,list2vec)
  

  return(authorAtt_df)
  

  
}

# test software program
# mydata <- OpenAlex4NodeXL(
#   keywords = c("software", "information"),
#   pub_start_date = "2019-01-01",
#   pub_end_date = "2023-09-30"
# )

# 
# view returned data
# mydata |> view()