box::use( quanteda[corpus, docnames, dfm, convert, dfm_tfidf, dfm_subset], quanteda.textstats[textstat_simil], utils[head], spacyr[spacy_parse], dplyr[mutate, filter, select], ) create_ref_corpus <- function(data_tab) { corp <- corpus(data_tab, text_field = "description") docnames(corp) <- data_tab$title return(corp) } spacy_pipeline <- function(corp) { res <- corp |> spacy_parse() res_tokens <- res |> filter( ! pos %in% c("PUNCT", "PART", "NUM", "SYM") ) |> mutate( lemma = tolower(lemma) ) |> as.tokens( use_lemma = TRUE ) |> tokens_remove(stopwords("en")) corp_dfm <- res_tokens |> dfm() saveRDS(corp_dfm, "./data/ref_corp_dfm.rds") corp_tfidf <- corp_dfm |> dfm_tfidf() saveRDS(corp_tfidf, "./data/ref_corp_tfidf.rds") } #' export get_recommendations <- function(corp_dfm, query_book_titles, simil_method = "ejaccard", how_many) { query_dfm <- dfm_subset(corp_dfm, docname_ %in% query_book_titles) tstat <- textstat_simil( query_dfm, corp_dfm, margin = "documents", method = simil_method ) stat_list <- as.list(tstat) ordered <- sort(unlist(stat_list), decreasing = TRUE) top_n <- head(ordered, n = how_many) names(top_n) <- names(top_n) |> gsub(pattern = "\\..*$", replacement = "") return(names(top_n)) } #' export parse_recommendations <- function(rec_book_names, data_tab) { subset_books <- data_tab |> filter( title %in% rec_book_names ) |> select( title, average_rating, description, url, image_url, genres, author_name ) return(subset_books) } # hp3 <- dfm_subset(corp_dfm, docname_ %in% "A Game of Thrones (A Song of Ice and Fire, #1)") # tstat <- textstat_simil(hp3, corp_dfm, # margin = "documents", method = "ejaccard") # stat_list <- as.list(tstat) # ordered <- sort(unlist(stat_list), decreasing = TRUE) # top_ten <- head(ordered, n = 10) # names(top_ten) <- names(top_ten) |> gsub(pattern = "\\..*$", replacement = "") # top_ten