File size: 2,041 Bytes
f888423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
box::use(
  quanteda[corpus, docnames, dfm, convert, dfm_tfidf, dfm_subset],
  quanteda.textstats[textstat_simil],
  utils[head],
  spacyr[spacy_parse],
  dplyr[mutate, filter, select],
)



create_ref_corpus <- function(data_tab) {
  corp <- corpus(data_tab, text_field = "description")
  docnames(corp) <- data_tab$title
  return(corp)
}


spacy_pipeline <- function(corp) {
  res <- corp |> spacy_parse() 
  res_tokens <- res |> 
    filter(
    ! pos %in% c("PUNCT", "PART", "NUM", "SYM")
    ) |>
    mutate(
      lemma = tolower(lemma)
    ) |>
    as.tokens(
      use_lemma = TRUE
    ) |>
    tokens_remove(stopwords("en"))
  
  corp_dfm <- res_tokens |> dfm()
  
  saveRDS(corp_dfm, "./data/ref_corp_dfm.rds")
  
  
  corp_tfidf <- corp_dfm |> dfm_tfidf()
  
  saveRDS(corp_tfidf, "./data/ref_corp_tfidf.rds")
}

#' export
get_recommendations <- function(corp_dfm, query_book_titles, simil_method = "ejaccard", how_many) {
  query_dfm <- dfm_subset(corp_dfm, docname_ %in% query_book_titles)
  
  tstat <- textstat_simil(
    query_dfm, corp_dfm,
    margin = "documents",
    method = simil_method
  )
  
  stat_list <- as.list(tstat)
  ordered <- sort(unlist(stat_list), decreasing = TRUE)
  top_n <- head(ordered, n = how_many)
  names(top_n) <- names(top_n) |> gsub(pattern = "\\..*$", replacement = "")
  return(names(top_n))
}

#' export
parse_recommendations <- function(rec_book_names, data_tab) {
  subset_books <- data_tab |> 
    filter(
      title %in% rec_book_names  
    ) |>
    select(
      title, average_rating, description, url, image_url, genres, author_name
    )
  return(subset_books)
}

# hp3 <- dfm_subset(corp_dfm, docname_ %in% "A Game of Thrones (A Song of Ice and Fire, #1)")
# tstat <- textstat_simil(hp3, corp_dfm,
#                         margin = "documents", method = "ejaccard")
# stat_list <- as.list(tstat)
# ordered <- sort(unlist(stat_list), decreasing = TRUE)
# top_ten <- head(ordered, n = 10)
# names(top_ten) <- names(top_ten) |> gsub(pattern = "\\..*$", replacement = "")
# top_ten