File size: 3,177 Bytes
f888423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
box::use(
  quanteda[corpus, docvars, docnames, dfm, convert, dfm_tfidf, dfm_subset],
  quanteda.textstats[textstat_simil],
  utils[head],
  spacyr[spacy_parse],
  dplyr[mutate, filter, select],
  data.table[setorderv],
)

box::use(
  app/logic/utils[parse_recommendations]
)


create_ref_corpus <- function(data_tab, field) {
  corp <- corpus(data_tab, text_field = field)
  docnames(corp) <- data_tab$book_id
  return(corp)
}


spacy_pipeline <- function(corp) {
  browser()
  res <- corp |> spacy_parse(nounphrase = TRUE) 
  res_tokens <- res |> 
    filter(
      ! pos %in% c("PUNCT", "PART", "NUM", "SYM"),
      ! entity %in% c("PERSON_B", "PERSON_I") 
    ) |>
    mutate(
      lemma = tolower(lemma)
    )
  
  all <- res_tokens |>
    group_by(sentence_id, nounphrase) |>
    mutate(nounphrase_id = cumsum(nounphrase %in% c("beg_root", ""))) |>
    group_by(sentence_id, nounphrase_id) |>
    mutate(has_entity = ifelse(entity!= "", 1, 0)) |>
    as.data.table()
  
  
  phrases <- all |>
    filter(nounphrase_id == 0, has_entity == 1) 
  
  non_phrases <- fsetdiff(all, phrases)
  
  phrases <- phrases |>
    mutate(nounphrase_id = cumsum(nounphrase == "beg"), seq_id = -1)
  
  phrases[1, ]$seq_id <- 0
  
  phrases$seq_id <- cumsum(c(TRUE, phrases$sentence_id[-1]!= phrases$sentence_id[-nrow(phrases)] | 
                               phrases$token_id[-1]!= phrases$token_id[-nrow(phrases)] + 1))
  
  phrases_concat <- phrases[,c("token", "lemma", "pos", "entity") := 
                              .(paste(token, collapse = " "), 
                                paste(lemma, collapse = " "), 
                                paste(pos, collapse = " "), 
                                paste(entity, collapse = " ")), 
                            by =.(nounphrase_id, sentence_id, seq_id)]
  
  phrases_concat <- unique(phrases_concat, by = c("sentence_id", "nounphrase_id", "token", "seq_id"))
  non_phrases[, c("nounphrase_id", "has_entity") := NULL]
  phrases_concat[, c("nounphrase_id", "has_entity", "seq_id") := NULL]
  
  joined <- rbindlist(list(non_phrases, phrases_concat))
  setorder(joined, doc_id, sentence_id, token_id)
  
  class(joined) <- c("spacyr_parsed", class(joined))
  res_tokens <- joined |> as.tokens(
    use_lemma = TRUE
  )
  
  corp_dfm <- res_tokens |> dfm()
  docvars(corp_dfm) <- docvars(corp)
  
  saveRDS(corp_dfm, "./data/ref_corp_dfm_new.rds")
  
  
  corp_tfidf <- corp_dfm |> dfm_tfidf()
  
  saveRDS(corp_tfidf, "./data/ref_corp_tfidf_new.rds")
}

#' export
get_recommendations <- function(corp_dfm, data_tab, query_book_ids, genres, simil_method = "cosine", how_many) {
  query_dfm <- dfm_subset(corp_dfm, docname_ %in% query_book_ids)
  if (!is.null(genres)) {
    corp_dfm <- corp_dfm[grep(genres, paste(docvars(corp_dfm)$genres)),]
  }
  rest_dfm <- dfm_subset(corp_dfm, !docname_ %in% query_book_ids)
  

  tstat <- textstat_simil(
    query_dfm, rest_dfm,
    margin = "documents",
    method = simil_method
  ) |>
  as.data.frame()
  setorderv(tstat, cols = c(simil_method), order = -1)
  recommendations <- parse_recommendations(tstat[1:how_many,]$document2, data_tab, "TFIDF")
  return(recommendations)
}