|
box::use( |
|
quanteda[corpus, docvars, docnames, dfm, convert, dfm_tfidf, dfm_subset], |
|
quanteda.textstats[textstat_simil], |
|
utils[head], |
|
spacyr[spacy_parse], |
|
dplyr[mutate, filter, select], |
|
data.table[setorderv], |
|
) |
|
|
|
box::use( |
|
app/logic/utils[parse_recommendations] |
|
) |
|
|
|
|
|
create_ref_corpus <- function(data_tab, field) { |
|
corp <- corpus(data_tab, text_field = field) |
|
docnames(corp) <- data_tab$book_id |
|
return(corp) |
|
} |
|
|
|
|
|
spacy_pipeline <- function(corp) { |
|
browser() |
|
res <- corp |> spacy_parse(nounphrase = TRUE) |
|
res_tokens <- res |> |
|
filter( |
|
! pos %in% c("PUNCT", "PART", "NUM", "SYM"), |
|
! entity %in% c("PERSON_B", "PERSON_I") |
|
) |> |
|
mutate( |
|
lemma = tolower(lemma) |
|
) |
|
|
|
all <- res_tokens |> |
|
group_by(sentence_id, nounphrase) |> |
|
mutate(nounphrase_id = cumsum(nounphrase %in% c("beg_root", ""))) |> |
|
group_by(sentence_id, nounphrase_id) |> |
|
mutate(has_entity = ifelse(entity!= "", 1, 0)) |> |
|
as.data.table() |
|
|
|
|
|
phrases <- all |> |
|
filter(nounphrase_id == 0, has_entity == 1) |
|
|
|
non_phrases <- fsetdiff(all, phrases) |
|
|
|
phrases <- phrases |> |
|
mutate(nounphrase_id = cumsum(nounphrase == "beg"), seq_id = -1) |
|
|
|
phrases[1, ]$seq_id <- 0 |
|
|
|
phrases$seq_id <- cumsum(c(TRUE, phrases$sentence_id[-1]!= phrases$sentence_id[-nrow(phrases)] | |
|
phrases$token_id[-1]!= phrases$token_id[-nrow(phrases)] + 1)) |
|
|
|
phrases_concat <- phrases[,c("token", "lemma", "pos", "entity") := |
|
.(paste(token, collapse = " "), |
|
paste(lemma, collapse = " "), |
|
paste(pos, collapse = " "), |
|
paste(entity, collapse = " ")), |
|
by =.(nounphrase_id, sentence_id, seq_id)] |
|
|
|
phrases_concat <- unique(phrases_concat, by = c("sentence_id", "nounphrase_id", "token", "seq_id")) |
|
non_phrases[, c("nounphrase_id", "has_entity") := NULL] |
|
phrases_concat[, c("nounphrase_id", "has_entity", "seq_id") := NULL] |
|
|
|
joined <- rbindlist(list(non_phrases, phrases_concat)) |
|
setorder(joined, doc_id, sentence_id, token_id) |
|
|
|
class(joined) <- c("spacyr_parsed", class(joined)) |
|
res_tokens <- joined |> as.tokens( |
|
use_lemma = TRUE |
|
) |
|
|
|
corp_dfm <- res_tokens |> dfm() |
|
docvars(corp_dfm) <- docvars(corp) |
|
|
|
saveRDS(corp_dfm, "./data/ref_corp_dfm_new.rds") |
|
|
|
|
|
corp_tfidf <- corp_dfm |> dfm_tfidf() |
|
|
|
saveRDS(corp_tfidf, "./data/ref_corp_tfidf_new.rds") |
|
} |
|
|
|
|
|
get_recommendations <- function(corp_dfm, data_tab, query_book_ids, genres, simil_method = "cosine", how_many) { |
|
query_dfm <- dfm_subset(corp_dfm, docname_ %in% query_book_ids) |
|
if (!is.null(genres)) { |
|
corp_dfm <- corp_dfm[grep(genres, paste(docvars(corp_dfm)$genres)),] |
|
} |
|
rest_dfm <- dfm_subset(corp_dfm, !docname_ %in% query_book_ids) |
|
|
|
|
|
tstat <- textstat_simil( |
|
query_dfm, rest_dfm, |
|
margin = "documents", |
|
method = simil_method |
|
) |> |
|
as.data.frame() |
|
setorderv(tstat, cols = c(simil_method), order = -1) |
|
recommendations <- parse_recommendations(tstat[1:how_many,]$document2, data_tab, "TFIDF") |
|
return(recommendations) |
|
} |
|
|
|
|
|
|
|
|
|
|