bookrecc / app /logic /tfidf_model.R
Ubuntu
init
f888423
box::use(
quanteda[corpus, docvars, docnames, dfm, convert, dfm_tfidf, dfm_subset],
quanteda.textstats[textstat_simil],
utils[head],
spacyr[spacy_parse],
dplyr[mutate, filter, select],
data.table[setorderv],
)
box::use(
app/logic/utils[parse_recommendations]
)
create_ref_corpus <- function(data_tab, field) {
corp <- corpus(data_tab, text_field = field)
docnames(corp) <- data_tab$book_id
return(corp)
}
spacy_pipeline <- function(corp) {
browser()
res <- corp |> spacy_parse(nounphrase = TRUE)
res_tokens <- res |>
filter(
! pos %in% c("PUNCT", "PART", "NUM", "SYM"),
! entity %in% c("PERSON_B", "PERSON_I")
) |>
mutate(
lemma = tolower(lemma)
)
all <- res_tokens |>
group_by(sentence_id, nounphrase) |>
mutate(nounphrase_id = cumsum(nounphrase %in% c("beg_root", ""))) |>
group_by(sentence_id, nounphrase_id) |>
mutate(has_entity = ifelse(entity!= "", 1, 0)) |>
as.data.table()
phrases <- all |>
filter(nounphrase_id == 0, has_entity == 1)
non_phrases <- fsetdiff(all, phrases)
phrases <- phrases |>
mutate(nounphrase_id = cumsum(nounphrase == "beg"), seq_id = -1)
phrases[1, ]$seq_id <- 0
phrases$seq_id <- cumsum(c(TRUE, phrases$sentence_id[-1]!= phrases$sentence_id[-nrow(phrases)] |
phrases$token_id[-1]!= phrases$token_id[-nrow(phrases)] + 1))
phrases_concat <- phrases[,c("token", "lemma", "pos", "entity") :=
.(paste(token, collapse = " "),
paste(lemma, collapse = " "),
paste(pos, collapse = " "),
paste(entity, collapse = " ")),
by =.(nounphrase_id, sentence_id, seq_id)]
phrases_concat <- unique(phrases_concat, by = c("sentence_id", "nounphrase_id", "token", "seq_id"))
non_phrases[, c("nounphrase_id", "has_entity") := NULL]
phrases_concat[, c("nounphrase_id", "has_entity", "seq_id") := NULL]
joined <- rbindlist(list(non_phrases, phrases_concat))
setorder(joined, doc_id, sentence_id, token_id)
class(joined) <- c("spacyr_parsed", class(joined))
res_tokens <- joined |> as.tokens(
use_lemma = TRUE
)
corp_dfm <- res_tokens |> dfm()
docvars(corp_dfm) <- docvars(corp)
saveRDS(corp_dfm, "./data/ref_corp_dfm_new.rds")
corp_tfidf <- corp_dfm |> dfm_tfidf()
saveRDS(corp_tfidf, "./data/ref_corp_tfidf_new.rds")
}
#' export
get_recommendations <- function(corp_dfm, data_tab, query_book_ids, genres, simil_method = "cosine", how_many) {
query_dfm <- dfm_subset(corp_dfm, docname_ %in% query_book_ids)
if (!is.null(genres)) {
corp_dfm <- corp_dfm[grep(genres, paste(docvars(corp_dfm)$genres)),]
}
rest_dfm <- dfm_subset(corp_dfm, !docname_ %in% query_book_ids)
tstat <- textstat_simil(
query_dfm, rest_dfm,
margin = "documents",
method = simil_method
) |>
as.data.frame()
setorderv(tstat, cols = c(simil_method), order = -1)
recommendations <- parse_recommendations(tstat[1:how_many,]$document2, data_tab, "TFIDF")
return(recommendations)
}