"""Generate a similarity matrix (doc-term score matrix) based on textacy.representation.Vectorizer. refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer). originally docterm_scores.py. """ from typing import Dict, Iterable, List, Optional, Union import numpy as np from itertools import chain from psutil import virtual_memory from more_itertools import ilen from textacy.representations import Vectorizer # from textacy.representations.vectorizers import Vectorizer from logzero import logger # from smatrix.gen_model import gen_model from gradiobee.gen_model import gen_model # fmt: off def smatrix( doc1: Iterable[Iterable[str]], # List[List[str]], doc2: Iterable[Iterable[str]], model: Vectorizer = None, tf_type: str = 'linear', idf_type: Optional[str] = "smooth", # dl_type: Optional[str] = "sqrt", # "lucene-style tfidf" dl_type: Optional[str] = None, # norm: Optional[str] = "l2", # + "l2" min_df: Union[int, float] = 1, max_df: Union[int, float] = 1.0, max_n_terms: Optional[int] = None, vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None ) -> np.ndarray: # fmt: on """Generate a doc-term score matrix based on textacy.representation.Vectorizer. Args doc1: tokenized doc of n1 doc2: tokenized doc of n2 model: if None, generate one ad hoc from doc1 and doc2 ("lucene-style tfidf"). rest: refer to textacy.representation.Vectorizer Attributes vectorizer Returns n1 x n2 similarity matrix of float numbers """ # make sure doc1/doc2 is of the right typing try: for xelm in iter(doc1): for elm in iter(xelm): assert isinstance(elm, str) except AssertionError: raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ") except Exception as e: logger.error(e) raise try: for xelm in iter(doc2): for elm in iter(xelm): assert isinstance(elm, str) except AssertionError: raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ") except Exception as e: logger.error(e) raise if model is None: model = gen_model( [*chain(doc1, doc2)], tf_type=tf_type, idf_type=idf_type, dl_type=dl_type, norm=norm, min_df=min_df, max_df=max_df, max_n_terms=max_n_terms, vocabulary_terms=vocabulary_terms ) # docterm_scores.model = model smatrix.model = model # a1 = dt.toarray(), a2 = doc_term_matrix.toarray() # np.all(np.isclose(a1, a2)) dt1 = model.transform(doc1) dt2 = model.transform(doc2) # virtual_memory().available / 8: 64bits float require_ram = ilen(iter(doc1)) * ilen(iter(doc2)) * 8 if require_ram > virtual_memory().available: logger.warning("virtual_memory().available: %s", virtual_memory().available) logger.warning("memory required: %s", require_ram) if require_ram > virtual_memory().available * 10: logger.warning("You're likely to encounter memory problem, such as slowing down response and/or OOM.") # return dt1.doc(dt2.T) return dt2.toarray().dot(dt1.toarray().T)