|
import numpy as np |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
|
|
def create_vectorizer(processed_texts): |
|
""" |
|
Creates a TF-IDF vectorizer and transforms the texts. |
|
|
|
Args: |
|
processed_texts (list): List of preprocessed and tokenized texts. |
|
|
|
Returns: |
|
tuple: TF-IDF vectorizer and transformed text matrix. |
|
""" |
|
vectorizer = TfidfVectorizer() |
|
X = vectorizer.fit_transform([' '.join(text) for text in processed_texts]) |
|
return vectorizer, X |
|
|
|
def retrieve(query, X, vectorizer, top_k=5): |
|
""" |
|
Retrieves the top-k most relevant texts for a given query. |
|
|
|
Args: |
|
query (str): Query string. |
|
X (matrix): TF-IDF transformed text matrix. |
|
vectorizer (TfidfVectorizer): TF-IDF vectorizer. |
|
top_k (int): Number of top results to retrieve. |
|
|
|
Returns: |
|
list: Indices of the top-k most relevant texts. |
|
""" |
|
query_vec = vectorizer.transform([query]) |
|
scores = np.dot(X, query_vec.T).toarray() |
|
top_indices = np.argsort(scores, axis=0)[-top_k:][::-1] |
|
return top_indices.flatten() |
|
|