import pandas as pd from sklearn.pipeline import Pipeline from sklearn.feature_selection import ColumnSelector from sklearn.feature_extraction.text import TfidfVectorizer from utils.utilities import * import sys from pprint import pprint CONFIG_FILE_PATH = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/configs.yaml" config = read_yaml_config(CONFIG_FILE_PATH) pprint(config) @execution_time def train_tfidf(): df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers.parquet.gzip") \ .sample(500000) \ .reset_index(drop=True) vectorizer = TfidfVectorizer(**config["models"]["tfidf"]["tfidf_deffault"]) pprint(config["models"]["tfidf"]["tfidf_deffault"]) sys.exit() vectors = vectorizer.fit_transform(df['cleaned_abstracts']) tfidf_df = pd.DataFrame(vectors.toarray(), columns=[i for i in vectorizer.get_feature_names_out()]) tfidf_df.to_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_tfidf.parquet.gzip") train_tfidf()