import pickle import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity test_df = pd.read_csv("/tmp/data/test.csv") with open("model.pkl", "rb") as f: model = pickle.load(f) scores = [] for _, row in test_df.iterrows(): X_query = model["tokenizer"].transform([row["Query"]]) is_cand = sum([(model["faq_ids"] == row[f"FAQ{i+1}"]).astype(int) for i in range(3)]) > 0 sim = cosine_similarity(X_query, model["X_faq"][is_cand])[0] score = sim.max() scores.append(score) predict = (np.array(scores) > model["thr"]).astype(int) df = pd.DataFrame([(f"testid{i:04}", v) for i, v in enumerate(predict)], columns=["id", "pred"]) df.to_csv("submission.csv", index=None)