|
import joblib |
|
import pandas as pd |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
import argparse |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Process some integers.') |
|
parser.add_argument('--input', type=str, help="Input file path (file should be in parquet format and have 'prompt' and 'completion' columns)") |
|
parser.add_argument('--output', type=str, help='Output file path') |
|
args = parser.parse_args() |
|
|
|
df = pd.read_parquet(args.input) |
|
|
|
|
|
prompt_tfidf_vectorizer = TfidfVectorizer() |
|
prompt_tfidf_vectorizer.fit(df['prompt']) |
|
|
|
|
|
joblib.dump(prompt_tfidf_vectorizer, args.output + 'prompt-vectorizer.pkl') |
|
|
|
|
|
prompt_tfidf_matrix = prompt_tfidf_vectorizer.transform(df['prompt']) |
|
|
|
|
|
joblib.dump(prompt_tfidf_matrix, args.output + 'prompt-tfidf-matrix.pkl') |
|
|
|
|
|
completion_tfidf_vectorizer = TfidfVectorizer() |
|
completion_tfidf_vectorizer.fit(df['completion']) |
|
|
|
|
|
joblib.dump(completion_tfidf_vectorizer, args.output + 'completion-vectorizer.pkl') |
|
|
|
|
|
completion_tfidf_matrix = completion_tfidf_vectorizer.transform(df['completion']) |
|
|
|
|
|
joblib.dump(completion_tfidf_matrix, args.output + 'completion-tfidf-matrix.pkl') |
|
|
|
print("Done!") |
|
|
|
if __name__ == '__main__': |
|
main() |
|
|
|
|