nicholasKluge commited on
Commit
e2a3f6a
1 Parent(s): d6379a0

Upload 6 files

Browse files
completion-tfidf-matrix.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16f7cb342da64a6334bb035d162a29579853926af2243c14029fb5043d4fbd81
3
+ size 116328867
completion-vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add850bde149e5de855d3c0334cd99ef5055289f8d103626250db2b5a1bbd0dc
3
+ size 4036115
create-tfidf-matrix.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import pandas as pd
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ import argparse
6
+
7
+ def main():
8
+ parser = argparse.ArgumentParser(description='Process some integers.')
9
+ parser.add_argument('--input', type=str, help="Input file path (file should be in parquet format and have 'prompt' and 'completion' columns)")
10
+ parser.add_argument('--output', type=str, help='Output file path')
11
+ args = parser.parse_args()
12
+
13
+ df = pd.read_parquet(args.input)
14
+
15
+ # fit the vectorizer on the prompt column
16
+ prompt_tfidf_vectorizer = TfidfVectorizer()
17
+ prompt_tfidf_vectorizer.fit(df['prompt'])
18
+
19
+ # save the vectorizer
20
+ joblib.dump(prompt_tfidf_vectorizer, args.output + 'prompt-vectorizer.pkl')
21
+
22
+ # get the tfidf_matrix
23
+ prompt_tfidf_matrix = prompt_tfidf_vectorizer.transform(df['prompt'])
24
+
25
+ # save the tfidf_matrix
26
+ joblib.dump(prompt_tfidf_matrix, args.output + 'prompt-tfidf-matrix.pkl')
27
+
28
+ # fit the vectorizer on the completion column
29
+ completion_tfidf_vectorizer = TfidfVectorizer()
30
+ completion_tfidf_vectorizer.fit(df['completion'])
31
+
32
+ # save the vectorizer
33
+ joblib.dump(completion_tfidf_vectorizer, args.output + 'completion-vectorizer.pkl')
34
+
35
+ # get the tfidf_matrix
36
+ completion_tfidf_matrix = completion_tfidf_vectorizer.transform(df['completion'])
37
+
38
+ # save the tfidf_matrix
39
+ joblib.dump(completion_tfidf_matrix, args.output + 'completion-tfidf-matrix.pkl')
40
+
41
+ print("Done!")
42
+
43
+ if __name__ == '__main__':
44
+ main()
45
+
46
+ # example usage: python create-tfidf-matrix.py --input fine-tuning-data.parquet --output ./
prompt-tfidf-matrix.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95d8c1d302b36e5fef3da79e802354972158b247051715c98d55f351b8993fe2
3
+ size 37977659
prompt-vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:739df119b82ef1f2d8dfd4d85bc1ee489d2705b48d1bd701627df9222e15cc8f
3
+ size 3324940