nes470 commited on
Commit
629e2f1
1 Parent(s): 9c75d1e

Update tfidf.py

Browse files
Files changed (1) hide show
  1. tfidf.py +32 -9
tfidf.py CHANGED
@@ -8,10 +8,13 @@ import os
8
  from nltk.corpus import stopwords
9
  from nltk.tokenize import word_tokenize
10
  from nltk.stem import WordNetLemmatizer
 
 
11
 
12
 
13
  class TfidfWikiGuesser:
14
  def __init__(self, wikidump = 'resources/wiki_text_16.json') -> None:
 
15
  self.tfidf = None
16
  self.corpus = None
17
  self.titles = None
@@ -21,15 +24,28 @@ class TfidfWikiGuesser:
21
  #model_file = "processed_tfidf_wiki_16_model.pkl"
22
  # full_model_path = model_file
23
  full_model_path = os.path.join("./models", model_file)
 
24
 
25
- if os.path.exists(full_model_path):
26
- print("Loading model from pickle...")
27
- self.load_from_pkl(full_model_path)
28
- else:
29
- if wikidump:
30
- print("No pre-trained model found, loading data from dump...")
31
- self.load_model(wikidump)
32
- self.save_model(full_model_path)
 
 
 
 
 
 
 
 
 
 
 
 
33
  # self.load_model(wikidump)
34
 
35
  def load_model(self, wikidump):
@@ -99,4 +115,11 @@ class TfidfWikiGuesser:
99
  self.vectorizer = data['vectorizer']
100
  self.tfidf = data['tfidf_matrix']
101
  self.titles = data['titles']
102
- # self.corpus = data['corpus']
 
 
 
 
 
 
 
 
8
  from nltk.corpus import stopwords
9
  from nltk.tokenize import word_tokenize
10
  from nltk.stem import WordNetLemmatizer
11
+ from huggingface_hub import hf_hub_download
12
+ import joblib
13
 
14
 
15
  class TfidfWikiGuesser:
16
  def __init__(self, wikidump = 'resources/wiki_text_16.json') -> None:
17
+ print("init TfidfWikieGuesser")
18
  self.tfidf = None
19
  self.corpus = None
20
  self.titles = None
 
24
  #model_file = "processed_tfidf_wiki_16_model.pkl"
25
  # full_model_path = model_file
26
  full_model_path = os.path.join("./models", model_file)
27
+ print(full_model_path)
28
 
29
+
30
+
31
+ REPO_ID = "nes470/pipeline-as-repo"
32
+ FILENAME = "processed_tfidf_wiki_page_text_model.pkl"
33
+
34
+ model = joblib.load(
35
+ hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
36
+ )
37
+
38
+ print("loading from hugginface pkl file")
39
+ self.load_from_pk_direct(model)
40
+
41
+ # if os.path.exists(full_model_path):
42
+ # print("Loading model from pickle...")
43
+ # self.load_from_pkl(full_model_path)
44
+ # else:
45
+ # if wikidump:
46
+ # print("No pre-trained model found, loading data from dump...")
47
+ # self.load_model(wikidump)
48
+ # self.save_model(full_model_path)
49
  # self.load_model(wikidump)
50
 
51
  def load_model(self, wikidump):
 
115
  self.vectorizer = data['vectorizer']
116
  self.tfidf = data['tfidf_matrix']
117
  self.titles = data['titles']
118
+ # self.corpus = data['corpus']
119
+
120
+ def load_from_pk_direct(self, pkl):
121
+ #data = pickle.load(pkl)
122
+ data = pkl
123
+ self.vectorizer = data['vectorizer']
124
+ self.tfidf = data['tfidf_matrix']
125
+ self.titles = data['titles']