File size: 4,435 Bytes
c3a942f 629e2f1 c3a942f 629e2f1 c3a942f 629e2f1 c3a942f 629e2f1 c3a942f 629e2f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import zipfile
import pickle
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from huggingface_hub import hf_hub_download
import joblib
class TfidfWikiGuesser:
def __init__(self, wikidump = 'resources/wiki_text_16.json') -> None:
print("init TfidfWikieGuesser")
self.tfidf = None
self.corpus = None
self.titles = None
self.vectorizer = None
self.lemmatizer = WordNetLemmatizer()
model_file = "processed_tfidf_wiki_page_text_model.pkl" # <--- has best acc so far (using wiki_page_text.json from gdrive folder)
#model_file = "processed_tfidf_wiki_16_model.pkl"
# full_model_path = model_file
full_model_path = os.path.join("./models", model_file)
print(full_model_path)
REPO_ID = "nes470/pipeline-as-repo"
FILENAME = "processed_tfidf_wiki_page_text_model.pkl"
model = joblib.load(
hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
)
print("loading from hugginface pkl file")
self.load_from_pk_direct(model)
# if os.path.exists(full_model_path):
# print("Loading model from pickle...")
# self.load_from_pkl(full_model_path)
# else:
# if wikidump:
# print("No pre-trained model found, loading data from dump...")
# self.load_model(wikidump)
# self.save_model(full_model_path)
# self.load_model(wikidump)
def load_model(self, wikidump):
# wiki dump is an json array of json objects with page and text fields
with open(wikidump) as f:
doc = json.load(f)
# with zipfile.ZipFile('resources/wiki_text_8.json.zip', 'r') as z:
# with z.open('wiki_text_8.json') as f:
# doc = json.load(f)
self.corpus, self.titles = self.create_corpus(doc)
self.vectorizer = TfidfVectorizer(stop_words='english')
self.tfidf = self.vectorizer.fit_transform(self.corpus)
def preprocess_text(self,text):
if type(text) == float:
return str(text)
tokens = word_tokenize(text.lower())
filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in filtered_tokens]
processed_text = ' '.join(lemmatized_tokens)
return processed_text
def create_corpus(self, json_file):
corpus = []
page_titles = []
for json_obj in json_file:
# corpus.append(json_obj['text'])
#corpus.append(self.preprocess_text(json_obj['text']))
corpus.append(json_obj['text'])
page_titles.append(json_obj['page'])
return (corpus, page_titles)
def make_guess(self, question, num_guesses = 1):
tfidf_question = self.vectorizer.transform([question])
sim = cosine_similarity(self.tfidf, tfidf_question)
#get indices of best matching documents and use it to get (num_guesses) top documents
sim_indices = np.argsort(sim.flatten())[::-1]
best_indices = sim_indices[:num_guesses]
# best_docs = []
best_guesses = []
for i in best_indices:
# best_docs.append(self.corpus[i])
best_guesses.append(self.titles[i])
return best_guesses
def save_model(self, file_name):
with open(file_name, 'wb') as f:
pickle.dump({
'vectorizer': self.vectorizer,
'tfidf_matrix': self.tfidf,
'titles': self.titles,
# 'corpus': self.corpus
}, f)
def load_from_pkl(self, file_name):
with open(file_name, 'rb') as f:
data = pickle.load(f)
self.vectorizer = data['vectorizer']
self.tfidf = data['tfidf_matrix']
self.titles = data['titles']
# self.corpus = data['corpus']
def load_from_pk_direct(self, pkl):
#data = pickle.load(pkl)
data = pkl
self.vectorizer = data['vectorizer']
self.tfidf = data['tfidf_matrix']
self.titles = data['titles'] |