Backedman commited on
Commit
0a180e0
1 Parent(s): 01920b8

Update tfidf_model.py

Browse files
Files changed (1) hide show
  1. tfidf_model.py +283 -275
tfidf_model.py CHANGED
@@ -1,275 +1,283 @@
1
- import json
2
- import nltk
3
- from nltk.corpus import stopwords
4
- from nltk.tokenize import word_tokenize, sent_tokenize
5
- from nltk.stem import PorterStemmer, WordNetLemmatizer
6
- from sklearn.feature_extraction.text import TfidfVectorizer
7
- from sklearn.metrics.pairwise import cosine_similarity
8
- import numpy as np
9
- import os
10
- import math
11
- import pickle
12
- import joblib
13
- import multiprocessing
14
- from concurrent.futures import ProcessPoolExecutor
15
- from tqdm import tqdm # Import tqdm for progress tracking
16
- from collections import defaultdict
17
-
18
-
19
- nltk.download('punkt')
20
- nltk.download('stopwords')
21
- nltk.download('averaged_perceptron_tagger')
22
- nltk.download('maxent_ne_chunker')
23
- nltk.download('words')
24
- nltk.download('wordnet')
25
- nltk.download('omw-1.4')
26
-
27
-
28
- # Helper function to map NLTK POS tags to WordNet POS tags
29
- def get_wordnet_pos(treebank_tag):
30
- if treebank_tag.startswith('J'):
31
- return nltk.corpus.wordnet.ADJ
32
- elif treebank_tag.startswith('V'):
33
- return nltk.corpus.wordnet.VERB
34
- elif treebank_tag.startswith('N'):
35
- return nltk.corpus.wordnet.NOUN
36
- elif treebank_tag.startswith('R'):
37
- return nltk.corpus.wordnet.ADV
38
- else:
39
- return nltk.corpus.wordnet.NOUN
40
-
41
- class NLPModel:
42
- def __init__(self): # Initialize the model with necessary parameters
43
- # Initialize model components (preprocessing, training, etc.)
44
- #self.model
45
-
46
- self.tfidf = TfidfVectorizer(tokenizer=self.tokenize, lowercase=False)
47
-
48
- self.training_tfidf = None
49
-
50
- #self.manager = multiprocessing.Manager()
51
-
52
- self.flattened_sentences = []
53
- self.training_tagged = []
54
- self.answers = []
55
-
56
-
57
-
58
- def tokenize(self, text):
59
- # Your tokenization logic goes here
60
- return text # No tokenization needed, return the input as-is
61
-
62
- def preprocess_text(self, text):
63
- # Tokenization
64
- sentences = sent_tokenize(text)
65
-
66
- preprocessed_sentences = []
67
- batch_size = 50 # Adjust the batch size based on your system's capabilities
68
- for i in range(0, len(sentences), batch_size):
69
- batch_sentences = sentences[i:i + batch_size]
70
- batch_words = [word_tokenize(sentence) for sentence in batch_sentences]
71
-
72
- # Filtering Stop Words
73
- stop_words = set(stopwords.words('english'))
74
- filtered_words = [[word for word in words if word.lower() not in stop_words] for words in batch_words]
75
-
76
- # Stemming
77
- stemmer = PorterStemmer()
78
- stemmed_words = [[stemmer.stem(word) for word in words] for words in filtered_words]
79
-
80
- # Tagging Parts of Speech
81
- pos_tags = [nltk.pos_tag(words) for words in stemmed_words]
82
-
83
- # Lemmatizing
84
- lemmatizer = WordNetLemmatizer()
85
- lemmatized_words = [[lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos] for pos in pos_tags]
86
-
87
- preprocessed_sentences.extend(lemmatized_words)
88
-
89
- return preprocessed_sentences
90
-
91
- def process_data(self, data_json):
92
- #print("Processing data in parallel...")
93
- batch_size = 10000 # Experiment with different batch sizes
94
- num_processes = int(multiprocessing.cpu_count()/2) # Utilize more processes
95
-
96
- batches = [data_json[i:i + batch_size] for i in range(0, len(data_json), batch_size)]
97
-
98
- #print('batches')
99
-
100
- #training_tagged = [] # Initialize or clear self.training_tagged
101
- sentence_answers = []
102
-
103
- with ProcessPoolExecutor(max_workers=num_processes) as executor:
104
- results = list(tqdm(executor.map(self.process_data_batch, batches), total=len(batches)))
105
-
106
- #with multiprocessing.Pool() as pool:
107
- #results = []
108
- #for batch in batches:
109
- #results.append(self.process_data_batch(batch))
110
-
111
- for batch_result in results:
112
- for result in batch_result:
113
- sentence_answers.extend(result)
114
- #print("here")
115
-
116
- # Create a dictionary to group sentences by answer
117
- answer_groups = defaultdict(list)
118
-
119
- # Iterate through each (sentence, answer) pair in batch_results
120
- for sentence, answer in sentence_answers:
121
- answer_groups[answer].extend(sentence)
122
-
123
- #print(list(answer_groups.items())[0])
124
-
125
- # Create a new list with sentences grouped by answer
126
- sentence_answers.extend([(sentence,answer) for answer, sentence in answer_groups.items()])
127
-
128
- self.flattened_sentences.extend([x[0] for x in sentence_answers])
129
- self.training_tagged.extend([x[1] for x in sentence_answers])
130
-
131
-
132
-
133
- #print("Data processing complete.")
134
-
135
- def process_data_batch(self, batch):
136
- batch_results = []
137
-
138
-
139
-
140
- for data in batch:
141
- text = data["text"]
142
- answer = data["answer"]
143
- preprocessed_sentences = self.preprocess_text(text)
144
- training_tagged = [(sentence, answer) for sentence in preprocessed_sentences]
145
-
146
-
147
-
148
- #print(training_tagged)
149
- batch_results.append(training_tagged)
150
-
151
- #create another list where instead, the "sentence" of elements with the same answer are appended with each other
152
-
153
- return batch_results
154
-
155
- def train_model(self):
156
- # Fit and transform the TF-IDF vectorizer
157
-
158
- #print(self.flattened_sentences)
159
- if(self.flattened_sentences):
160
- self.training_tfidf = self.tfidf.fit_transform(self.flattened_sentences)
161
- self.flattened_sentences = []
162
- #self.
163
-
164
- #print(self.training_tfidf)
165
- #print(self.training_tagged)
166
-
167
-
168
-
169
-
170
- def save(self, file_path):
171
- model_data = {
172
- 'training_tagged': list(self.training_tagged),
173
- 'tfidf': self.tfidf,
174
- 'training_tfidf': self.training_tfidf
175
- }
176
- #print(model_data)
177
- with open(file_path, 'wb') as f:
178
- joblib.dump(model_data, f)
179
-
180
- def load(self, file_path):
181
-
182
- if os.path.exists(file_path):
183
- with open(file_path, 'rb') as f:
184
- print(os.path.exists(file_path))
185
- model_data = joblib.load(file_path)
186
- self.training_tagged = list(model_data['training_tagged'])
187
- self.tfidf = model_data['tfidf']
188
- print(self.tfidf)
189
- self.training_tfidf = model_data['training_tfidf']
190
-
191
- return self
192
-
193
- def predict(self, input_data):
194
- # Preprocess input data
195
- new_text_processed = self.preprocess_text(input_data)
196
- new_text_processed_tfidf = self.tfidf.transform(new_text_processed)
197
- training_tfidf = self.training_tfidf
198
-
199
- # Calculate sentence similarities
200
- sentence_similarities = cosine_similarity(new_text_processed_tfidf, training_tfidf)
201
-
202
- # Initialize data structures
203
- similarities_max = {}
204
- answers = []
205
-
206
- # Iterate over sentence similarities
207
- for similarity_row in sentence_similarities:
208
- for answer, similarity in zip(self.training_tagged, similarity_row):
209
- if isinstance(answer, list):
210
- continue
211
- # Update similarities_max only when the new similarity is greater
212
- if answer not in similarities_max or similarity > similarities_max[answer]:
213
- similarities_max[answer] = similarity
214
-
215
- if not answers:
216
- answers.extend(similarities_max.keys())
217
-
218
- # Calculate total similarity for each answer and find the maximum similarity and its index
219
- total_similarities = np.array([similarities_max[answer] for answer in answers])
220
- closest_index = np.argmax(total_similarities)
221
- closest_answer = answers[closest_index]
222
-
223
- return total_similarities[closest_index], closest_answer
224
-
225
-
226
-
227
-
228
-
229
-
230
-
231
- #return (sentences.max(),self.training_tagged[closest_index])
232
-
233
-
234
-
235
-
236
-
237
- def evaluate(self, test_data, labels):
238
- # Evaluate the performance of the model on test data
239
- # Return evaluation metrics
240
- pass
241
-
242
- # Additional functions for model tuning, hyperparameter optimization, etc.
243
-
244
- if __name__ == "__main__":
245
- # Train a simple model on QB data, save it to a file
246
- import argparse
247
- parser = argparse.ArgumentParser()
248
-
249
- parser.add_argument('--data', type=str)
250
- parser.add_argument('--model', type=str)
251
- parser.add_argument('--predict', type=str)
252
-
253
- flags = parser.parse_args()
254
-
255
- model = NLPModel()
256
-
257
- if flags.data:
258
- with open(flags.data, 'r') as data_file:
259
- data_json = json.load(data_file)
260
-
261
- model.process_data(data_json)
262
- model.train_model()
263
- print(model.predict("My name is bobby, bobby newport. your name is jeff?"))
264
- model.save("model.pkl")
265
-
266
- if flags.model:
267
- model.load(flags.model)
268
-
269
- if flags.predict:
270
- print(model.predict(flags.predict))
271
-
272
-
273
-
274
-
275
-
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import word_tokenize, sent_tokenize
5
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import numpy as np
9
+ import os
10
+ import math
11
+ import pickle
12
+ import joblib
13
+ import multiprocessing
14
+ from concurrent.futures import ProcessPoolExecutor
15
+ from tqdm import tqdm # Import tqdm for progress tracking
16
+ from collections import defaultdict
17
+
18
+
19
+ nltk.download('punkt')
20
+ nltk.download('stopwords')
21
+ nltk.download('averaged_perceptron_tagger')
22
+ nltk.download('maxent_ne_chunker')
23
+ nltk.download('words')
24
+ nltk.download('wordnet')
25
+ nltk.download('omw-1.4')
26
+
27
+
28
+ # Helper function to map NLTK POS tags to WordNet POS tags
29
+ def get_wordnet_pos(treebank_tag):
30
+ if treebank_tag.startswith('J'):
31
+ return nltk.corpus.wordnet.ADJ
32
+ elif treebank_tag.startswith('V'):
33
+ return nltk.corpus.wordnet.VERB
34
+ elif treebank_tag.startswith('N'):
35
+ return nltk.corpus.wordnet.NOUN
36
+ elif treebank_tag.startswith('R'):
37
+ return nltk.corpus.wordnet.ADV
38
+ else:
39
+ return nltk.corpus.wordnet.NOUN
40
+
41
+ class NLPModel:
42
+ def __init__(self): # Initialize the model with necessary parameters
43
+ # Initialize model components (preprocessing, training, etc.)
44
+ #self.model
45
+
46
+ self.tfidf = TfidfVectorizer(tokenizer=self.tokenize, lowercase=False)
47
+
48
+ self.training_tfidf = None
49
+
50
+ #self.manager = multiprocessing.Manager()
51
+
52
+ self.flattened_sentences = []
53
+ self.training_tagged = []
54
+ self.answers = []
55
+
56
+
57
+
58
+ def tokenize(self, text):
59
+ # Your tokenization logic goes here
60
+ return text # No tokenization needed, return the input as-is
61
+
62
+ def preprocess_text(self, text):
63
+ # Tokenization
64
+ sentences = sent_tokenize(text)
65
+
66
+ preprocessed_sentences = []
67
+ batch_size = 50 # Adjust the batch size based on your system's capabilities
68
+ for i in range(0, len(sentences), batch_size):
69
+ batch_sentences = sentences[i:i + batch_size]
70
+ batch_words = [word_tokenize(sentence) for sentence in batch_sentences]
71
+
72
+ # Filtering Stop Words
73
+ stop_words = set(stopwords.words('english'))
74
+ filtered_words = [[word for word in words if word.lower() not in stop_words] for words in batch_words]
75
+
76
+ # Stemming
77
+ stemmer = PorterStemmer()
78
+ stemmed_words = [[stemmer.stem(word) for word in words] for words in filtered_words]
79
+
80
+ # Tagging Parts of Speech
81
+ pos_tags = [nltk.pos_tag(words) for words in stemmed_words]
82
+
83
+ # Lemmatizing
84
+ lemmatizer = WordNetLemmatizer()
85
+ lemmatized_words = [[lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos] for pos in pos_tags]
86
+
87
+ preprocessed_sentences.extend(lemmatized_words)
88
+
89
+ return preprocessed_sentences
90
+
91
+ def process_data(self, data_json):
92
+ #print("Processing data in parallel...")
93
+ batch_size = 10000 # Experiment with different batch sizes
94
+ num_processes = int(multiprocessing.cpu_count()/2) # Utilize more processes
95
+
96
+ batches = [data_json[i:i + batch_size] for i in range(0, len(data_json), batch_size)]
97
+
98
+ #print('batches')
99
+
100
+ #training_tagged = [] # Initialize or clear self.training_tagged
101
+ sentence_answers = []
102
+
103
+ with ProcessPoolExecutor(max_workers=num_processes) as executor:
104
+ results = list(tqdm(executor.map(self.process_data_batch, batches), total=len(batches)))
105
+
106
+ #with multiprocessing.Pool() as pool:
107
+ #results = []
108
+ #for batch in batches:
109
+ #results.append(self.process_data_batch(batch))
110
+
111
+ for batch_result in results:
112
+ for result in batch_result:
113
+ sentence_answers.extend(result)
114
+ #print("here")
115
+
116
+ # Create a dictionary to group sentences by answer
117
+ answer_groups = defaultdict(list)
118
+
119
+ # Iterate through each (sentence, answer) pair in batch_results
120
+ for sentence, answer in sentence_answers:
121
+ answer_groups[answer].extend(sentence)
122
+
123
+ #print(list(answer_groups.items())[0])
124
+
125
+ # Create a new list with sentences grouped by answer
126
+ sentence_answers.extend([(sentence,answer) for answer, sentence in answer_groups.items()])
127
+
128
+ self.flattened_sentences.extend([x[0] for x in sentence_answers])
129
+ self.training_tagged.extend([x[1] for x in sentence_answers])
130
+
131
+
132
+
133
+ #print("Data processing complete.")
134
+
135
+ def process_data_batch(self, batch):
136
+ batch_results = []
137
+
138
+
139
+
140
+ for data in batch:
141
+ text = data["text"]
142
+ answer = data["answer"]
143
+ preprocessed_sentences = self.preprocess_text(text)
144
+ training_tagged = [(sentence, answer) for sentence in preprocessed_sentences]
145
+
146
+
147
+
148
+ #print(training_tagged)
149
+ batch_results.append(training_tagged)
150
+
151
+ #create another list where instead, the "sentence" of elements with the same answer are appended with each other
152
+
153
+ return batch_results
154
+
155
+ def train_model(self):
156
+ # Fit and transform the TF-IDF vectorizer
157
+
158
+ #print(self.flattened_sentences)
159
+ if(self.flattened_sentences):
160
+ self.training_tfidf = self.tfidf.fit_transform(self.flattened_sentences)
161
+ self.flattened_sentences = []
162
+ #self.
163
+
164
+ #print(self.training_tfidf)
165
+ #print(self.training_tagged)
166
+
167
+
168
+
169
+
170
+ def save(self, file_path):
171
+ model_data = {
172
+ 'training_tagged': list(self.training_tagged),
173
+ 'tfidf': self.tfidf,
174
+ 'training_tfidf': self.training_tfidf
175
+ }
176
+ #print(model_data)
177
+ with open(file_path, 'wb') as f:
178
+ joblib.dump(model_data, f)
179
+
180
+ def load(self, file_path):
181
+
182
+ if os.path.exists(file_path):
183
+ with open(file_path, 'rb') as f:
184
+ print(os.path.exists(file_path))
185
+ model_data = joblib.load(file_path)
186
+ self.training_tagged = list(model_data['training_tagged'])
187
+ self.tfidf = model_data['tfidf']
188
+ print(self.tfidf)
189
+ self.training_tfidf = model_data['training_tfidf']
190
+
191
+ return self
192
+
193
+ def predict(self, input_data):
194
+ # Preprocess input data
195
+ new_text_processed = self.preprocess_text(input_data)
196
+ new_text_processed_tfidf = self.tfidf.transform(new_text_processed)
197
+ training_tfidf = self.training_tfidf
198
+
199
+ # Calculate sentence similarities
200
+ sentence_similarities = cosine_similarity(new_text_processed_tfidf, training_tfidf)
201
+
202
+ # Initialize data structures
203
+ similarities_max = {}
204
+ similarities_per_sentence = []
205
+ answers = None
206
+
207
+ # Iterate over sentence similarities
208
+ for similarity_row in sentence_similarities:
209
+ for answer, similarity in zip(self.training_tagged, similarity_row):
210
+ if isinstance(answer, list):
211
+ continue
212
+ # Update similarities_max only when the new similarity is greater
213
+ if answer not in similarities_max or similarity > similarities_max[answer]:
214
+ similarities_max[answer] = similarity
215
+
216
+ if not answers:
217
+ answers.extend(similarities_max.keys())
218
+ similarities_per_sentence = similarities_max
219
+ else:
220
+ for answer, similarity in similarities_max:
221
+ similarities_per_sentence[answer] += similarity
222
+
223
+ similarities_max = {}
224
+
225
+
226
+ # Calculate total similarity for each answer and find the maximum similarity and its index
227
+ total_similarities = np.array([similarities_per_sentence[answer] for answer in answers])
228
+ closest_index = np.argmax(total_similarities)
229
+ closest_answer = answers[closest_index]
230
+
231
+ return total_similarities[closest_index], closest_answer
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+ #return (sentences.max(),self.training_tagged[closest_index])
240
+
241
+
242
+
243
+
244
+
245
+ def evaluate(self, test_data, labels):
246
+ # Evaluate the performance of the model on test data
247
+ # Return evaluation metrics
248
+ pass
249
+
250
+ # Additional functions for model tuning, hyperparameter optimization, etc.
251
+
252
+ if __name__ == "__main__":
253
+ # Train a simple model on QB data, save it to a file
254
+ import argparse
255
+ parser = argparse.ArgumentParser()
256
+
257
+ parser.add_argument('--data', type=str)
258
+ parser.add_argument('--model', type=str)
259
+ parser.add_argument('--predict', type=str)
260
+
261
+ flags = parser.parse_args()
262
+
263
+ model = NLPModel()
264
+
265
+ if flags.data:
266
+ with open(flags.data, 'r') as data_file:
267
+ data_json = json.load(data_file)
268
+
269
+ model.process_data(data_json)
270
+ model.train_model()
271
+ print(model.predict("My name is bobby, bobby newport. your name is jeff?"))
272
+ model.save("model.pkl")
273
+
274
+ if flags.model:
275
+ model.load(flags.model)
276
+
277
+ if flags.predict:
278
+ print(model.predict(flags.predict))
279
+
280
+
281
+
282
+
283
+