Update README.md
Browse files
README.md
CHANGED
@@ -49,6 +49,13 @@ filename='lr_clf_test2.joblib'
|
|
49 |
model_file_path=hf_hub_download(repo_id=repo_id, filename=filename) <br>
|
50 |
model=joblib.load(model_file_path)
|
51 |
print(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
#Load test dataset (assuming the name is the same as the one in the Ed post) <br>
|
54 |
test_df = pd.read_csv(file_path)
|
@@ -215,78 +222,22 @@ X_test = X_test.dropna(subset = ['title'])
|
|
215 |
X_test = handle_missing_data(X_test, 'title')
|
216 |
X_test = consistency_checks(X_test, 'title') </pre>
|
217 |
|
218 |
-
# Load the embedding model from Huggingface. Transformer: DistilBERT
|
219 |
-
|
220 |
-
|
221 |
-
<pre>
|
222 |
-
def get_embeddings(text_all, tokenizer, model, device, max_len=128):
|
223 |
-
'''
|
224 |
-
Generate embeddings using a transformer model on GPU if available.
|
225 |
-
Args:
|
226 |
-
- text_all: List of input texts
|
227 |
-
- tokenizer: Tokenizer for the model
|
228 |
-
- model: Transformer model
|
229 |
-
- device: torch.device to run the computations
|
230 |
-
- max_len: Maximum token length for the input
|
231 |
-
Returns:
|
232 |
-
- embeddings: List of embeddings for each input text
|
233 |
-
'''
|
234 |
-
embeddings = []
|
235 |
-
|
236 |
-
count = 0
|
237 |
-
print('Start embeddings:')
|
238 |
-
|
239 |
-
for text in text_all:
|
240 |
-
count += 1
|
241 |
-
if count % (len(text_all) // 10) == 0:
|
242 |
-
print(f'{count / len(text_all) * 100:.1f}% done ...')
|
243 |
-
|
244 |
-
# Tokenize the input text
|
245 |
-
model_input_token = tokenizer(
|
246 |
-
text,
|
247 |
-
add_special_tokens=True,
|
248 |
-
max_length=max_len,
|
249 |
-
padding='max_length',
|
250 |
-
truncation=True,
|
251 |
-
return_tensors='pt'
|
252 |
-
).to(device) # Move input tensors to GPU
|
253 |
-
|
254 |
-
# Generate embeddings without gradient computation
|
255 |
-
with torch.no_grad():
|
256 |
-
model_output = model(**model_input_token)
|
257 |
-
cls_embedding = model_output.last_hidden_state[:, 0, :] # Use CLS token embedding
|
258 |
-
cls_embedding = cls_embedding.squeeze().cpu().numpy() # Move back to CPU for numpy
|
259 |
-
embeddings.append(cls_embedding)
|
260 |
-
|
261 |
-
return embeddings </pre>
|
262 |
-
|
263 |
-
|
264 |
-
# Check for GPU availability
|
265 |
-
<pre>
|
266 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
267 |
-
print(f'Using device: {device}')
|
268 |
-
|
269 |
-
# Load the tokenizer and model for 'all-mpnet-base-v2'
|
270 |
-
print("Loading model and tokenizer...")
|
271 |
-
# Load model and tokenizer
|
272 |
-
tokenizer_news = AutoTokenizer.from_pretrained('distilbert-base-uncased')
|
273 |
-
model_news = AutoModel.from_pretrained('distilbert-base-uncased').to(device)
|
274 |
|
275 |
-
# Set the model to evaluation mode
|
276 |
-
model_news.eval()
|
277 |
|
278 |
-
#############################################
|
279 |
############################################# Embedding #############################################
|
280 |
-
|
|
|
281 |
|
282 |
y_test = X_test['labels']
|
283 |
X_test = X_test['title']
|
284 |
-
|
285 |
-
X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
|
286 |
-
print("DBERT embeddings for training data computed!")
|
287 |
|
|
|
288 |
|
289 |
-
|
|
|
|
|
|
|
290 |
</pre>
|
291 |
# Accuracy
|
292 |
<pre>label_map = {'NBC': 0, 'FoxNews': 1}
|
|
|
49 |
model_file_path=hf_hub_download(repo_id=repo_id, filename=filename) <br>
|
50 |
model=joblib.load(model_file_path)
|
51 |
print(model)
|
52 |
+
|
53 |
+
repo_id2='awngsz/tfidf_model' ############# <--- check tfidf model name
|
54 |
+
filename2='embed_tfidf.joblib'
|
55 |
+
|
56 |
+
model_file_path2=hf_hub_download(repo_id=repo_id2, filename=filename2) <br>
|
57 |
+
tfidf_model=joblib.load(model_file_path2)
|
58 |
+
print(tfidf_model)
|
59 |
|
60 |
#Load test dataset (assuming the name is the same as the one in the Ed post) <br>
|
61 |
test_df = pd.read_csv(file_path)
|
|
|
222 |
X_test = handle_missing_data(X_test, 'title')
|
223 |
X_test = consistency_checks(X_test, 'title') </pre>
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
|
|
|
|
226 |
|
227 |
+
############################################# TF-IDF Embedding #############################################
|
228 |
############################################# Embedding #############################################
|
229 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
230 |
+
print("Computing embeddings ...")
|
231 |
|
232 |
y_test = X_test['labels']
|
233 |
X_test = X_test['title']
|
|
|
|
|
|
|
234 |
|
235 |
+
X_test_tfidf = tfidf_model.transform(X_test)
|
236 |
|
237 |
+
#X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
|
238 |
+
print("Embeddings computed!")
|
239 |
+
|
240 |
+
prediction = model.predict(X_test_tfidf)
|
241 |
</pre>
|
242 |
# Accuracy
|
243 |
<pre>label_map = {'NBC': 0, 'FoxNews': 1}
|