| | |
| |
|
| | import os |
| | import torch |
| | from transformers import ( |
| | AutoModelForSequenceClassification, |
| | AutoTokenizer, |
| | ) |
| |
|
| |
|
| | |
| | _SCRIPT_DIR = os.path.dirname( |
| | os.path.abspath(__file__) |
| | ) |
| | MODEL_PATH = os.path.join(_SCRIPT_DIR, "fine_tuned_model") |
| |
|
| | print(f"DEBUG (predict.py): Model path set to: {MODEL_PATH}") |
| |
|
| | |
| | |
| | if torch.cuda.is_available(): |
| | device = torch.device("cuda") |
| | |
| | try: |
| | gpu_name = torch.cuda.get_device_name(0) |
| | print(f"INFO (predict.py): GPU is available ({gpu_name}), using CUDA.") |
| | except Exception as e: |
| | print( |
| | f"INFO (predict.py): GPU is available, using CUDA. (Could not get GPU name: {e})" |
| | ) |
| | else: |
| | device = torch.device("cpu") |
| | print( |
| | "INFO (predict.py): GPU not available, using CPU. Predictions might be slower." |
| | ) |
| |
|
| | |
| | |
| | model = None |
| | tokenizer = None |
| | id2label_mapping = {0: "negative", 1: "neutral", 2: "positive"} |
| |
|
| | try: |
| | print(f"INFO (predict.py): Loading model from {MODEL_PATH}...") |
| | |
| | model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH) |
| | model.to(device) |
| | model.eval() |
| | print("INFO (predict.py): Model loaded successfully and set to evaluation mode.") |
| |
|
| | print(f"INFO (predict.py): Loading tokenizer from {MODEL_PATH}...") |
| | |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
| | print("INFO (predict.py): Tokenizer loaded successfully.") |
| |
|
| | |
| | |
| | if hasattr(model.config, "id2label") and model.config.id2label: |
| | id2label_mapping = model.config.id2label |
| | |
| | id2label_mapping = {int(k): v for k, v in id2label_mapping.items()} |
| | print( |
| | f"INFO (predict.py): Loaded id2label mapping from model config: {id2label_mapping}" |
| | ) |
| | else: |
| | print( |
| | "WARN (predict.py): id2label not found in model config, using default mapping." |
| | ) |
| |
|
| | except FileNotFoundError: |
| | print(f"--- CRITICAL ERROR (predict.py) ---") |
| | print(f"Model or Tokenizer files NOT FOUND at the specified path: {MODEL_PATH}") |
| | print( |
| | f"Please ensure the '{os.path.basename(MODEL_PATH)}' directory exists at '{_SCRIPT_DIR}' and contains all necessary model files (pytorch_model.bin/model.safetensors, config.json, tokenizer files, etc.)." |
| | ) |
| | |
| | except Exception as e: |
| | print(f"--- ERROR (predict.py) ---") |
| | print(f"An unexpected error occurred loading model or tokenizer: {e}") |
| | |
| |
|
| |
|
| | |
| | def preprocess_tweet(text): |
| | """Replaces @user mentions and http links with placeholders.""" |
| | preprocessed_text = [] |
| | if text is None: |
| | return "" |
| | |
| | for t in text.split(" "): |
| | if len(t) > 0: |
| | t = "@user" if t.startswith("@") else t |
| | t = "http" if t.startswith("http") else t |
| | preprocessed_text.append(t) |
| | return " ".join(preprocessed_text) |
| |
|
| |
|
| | |
| | def predict_sentiments(comment_list: list): |
| | """ |
| | Predicts sentiments for a list of comment strings. |
| | Returns a list of dictionaries, each containing the predicted label |
| | and the probabilities (scores) for each class. |
| | e.g., [{'label': 'positive', 'scores': {'negative': 0.1, 'neutral': 0.2, 'positive': 0.7}}, ...] |
| | """ |
| | |
| | if model is None or tokenizer is None: |
| | print( |
| | "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot predict." |
| | ) |
| | |
| | return [{"label": "Error: Model not loaded", "scores": {}}] * len(comment_list) |
| |
|
| | if not comment_list: |
| | return [] |
| |
|
| | inference_batch_size = 64 |
| | print( |
| | f"INFO (predict.py): Predicting sentiments for {len(comment_list)} comments in batches of {inference_batch_size}..." |
| | ) |
| |
|
| | all_results_list = [] |
| |
|
| | |
| | try: |
| | total_comments = len(comment_list) |
| | |
| | for i in range(0, total_comments, inference_batch_size): |
| | |
| | batch_comments = comment_list[i : i + inference_batch_size] |
| |
|
| | |
| | current_batch_num = i // inference_batch_size + 1 |
| | total_batches = ( |
| | total_comments + inference_batch_size - 1 |
| | ) // inference_batch_size |
| | print( |
| | f"DEBUG (predict.py): Processing batch {current_batch_num}/{total_batches}..." |
| | ) |
| |
|
| | |
| | |
| | processed_batch = [preprocess_tweet(comment) for comment in batch_comments] |
| |
|
| | |
| | |
| | inputs = tokenizer( |
| | processed_batch, |
| | padding=True, |
| | truncation=True, |
| | return_tensors="pt", |
| | max_length=( |
| | tokenizer.model_max_length |
| | if hasattr(tokenizer, "model_max_length") |
| | and tokenizer.model_max_length |
| | else 512 |
| | ), |
| | ) |
| |
|
| | |
| | inputs = {k: v.to(device) for k, v in inputs.items()} |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | logits = outputs.logits |
| |
|
| | |
| | probabilities_batch = torch.softmax(logits, dim=-1) |
| | predicted_class_ids_batch = torch.argmax(probabilities_batch, dim=-1) |
| |
|
| | |
| | probs_list_batch = probabilities_batch.cpu().numpy().tolist() |
| | ids_list_batch = predicted_class_ids_batch.cpu().numpy().tolist() |
| |
|
| | |
| | batch_results = [] |
| | for j in range(len(ids_list_batch)): |
| | pred_id = ids_list_batch[j] |
| | pred_label = id2label_mapping.get( |
| | pred_id, "Unknown" |
| | ) |
| | |
| | pred_scores = { |
| | label_name: probs_list_batch[j][label_id] |
| | for label_id, label_name in id2label_mapping.items() |
| | if 0 |
| | <= label_id |
| | < probabilities_batch.shape[-1] |
| | } |
| | |
| | batch_results.append({"label": pred_label, "scores": pred_scores}) |
| |
|
| | |
| | all_results_list.extend(batch_results) |
| | |
| |
|
| | print( |
| | f"INFO (predict.py): Finished processing all {len(all_results_list)} comments." |
| | ) |
| |
|
| | except Exception as e: |
| | |
| | print(f"--- ERROR (predict.py - predict_sentiments loop) ---") |
| | print( |
| | f"An error occurred during batch prediction (around comment index {i}): {e}" |
| | ) |
| | import traceback |
| |
|
| | traceback.print_exc() |
| | |
| | num_processed = len(all_results_list) |
| | num_remaining = len(comment_list) - num_processed |
| | |
| | all_results_list.extend( |
| | [{"label": "Error: Batch failed", "scores": {}}] * num_remaining |
| | ) |
| |
|
| | |
| | return all_results_list |
| |
|
| |
|
| | |
| | if __name__ == "__main__": |
| | print("\n--- Testing predict.py Script Directly ---") |
| | if model and tokenizer: |
| | sample_comments_for_testing = [ |
| | "This is an amazing movie, I loved it!", |
| | "I'm not sure how I feel about this, it was okay.", |
| | "Worst experience ever, would not recommend.", |
| | "The food was alright, but the service was slow.", |
| | "What a fantastic day! #blessed", |
| | "I hate waiting in long lines.", |
| | "@user Check out http this is cool.", |
| | "Just a normal sentence, nothing special here.", |
| | "", |
| | "This new update is absolutely terrible and full of bugs.", |
| | ] |
| |
|
| | print("\nInput Comments for Direct Test:") |
| | for i, c in enumerate(sample_comments_for_testing): |
| | print(f"{i+1}. '{c}'") |
| |
|
| | |
| | prediction_results = predict_sentiments(sample_comments_for_testing) |
| |
|
| | print("\nPredicted Sentiments and Scores (Direct Test):") |
| | |
| | for i, (comment, result) in enumerate( |
| | zip(sample_comments_for_testing, prediction_results) |
| | ): |
| | print(f"{i+1}. Comment: '{comment}'") |
| | |
| | scores_dict = result.get("scores", {}) |
| | formatted_scores = ", ".join( |
| | [f"{name}: {score:.3f}" for name, score in scores_dict.items()] |
| | ) |
| | print(f" -> Predicted Label: {result.get('label', 'N/A')}") |
| | |
| | print(f" -> Scores: {{{formatted_scores}}}") |
| | print("--- Direct Test Finished ---") |
| | else: |
| | print("ERROR (predict.py - main test): Model and/or tokenizer not loaded.") |
| | print( |
| | f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present." |
| | ) |
| |
|