Spaces:
Running
Running
from flask import Flask, request, jsonify, send_from_directory | |
import pickle | |
import torch | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from transformers import DistilBertTokenizer, DistilBertModel | |
import torch.nn as nn | |
import os | |
import numpy | |
# Download NLTK stuff | |
nltk.data.path.append('/usr/local/share/nltk_data') | |
nltk.download('punkt_tab') | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
nltk.download('punkt') | |
app = Flask(__name__, static_folder='build', static_url_path='') | |
# Define DistilBERT model class | |
class DistilBERTClassifier(nn.Module): | |
def __init__(self, dropout_rate=0.2): | |
super(DistilBERTClassifier, self).__init__() | |
self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased') | |
self.dropout = nn.Dropout(dropout_rate) | |
self.classifier = nn.Linear(768, 2) | |
def forward(self, input_ids, attention_mask): | |
outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask) | |
pooled_output = outputs.last_hidden_state[:, 0] | |
pooled_output = self.dropout(pooled_output) | |
logits = self.classifier(pooled_output) | |
return logits | |
# Clean text function | |
def clean_text(text): | |
text = text.lower() | |
text = re.sub(r'http\S+|www\S+|https\S+', '', text) | |
text = re.sub(r'<.*?>', '', text) | |
text = re.sub(r'[^\w\s]', '', text) | |
text = re.sub(r'\d+', '', text) | |
tokens = nltk.word_tokenize(text) | |
stop_words = set(stopwords.words('english')) | |
lemmatizer = WordNetLemmatizer() | |
tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] | |
cleaned_text = ' '.join(tokens) | |
return cleaned_text | |
# Load models | |
def load_models(): | |
# Set device | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# TF-IDF vectorizer | |
with open('models/tfidf_vectorizer.pkl', 'rb') as f: | |
tfidf_vectorizer = pickle.load(f) | |
# Logistic Regression | |
with open('models/lr_model.pkl', 'rb') as f: | |
lr_model = pickle.load(f) | |
# random Forest | |
with open('models/rf_model.pkl', 'rb') as f: | |
rf_model = pickle.load(f) | |
# load DistilBERT | |
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') | |
distilbert_model = DistilBERTClassifier() | |
distilbert_model.load_state_dict(torch.load('models/distilbert_model.pt', map_location=device)) | |
distilbert_model.to(device) | |
distilbert_model.eval() | |
return tfidf_vectorizer, lr_model, rf_model, distilbert_model, tokenizer, device | |
# Load models at startup | |
tfidf_vectorizer, lr_model, rf_model, distilbert_model, tokenizer, device = load_models() | |
def serve(): | |
return send_from_directory(app.static_folder, 'index.html') | |
def analyze(): | |
data = request.get_json() | |
if not data or 'text' not in data or 'model' not in data: | |
return jsonify({'error': 'Missing required fields'}), 400 | |
news_text = data['text'] | |
model_option = data['model'] | |
if not news_text: | |
return jsonify({'error': 'Text cannot be empty'}), 400 | |
# Clean text | |
cleaned_text = clean_text(news_text) | |
results = {} | |
# Using Logistic Regression | |
if model_option in ["lr", "all"]: | |
text_tfidf = tfidf_vectorizer.transform([cleaned_text]) | |
lr_pred = lr_model.predict(text_tfidf)[0] | |
lr_prob = lr_model.predict_proba(text_tfidf)[0] | |
results["Logistic Regression"] = { | |
"prediction": "Real" if lr_pred == 1 else "Fake", | |
"fake_prob": float(lr_prob[0]), | |
"real_prob": float(lr_prob[1]) | |
} | |
# Using Random Forest | |
if model_option in ["rf", "all"]: | |
text_tfidf = tfidf_vectorizer.transform([cleaned_text]) | |
rf_pred = rf_model.predict(text_tfidf)[0] | |
rf_prob = rf_model.predict_proba(text_tfidf)[0] | |
results["Random Forest"] = { | |
"prediction": "Real" if rf_pred == 1 else "Fake", | |
"fake_prob": float(rf_prob[0]), | |
"real_prob": float(rf_prob[1]) | |
} | |
# Using DistilBERT | |
if model_option in ["distilbert", "all"]: | |
encoding = tokenizer( | |
cleaned_text, | |
truncation=True, | |
padding='max_length', | |
max_length=128, | |
return_tensors='pt' | |
) | |
with torch.no_grad(): | |
input_ids = encoding['input_ids'].to(device) | |
attention_mask = encoding['attention_mask'].to(device) | |
outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask) | |
print("Raw model output:", outputs.cpu().numpy()) | |
probs = torch.softmax(outputs, dim=1).cpu().numpy()[0] | |
print("After softmax:", probs) | |
print(f"Text: {cleaned_text[:50]}...") | |
print(f"Probabilities: Real={probs[0]:.4f}, Fake={probs[1]:.4f}") | |
distilbert_pred = 1 if probs[1] > probs[0] else 0 | |
results["DistilBERT"] = { | |
"prediction": "Real" if distilbert_pred == 1 else "Fake", | |
"fake_prob": float(probs[0]), | |
"real_prob": float(probs[1]) | |
} | |
# Calculate overall results for "all models" option | |
if model_option == "all": | |
real_votes = sum(1 for model, result in results.items() if result["prediction"] == "Real") | |
fake_votes = len(results) - real_votes | |
overall_verdict = "Real" if real_votes >= fake_votes else "Fake" | |
results["Overall"] = { | |
"prediction": overall_verdict, | |
"real_votes": real_votes, | |
"fake_votes": fake_votes, | |
"total_models": len(results) | |
} | |
return jsonify({'results': results}) | |
if __name__ == '__main__': | |
port = int(os.environ.get('PORT', 7860)) | |
app.run(host='0.0.0.0', port=port) |