Spaces:
Sleeping
Sleeping
| import csv | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc | |
| from sklearn.utils import shuffle | |
| from sklearn.model_selection import learning_curve | |
| import gender_guesser.detector as gender | |
| def read_datasets(): | |
| """ Reads users profile from csv files """ | |
| genuine_users = pd.read_csv("data/users.csv") | |
| fake_users = pd.read_csv("data/fusers.csv") | |
| x = pd.concat([genuine_users, fake_users]) | |
| y = [1] * len(genuine_users) + [0] * len(fake_users) | |
| return x, y | |
| def predict_sex(names): | |
| sex_predictor = gender.Detector(case_sensitive=False) | |
| sex_code = [] | |
| for name in names: | |
| first_name = name.split(' ')[0] | |
| sex = sex_predictor.get_gender(first_name) | |
| if sex == 'female': | |
| sex_code.append(2) | |
| # elif sex == 'mostly_female': | |
| # sex_code.append(-1) | |
| elif sex == 'male': | |
| sex_code.append(1) | |
| # elif sex == 'mostly_male': | |
| # sex_code.append(1) | |
| else: | |
| sex_code.append(0) # Assign a default value for unknown genders | |
| return sex_code | |
| def extract_features(x): | |
| x['sex_code'] = predict_sex(x['name']) | |
| feature_columns_to_use = ['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'sex_code'] | |
| x = x[feature_columns_to_use] | |
| return x | |
| # Rest of your code... | |
| def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): | |
| plt.figure() | |
| plt.title(title) | |
| if ylim is not None: | |
| plt.ylim(*ylim) | |
| plt.xlabel("Training examples") | |
| plt.ylabel("Score") | |
| train_sizes, train_scores, test_scores = learning_curve( | |
| estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) | |
| train_scores_mean = np.mean(train_scores, axis=1) | |
| train_scores_std = np.std(train_scores, axis=1) | |
| test_scores_mean = np.mean(test_scores, axis=1) | |
| test_scores_std = np.std(test_scores, axis=1) | |
| plt.grid() | |
| plt.fill_between(train_sizes, train_scores_mean - train_scores_std, | |
| train_scores_mean + train_scores_std, alpha=0.1, | |
| color="r") | |
| plt.fill_between(train_sizes, test_scores_mean - test_scores_std, | |
| test_scores_mean + test_scores_std, alpha=0.1, color="g") | |
| plt.plot(train_sizes, train_scores_mean, 'o-', color="r", | |
| label="Training score") | |
| plt.plot(train_sizes, test_scores_mean, 'o-', color="g", | |
| label="Cross-validation score") | |
| plt.legend(loc="best") | |
| return plt | |
| def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): | |
| target_names=['Fake','Genuine'] | |
| plt.imshow(cm, interpolation='nearest', cmap=cmap) | |
| plt.title(title) | |
| plt.colorbar() | |
| tick_marks = np.arange(len(target_names)) | |
| plt.xticks(tick_marks, target_names, rotation=45) | |
| plt.yticks(tick_marks, target_names) | |
| plt.tight_layout() | |
| plt.ylabel('True label') | |
| plt.xlabel('Predicted label') | |
| def plot_roc_curve(y_test, y_pred): | |
| false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred) | |
| print("False Positive rate: ", false_positive_rate) | |
| print("True Positive rate: ", true_positive_rate) | |
| roc_auc = auc(false_positive_rate, true_positive_rate) | |
| plt.title('Receiver Operating Characteristic') | |
| plt.plot(false_positive_rate, true_positive_rate, 'b', | |
| label='AUC = %0.2f' % roc_auc) | |
| plt.legend(loc='lower right') | |
| plt.plot([0, 1], [0, 1], 'r--') | |
| plt.xlim([-0.1, 1.2]) | |
| plt.ylim([-0.1, 1.2]) | |
| plt.ylabel('True Positive Rate') | |
| plt.xlabel('False Positive Rate') | |
| plt.show() | |
| def train(X_train, y_train, X_test): | |
| """ Trains and predicts dataset with a Random Forest classifier """ | |
| clf = RandomForestClassifier(n_estimators=40, oob_score=True) | |
| clf.fit(X_train, y_train) | |
| print("The best classifier is: ", clf) | |
| # Estimate score | |
| scores = cross_val_score(clf, X_train, y_train, cv=5) | |
| print(scores) | |
| print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) | |
| title = 'Learning Curves (Random Forest)' | |
| plot_learning_curve(clf, title, X_train, y_train, cv=5) | |
| plt.show() | |
| # Predict | |
| y_pred = clf.predict(X_test) | |
| import pickle | |
| with open('data.pkl','wb') as file: | |
| pickle.dump(clf,file) | |
| return y_test, y_pred | |
| print("Reading datasets...\n") | |
| x, y = read_datasets() | |
| x.describe() | |
| print("Extracting features...\n") | |
| x = extract_features(x) | |
| print(x.columns) | |
| print(x.describe()) | |
| print("Splitting datasets into train and test dataset...\n") | |
| X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=44) | |
| print("Training datasets...\n") | |
| y_test, y_pred = train(X_train, y_train, X_test) | |
| print('Classification Accuracy on Test dataset: ', accuracy_score(y_test, y_pred)) | |
| cm = confusion_matrix(y_test, y_pred) | |
| print('Confusion matrix, without normalization') | |
| print(cm) | |
| plot_confusion_matrix(cm) | |
| cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | |
| print('Normalized confusion matrix') | |
| print(cm_normalized) | |
| plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix') | |
| print(classification_report(y_test, y_pred, target_names=['Fake', 'Genuine'])) | |
| plot_roc_curve(y_test, y_pred) |