import pickle import numpy as np import pandas as pd def fetch_pageviews(title): import pageviewapi retry_count = 0 MAX_RETRIES = 10 page_views = pageviewapi.per_article('en.wikipedia', title, '20150701', '20210607', access='all-access', agent='all-agents', granularity='daily') view_counter = 0 for i in range (0, len(page_views['items'])): view_counter += page_views['items'][i]['views'] return view_counter def fetch_details_from_info_page(title): import requests url = "https://en.wikipedia.org/w/index.php?action=info&title=" + title html_content = requests.get(url) df_list = pd.read_html(html_content.text) # this parses all the tables in webpages to a list #Get Features from all tables #Basic info table try: display_title = df_list[1][1][0] except IndexError: print("IndexError for Basic info table, so skipping") return print("Display Title = ", display_title) # Process Table 1 - Basic Information dict_table1 = df_list[1].to_dict() #Declare vars page_length = "" page_id = "" number_page_watchers = "" number_page_watchers_recent_edits = "" page_views_past_30days = "" number_of_redirects = "" page_views_past_30days = "" total_edits = "" recent_number_of_edits = "" number_distinct_authors = "" number_categories = "" for key, value in dict_table1[0].items(): if value == 'Page length (in bytes)': page_length = dict_table1[1][key] print("Page Length = ", page_length) elif (value == 'Page ID'): page_id = dict_table1[1][key] print("Scrapped Page ID = ", page_id) elif value == 'Number of page watchers': number_page_watchers = dict_table1[1][key] print("Number of Page Watchers = ", number_page_watchers) elif value == 'Number of page watchers who visited recent edits': number_page_watchers_recent_edits = dict_table1[1][key] print("Number of Page Watchers with recent edits = ", number_page_watchers_recent_edits) elif value == 'Number of redirects to this page': number_of_redirects = dict_table1[1][key] print("Number of redirects = ", number_of_redirects) elif value == 'Page views in the past 30 days': page_views_past_30days = dict_table1[1][key] print("Page views past 30 days = ", page_views_past_30days) #Process Table 3 - Edit History try: dict_table3 = df_list[3].to_dict() for key, value in dict_table3[0].items(): if value == 'Total number of edits': total_edits = dict_table3[1][key] print("Total Edits = ", total_edits) elif value == 'Recent number of edits (within past 30 days)': recent_number_of_edits = dict_table3[1][key] print("Recent number of edits = ", recent_number_of_edits) elif value == 'Recent number of distinct authors': number_distinct_authors = dict_table3[1][key] print("Distinct authors =", number_distinct_authors) except IndexError: print("Couldn't find the Edit History Table, so skipping...") pass #Page properties Table try: categories_string = df_list[4][0][0] print(categories_string) number_categories = "" if categories_string.startswith("Hidden categories"): #Get number of categories for c in categories_string: if c.isdigit(): number_categories = number_categories + c print("Total number of categories = ", number_categories) except IndexError: print("Couldn't find the Page Properties Table, so skipping...") pass print("============================================== EOP ======================================") features_dict = { 'page_length': page_length, 'page_id': page_id, 'number_page_watchers': number_page_watchers, 'number_page_watchers_recent_edits': number_page_watchers_recent_edits, 'number_of_redirects' : number_of_redirects, 'page_views_past_30days' :page_views_past_30days, 'total_edits': total_edits, 'recent_number_of_edits': recent_number_of_edits, 'number_distinct_authors': number_distinct_authors, 'number_categories': number_categories } return features_dict # MAP page_views and features_dict to np input array def mapping_function(page_views, features_dict): features_of_test_sample = np.empty([12,]) features_of_test_sample[0] = features_dict['page_id'] features_of_test_sample[1] = page_views features_of_test_sample[2] = features_dict['page_length'] features_of_test_sample[3] = features_dict['number_page_watchers'] features_of_test_sample[4] = features_dict ['number_page_watchers_recent_edits'] features_of_test_sample[5] = features_dict['number_of_redirects'] features_of_test_sample[6] = features_dict['page_views_past_30days'] features_of_test_sample[7] = features_dict['total_edits'] features_of_test_sample[8] = features_dict['recent_number_of_edits'] features_of_test_sample[9] = features_dict['number_distinct_authors'] features_of_test_sample[10] = features_dict['number_categories'] features_of_test_sample[11] = features_dict['page_id'] wikipedia_url = "https://en.wikipedia.org/?curid=" + str(features_dict['page_id']) return features_of_test_sample, wikipedia_url def get_features(title): #Get pageview page_views = fetch_pageviews(title) print('Tilte:', title, 'View Count:',page_views) #Get features from info pages features_dict = fetch_details_from_info_page(title) #MAP both to numpy array features_of_test_sample, wikipedia_url = mapping_function(page_views, features_dict) return features_of_test_sample, wikipedia_url def predict_protection_level(title): import pickle features_of_test_sample, wikipedia_url = get_features(title) print("Page URL: ", wikipedia_url) #Load the model filename = 'rfmodel.sav' loaded_model = pickle.load(open(filename, 'rb')) #predict #print("Features 1st row:", X_test[0]) y_pred = loaded_model.predict([features_of_test_sample]) print("Predicted protection_level: ", y_pred[0]) predicted_protection_level = y_pred if(predicted_protection_level == 0): predicted_protection_level_str = "unprotected" elif(predicted_protection_level == 1): predicted_protection_level_str = "autoconfirmed" elif(predicted_protection_level == 2): predicted_protection_level_str = "extendedconfirmed" elif(predicted_protection_level == 3): predicted_protection_level_str = "sysop" return predicted_protection_level_str def main(): predicted_protection_level_str = predict_protection_level("Donald Trump") print("Protection level:", predicted_protection_level_str) if __name__=='__main__': main()