Spaces:
Build error
Build error
File size: 7,662 Bytes
98bbe69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import pickle
import numpy as np
import pandas as pd
def fetch_pageviews(title):
import pageviewapi
retry_count = 0
MAX_RETRIES = 10
page_views = pageviewapi.per_article('en.wikipedia', title, '20150701', '20210607', access='all-access', agent='all-agents', granularity='daily')
view_counter = 0
for i in range (0, len(page_views['items'])):
view_counter += page_views['items'][i]['views']
return view_counter
def fetch_details_from_info_page(title):
import requests
url = "https://en.wikipedia.org/w/index.php?action=info&title=" + title
html_content = requests.get(url)
df_list = pd.read_html(html_content.text) # this parses all the tables in webpages to a list
#Get Features from all tables
#Basic info table
try:
display_title = df_list[1][1][0]
except IndexError:
print("IndexError for Basic info table, so skipping")
return
print("Display Title = ", display_title)
# Process Table 1 - Basic Information
dict_table1 = df_list[1].to_dict()
#Declare vars
page_length = ""
page_id = ""
number_page_watchers = ""
number_page_watchers_recent_edits = ""
page_views_past_30days = ""
number_of_redirects = ""
page_views_past_30days = ""
total_edits = ""
recent_number_of_edits = ""
number_distinct_authors = ""
number_categories = ""
for key, value in dict_table1[0].items():
if value == 'Page length (in bytes)':
page_length = dict_table1[1][key]
print("Page Length = ", page_length)
elif (value == 'Page ID'):
page_id = dict_table1[1][key]
print("Scrapped Page ID = ", page_id)
elif value == 'Number of page watchers':
number_page_watchers = dict_table1[1][key]
print("Number of Page Watchers = ", number_page_watchers)
elif value == 'Number of page watchers who visited recent edits':
number_page_watchers_recent_edits = dict_table1[1][key]
print("Number of Page Watchers with recent edits = ", number_page_watchers_recent_edits)
elif value == 'Number of redirects to this page':
number_of_redirects = dict_table1[1][key]
print("Number of redirects = ", number_of_redirects)
elif value == 'Page views in the past 30 days':
page_views_past_30days = dict_table1[1][key]
print("Page views past 30 days = ", page_views_past_30days)
#Process Table 3 - Edit History
try:
dict_table3 = df_list[3].to_dict()
for key, value in dict_table3[0].items():
if value == 'Total number of edits':
total_edits = dict_table3[1][key]
print("Total Edits = ", total_edits)
elif value == 'Recent number of edits (within past 30 days)':
recent_number_of_edits = dict_table3[1][key]
print("Recent number of edits = ", recent_number_of_edits)
elif value == 'Recent number of distinct authors':
number_distinct_authors = dict_table3[1][key]
print("Distinct authors =", number_distinct_authors)
except IndexError:
print("Couldn't find the Edit History Table, so skipping...")
pass
#Page properties Table
try:
categories_string = df_list[4][0][0]
print(categories_string)
number_categories = ""
if categories_string.startswith("Hidden categories"):
#Get number of categories
for c in categories_string:
if c.isdigit():
number_categories = number_categories + c
print("Total number of categories = ", number_categories)
except IndexError:
print("Couldn't find the Page Properties Table, so skipping...")
pass
print("============================================== EOP ======================================")
features_dict = { 'page_length': page_length,
'page_id': page_id,
'number_page_watchers': number_page_watchers,
'number_page_watchers_recent_edits': number_page_watchers_recent_edits,
'number_of_redirects' : number_of_redirects,
'page_views_past_30days' :page_views_past_30days,
'total_edits': total_edits,
'recent_number_of_edits': recent_number_of_edits,
'number_distinct_authors': number_distinct_authors,
'number_categories': number_categories }
return features_dict
# MAP page_views and features_dict to np input array
def mapping_function(page_views, features_dict):
features_of_test_sample = np.empty([12,])
features_of_test_sample[0] = features_dict['page_id']
features_of_test_sample[1] = page_views
features_of_test_sample[2] = features_dict['page_length']
features_of_test_sample[3] = features_dict['number_page_watchers']
features_of_test_sample[4] = features_dict ['number_page_watchers_recent_edits']
features_of_test_sample[5] = features_dict['number_of_redirects']
features_of_test_sample[6] = features_dict['page_views_past_30days']
features_of_test_sample[7] = features_dict['total_edits']
features_of_test_sample[8] = features_dict['recent_number_of_edits']
features_of_test_sample[9] = features_dict['number_distinct_authors']
features_of_test_sample[10] = features_dict['number_categories']
features_of_test_sample[11] = features_dict['page_id']
wikipedia_url = "https://en.wikipedia.org/?curid=" + str(features_dict['page_id'])
return features_of_test_sample, wikipedia_url
def get_features(title):
#Get pageview
page_views = fetch_pageviews(title)
print('Tilte:', title, 'View Count:',page_views)
#Get features from info pages
features_dict = fetch_details_from_info_page(title)
#MAP both to numpy array
features_of_test_sample, wikipedia_url = mapping_function(page_views, features_dict)
return features_of_test_sample, wikipedia_url
def predict_protection_level(title):
import pickle
features_of_test_sample, wikipedia_url = get_features(title)
print("Page URL: ", wikipedia_url)
#Load the model
filename = 'rfmodel.sav'
loaded_model = pickle.load(open(filename, 'rb'))
#predict
#print("Features 1st row:", X_test[0])
y_pred = loaded_model.predict([features_of_test_sample])
print("Predicted protection_level: ", y_pred[0])
predicted_protection_level = y_pred
if(predicted_protection_level == 0):
predicted_protection_level_str = "unprotected"
elif(predicted_protection_level == 1):
predicted_protection_level_str = "autoconfirmed"
elif(predicted_protection_level == 2):
predicted_protection_level_str = "extendedconfirmed"
elif(predicted_protection_level == 3):
predicted_protection_level_str = "sysop"
return predicted_protection_level_str
def main():
predicted_protection_level_str = predict_protection_level("Donald Trump")
print("Protection level:", predicted_protection_level_str)
if __name__=='__main__':
main() |