File size: 7,662 Bytes
98bbe69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import pickle 
import numpy as np
import pandas as pd

def fetch_pageviews(title):
    import pageviewapi
    retry_count = 0
    MAX_RETRIES = 10
    page_views = pageviewapi.per_article('en.wikipedia', title, '20150701', '20210607', access='all-access', agent='all-agents', granularity='daily')
    view_counter = 0
    for i in range (0, len(page_views['items'])):
        view_counter += page_views['items'][i]['views']
    
    return view_counter

def fetch_details_from_info_page(title):
    import requests
    url = "https://en.wikipedia.org/w/index.php?action=info&title=" + title

    html_content = requests.get(url)
    df_list = pd.read_html(html_content.text) # this parses all the tables in webpages to a list
    
    #Get Features from all tables

    #Basic info table
    try:
        display_title = df_list[1][1][0]
    except IndexError:
        print("IndexError for Basic info table, so skipping")
        return
    print("Display Title = ", display_title)

    # Process Table 1 - Basic Information
    dict_table1 = df_list[1].to_dict()
    
    #Declare vars
    page_length = ""
    page_id = ""
    number_page_watchers = ""
    number_page_watchers_recent_edits = ""
    page_views_past_30days = ""
    number_of_redirects = ""
    page_views_past_30days = ""
    total_edits = ""
    recent_number_of_edits = ""
    number_distinct_authors = ""
    number_categories = ""

    for key, value in dict_table1[0].items():  
        if value == 'Page length (in bytes)':        
            page_length = dict_table1[1][key]
            print("Page Length = ", page_length)
            
        elif (value == 'Page ID'):
            page_id = dict_table1[1][key]
            print("Scrapped Page ID = ", page_id)
            
        elif value == 'Number of page watchers':
            number_page_watchers = dict_table1[1][key]
            print("Number of Page Watchers = ", number_page_watchers)
        
        elif value == 'Number of page watchers who visited recent edits':
            number_page_watchers_recent_edits = dict_table1[1][key]
            print("Number of Page Watchers with recent edits = ", number_page_watchers_recent_edits)
        
        elif value == 'Number of redirects to this page':
            number_of_redirects = dict_table1[1][key]
            print("Number of redirects = ", number_of_redirects)
        
        elif value == 'Page views in the past 30 days':
            page_views_past_30days = dict_table1[1][key]
            print("Page views past 30 days = ", page_views_past_30days)
        
    #Process Table 3 - Edit History
    try:
        dict_table3 = df_list[3].to_dict()
        for key, value in dict_table3[0].items():  
            if value == 'Total number of edits':        
                total_edits = dict_table3[1][key]
                print("Total Edits = ", total_edits)
                
            elif value == 'Recent number of edits (within past 30 days)':
                recent_number_of_edits = dict_table3[1][key]
                print("Recent number of edits = ", recent_number_of_edits)
                
            elif value == 'Recent number of distinct authors':
                number_distinct_authors = dict_table3[1][key]
                print("Distinct authors =", number_distinct_authors)
    except IndexError:
        print("Couldn't find the Edit History Table, so skipping...")
        pass

    #Page properties Table
    try:
        categories_string = df_list[4][0][0]
        print(categories_string)
        number_categories = ""
        if  categories_string.startswith("Hidden categories"):         
            #Get number of categories
            for c in categories_string:
                if c.isdigit():
                    number_categories = number_categories + c     
            
            print("Total number of categories = ", number_categories)
    except IndexError:
        print("Couldn't find the Page Properties Table, so skipping...")
        pass

    print("============================================== EOP ======================================")

    features_dict = {   'page_length': page_length, 
                        'page_id': page_id, 
                        'number_page_watchers': number_page_watchers, 
                        'number_page_watchers_recent_edits': number_page_watchers_recent_edits, 
                        'number_of_redirects' : number_of_redirects, 
                        'page_views_past_30days' :page_views_past_30days, 
                        'total_edits': total_edits, 
                        'recent_number_of_edits': recent_number_of_edits, 
                        'number_distinct_authors': number_distinct_authors, 
                        'number_categories': number_categories }

    return features_dict

# MAP page_views and features_dict to np input array

def mapping_function(page_views, features_dict):

    features_of_test_sample = np.empty([12,])

    features_of_test_sample[0] = features_dict['page_id']
    features_of_test_sample[1] = page_views
    features_of_test_sample[2] = features_dict['page_length']
    features_of_test_sample[3] = features_dict['number_page_watchers']
    features_of_test_sample[4] = features_dict ['number_page_watchers_recent_edits']
    features_of_test_sample[5] = features_dict['number_of_redirects']
    features_of_test_sample[6] = features_dict['page_views_past_30days']
    features_of_test_sample[7] = features_dict['total_edits']
    features_of_test_sample[8] = features_dict['recent_number_of_edits']
    features_of_test_sample[9] = features_dict['number_distinct_authors']
    features_of_test_sample[10] = features_dict['number_categories']
    features_of_test_sample[11] = features_dict['page_id']
    
    wikipedia_url = "https://en.wikipedia.org/?curid=" + str(features_dict['page_id'])
    
    return features_of_test_sample, wikipedia_url

def get_features(title):
    #Get pageview
    page_views = fetch_pageviews(title)
    print('Tilte:', title, 'View Count:',page_views)
    
    #Get features from info pages 
    features_dict = fetch_details_from_info_page(title)
    
    #MAP both to numpy array
    features_of_test_sample, wikipedia_url = mapping_function(page_views, features_dict)
    
    return features_of_test_sample, wikipedia_url

def predict_protection_level(title):
    import pickle
    features_of_test_sample, wikipedia_url = get_features(title)
    print("Page URL: ", wikipedia_url)
    
    #Load the model
    filename = 'rfmodel.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    
    
    #predict    
    #print("Features 1st row:", X_test[0])
    y_pred = loaded_model.predict([features_of_test_sample])
    
    print("Predicted protection_level: ", y_pred[0])
    
    predicted_protection_level = y_pred    
    
    if(predicted_protection_level == 0):
        predicted_protection_level_str = "unprotected"
    elif(predicted_protection_level == 1):
        predicted_protection_level_str = "autoconfirmed"
    elif(predicted_protection_level == 2):
        predicted_protection_level_str = "extendedconfirmed"
    elif(predicted_protection_level == 3):
        predicted_protection_level_str = "sysop"
        
    return predicted_protection_level_str

def main():
    predicted_protection_level_str = predict_protection_level("Donald Trump")
    print("Protection level:", predicted_protection_level_str)


if __name__=='__main__':
    main()