abhi12ravi commited on
Commit
98bbe69
1 Parent(s): a0f2a88

Upload gradiohelper.py

Browse files
Files changed (1) hide show
  1. gradiohelper.py +195 -0
gradiohelper.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ def fetch_pageviews(title):
6
+ import pageviewapi
7
+ retry_count = 0
8
+ MAX_RETRIES = 10
9
+ page_views = pageviewapi.per_article('en.wikipedia', title, '20150701', '20210607', access='all-access', agent='all-agents', granularity='daily')
10
+ view_counter = 0
11
+ for i in range (0, len(page_views['items'])):
12
+ view_counter += page_views['items'][i]['views']
13
+
14
+ return view_counter
15
+
16
+ def fetch_details_from_info_page(title):
17
+ import requests
18
+ url = "https://en.wikipedia.org/w/index.php?action=info&title=" + title
19
+
20
+ html_content = requests.get(url)
21
+ df_list = pd.read_html(html_content.text) # this parses all the tables in webpages to a list
22
+
23
+ #Get Features from all tables
24
+
25
+ #Basic info table
26
+ try:
27
+ display_title = df_list[1][1][0]
28
+ except IndexError:
29
+ print("IndexError for Basic info table, so skipping")
30
+ return
31
+ print("Display Title = ", display_title)
32
+
33
+ # Process Table 1 - Basic Information
34
+ dict_table1 = df_list[1].to_dict()
35
+
36
+ #Declare vars
37
+ page_length = ""
38
+ page_id = ""
39
+ number_page_watchers = ""
40
+ number_page_watchers_recent_edits = ""
41
+ page_views_past_30days = ""
42
+ number_of_redirects = ""
43
+ page_views_past_30days = ""
44
+ total_edits = ""
45
+ recent_number_of_edits = ""
46
+ number_distinct_authors = ""
47
+ number_categories = ""
48
+
49
+ for key, value in dict_table1[0].items():
50
+ if value == 'Page length (in bytes)':
51
+ page_length = dict_table1[1][key]
52
+ print("Page Length = ", page_length)
53
+
54
+ elif (value == 'Page ID'):
55
+ page_id = dict_table1[1][key]
56
+ print("Scrapped Page ID = ", page_id)
57
+
58
+ elif value == 'Number of page watchers':
59
+ number_page_watchers = dict_table1[1][key]
60
+ print("Number of Page Watchers = ", number_page_watchers)
61
+
62
+ elif value == 'Number of page watchers who visited recent edits':
63
+ number_page_watchers_recent_edits = dict_table1[1][key]
64
+ print("Number of Page Watchers with recent edits = ", number_page_watchers_recent_edits)
65
+
66
+ elif value == 'Number of redirects to this page':
67
+ number_of_redirects = dict_table1[1][key]
68
+ print("Number of redirects = ", number_of_redirects)
69
+
70
+ elif value == 'Page views in the past 30 days':
71
+ page_views_past_30days = dict_table1[1][key]
72
+ print("Page views past 30 days = ", page_views_past_30days)
73
+
74
+ #Process Table 3 - Edit History
75
+ try:
76
+ dict_table3 = df_list[3].to_dict()
77
+ for key, value in dict_table3[0].items():
78
+ if value == 'Total number of edits':
79
+ total_edits = dict_table3[1][key]
80
+ print("Total Edits = ", total_edits)
81
+
82
+ elif value == 'Recent number of edits (within past 30 days)':
83
+ recent_number_of_edits = dict_table3[1][key]
84
+ print("Recent number of edits = ", recent_number_of_edits)
85
+
86
+ elif value == 'Recent number of distinct authors':
87
+ number_distinct_authors = dict_table3[1][key]
88
+ print("Distinct authors =", number_distinct_authors)
89
+ except IndexError:
90
+ print("Couldn't find the Edit History Table, so skipping...")
91
+ pass
92
+
93
+ #Page properties Table
94
+ try:
95
+ categories_string = df_list[4][0][0]
96
+ print(categories_string)
97
+ number_categories = ""
98
+ if categories_string.startswith("Hidden categories"):
99
+ #Get number of categories
100
+ for c in categories_string:
101
+ if c.isdigit():
102
+ number_categories = number_categories + c
103
+
104
+ print("Total number of categories = ", number_categories)
105
+ except IndexError:
106
+ print("Couldn't find the Page Properties Table, so skipping...")
107
+ pass
108
+
109
+ print("============================================== EOP ======================================")
110
+
111
+ features_dict = { 'page_length': page_length,
112
+ 'page_id': page_id,
113
+ 'number_page_watchers': number_page_watchers,
114
+ 'number_page_watchers_recent_edits': number_page_watchers_recent_edits,
115
+ 'number_of_redirects' : number_of_redirects,
116
+ 'page_views_past_30days' :page_views_past_30days,
117
+ 'total_edits': total_edits,
118
+ 'recent_number_of_edits': recent_number_of_edits,
119
+ 'number_distinct_authors': number_distinct_authors,
120
+ 'number_categories': number_categories }
121
+
122
+ return features_dict
123
+
124
+ # MAP page_views and features_dict to np input array
125
+
126
+ def mapping_function(page_views, features_dict):
127
+
128
+ features_of_test_sample = np.empty([12,])
129
+
130
+ features_of_test_sample[0] = features_dict['page_id']
131
+ features_of_test_sample[1] = page_views
132
+ features_of_test_sample[2] = features_dict['page_length']
133
+ features_of_test_sample[3] = features_dict['number_page_watchers']
134
+ features_of_test_sample[4] = features_dict ['number_page_watchers_recent_edits']
135
+ features_of_test_sample[5] = features_dict['number_of_redirects']
136
+ features_of_test_sample[6] = features_dict['page_views_past_30days']
137
+ features_of_test_sample[7] = features_dict['total_edits']
138
+ features_of_test_sample[8] = features_dict['recent_number_of_edits']
139
+ features_of_test_sample[9] = features_dict['number_distinct_authors']
140
+ features_of_test_sample[10] = features_dict['number_categories']
141
+ features_of_test_sample[11] = features_dict['page_id']
142
+
143
+ wikipedia_url = "https://en.wikipedia.org/?curid=" + str(features_dict['page_id'])
144
+
145
+ return features_of_test_sample, wikipedia_url
146
+
147
+ def get_features(title):
148
+ #Get pageview
149
+ page_views = fetch_pageviews(title)
150
+ print('Tilte:', title, 'View Count:',page_views)
151
+
152
+ #Get features from info pages
153
+ features_dict = fetch_details_from_info_page(title)
154
+
155
+ #MAP both to numpy array
156
+ features_of_test_sample, wikipedia_url = mapping_function(page_views, features_dict)
157
+
158
+ return features_of_test_sample, wikipedia_url
159
+
160
+ def predict_protection_level(title):
161
+ import pickle
162
+ features_of_test_sample, wikipedia_url = get_features(title)
163
+ print("Page URL: ", wikipedia_url)
164
+
165
+ #Load the model
166
+ filename = 'rfmodel.sav'
167
+ loaded_model = pickle.load(open(filename, 'rb'))
168
+
169
+
170
+ #predict
171
+ #print("Features 1st row:", X_test[0])
172
+ y_pred = loaded_model.predict([features_of_test_sample])
173
+
174
+ print("Predicted protection_level: ", y_pred[0])
175
+
176
+ predicted_protection_level = y_pred
177
+
178
+ if(predicted_protection_level == 0):
179
+ predicted_protection_level_str = "unprotected"
180
+ elif(predicted_protection_level == 1):
181
+ predicted_protection_level_str = "autoconfirmed"
182
+ elif(predicted_protection_level == 2):
183
+ predicted_protection_level_str = "extendedconfirmed"
184
+ elif(predicted_protection_level == 3):
185
+ predicted_protection_level_str = "sysop"
186
+
187
+ return predicted_protection_level_str
188
+
189
+ def main():
190
+ predicted_protection_level_str = predict_protection_level("Donald Trump")
191
+ print("Protection level:", predicted_protection_level_str)
192
+
193
+
194
+ if __name__=='__main__':
195
+ main()