Spaces:

abhi12ravi
/

wikipedia-protection

Build error

App Files Files Community

wikipedia-protection / gradiohelper.py

abhi12ravi

Upload gradiohelper.py

98bbe69 almost 3 years ago

raw

history blame contribute delete

7.66 kB

	import pickle
	import numpy as np
	import pandas as pd

	def fetch_pageviews(title):
	import pageviewapi
	retry_count = 0
	MAX_RETRIES = 10
	page_views = pageviewapi.per_article('en.wikipedia', title, '20150701', '20210607', access='all-access', agent='all-agents', granularity='daily')
	view_counter = 0
	for i in range (0, len(page_views['items'])):
	view_counter += page_views['items'][i]['views']

	return view_counter

	def fetch_details_from_info_page(title):
	import requests
	url = "https://en.wikipedia.org/w/index.php?action=info&title=" + title

	html_content = requests.get(url)
	df_list = pd.read_html(html_content.text) # this parses all the tables in webpages to a list

	#Get Features from all tables

	#Basic info table
	try:
	display_title = df_list[1][1][0]
	except IndexError:
	print("IndexError for Basic info table, so skipping")
	return
	print("Display Title = ", display_title)

	# Process Table 1 - Basic Information
	dict_table1 = df_list[1].to_dict()

	#Declare vars
	page_length = ""
	page_id = ""
	number_page_watchers = ""
	number_page_watchers_recent_edits = ""
	page_views_past_30days = ""
	number_of_redirects = ""
	page_views_past_30days = ""
	total_edits = ""
	recent_number_of_edits = ""
	number_distinct_authors = ""
	number_categories = ""

	for key, value in dict_table1[0].items():
	if value == 'Page length (in bytes)':
	page_length = dict_table1[1][key]
	print("Page Length = ", page_length)

	elif (value == 'Page ID'):
	page_id = dict_table1[1][key]
	print("Scrapped Page ID = ", page_id)

	elif value == 'Number of page watchers':
	number_page_watchers = dict_table1[1][key]
	print("Number of Page Watchers = ", number_page_watchers)

	elif value == 'Number of page watchers who visited recent edits':
	number_page_watchers_recent_edits = dict_table1[1][key]
	print("Number of Page Watchers with recent edits = ", number_page_watchers_recent_edits)

	elif value == 'Number of redirects to this page':
	number_of_redirects = dict_table1[1][key]
	print("Number of redirects = ", number_of_redirects)

	elif value == 'Page views in the past 30 days':
	page_views_past_30days = dict_table1[1][key]
	print("Page views past 30 days = ", page_views_past_30days)

	#Process Table 3 - Edit History
	try:
	dict_table3 = df_list[3].to_dict()
	for key, value in dict_table3[0].items():
	if value == 'Total number of edits':
	total_edits = dict_table3[1][key]
	print("Total Edits = ", total_edits)

	elif value == 'Recent number of edits (within past 30 days)':
	recent_number_of_edits = dict_table3[1][key]
	print("Recent number of edits = ", recent_number_of_edits)

	elif value == 'Recent number of distinct authors':
	number_distinct_authors = dict_table3[1][key]
	print("Distinct authors =", number_distinct_authors)
	except IndexError:
	print("Couldn't find the Edit History Table, so skipping...")
	pass

	#Page properties Table
	try:
	categories_string = df_list[4][0][0]
	print(categories_string)
	number_categories = ""
	if categories_string.startswith("Hidden categories"):
	#Get number of categories
	for c in categories_string:
	if c.isdigit():
	number_categories = number_categories + c

	print("Total number of categories = ", number_categories)
	except IndexError:
	print("Couldn't find the Page Properties Table, so skipping...")
	pass

	print("============================================== EOP ======================================")

	features_dict = { 'page_length': page_length,
	'page_id': page_id,
	'number_page_watchers': number_page_watchers,
	'number_page_watchers_recent_edits': number_page_watchers_recent_edits,
	'number_of_redirects' : number_of_redirects,
	'page_views_past_30days' :page_views_past_30days,
	'total_edits': total_edits,
	'recent_number_of_edits': recent_number_of_edits,
	'number_distinct_authors': number_distinct_authors,
	'number_categories': number_categories }

	return features_dict

	# MAP page_views and features_dict to np input array

	def mapping_function(page_views, features_dict):

	features_of_test_sample = np.empty([12,])

	features_of_test_sample[0] = features_dict['page_id']
	features_of_test_sample[1] = page_views
	features_of_test_sample[2] = features_dict['page_length']
	features_of_test_sample[3] = features_dict['number_page_watchers']
	features_of_test_sample[4] = features_dict ['number_page_watchers_recent_edits']
	features_of_test_sample[5] = features_dict['number_of_redirects']
	features_of_test_sample[6] = features_dict['page_views_past_30days']
	features_of_test_sample[7] = features_dict['total_edits']
	features_of_test_sample[8] = features_dict['recent_number_of_edits']
	features_of_test_sample[9] = features_dict['number_distinct_authors']
	features_of_test_sample[10] = features_dict['number_categories']
	features_of_test_sample[11] = features_dict['page_id']

	wikipedia_url = "https://en.wikipedia.org/?curid=" + str(features_dict['page_id'])

	return features_of_test_sample, wikipedia_url

	def get_features(title):
	#Get pageview
	page_views = fetch_pageviews(title)
	print('Tilte:', title, 'View Count:',page_views)

	#Get features from info pages
	features_dict = fetch_details_from_info_page(title)

	#MAP both to numpy array
	features_of_test_sample, wikipedia_url = mapping_function(page_views, features_dict)

	return features_of_test_sample, wikipedia_url

	def predict_protection_level(title):
	import pickle
	features_of_test_sample, wikipedia_url = get_features(title)
	print("Page URL: ", wikipedia_url)

	#Load the model
	filename = 'rfmodel.sav'
	loaded_model = pickle.load(open(filename, 'rb'))


	#predict
	#print("Features 1st row:", X_test[0])
	y_pred = loaded_model.predict([features_of_test_sample])

	print("Predicted protection_level: ", y_pred[0])

	predicted_protection_level = y_pred

	if(predicted_protection_level == 0):
	predicted_protection_level_str = "unprotected"
	elif(predicted_protection_level == 1):
	predicted_protection_level_str = "autoconfirmed"
	elif(predicted_protection_level == 2):
	predicted_protection_level_str = "extendedconfirmed"
	elif(predicted_protection_level == 3):
	predicted_protection_level_str = "sysop"

	return predicted_protection_level_str

	def main():
	predicted_protection_level_str = predict_protection_level("Donald Trump")
	print("Protection level:", predicted_protection_level_str)


	if __name__=='__main__':
	main()