Spaces:

trangdoan982
/

wikicredibility

Sleeping

wikicredibility / source_eval_model.py

doantrang982@uni.minerva.edu

add scholarly green flags

7be836b over 2 years ago

12.4 kB

	# %%
	# Import all libraries

	from bs4 import BeautifulSoup
	import bibtexparser
	from dateutil import parser
	import json
	import requests
	import tldextract
	from collections import defaultdict
	import re
	import mwparserfromhell


	# Given the DOI, PMID, PMC number, fetch journal's meta data

	def get_metainfo_doi(doi):
	"""Input: doi string
	Output: the journal name and date published of the article. Return None for each value if the can't parsed
	"""
	res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"})
	res = res.content.decode('utf-8')
	bibtext = bibtexparser.loads(res).entries
	if len(bibtext) >0 :
	journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None
	time_published = ""

	if "year" in bibtext[0]:
	time_published += bibtext[0]["year"]
	if "month" in bibtext[0]:
	time_published += " " + bibtext[0]["month"]
	if "day" in bibtext[0]:
	time_published += " " + bibtext[0]["day"]
	if len(time_published) > 0:
	time_published = parser.parse(time_published)
	else:
	time_published = None

	return journal, time_published
	else:
	return None, None


	def get_metainfo_pmc(pmc):
	res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json")
	res = res.content.decode("utf-8")
	res = json.loads(res)
	data = res["result"][pmc]
	journal, time_published = None, None
	if "error" in data:
	return None, None
	else:
	journal = data["fulljournalname"].strip()
	time_published = parser.parse(data["pubdate"])
	return journal, time_published


	def get_metainfo_pmid(pmid):
	res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json")
	res = res.content.decode("utf-8")
	res = json.loads(res)
	data = res["result"][pmid]
	journal, time_published = None, None
	if "error" in data:
	return None, None
	else:
	journal = data["fulljournalname"].strip()
	time_published = parser.parse(data["pubdate"])
	return journal, time_published


	def parse_html(page_url):
	""" This function parse metadata of citations from HTML tag.
	Input: wiki_url
	Output: a parsed citation list from HTML. Each citation has format key: value
	key: the text version of all citation
	value: a dictionary with schema
	{"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str \| None, "date": datetime \|None}"""

	citation_types = {'web', 'journal', 'book', 'conference', 'news'}
	all_parsed_citations = defaultdict(dict)
	response = requests.get(page_url)
	soup = BeautifulSoup(response.content, 'html.parser')

	# Get all the references
	all_ref = []
	ordered_ref_lst = soup.find_all("ol", {"class": "references"})
	for each in ordered_ref_lst:
	refs = each.find_all("li")
	all_ref += refs

	for ele in all_ref:
	#Check if it has <span class="reference text">
	ref = ele.find("span", {"class":"reference-text"})

	source_type = "other" #first default value for source_type

	if ref:
	# TASK: get all essential information from citation tag
	citation_key = ref.get_text()
	hyperlink = ref.find("a", {"class": "external text"})
	external_link = hyperlink["href"] if hyperlink else None

	# TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news'
	cite_tag = ref.find("cite")
	if cite_tag:
	for class_tag in cite_tag["class"]:
	if class_tag in citation_types:
	source_type = class_tag
	break

	# TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news)
	#for journal, conference, others look for DOI or PMID or PMC
	if source_type in {'journal', 'conference', 'other'}:
	has_doi = ref.find("a", {"title": "Doi (identifier)"})
	has_pmc = ref.find("a", {"title": "PMC (identifier)"})
	has_pmid = ref.find("a", {"title": "PMID (identifier)"})
	journal, date = None, None
	if has_doi:
	doi = has_doi.find_next("a", {"class": "external text"})
	journal, date = get_metainfo_doi(doi.text)
	elif has_pmc:
	pmc = has_pmc.find_next("a", {"class": "external text"})
	journal, date = get_metainfo_pmc(pmc.text)
	elif has_pmid:
	pmid = has_pmid.find_next("a", {"class": "external text"})
	journal, date = get_metainfo_pmid(pmid.text)

	all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date}

	# for news, web, other that hasn't been parsed, publisher is the domain of the website
	elif source_type in {'news', 'web', 'other'}:
	publisher = tldextract.extract(external_link).domain if external_link else None
	all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None}


	return all_parsed_citations


	# After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation

	# %%
	def parse_match_wikitext(wiki_url):
	"""
	This function parse wikitext version of the citations, match it with the HTML version,
	and extract more information, such as publisher and date that weren't extracted on the HTML.
	Input: wiki_url
	Output: a fully parsed citation list. Each citation has format key: value
	key: the text version of all citation
	value: a dictionary with schema
	{"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str \| None, "date": datetime \|None}
	"""
	parsed_citation = parse_html(wiki_url)
	print("ALL citation", len(parsed_citation))
	wiki_page = wiki_url.split("wiki/")[1]
	url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw"
	response = requests.get(url)
	text = response.text
	wikicode = mwparserfromhell.parse(text)


	# Create a copy of not fully parsed citation, ie one that lacks publisher or date param
	not_fully_parsed = defaultdict(dict)
	for key, val in parsed_citation.items():
	if not val["publisher"] or not val["date"]:
	not_fully_parsed[key] = val

	for tpl in wikicode.filter_templates(matches="{{cite"):
	#tpl is template, for a template in wikitext
	found_match = None

	# Match on external link:
	if tpl.has_param("url"):
	external_url = tpl.get("url").split("=")[1]
	for key, val in not_fully_parsed.items():
	if val["external_link"]:
	if val["external_link"].strip() == external_url.strip():
	found_match = key
	break
	# if not found match by URL, find by title
	if not found_match:
	if tpl.has_param("title"):
	#Get the title of citation in without formatting text
	title = tpl.get("title").split("=")[1]
	title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting
	for key in not_fully_parsed.keys():
	if title in key:
	found_match = key
	break

	if found_match:
	# Fetch publisher/ journal name from wikitext
	if not parsed_citation[found_match]["publisher"]:
	publisher = None
	if tpl.has_param("journal"): #for journal name
	publisher = tpl.get("journal").split("=")[1]
	elif tpl.has_param("publishder"): #for website or book publisher
	publisher = tpl.get("publisher").split("=")[1]
	elif tpl.has_param("work"): #for news/ magazine name
	publisher = tpl.get("work").split("=")[1]

	if publisher:
	publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher)
	parsed_citation[found_match]["publisher"] = publisher

	# Fetch publication date from wikitext
	if not parsed_citation[found_match]["date"]:
	date = None
	if tpl.has_param("date"):
	date = tpl.get("date").split("=")[1]
	if len(date) >= 4: #at least 4 digits for year, or yy-mm format
	date = parser.parse(date)
	parsed_citation[found_match]["date"] = date


	return parsed_citation


	def eval_scholarly_sources(citation):
	"""
	This function evaluates the tag for a scholarly souces (journal, conference, or other type)
	Input:
	the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str \| None, "date": datetime \|None}
	Output:
	the tag for citation (red, green, yellow, unknown)
	"""
	# read the dictionaries of flags from the json file
	with open("scholarly_flags.json", "r") as f:
	all_flags = json.load(f)

	# Check on the domain of external link
	if citation["external_link"]:
	domain = tldextract.extract(citation["external_link"]).domain
	if domain in all_flags["red_scholarly_reverse"]:
	return "red"
	elif domain in all_flags["yellow_scholarly_reverse"]:
	return "yellow"
	elif domain in all_flags["green_scholarly_reverse"]:
	return "green"

	#check on the name dictionary
	if citation["publisher"] in all_flags["red_scholarly"]:
	return "red"
	elif citation["publisher"] in all_flags["yellow_scholarly"]:
	return "yellow"
	elif citation["publisher"] in all_flags["green_scholarly"]:
	return "green"

	return "unknown"


	def eval_non_scholarly_sources(citation, citation_val):
	"""
	This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
	Input:
	the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str \| None, "date": datetime \|None}
	Output:
	the tag for citation (red, green, yellow, unknown)
	"""
	with open("non_scholarly_flags.json", "r") as f:
	non_scholarly_flags = json.load(f)

	# Check if the tag is found in either name or is part of external_link
	for key, val in non_scholarly_flags.items():
	for source in val:
	if source in citation_val["external_link"]:
	return key
	elif source in citation:
	return key
	return "unknown"

	def check_source_quality(wiki_url):
	"""
	Go through each parsed citation, check them through the red-yellow-green tag
	Return: red, yellow, green lists that include the citations belong to each category
	"""
	parsed = parse_match_wikitext(wiki_url)
	red_flag_list = []
	yellow_flag_list = []
	green_flag_list = []
	unknown_list = []

	for citation, val in parsed.items():
	eval = None
	# Check for journals/ conference/ other
	if val["type"] in {"journal", "conference", "other"}:
	eval = eval_scholarly_sources(val)

	elif val["type"] in {"web", "book", "news", "other"}:
	eval = eval_non_scholarly_sources(citation, val)

	if eval == "red":
	red_flag_list.append((citation, val["publisher"]))
	elif eval == "yellow":
	yellow_flag_list.append((citation, val["publisher"]))
	elif eval == "green":
	green_flag_list.append((citation, val["publisher"]))
	elif eval == "unknown":
	unknown_list.append((citation, val["publisher"]))

	return red_flag_list, yellow_flag_list, green_flag_list, unknown_list

	# TEST
	a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
	print("Red flag source:" , a[0])
	print("Yellow flag source: ", a[1])
	print("Green source: ", a[2])
	print("Undetermined sources: ", a[3])