wikicredibility / source_eval_model.py
doantrang982@uni.minerva.edu
add scholarly green flags
7be836b
# %%
# Import all libraries
from bs4 import BeautifulSoup
import bibtexparser
from dateutil import parser
import json
import requests
import tldextract
from collections import defaultdict
import re
import mwparserfromhell
# Given the DOI, PMID, PMC number, fetch journal's meta data
def get_metainfo_doi(doi):
"""Input: doi string
Output: the journal name and date published of the article. Return None for each value if the can't parsed
"""
res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"})
res = res.content.decode('utf-8')
bibtext = bibtexparser.loads(res).entries
if len(bibtext) >0 :
journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None
time_published = ""
if "year" in bibtext[0]:
time_published += bibtext[0]["year"]
if "month" in bibtext[0]:
time_published += " " + bibtext[0]["month"]
if "day" in bibtext[0]:
time_published += " " + bibtext[0]["day"]
if len(time_published) > 0:
time_published = parser.parse(time_published)
else:
time_published = None
return journal, time_published
else:
return None, None
def get_metainfo_pmc(pmc):
res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json")
res = res.content.decode("utf-8")
res = json.loads(res)
data = res["result"][pmc]
journal, time_published = None, None
if "error" in data:
return None, None
else:
journal = data["fulljournalname"].strip()
time_published = parser.parse(data["pubdate"])
return journal, time_published
def get_metainfo_pmid(pmid):
res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json")
res = res.content.decode("utf-8")
res = json.loads(res)
data = res["result"][pmid]
journal, time_published = None, None
if "error" in data:
return None, None
else:
journal = data["fulljournalname"].strip()
time_published = parser.parse(data["pubdate"])
return journal, time_published
def parse_html(page_url):
""" This function parse metadata of citations from HTML tag.
Input: wiki_url
Output: a parsed citation list from HTML. Each citation has format key: value
key: the text version of all citation
value: a dictionary with schema
{"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}"""
citation_types = {'web', 'journal', 'book', 'conference', 'news'}
all_parsed_citations = defaultdict(dict)
response = requests.get(page_url)
soup = BeautifulSoup(response.content, 'html.parser')
# Get all the references
all_ref = []
ordered_ref_lst = soup.find_all("ol", {"class": "references"})
for each in ordered_ref_lst:
refs = each.find_all("li")
all_ref += refs
for ele in all_ref:
#Check if it has <span class="reference text">
ref = ele.find("span", {"class":"reference-text"})
source_type = "other" #first default value for source_type
if ref:
# TASK: get all essential information from citation tag
citation_key = ref.get_text()
hyperlink = ref.find("a", {"class": "external text"})
external_link = hyperlink["href"] if hyperlink else None
# TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news'
cite_tag = ref.find("cite")
if cite_tag:
for class_tag in cite_tag["class"]:
if class_tag in citation_types:
source_type = class_tag
break
# TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news)
#for journal, conference, others look for DOI or PMID or PMC
if source_type in {'journal', 'conference', 'other'}:
has_doi = ref.find("a", {"title": "Doi (identifier)"})
has_pmc = ref.find("a", {"title": "PMC (identifier)"})
has_pmid = ref.find("a", {"title": "PMID (identifier)"})
journal, date = None, None
if has_doi:
doi = has_doi.find_next("a", {"class": "external text"})
journal, date = get_metainfo_doi(doi.text)
elif has_pmc:
pmc = has_pmc.find_next("a", {"class": "external text"})
journal, date = get_metainfo_pmc(pmc.text)
elif has_pmid:
pmid = has_pmid.find_next("a", {"class": "external text"})
journal, date = get_metainfo_pmid(pmid.text)
all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date}
# for news, web, other that hasn't been parsed, publisher is the domain of the website
elif source_type in {'news', 'web', 'other'}:
publisher = tldextract.extract(external_link).domain if external_link else None
all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None}
return all_parsed_citations
# After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation
# %%
def parse_match_wikitext(wiki_url):
"""
This function parse wikitext version of the citations, match it with the HTML version,
and extract more information, such as publisher and date that weren't extracted on the HTML.
Input: wiki_url
Output: a fully parsed citation list. Each citation has format key: value
key: the text version of all citation
value: a dictionary with schema
{"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
"""
parsed_citation = parse_html(wiki_url)
print("ALL citation", len(parsed_citation))
wiki_page = wiki_url.split("wiki/")[1]
url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw"
response = requests.get(url)
text = response.text
wikicode = mwparserfromhell.parse(text)
# Create a copy of not fully parsed citation, ie one that lacks publisher or date param
not_fully_parsed = defaultdict(dict)
for key, val in parsed_citation.items():
if not val["publisher"] or not val["date"]:
not_fully_parsed[key] = val
for tpl in wikicode.filter_templates(matches="{{cite"):
#tpl is template, for a template in wikitext
found_match = None
# Match on external link:
if tpl.has_param("url"):
external_url = tpl.get("url").split("=")[1]
for key, val in not_fully_parsed.items():
if val["external_link"]:
if val["external_link"].strip() == external_url.strip():
found_match = key
break
# if not found match by URL, find by title
if not found_match:
if tpl.has_param("title"):
#Get the title of citation in without formatting text
title = tpl.get("title").split("=")[1]
title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting
for key in not_fully_parsed.keys():
if title in key:
found_match = key
break
if found_match:
# Fetch publisher/ journal name from wikitext
if not parsed_citation[found_match]["publisher"]:
publisher = None
if tpl.has_param("journal"): #for journal name
publisher = tpl.get("journal").split("=")[1]
elif tpl.has_param("publishder"): #for website or book publisher
publisher = tpl.get("publisher").split("=")[1]
elif tpl.has_param("work"): #for news/ magazine name
publisher = tpl.get("work").split("=")[1]
if publisher:
publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher)
parsed_citation[found_match]["publisher"] = publisher
# Fetch publication date from wikitext
if not parsed_citation[found_match]["date"]:
date = None
if tpl.has_param("date"):
date = tpl.get("date").split("=")[1]
if len(date) >= 4: #at least 4 digits for year, or yy-mm format
date = parser.parse(date)
parsed_citation[found_match]["date"] = date
return parsed_citation
def eval_scholarly_sources(citation):
"""
This function evaluates the tag for a scholarly souces (journal, conference, or other type)
Input:
the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
Output:
the tag for citation (red, green, yellow, unknown)
"""
# read the dictionaries of flags from the json file
with open("scholarly_flags.json", "r") as f:
all_flags = json.load(f)
# Check on the domain of external link
if citation["external_link"]:
domain = tldextract.extract(citation["external_link"]).domain
if domain in all_flags["red_scholarly_reverse"]:
return "red"
elif domain in all_flags["yellow_scholarly_reverse"]:
return "yellow"
elif domain in all_flags["green_scholarly_reverse"]:
return "green"
#check on the name dictionary
if citation["publisher"] in all_flags["red_scholarly"]:
return "red"
elif citation["publisher"] in all_flags["yellow_scholarly"]:
return "yellow"
elif citation["publisher"] in all_flags["green_scholarly"]:
return "green"
return "unknown"
def eval_non_scholarly_sources(citation, citation_val):
"""
This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type)
Input:
the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}
Output:
the tag for citation (red, green, yellow, unknown)
"""
with open("non_scholarly_flags.json", "r") as f:
non_scholarly_flags = json.load(f)
# Check if the tag is found in either name or is part of external_link
for key, val in non_scholarly_flags.items():
for source in val:
if source in citation_val["external_link"]:
return key
elif source in citation:
return key
return "unknown"
def check_source_quality(wiki_url):
"""
Go through each parsed citation, check them through the red-yellow-green tag
Return: red, yellow, green lists that include the citations belong to each category
"""
parsed = parse_match_wikitext(wiki_url)
red_flag_list = []
yellow_flag_list = []
green_flag_list = []
unknown_list = []
for citation, val in parsed.items():
eval = None
# Check for journals/ conference/ other
if val["type"] in {"journal", "conference", "other"}:
eval = eval_scholarly_sources(val)
elif val["type"] in {"web", "book", "news", "other"}:
eval = eval_non_scholarly_sources(citation, val)
if eval == "red":
red_flag_list.append((citation, val["publisher"]))
elif eval == "yellow":
yellow_flag_list.append((citation, val["publisher"]))
elif eval == "green":
green_flag_list.append((citation, val["publisher"]))
elif eval == "unknown":
unknown_list.append((citation, val["publisher"]))
return red_flag_list, yellow_flag_list, green_flag_list, unknown_list
# TEST
a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania")
print("Red flag source:" , a[0])
print("Yellow flag source: ", a[1])
print("Green source: ", a[2])
print("Undetermined sources: ", a[3])