Spaces:
Sleeping
Sleeping
# %% | |
# Import all libraries | |
from bs4 import BeautifulSoup | |
import bibtexparser | |
from dateutil import parser | |
import json | |
import requests | |
import tldextract | |
from collections import defaultdict | |
import re | |
import mwparserfromhell | |
# Given the DOI, PMID, PMC number, fetch journal's meta data | |
def get_metainfo_doi(doi): | |
"""Input: doi string | |
Output: the journal name and date published of the article. Return None for each value if the can't parsed | |
""" | |
res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"}) | |
res = res.content.decode('utf-8') | |
bibtext = bibtexparser.loads(res).entries | |
if len(bibtext) >0 : | |
journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None | |
time_published = "" | |
if "year" in bibtext[0]: | |
time_published += bibtext[0]["year"] | |
if "month" in bibtext[0]: | |
time_published += " " + bibtext[0]["month"] | |
if "day" in bibtext[0]: | |
time_published += " " + bibtext[0]["day"] | |
if len(time_published) > 0: | |
time_published = parser.parse(time_published) | |
else: | |
time_published = None | |
return journal, time_published | |
else: | |
return None, None | |
def get_metainfo_pmc(pmc): | |
res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json") | |
res = res.content.decode("utf-8") | |
res = json.loads(res) | |
data = res["result"][pmc] | |
journal, time_published = None, None | |
if "error" in data: | |
return None, None | |
else: | |
journal = data["fulljournalname"].strip() | |
time_published = parser.parse(data["pubdate"]) | |
return journal, time_published | |
def get_metainfo_pmid(pmid): | |
res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json") | |
res = res.content.decode("utf-8") | |
res = json.loads(res) | |
data = res["result"][pmid] | |
journal, time_published = None, None | |
if "error" in data: | |
return None, None | |
else: | |
journal = data["fulljournalname"].strip() | |
time_published = parser.parse(data["pubdate"]) | |
return journal, time_published | |
def parse_html(page_url): | |
""" This function parse metadata of citations from HTML tag. | |
Input: wiki_url | |
Output: a parsed citation list from HTML. Each citation has format key: value | |
key: the text version of all citation | |
value: a dictionary with schema | |
{"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}""" | |
citation_types = {'web', 'journal', 'book', 'conference', 'news'} | |
all_parsed_citations = defaultdict(dict) | |
response = requests.get(page_url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Get all the references | |
all_ref = [] | |
ordered_ref_lst = soup.find_all("ol", {"class": "references"}) | |
for each in ordered_ref_lst: | |
refs = each.find_all("li") | |
all_ref += refs | |
for ele in all_ref: | |
#Check if it has <span class="reference text"> | |
ref = ele.find("span", {"class":"reference-text"}) | |
source_type = "other" #first default value for source_type | |
if ref: | |
# TASK: get all essential information from citation tag | |
citation_key = ref.get_text() | |
hyperlink = ref.find("a", {"class": "external text"}) | |
external_link = hyperlink["href"] if hyperlink else None | |
# TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news' | |
cite_tag = ref.find("cite") | |
if cite_tag: | |
for class_tag in cite_tag["class"]: | |
if class_tag in citation_types: | |
source_type = class_tag | |
break | |
# TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news) | |
#for journal, conference, others look for DOI or PMID or PMC | |
if source_type in {'journal', 'conference', 'other'}: | |
has_doi = ref.find("a", {"title": "Doi (identifier)"}) | |
has_pmc = ref.find("a", {"title": "PMC (identifier)"}) | |
has_pmid = ref.find("a", {"title": "PMID (identifier)"}) | |
journal, date = None, None | |
if has_doi: | |
doi = has_doi.find_next("a", {"class": "external text"}) | |
journal, date = get_metainfo_doi(doi.text) | |
elif has_pmc: | |
pmc = has_pmc.find_next("a", {"class": "external text"}) | |
journal, date = get_metainfo_pmc(pmc.text) | |
elif has_pmid: | |
pmid = has_pmid.find_next("a", {"class": "external text"}) | |
journal, date = get_metainfo_pmid(pmid.text) | |
all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date} | |
# for news, web, other that hasn't been parsed, publisher is the domain of the website | |
elif source_type in {'news', 'web', 'other'}: | |
publisher = tldextract.extract(external_link).domain if external_link else None | |
all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None} | |
return all_parsed_citations | |
# After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation | |
# %% | |
def parse_match_wikitext(wiki_url): | |
""" | |
This function parse wikitext version of the citations, match it with the HTML version, | |
and extract more information, such as publisher and date that weren't extracted on the HTML. | |
Input: wiki_url | |
Output: a fully parsed citation list. Each citation has format key: value | |
key: the text version of all citation | |
value: a dictionary with schema | |
{"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} | |
""" | |
parsed_citation = parse_html(wiki_url) | |
print("ALL citation", len(parsed_citation)) | |
wiki_page = wiki_url.split("wiki/")[1] | |
url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw" | |
response = requests.get(url) | |
text = response.text | |
wikicode = mwparserfromhell.parse(text) | |
# Create a copy of not fully parsed citation, ie one that lacks publisher or date param | |
not_fully_parsed = defaultdict(dict) | |
for key, val in parsed_citation.items(): | |
if not val["publisher"] or not val["date"]: | |
not_fully_parsed[key] = val | |
for tpl in wikicode.filter_templates(matches="{{cite"): | |
#tpl is template, for a template in wikitext | |
found_match = None | |
# Match on external link: | |
if tpl.has_param("url"): | |
external_url = tpl.get("url").split("=")[1] | |
for key, val in not_fully_parsed.items(): | |
if val["external_link"]: | |
if val["external_link"].strip() == external_url.strip(): | |
found_match = key | |
break | |
# if not found match by URL, find by title | |
if not found_match: | |
if tpl.has_param("title"): | |
#Get the title of citation in without formatting text | |
title = tpl.get("title").split("=")[1] | |
title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting | |
for key in not_fully_parsed.keys(): | |
if title in key: | |
found_match = key | |
break | |
if found_match: | |
# Fetch publisher/ journal name from wikitext | |
if not parsed_citation[found_match]["publisher"]: | |
publisher = None | |
if tpl.has_param("journal"): #for journal name | |
publisher = tpl.get("journal").split("=")[1] | |
elif tpl.has_param("publishder"): #for website or book publisher | |
publisher = tpl.get("publisher").split("=")[1] | |
elif tpl.has_param("work"): #for news/ magazine name | |
publisher = tpl.get("work").split("=")[1] | |
if publisher: | |
publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher) | |
parsed_citation[found_match]["publisher"] = publisher | |
# Fetch publication date from wikitext | |
if not parsed_citation[found_match]["date"]: | |
date = None | |
if tpl.has_param("date"): | |
date = tpl.get("date").split("=")[1] | |
if len(date) >= 4: #at least 4 digits for year, or yy-mm format | |
date = parser.parse(date) | |
parsed_citation[found_match]["date"] = date | |
return parsed_citation | |
def eval_scholarly_sources(citation): | |
""" | |
This function evaluates the tag for a scholarly souces (journal, conference, or other type) | |
Input: | |
the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} | |
Output: | |
the tag for citation (red, green, yellow, unknown) | |
""" | |
# read the dictionaries of flags from the json file | |
with open("scholarly_flags.json", "r") as f: | |
all_flags = json.load(f) | |
# Check on the domain of external link | |
if citation["external_link"]: | |
domain = tldextract.extract(citation["external_link"]).domain | |
if domain in all_flags["red_scholarly_reverse"]: | |
return "red" | |
elif domain in all_flags["yellow_scholarly_reverse"]: | |
return "yellow" | |
elif domain in all_flags["green_scholarly_reverse"]: | |
return "green" | |
#check on the name dictionary | |
if citation["publisher"] in all_flags["red_scholarly"]: | |
return "red" | |
elif citation["publisher"] in all_flags["yellow_scholarly"]: | |
return "yellow" | |
elif citation["publisher"] in all_flags["green_scholarly"]: | |
return "green" | |
return "unknown" | |
def eval_non_scholarly_sources(citation, citation_val): | |
""" | |
This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type) | |
Input: | |
the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} | |
Output: | |
the tag for citation (red, green, yellow, unknown) | |
""" | |
with open("non_scholarly_flags.json", "r") as f: | |
non_scholarly_flags = json.load(f) | |
# Check if the tag is found in either name or is part of external_link | |
for key, val in non_scholarly_flags.items(): | |
for source in val: | |
if source in citation_val["external_link"]: | |
return key | |
elif source in citation: | |
return key | |
return "unknown" | |
def check_source_quality(wiki_url): | |
""" | |
Go through each parsed citation, check them through the red-yellow-green tag | |
Return: red, yellow, green lists that include the citations belong to each category | |
""" | |
parsed = parse_match_wikitext(wiki_url) | |
red_flag_list = [] | |
yellow_flag_list = [] | |
green_flag_list = [] | |
unknown_list = [] | |
for citation, val in parsed.items(): | |
eval = None | |
# Check for journals/ conference/ other | |
if val["type"] in {"journal", "conference", "other"}: | |
eval = eval_scholarly_sources(val) | |
elif val["type"] in {"web", "book", "news", "other"}: | |
eval = eval_non_scholarly_sources(citation, val) | |
if eval == "red": | |
red_flag_list.append((citation, val["publisher"])) | |
elif eval == "yellow": | |
yellow_flag_list.append((citation, val["publisher"])) | |
elif eval == "green": | |
green_flag_list.append((citation, val["publisher"])) | |
elif eval == "unknown": | |
unknown_list.append((citation, val["publisher"])) | |
return red_flag_list, yellow_flag_list, green_flag_list, unknown_list | |
# TEST | |
a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania") | |
print("Red flag source:" , a[0]) | |
print("Yellow flag source: ", a[1]) | |
print("Green source: ", a[2]) | |
print("Undetermined sources: ", a[3]) |