Spaces:
Sleeping
Sleeping
| # %% | |
| # Import all libraries | |
| from bs4 import BeautifulSoup | |
| import bibtexparser | |
| from dateutil import parser | |
| import json | |
| import requests | |
| import tldextract | |
| from collections import defaultdict | |
| import re | |
| import mwparserfromhell | |
| # Given the DOI, PMID, PMC number, fetch journal's meta data | |
| def get_metainfo_doi(doi): | |
| """Input: doi string | |
| Output: the journal name and date published of the article. Return None for each value if the can't parsed | |
| """ | |
| res = requests.get("http://dx.doi.org/"+ doi, headers={"Accept": "application/x-bibtex"}) | |
| res = res.content.decode('utf-8') | |
| bibtext = bibtexparser.loads(res).entries | |
| if len(bibtext) >0 : | |
| journal = bibtext[0]["journal"].strip() if "journal" in bibtext[0] else None | |
| time_published = "" | |
| if "year" in bibtext[0]: | |
| time_published += bibtext[0]["year"] | |
| if "month" in bibtext[0]: | |
| time_published += " " + bibtext[0]["month"] | |
| if "day" in bibtext[0]: | |
| time_published += " " + bibtext[0]["day"] | |
| if len(time_published) > 0: | |
| time_published = parser.parse(time_published) | |
| else: | |
| time_published = None | |
| return journal, time_published | |
| else: | |
| return None, None | |
| def get_metainfo_pmc(pmc): | |
| res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id="+pmc+"&retmode=json") | |
| res = res.content.decode("utf-8") | |
| res = json.loads(res) | |
| data = res["result"][pmc] | |
| journal, time_published = None, None | |
| if "error" in data: | |
| return None, None | |
| else: | |
| journal = data["fulljournalname"].strip() | |
| time_published = parser.parse(data["pubdate"]) | |
| return journal, time_published | |
| def get_metainfo_pmid(pmid): | |
| res = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json") | |
| res = res.content.decode("utf-8") | |
| res = json.loads(res) | |
| data = res["result"][pmid] | |
| journal, time_published = None, None | |
| if "error" in data: | |
| return None, None | |
| else: | |
| journal = data["fulljournalname"].strip() | |
| time_published = parser.parse(data["pubdate"]) | |
| return journal, time_published | |
| def parse_html(page_url): | |
| """ This function parse metadata of citations from HTML tag. | |
| Input: wiki_url | |
| Output: a parsed citation list from HTML. Each citation has format key: value | |
| key: the text version of all citation | |
| value: a dictionary with schema | |
| {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None}""" | |
| citation_types = {'web', 'journal', 'book', 'conference', 'news'} | |
| all_parsed_citations = defaultdict(dict) | |
| response = requests.get(page_url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Get all the references | |
| all_ref = [] | |
| ordered_ref_lst = soup.find_all("ol", {"class": "references"}) | |
| for each in ordered_ref_lst: | |
| refs = each.find_all("li") | |
| all_ref += refs | |
| for ele in all_ref: | |
| #Check if it has <span class="reference text"> | |
| ref = ele.find("span", {"class":"reference-text"}) | |
| source_type = "other" #first default value for source_type | |
| if ref: | |
| # TASK: get all essential information from citation tag | |
| citation_key = ref.get_text() | |
| hyperlink = ref.find("a", {"class": "external text"}) | |
| external_link = hyperlink["href"] if hyperlink else None | |
| # TASK: find source type, ie whether it's 'web', 'journal', 'book', 'conference', 'news' | |
| cite_tag = ref.find("cite") | |
| if cite_tag: | |
| for class_tag in cite_tag["class"]: | |
| if class_tag in citation_types: | |
| source_type = class_tag | |
| break | |
| # TASK: get publisher (journal name for journal or conference, domain website for webs, 'work' for news) | |
| #for journal, conference, others look for DOI or PMID or PMC | |
| if source_type in {'journal', 'conference', 'other'}: | |
| has_doi = ref.find("a", {"title": "Doi (identifier)"}) | |
| has_pmc = ref.find("a", {"title": "PMC (identifier)"}) | |
| has_pmid = ref.find("a", {"title": "PMID (identifier)"}) | |
| journal, date = None, None | |
| if has_doi: | |
| doi = has_doi.find_next("a", {"class": "external text"}) | |
| journal, date = get_metainfo_doi(doi.text) | |
| elif has_pmc: | |
| pmc = has_pmc.find_next("a", {"class": "external text"}) | |
| journal, date = get_metainfo_pmc(pmc.text) | |
| elif has_pmid: | |
| pmid = has_pmid.find_next("a", {"class": "external text"}) | |
| journal, date = get_metainfo_pmid(pmid.text) | |
| all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": journal, "date": date} | |
| # for news, web, other that hasn't been parsed, publisher is the domain of the website | |
| elif source_type in {'news', 'web', 'other'}: | |
| publisher = tldextract.extract(external_link).domain if external_link else None | |
| all_parsed_citations[citation_key] = {"external_link": external_link, "type": source_type, "html_tag": ele, "publisher": publisher, "date": None} | |
| return all_parsed_citations | |
| # After finish parsing with HTML tag, we fetch the wikitext version of the page, match it with the HTML tag to extract more information about the citation | |
| # %% | |
| def parse_match_wikitext(wiki_url): | |
| """ | |
| This function parse wikitext version of the citations, match it with the HTML version, | |
| and extract more information, such as publisher and date that weren't extracted on the HTML. | |
| Input: wiki_url | |
| Output: a fully parsed citation list. Each citation has format key: value | |
| key: the text version of all citation | |
| value: a dictionary with schema | |
| {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} | |
| """ | |
| parsed_citation = parse_html(wiki_url) | |
| print("ALL citation", len(parsed_citation)) | |
| wiki_page = wiki_url.split("wiki/")[1] | |
| url = "https://en.wikipedia.org/w/index.php?title=" + wiki_page +"&action=raw" | |
| response = requests.get(url) | |
| text = response.text | |
| wikicode = mwparserfromhell.parse(text) | |
| # Create a copy of not fully parsed citation, ie one that lacks publisher or date param | |
| not_fully_parsed = defaultdict(dict) | |
| for key, val in parsed_citation.items(): | |
| if not val["publisher"] or not val["date"]: | |
| not_fully_parsed[key] = val | |
| for tpl in wikicode.filter_templates(matches="{{cite"): | |
| #tpl is template, for a template in wikitext | |
| found_match = None | |
| # Match on external link: | |
| if tpl.has_param("url"): | |
| external_url = tpl.get("url").split("=")[1] | |
| for key, val in not_fully_parsed.items(): | |
| if val["external_link"]: | |
| if val["external_link"].strip() == external_url.strip(): | |
| found_match = key | |
| break | |
| # if not found match by URL, find by title | |
| if not found_match: | |
| if tpl.has_param("title"): | |
| #Get the title of citation in without formatting text | |
| title = tpl.get("title").split("=")[1] | |
| title = re.sub('[^A-Za-z0-9 ]+', '', title) #filter out extra formatting | |
| for key in not_fully_parsed.keys(): | |
| if title in key: | |
| found_match = key | |
| break | |
| if found_match: | |
| # Fetch publisher/ journal name from wikitext | |
| if not parsed_citation[found_match]["publisher"]: | |
| publisher = None | |
| if tpl.has_param("journal"): #for journal name | |
| publisher = tpl.get("journal").split("=")[1] | |
| elif tpl.has_param("publishder"): #for website or book publisher | |
| publisher = tpl.get("publisher").split("=")[1] | |
| elif tpl.has_param("work"): #for news/ magazine name | |
| publisher = tpl.get("work").split("=")[1] | |
| if publisher: | |
| publisher = re.sub('[^A-Za-z0-9 ]+', '', publisher) | |
| parsed_citation[found_match]["publisher"] = publisher | |
| # Fetch publication date from wikitext | |
| if not parsed_citation[found_match]["date"]: | |
| date = None | |
| if tpl.has_param("date"): | |
| date = tpl.get("date").split("=")[1] | |
| if len(date) >= 4: #at least 4 digits for year, or yy-mm format | |
| date = parser.parse(date) | |
| parsed_citation[found_match]["date"] = date | |
| return parsed_citation | |
| def eval_scholarly_sources(citation): | |
| """ | |
| This function evaluates the tag for a scholarly souces (journal, conference, or other type) | |
| Input: | |
| the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} | |
| Output: | |
| the tag for citation (red, green, yellow, unknown) | |
| """ | |
| # read the dictionaries of flags from the json file | |
| with open("scholarly_flags.json", "r") as f: | |
| all_flags = json.load(f) | |
| # Check on the domain of external link | |
| if citation["external_link"]: | |
| domain = tldextract.extract(citation["external_link"]).domain | |
| if domain in all_flags["red_scholarly_reverse"]: | |
| return "red" | |
| elif domain in all_flags["yellow_scholarly_reverse"]: | |
| return "yellow" | |
| elif domain in all_flags["green_scholarly_reverse"]: | |
| return "green" | |
| #check on the name dictionary | |
| if citation["publisher"] in all_flags["red_scholarly"]: | |
| return "red" | |
| elif citation["publisher"] in all_flags["yellow_scholarly"]: | |
| return "yellow" | |
| elif citation["publisher"] in all_flags["green_scholarly"]: | |
| return "green" | |
| return "unknown" | |
| def eval_non_scholarly_sources(citation, citation_val): | |
| """ | |
| This function evaluates the tag for a non-scholarly scholarly souces (journal, conference, or other type) | |
| Input: | |
| the citation dictionary, which has format {"external_link": str, "type": str, "html_tag": HTMLElement, "publisher": str | None, "date": datetime |None} | |
| Output: | |
| the tag for citation (red, green, yellow, unknown) | |
| """ | |
| with open("non_scholarly_flags.json", "r") as f: | |
| non_scholarly_flags = json.load(f) | |
| # Check if the tag is found in either name or is part of external_link | |
| for key, val in non_scholarly_flags.items(): | |
| for source in val: | |
| if source in citation_val["external_link"]: | |
| return key | |
| elif source in citation: | |
| return key | |
| return "unknown" | |
| def check_source_quality(wiki_url): | |
| """ | |
| Go through each parsed citation, check them through the red-yellow-green tag | |
| Return: red, yellow, green lists that include the citations belong to each category | |
| """ | |
| parsed = parse_match_wikitext(wiki_url) | |
| red_flag_list = [] | |
| yellow_flag_list = [] | |
| green_flag_list = [] | |
| unknown_list = [] | |
| for citation, val in parsed.items(): | |
| eval = None | |
| # Check for journals/ conference/ other | |
| if val["type"] in {"journal", "conference", "other"}: | |
| eval = eval_scholarly_sources(val) | |
| elif val["type"] in {"web", "book", "news", "other"}: | |
| eval = eval_non_scholarly_sources(citation, val) | |
| if eval == "red": | |
| red_flag_list.append((citation, val["publisher"])) | |
| elif eval == "yellow": | |
| yellow_flag_list.append((citation, val["publisher"])) | |
| elif eval == "green": | |
| green_flag_list.append((citation, val["publisher"])) | |
| elif eval == "unknown": | |
| unknown_list.append((citation, val["publisher"])) | |
| return red_flag_list, yellow_flag_list, green_flag_list, unknown_list | |
| # TEST | |
| a = check_source_quality("https://en.wikipedia.org/wiki/Democratic_Party_of_Albania") | |
| print("Red flag source:" , a[0]) | |
| print("Yellow flag source: ", a[1]) | |
| print("Green source: ", a[2]) | |
| print("Undetermined sources: ", a[3]) |