from pathlib import Path HERE = Path(__file__).parent import sys sys.path.append(HERE) from utils import GParse_Paper, Get_Bibliography, Normalize_Section import solara from bs4 import BeautifulSoup from solara.components.file_drop import FileInfo from packaging.version import Version, InvalidVersion from bs4 import NavigableString,Tag import requests app_style = (HERE / "style.css").read_text() def Get_HTMLTop(title): # Top part of HTML html_top = f"""

{title}

""" return html_top def Get_Controls(): controls=""" """ return controls def Get_Sections(soup): # Generate sections from divs sections_content = "" sections_list = [] raw_text="" bib = Get_Bibliography(soup) citation_modals = [] for div in soup.find_all("div"): header = div.find("head") if header is not None: section_number = header.get('n', "") section_id = header.text.replace(" ", "_") sections_list.append({'num': Normalize_Section(section_number), 'text': section_id}) sections_content += f"
" sections_content += f"

{section_number} {header.text}

" else: sections_content += f"
" for i,paragraph in enumerate(div.find_all("p")): new_paragraph = "" for ii,element in enumerate(paragraph.contents): if isinstance(element, NavigableString): new_paragraph += element elif isinstance(element, Tag) and element.name == "ref" and element.get("target")!=None: ref_id = element.get("target").lstrip("#") if ref_id in bib.keys(): citation = f"""{element.text}""" new_paragraph += citation cit_info = bib[ref_id] citation_modals.append(f"""
{element.text}
Title: {cit_info['title']}
Authors: {", ".join(cit_info['authors'])}
Year: {cit_info['year']}
Journal: {cit_info['journal']}
DOI: {cit_info['doi']}
""") else: new_paragraph += element.text sections_content += f"

{new_paragraph}

" raw_text += "\n" + paragraph.text sections_content += "
" return sections_list, sections_content, citation_modals def Get_Navigation(controls,sections_list): # Generate navigation for sections navigation = "
" + controls + "

Navigation

" for section in sections_list: no_tabs = section['num'].count(".") if no_tabs>0: left = str(20*no_tabs)+"px" # Adjust the multiplier for desired tab width else: left= "0px" navigation += f'

{section["num"]} {section["text"]}

' navigation += "
" return navigation def Get_Article_HTML(pdf): article = GParse_Paper(pdf) soup = BeautifulSoup(article, "xml") try: document_title = soup.find("fileDesc").find("title").text except: document_title = "" html_top = Get_HTMLTop(document_title) sections_list, sections_content, citation_modals = Get_Sections(soup) controls = Get_Controls() navigation = Get_Navigation(controls, sections_list) # Combine all parts into final HTML html = "
" + str(html_top) + str(sections_content) + str(navigation) + "n".join(citation_modals)+"
" return html html = solara.reactive("

Article PDF to HTML converter

") xml = solara.reactive("") loaded = solara.reactive(False) @solara.component def Page(): solara.Style(app_style) def on_file(f: FileInfo): html.value = "Please wait..." html.value = Get_Article_HTML(f["file_obj"]) loaded.value = True def on_demo(): f=requests.get("https://familymedicine.med.wayne.edu/mph/project/green_2006_narrative_literature_reviews.pdf").content html.value = "Please wait..." html.value = Get_Article_HTML(f) loaded.value = True if not loaded.value: solara.Button(label="Use Demo PDF", on_click=on_demo) solara.FileDrop(label="Drag and drop custom article pdf", on_file=on_file, lazy=True) solara.HTML(unsafe_innerHTML=html.value) if loaded.value: solara.FileDownload(html, filename="articledemo.html", label="Download HTML")