pdf-toweb / app.py
Corran's picture
Update app.py
8e61481 verified
raw
history blame contribute delete
No virus
6.21 kB
from pathlib import Path
HERE = Path(__file__).parent
import sys
sys.path.append(HERE)
from utils import GParse_Paper, Get_Bibliography, Normalize_Section
import solara
from bs4 import BeautifulSoup
from solara.components.file_drop import FileInfo
from packaging.version import Version, InvalidVersion
from bs4 import NavigableString,Tag
import requests
app_style = (HERE / "style.css").read_text()
def Get_HTMLTop(title):
# Top part of HTML
html_top = f"""
<body>
<h1>{title}</h1>
<span typeof="schema:Person" resource="http://orcid.org/0000-0003-1279-3709">
</span>
"""
return html_top
def Get_Controls():
controls="""
<label for="textSize">Text Size: </label>
<select id="textSize" name="textSize" onchange="adjustTextSize(this.value)">
<option value="10">10px</option>
<option value="12">12px</option>
<option value="14">14px</option>
<option value="16" selected>16px</option>
<option value="18">18px</option>
<option value="20">20px</option>
<option value="24">24px</option>
<option value="28">28px</option>
<option value="32">32px</option>
<option value="36">36px</option>
<option value="40">40px</option>
<option value="44">44px</option>
<option value="48">48px</option>
<option value="50">50px</option>
</select>
<button onclick="toggleOpenDyslexic()">Accessible Font</button>
<button onclick="toggleAccessibleBackground()">Accessible Background</button>
"""
return controls
def Get_Sections(soup):
# Generate sections from divs
sections_content = ""
sections_list = []
raw_text=""
bib = Get_Bibliography(soup)
citation_modals = []
for div in soup.find_all("div"):
header = div.find("head")
if header is not None:
section_number = header.get('n', "")
section_id = header.text.replace(" ", "_")
sections_list.append({'num': Normalize_Section(section_number), 'text': section_id})
sections_content += f"<section id='{section_id}'>"
sections_content += f"<h2>{section_number} {header.text}</h2>"
else:
sections_content += f"<section id=''>"
for i,paragraph in enumerate(div.find_all("p")):
new_paragraph = ""
for ii,element in enumerate(paragraph.contents):
if isinstance(element, NavigableString):
new_paragraph += element
elif isinstance(element, Tag) and element.name == "ref" and element.get("target")!=None:
ref_id = element.get("target").lstrip("#")
if ref_id in bib.keys():
citation = f"""<span class="text-area" onclick="openDialog(event, '{ref_id}')">{element.text}</span>"""
new_paragraph += citation
cit_info = bib[ref_id]
citation_modals.append(f"""<div id="{ref_id}" class="dialog">
<b>{element.text}</b><br>
<b>Title:</b> {cit_info['title']}<br>
<b>Authors:</b> {", ".join(cit_info['authors'])}<br>
<b>Year:</b> {cit_info['year']}<br>
<b>Journal:</b> {cit_info['journal']}<br>
<b>DOI:</b> <a href="https://doi.org/{cit_info['doi']}">{cit_info['doi']} </a><br>
<button class="close-button" onclick="closeDialog('{ref_id}')">Close</button>
</div>""")
else:
new_paragraph += element.text
sections_content += f"<p>{new_paragraph}</p>"
raw_text += "\n" + paragraph.text
sections_content += "</section>"
return sections_list, sections_content, citation_modals
def Get_Navigation(controls,sections_list):
# Generate navigation for sections
navigation = "<div class='sticky-content' style='max-height: 100%; overflow-y: auto;'>" + controls + " <h2> Navigation </h2>"
for section in sections_list:
no_tabs = section['num'].count(".")
if no_tabs>0:
left = str(20*no_tabs)+"px" # Adjust the multiplier for desired tab width
else:
left= "0px"
navigation += f'<p style="margin-left: {left}; font-size: 10px;"><a href="#{section["text"]}">{section["num"]} {section["text"]}</a></p>'
navigation += "</div>"
return navigation
def Get_Article_HTML(pdf):
article = GParse_Paper(pdf)
soup = BeautifulSoup(article, "xml")
try:
document_title = soup.find("fileDesc").find("title").text
except:
document_title = ""
html_top = Get_HTMLTop(document_title)
sections_list, sections_content, citation_modals = Get_Sections(soup)
controls = Get_Controls()
navigation = Get_Navigation(controls, sections_list)
# Combine all parts into final HTML
html = "<article id='article'>" + str(html_top) + str(sections_content) + str(navigation) + "n".join(citation_modals)+"</article></body>"
return html
html = solara.reactive("<h1> Article PDF to HTML converter </h1>")
xml = solara.reactive("")
loaded = solara.reactive(False)
@solara.component
def Page():
solara.Style(app_style)
def on_file(f: FileInfo):
html.value = "Please wait..."
html.value = Get_Article_HTML(f["file_obj"])
loaded.value = True
def on_demo():
f=requests.get("https://familymedicine.med.wayne.edu/mph/project/green_2006_narrative_literature_reviews.pdf").content
html.value = "Please wait..."
html.value = Get_Article_HTML(f)
loaded.value = True
if not loaded.value:
solara.Button(label="Use Demo PDF", on_click=on_demo)
solara.FileDrop(label="Drag and drop custom article pdf", on_file=on_file, lazy=True)
solara.HTML(unsafe_innerHTML=html.value)
if loaded.value:
solara.FileDownload(html, filename="articledemo.html", label="Download HTML")