pdf-toweb

Sleeping

App Files Files Community

pdf-toweb / app.py

Corran

Update app.py

8e61481 verified 4 months ago

raw

history blame contribute delete

No virus

6.21 kB

	from pathlib import Path

	HERE = Path(__file__).parent

	import sys
	sys.path.append(HERE)

	from utils import GParse_Paper, Get_Bibliography, Normalize_Section
	import solara
	from bs4 import BeautifulSoup
	from solara.components.file_drop import FileInfo
	from packaging.version import Version, InvalidVersion
	from bs4 import NavigableString,Tag
	import requests

	app_style = (HERE / "style.css").read_text()


	def Get_HTMLTop(title):
	# Top part of HTML
	html_top = f"""
	<body>
	<h1>{title}</h1>
	<span typeof="schema:Person" resource="http://orcid.org/0000-0003-1279-3709">
	</span>
	"""
	return html_top

	def Get_Controls():
	controls="""
	<label for="textSize">Text Size: </label>
	<select id="textSize" name="textSize" onchange="adjustTextSize(this.value)">
	<option value="10">10px</option>
	<option value="12">12px</option>
	<option value="14">14px</option>
	<option value="16" selected>16px</option>
	<option value="18">18px</option>
	<option value="20">20px</option>
	<option value="24">24px</option>
	<option value="28">28px</option>
	<option value="32">32px</option>
	<option value="36">36px</option>
	<option value="40">40px</option>
	<option value="44">44px</option>
	<option value="48">48px</option>
	<option value="50">50px</option>
	</select>
	<button onclick="toggleOpenDyslexic()">Accessible Font</button>
	<button onclick="toggleAccessibleBackground()">Accessible Background</button>
	"""
	return controls


	def Get_Sections(soup):
	# Generate sections from divs
	sections_content = ""
	sections_list = []
	raw_text=""

	bib = Get_Bibliography(soup)
	citation_modals = []

	for div in soup.find_all("div"):
	header = div.find("head")
	if header is not None:
	section_number = header.get('n', "")
	section_id = header.text.replace(" ", "_")
	sections_list.append({'num': Normalize_Section(section_number), 'text': section_id})
	sections_content += f"<section id='{section_id}'>"
	sections_content += f"<h2>{section_number} {header.text}</h2>"
	else:
	sections_content += f"<section id=''>"

	for i,paragraph in enumerate(div.find_all("p")):
	new_paragraph = ""
	for ii,element in enumerate(paragraph.contents):
	if isinstance(element, NavigableString):
	new_paragraph += element
	elif isinstance(element, Tag) and element.name == "ref" and element.get("target")!=None:
	ref_id = element.get("target").lstrip("#")
	if ref_id in bib.keys():
	citation = f"""<span class="text-area" onclick="openDialog(event, '{ref_id}')">{element.text}</span>"""
	new_paragraph += citation
	cit_info = bib[ref_id]
	citation_modals.append(f"""<div id="{ref_id}" class="dialog">
	<b>{element.text}</b><br>
	<b>Title:</b> {cit_info['title']}<br>
	<b>Authors:</b> {", ".join(cit_info['authors'])}<br>
	<b>Year:</b> {cit_info['year']}<br>
	<b>Journal:</b> {cit_info['journal']}<br>
	<b>DOI:</b> <a href="https://doi.org/{cit_info['doi']}">{cit_info['doi']} </a><br>
	<button class="close-button" onclick="closeDialog('{ref_id}')">Close</button>
	</div>""")
	else:
	new_paragraph += element.text

	sections_content += f"<p>{new_paragraph}</p>"
	raw_text += "\n" + paragraph.text

	sections_content += "</section>"
	return sections_list, sections_content, citation_modals

	def Get_Navigation(controls,sections_list):
	# Generate navigation for sections
	navigation = "<div class='sticky-content' style='max-height: 100%; overflow-y: auto;'>" + controls + " <h2> Navigation </h2>"
	for section in sections_list:
	no_tabs = section['num'].count(".")
	if no_tabs>0:
	left = str(20*no_tabs)+"px" # Adjust the multiplier for desired tab width
	else:
	left= "0px"
	navigation += f'<p style="margin-left: {left}; font-size: 10px;"><a href="#{section["text"]}">{section["num"]} {section["text"]}</a></p>'

	navigation += "</div>"
	return navigation



	def Get_Article_HTML(pdf):

	article = GParse_Paper(pdf)

	soup = BeautifulSoup(article, "xml")

	try:
	document_title = soup.find("fileDesc").find("title").text
	except:
	document_title = ""

	html_top = Get_HTMLTop(document_title)
	sections_list, sections_content, citation_modals = Get_Sections(soup)
	controls = Get_Controls()
	navigation = Get_Navigation(controls, sections_list)

	# Combine all parts into final HTML
	html = "<article id='article'>" + str(html_top) + str(sections_content) + str(navigation) + "n".join(citation_modals)+"</article></body>"

	return html


	html = solara.reactive("<h1> Article PDF to HTML converter </h1>")
	xml = solara.reactive("")
	loaded = solara.reactive(False)

	@solara.component
	def Page():
	solara.Style(app_style)

	def on_file(f: FileInfo):
	html.value = "Please wait..."
	html.value = Get_Article_HTML(f["file_obj"])
	loaded.value = True

	def on_demo():
	f=requests.get("https://familymedicine.med.wayne.edu/mph/project/green_2006_narrative_literature_reviews.pdf").content
	html.value = "Please wait..."
	html.value = Get_Article_HTML(f)
	loaded.value = True
	if not loaded.value:
	solara.Button(label="Use Demo PDF", on_click=on_demo)
	solara.FileDrop(label="Drag and drop custom article pdf", on_file=on_file, lazy=True)
	solara.HTML(unsafe_innerHTML=html.value)
	if loaded.value:
	solara.FileDownload(html, filename="articledemo.html", label="Download HTML")