File size: 6,210 Bytes
f0b6424
 
 
315e4ca
 
 
 
6cab06b
315e4ca
007ba69
d697159
e3c7c31
 
105978d
315e4ca
f0b6424
 
 
 
 
 
9537b09
f0b6424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2abddc0
 
f0b6424
b1646ac
f0b6424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cab06b
f0b6424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c79ebf
f0b6424
31cd46f
f0b6424
 
 
 
 
 
 
 
 
 
 
b950b09
f0b6424
 
 
 
 
 
 
63872ab
f0b6424
 
 
 
 
 
 
3c79ebf
f0b6424
 
 
 
47a068e
bdb6fd9
3c79ebf
f0b6424
 
d697159
93c85b9
2110f9e
 
f0b6424
 
 
 
33d6a09
e7287b1
a9f16c1
2110f9e
4301cea
 
d7d6da7
4301cea
 
 
89a55c6
677b606
4301cea
a15384c
 
af7c477
fb36c61
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from pathlib import Path

HERE = Path(__file__).parent

import sys
sys.path.append(HERE)

from utils import GParse_Paper, Get_Bibliography, Normalize_Section
import solara
from bs4 import BeautifulSoup
from solara.components.file_drop import FileInfo
from packaging.version import Version, InvalidVersion
from bs4 import NavigableString,Tag
import requests

app_style = (HERE / "style.css").read_text()


def Get_HTMLTop(title):
    # Top part of HTML
    html_top = f"""
    <body>
    <h1>{title}</h1>
    <span typeof="schema:Person" resource="http://orcid.org/0000-0003-1279-3709">
    </span>
    """
    return html_top

def Get_Controls():
    controls="""
        <label for="textSize">Text Size: </label>
        <select id="textSize" name="textSize" onchange="adjustTextSize(this.value)">
            <option value="10">10px</option>
            <option value="12">12px</option>
            <option value="14">14px</option>
            <option value="16" selected>16px</option>
            <option value="18">18px</option>
            <option value="20">20px</option>
            <option value="24">24px</option>
            <option value="28">28px</option>
            <option value="32">32px</option>
            <option value="36">36px</option>
            <option value="40">40px</option>
            <option value="44">44px</option>
            <option value="48">48px</option>
            <option value="50">50px</option>
        </select>
        <button onclick="toggleOpenDyslexic()">Accessible Font</button>
        <button onclick="toggleAccessibleBackground()">Accessible Background</button>
        """
    return controls


def Get_Sections(soup):
    # Generate sections from divs
    sections_content = ""
    sections_list = []
    raw_text=""
    
    bib = Get_Bibliography(soup)
    citation_modals = []
    
    for div in soup.find_all("div"):
        header = div.find("head")
        if header is not None:
            section_number = header.get('n', "")
            section_id = header.text.replace(" ", "_")
            sections_list.append({'num': Normalize_Section(section_number), 'text': section_id})
            sections_content += f"<section id='{section_id}'>"
            sections_content += f"<h2>{section_number} {header.text}</h2>"
        else:
            sections_content += f"<section id=''>"
    
        for i,paragraph in enumerate(div.find_all("p")):
            new_paragraph = ""
            for ii,element in enumerate(paragraph.contents):
                if isinstance(element, NavigableString):
                    new_paragraph += element
                elif isinstance(element, Tag) and element.name == "ref" and element.get("target")!=None:
                    ref_id = element.get("target").lstrip("#")
                    if ref_id in bib.keys():
                      citation = f"""<span class="text-area" onclick="openDialog(event, '{ref_id}')">{element.text}</span>"""
                      new_paragraph += citation
                      cit_info = bib[ref_id]
                      citation_modals.append(f"""<div id="{ref_id}" class="dialog">
                              <b>{element.text}</b><br>
                              <b>Title:</b> {cit_info['title']}<br>
                              <b>Authors:</b> {", ".join(cit_info['authors'])}<br>
                              <b>Year:</b> {cit_info['year']}<br>
                              <b>Journal:</b> {cit_info['journal']}<br>
                              <b>DOI:</b> <a href="https://doi.org/{cit_info['doi']}">{cit_info['doi']} </a><br>
                              <button class="close-button" onclick="closeDialog('{ref_id}')">Close</button>
                          </div>""")
                else:
                    new_paragraph += element.text
    
            sections_content += f"<p>{new_paragraph}</p>"
            raw_text += "\n" + paragraph.text
    
        sections_content += "</section>"
    return sections_list, sections_content, citation_modals

def Get_Navigation(controls,sections_list):
    # Generate navigation for sections
    navigation = "<div class='sticky-content' style='max-height: 100%; overflow-y: auto;'>" + controls + " <h2> Navigation </h2>"
    for section in sections_list:
        no_tabs = section['num'].count(".")
        if no_tabs>0:
          left = str(20*no_tabs)+"px"  # Adjust the multiplier for desired tab width
        else:
          left= "0px"
        navigation += f'<p style="margin-left: {left}; font-size: 10px;"><a href="#{section["text"]}">{section["num"]} {section["text"]}</a></p>'
    
    navigation += "</div>"
    return navigation



def Get_Article_HTML(pdf):

    article = GParse_Paper(pdf)
    
    soup = BeautifulSoup(article, "xml")

    try:
        document_title = soup.find("fileDesc").find("title").text
    except:
        document_title = ""
        
    html_top = Get_HTMLTop(document_title)
    sections_list, sections_content, citation_modals = Get_Sections(soup)
    controls = Get_Controls()
    navigation = Get_Navigation(controls, sections_list)
    
    # Combine all parts into final HTML
    html = "<article id='article'>" + str(html_top) + str(sections_content) + str(navigation) + "n".join(citation_modals)+"</article></body>"

    return html


html = solara.reactive("<h1> Article PDF to HTML converter </h1>")
xml = solara.reactive("")
loaded = solara.reactive(False)

@solara.component
def Page():
    solara.Style(app_style)

    def on_file(f: FileInfo):
        html.value = "Please wait..."
        html.value = Get_Article_HTML(f["file_obj"]) 
        loaded.value = True

    def on_demo():
        f=requests.get("https://familymedicine.med.wayne.edu/mph/project/green_2006_narrative_literature_reviews.pdf").content
        html.value = "Please wait..."
        html.value = Get_Article_HTML(f) 
        loaded.value = True
    if not loaded.value:
        solara.Button(label="Use Demo PDF", on_click=on_demo)
        solara.FileDrop(label="Drag and drop custom article pdf", on_file=on_file, lazy=True)
    solara.HTML(unsafe_innerHTML=html.value)
    if loaded.value:
        solara.FileDownload(html, filename="articledemo.html", label="Download HTML")