hlydecker commited on
Commit
93ee3ba
1 Parent(s): ca7bed3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -0
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from docx2pdf import convert
4
+ import subprocess
5
+ from platform import system
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ class WordToPDFConverter:
10
+ """
11
+ A cross-platform Word to PDF converter that preserves formatting and hyperlinks.
12
+ Uses docx2pdf for Windows/Mac and LibreOffice for Linux.
13
+ """
14
+
15
+ def __init__(self):
16
+ self.platform = system()
17
+ self.logger = self._setup_logger()
18
+
19
+ # Verify LibreOffice installation on Linux
20
+ if self.platform == "Linux":
21
+ self._verify_libreoffice()
22
+
23
+ def _setup_logger(self):
24
+ """Set up logging configuration."""
25
+ logger = logging.getLogger('WordToPDFConverter')
26
+ logger.setLevel(logging.INFO)
27
+
28
+ if not logger.handlers:
29
+ handler = logging.StreamHandler()
30
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
31
+ handler.setFormatter(formatter)
32
+ logger.addHandler(handler)
33
+
34
+ return logger
35
+
36
+ def _verify_libreoffice(self):
37
+ """Verify LibreOffice is installed on Linux systems."""
38
+ try:
39
+ subprocess.run(['libreoffice', '--version'],
40
+ stdout=subprocess.PIPE,
41
+ stderr=subprocess.PIPE)
42
+ except FileNotFoundError:
43
+ raise SystemError(
44
+ "LibreOffice is required for Linux systems. "
45
+ "Please install it using: sudo apt-get install libreoffice"
46
+ )
47
+
48
+ def _convert_with_libreoffice(self, input_path, output_path=None):
49
+ """Convert Word to PDF using LibreOffice (for Linux)."""
50
+ input_path = Path(input_path).absolute()
51
+
52
+ if output_path:
53
+ output_path = Path(output_path).absolute()
54
+ else:
55
+ output_path = input_path.with_suffix('.pdf')
56
+
57
+ # Ensure output directory exists
58
+ output_path.parent.mkdir(parents=True, exist_ok=True)
59
+
60
+ cmd = [
61
+ 'libreoffice',
62
+ '--headless',
63
+ '--convert-to',
64
+ 'pdf',
65
+ '--outdir',
66
+ str(output_path.parent),
67
+ str(input_path)
68
+ ]
69
+
70
+ try:
71
+ process = subprocess.run(
72
+ cmd,
73
+ stdout=subprocess.PIPE,
74
+ stderr=subprocess.PIPE,
75
+ text=True
76
+ )
77
+
78
+ if process.returncode != 0:
79
+ raise Exception(f"LibreOffice conversion failed: {process.stderr}")
80
+
81
+ # LibreOffice creates PDF with the same name in the output directory
82
+ created_pdf = output_path.parent / input_path.with_suffix('.pdf').name
83
+
84
+ # Rename if a specific output path was requested
85
+ if output_path.name != created_pdf.name:
86
+ created_pdf.rename(output_path)
87
+
88
+ return str(output_path)
89
+
90
+ except Exception as e:
91
+ self.logger.error(f"Error during LibreOffice conversion: {str(e)}")
92
+ raise
93
+
94
+ def convert_to_pdf(self, input_path, output_path=None):
95
+ """
96
+ Convert a Word document to PDF while preserving formatting and hyperlinks.
97
+
98
+ Args:
99
+ input_path (str): Path to the input Word document
100
+ output_path (str, optional): Path for the output PDF
101
+
102
+ Returns:
103
+ str: Path to the created PDF file
104
+ """
105
+ input_path = os.path.abspath(input_path)
106
+
107
+ if not os.path.exists(input_path):
108
+ raise FileNotFoundError(f"Word document not found: {input_path}")
109
+
110
+ if output_path:
111
+ output_path = os.path.abspath(output_path)
112
+ else:
113
+ output_path = os.path.splitext(input_path)[0] + '.pdf'
114
+
115
+ try:
116
+ if self.platform == "Linux":
117
+ self.logger.info(f"Converting {input_path} using LibreOffice...")
118
+ return self._convert_with_libreoffice(input_path, output_path)
119
+ else:
120
+ self.logger.info(f"Converting {input_path} using docx2pdf...")
121
+ convert(input_path, output_path)
122
+ return output_path
123
+
124
+ except Exception as e:
125
+ self.logger.error(f"Conversion failed: {str(e)}")
126
+ raise
127
+
128
+ def convert_word_to_pdf(input_file):
129
+ """
130
+ Gradio-friendly wrapper for Word to PDF conversion.
131
+
132
+ Args:
133
+ input_file (str): Path to the uploaded Word document
134
+
135
+ Returns:
136
+ str: Path to the converted PDF file
137
+ """
138
+ try:
139
+ converter = WordToPDFConverter()
140
+
141
+ # Generate a unique output filename
142
+ output_file = os.path.splitext(input_file)[0] + '.pdf'
143
+
144
+ # Convert the file
145
+ pdf_path = converter.convert_to_pdf(input_file, output_file)
146
+
147
+ return pdf_path
148
+ except Exception as e:
149
+ raise gr.Error(f"Conversion failed: {str(e)}")
150
+
151
+ # Create Gradio Interface
152
+ def create_gradio_interface():
153
+ """
154
+ Create a Gradio interface for Word to PDF conversion.
155
+
156
+ Returns:
157
+ gr.Interface: Configured Gradio interface
158
+ """
159
+ interface = gr.Interface(
160
+ fn=convert_word_to_pdf,
161
+ inputs=gr.File(label="Upload Word Document", type="filepath", file_types=['.doc', '.docx']),
162
+ outputs=gr.File(label="Download PDF"),
163
+ title="Word to PDF Converter",
164
+ description="Upload a Word document and convert it to PDF while preserving formatting and hyperlinks.",
165
+ theme="soft",
166
+ examples=[
167
+ ["/path/to/sample1.docx"],
168
+ ["/path/to/sample2.doc"]
169
+ ]
170
+ )
171
+
172
+ return interface
173
+
174
+ # Launch the Gradio app
175
+ if __name__ == "__main__":
176
+ app = create_gradio_interface()
177
+ app.launch(share=True)