Spaces:
Sleeping
Sleeping
import re | |
from fpdf import FPDF | |
class PDF(FPDF): | |
def header(self): | |
self.set_font("Arial", "B", 12) | |
self.cell(0, 10, "", 0, 1, "C") | |
def footer(self): | |
self.set_y(-15) | |
self.set_font("Arial", "I", 8) | |
self.cell(0, 10, f"Page {self.page_no()}", 0, 0, "C") | |
def sanitize_content(content): | |
try: | |
# Use 'utf-8' encoding to handle Unicode characters | |
encoded_content = content.encode('utf-8', 'ignore').decode('utf-8') | |
return encoded_content | |
except UnicodeEncodeError as e: | |
print(f"Encoding error: {e}") | |
# Remove problematic characters using 'ascii' encoding | |
sanitized_content = content.encode('ascii', 'ignore').decode('ascii') | |
return sanitized_content | |
def replace_problematic_characters(content): | |
# Replace or remove problematic characters | |
replacements = { | |
'\u2013': '-', # en dash to hyphen | |
'\u2014': '--', # en dash to double hyphen | |
'\u2018': "'", # left single quotation mark to apostrophe | |
'\u2019': "'", # right single quotation mark to apostrophe | |
'\u201c': '"', # left double quotation mark to double quote | |
'\u201d': '"', # right double quotation mark to double quote | |
'\u2026': '...', # horizontal ellipsis | |
'\u2010': '-', # dash | |
'\u2022': '*', # bullet | |
'\u2122': 'TM' # TradeMark Symbol | |
} | |
for char, replacement in replacements.items(): | |
content = content.replace(char, replacement) | |
return content | |
def generate_pdf_from_md(content, filename='output.pdf'): | |
try: | |
pdf = PDF() | |
pdf.add_page() | |
pdf.set_auto_page_break(auto=True, margin=15) | |
pdf.set_font('Arial', '', 12) | |
sanitized_content = sanitize_content(content) | |
sanitized_content = replace_problematic_characters(sanitized_content) | |
lines = sanitized_content.split('\n') | |
for line in lines: | |
if line.startswith('#'): | |
header_level = min(line.count('#'), 4) | |
header_text = re.sub(r'\*{2,}', '', line.strip('# ').strip()) | |
pdf.set_font('Arial', 'B', 12 + (4 - header_level) * 2) | |
pdf.multi_cell(0, 10, header_text) | |
pdf.set_font('Arial', '', 12) | |
else: | |
parts = re.split(r'(\*\*\*.*?\*\*\*|\*\*.*?\*\*|\*.*?\*|\[.*?\]\(.*?\)|\([^ ]+?\))', line) | |
for part in parts: | |
if re.match(r'\*\*\*.*?\*\*\*', part): # Bold Italic | |
text = part.strip('*') | |
pdf.set_font('Arial', 'BI', 12) | |
pdf.write(10, text) | |
elif re.match(r'\*\*.*?\*\*', part): # Bold | |
text = part.strip('*') | |
pdf.set_font('Arial', 'B', 12) | |
pdf.write(10, text) | |
elif re.match(r'\*.*?\*', part): # Italic | |
text = part.strip('*') | |
pdf.set_font('Arial', 'I', 12) | |
pdf.write(10, text) | |
elif re.match(r'\[.*?\]\(.*?\)', part): # Markdown-style link | |
display_text = re.search(r'\[(.*?)\]', part).group(1) | |
url = re.search(r'\((.*?)\)', part).group(1) | |
pdf.set_text_color(0, 0, 255) # Set text color to blue | |
pdf.set_font('', 'U') | |
pdf.write(10, display_text, url) | |
pdf.set_text_color(0, 0, 0) # Reset text color | |
pdf.set_font('Arial', '', 12) | |
# elif re.match(r'\([^ ]+?\)', part): # Plain URL | |
# url = part[1:-1] | |
# pdf.set_text_color(0, 0, 255) # Set text color to blue | |
# pdf.set_font('', 'U') | |
# pdf.write(10, url, url) | |
else: | |
pdf.write(10, part) | |
pdf.set_text_color(0, 0, 0) # Reset text color | |
pdf.set_font('Arial', '', 12) # Reset font | |
pdf.ln(10) | |
pdf.output(filename) | |
return f"PDF generated: {filename}" | |
except Exception as e: | |
return f"Error generating PDF: {e}" |