Spaces:

awacke1
/

Pillow-PyMuPDF-ReportLab

Sleeping

App Files Files Community

Pillow-PyMuPDF-ReportLab / app.py

awacke1

Update app.py

d2cd664 verified about 1 month ago

raw

history blame

10.1 kB

	import streamlit as st
	import base64
	from reportlab.lib.pagesizes import A4
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib import colors
	import io
	import re
	import fitz # PyMuPDF
	from PIL import Image

	# Set page config for wide layout and collapsed sidebar
	st.set_page_config(layout="wide", initial_sidebar_state="collapsed")

	# Initial markdown content
	default_markdown = """# Cutting-Edge ML Outline

	## Core ML Techniques
	1. 🌟 Mixture of Experts (MoE)
	- Conditional computation techniques
	- Sparse gating mechanisms
	- Training specialized sub-models

	2. 🔥 Supervised Fine-Tuning (SFT) using PyTorch
	- Loss function customization
	- Gradient accumulation strategies
	- Learning rate schedulers

	3. 🤖 Large Language Models (LLM) using Transformers
	- Attention mechanisms
	- Tokenization strategies
	- Position encodings

	## Training Methods
	4. 📊 Self-Rewarding Learning using NPS 0-10 and Verbatims
	- Custom reward functions
	- Feedback categorization
	- Signal extraction from text

	5. 👍 Reinforcement Learning from Human Feedback (RLHF)
	- Preference datasets
	- PPO implementation
	- KL divergence constraints

	6. 🔗 MergeKit: Merging Models to Same Embedding Space
	- TIES merging
	- Task arithmetic
	- SLERP interpolation

	## Optimization & Deployment
	7. 📏 DistillKit: Model Size Reduction with Spectrum Analysis
	- Knowledge distillation
	- Quantization techniques
	- Model pruning strategies

	8. 🧠 Agentic RAG Agents using Document Inputs
	- Vector database integration
	- Query planning
	- Self-reflection mechanisms

	9. ⏳ Longitudinal Data Summarization from Multiple Docs
	- Multi-document compression
	- Timeline extraction
	- Entity tracking

	## Knowledge Representation
	10. 📑 Knowledge Extraction using Markdown Knowledge Graphs
	- Entity recognition
	- Relationship mapping
	- Hierarchical structuring

	11. 🗺️ Knowledge Mapping with Mermaid Diagrams
	- Flowchart generation
	- Sequence diagram creation
	- State diagrams

	12. 💻 ML Code Generation with Streamlit/Gradio/HTML5+JS
	- Code completion
	- Unit test generation
	- Documentation synthesis
	"""

	# Process multilevel markdown for PDF output
	def markdown_to_pdf_content(markdown_text):
	lines = markdown_text.strip().split('\n')
	pdf_content = []
	in_list_item = False
	current_item = None
	sub_items = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	if line.startswith('# '):
	pass
	elif line.startswith('## '):
	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])
	sub_items = []
	current_item = None

	section = line.replace('## ', '').strip()
	pdf_content.append(f"<b>{section}</b>")
	in_list_item = False
	elif re.match(r'^\d+\.', line):
	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])
	sub_items = []

	current_item = line.strip()
	in_list_item = True
	elif line.startswith('- ') and in_list_item:
	sub_items.append(line.strip())
	else:
	if not in_list_item:
	pdf_content.append(line.strip())

	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])

	mid_point = len(pdf_content) // 2
	left_column = pdf_content[:mid_point]
	right_column = pdf_content[mid_point:]

	return left_column, right_column

	# Main PDF creation with parameterized text sizes
	def create_main_pdf(markdown_text, base_font_size=10, auto_size=False):
	buffer = io.BytesIO()
	doc = SimpleDocTemplate(
	buffer,
	pagesize=(A4[1], A4[0]),
	leftMargin=36,
	rightMargin=36,
	topMargin=36,
	bottomMargin=36
	)

	styles = getSampleStyleSheet()
	story = []

	page_height = A4[0] - 72
	title_height = 20
	spacer_height = 10

	left_column, right_column = markdown_to_pdf_content(markdown_text)

	total_items = 0
	for col in (left_column, right_column):
	for item in col:
	if isinstance(item, list):
	main_item, sub_items = item
	total_items += 1 + len(sub_items)
	else:
	total_items += 1

	# 🔧 Adjust this multiplier to control autosizing sensitivity
	if auto_size:
	base_font_size = max(6, min(12, 200 / total_items))

	# 🔧 Font size parameters - tweak these ratios as needed
	item_font_size = base_font_size
	subitem_font_size = base_font_size * 0.9
	section_font_size = base_font_size * 1.2
	title_font_size = min(16, base_font_size * 1.5)

	title_style = styles['Heading1']
	title_style.textColor = colors.darkblue
	title_style.alignment = 1
	title_style.fontSize = title_font_size

	section_style = ParagraphStyle(
	'SectionStyle',
	parent=styles['Heading2'],
	textColor=colors.darkblue,
	fontSize=section_font_size,
	leading=section_font_size * 1.2,
	spaceAfter=2
	)

	item_style = ParagraphStyle(
	'ItemStyle',
	parent=styles['Normal'],
	fontSize=item_font_size,
	leading=item_font_size * 1.2,
	fontName='Helvetica-Bold',
	spaceAfter=1
	)

	subitem_style = ParagraphStyle(
	'SubItemStyle',
	parent=styles['Normal'],
	fontSize=subitem_font_size,
	leading=subitem_font_size * 1.2,
	leftIndent=10,
	spaceAfter=1
	)

	story.append(Paragraph("Cutting-Edge ML Outline (ReportLab)", title_style))
	story.append(Spacer(1, spacer_height))

	left_cells = []
	for item in left_column:
	if isinstance(item, str) and item.startswith('<b>'):
	text = item.replace('<b>', '').replace('</b>', '')
	left_cells.append(Paragraph(text, section_style))
	elif isinstance(item, list):
	main_item, sub_items = item
	left_cells.append(Paragraph(main_item, item_style))
	for sub_item in sub_items:
	left_cells.append(Paragraph(sub_item, subitem_style))
	else:
	left_cells.append(Paragraph(item, item_style))

	right_cells = []
	for item in right_column:
	if isinstance(item, str) and item.startswith('<b>'):
	text = item.replace('<b>', '').replace('</b>', '')
	right_cells.append(Paragraph(text, section_style))
	elif isinstance(item, list):
	main_item, sub_items = item
	right_cells.append(Paragraph(main_item, item_style))
	for sub_item in sub_items:
	right_cells.append(Paragraph(sub_item, subitem_style))
	else:
	right_cells.append(Paragraph(item, item_style))

	max_cells = max(len(left_cells), len(right_cells))
	left_cells.extend([""] * (max_cells - len(left_cells)))
	right_cells.extend([""] * (max_cells - len(right_cells)))

	table_data = list(zip(left_cells, right_cells))
	col_width = (A4[1] - 72) / 2.0

	table = Table(table_data, colWidths=[col_width, col_width], hAlign='CENTER')
	table.setStyle(TableStyle([
	('VALIGN', (0, 0), (-1, -1), 'TOP'),
	('ALIGN', (0, 0), (-1, -1), 'LEFT'),
	('BACKGROUND', (0, 0), (-1, -1), colors.white),
	('GRID', (0, 0), (-1, -1), 0, colors.white),
	('LINEAFTER', (0, 0), (0, -1), 0.5, colors.grey),
	('LEFTPADDING', (0, 0), (-1, -1), 2),
	('RIGHTPADDING', (0, 0), (-1, -1), 2),
	('TOPPADDING', (0, 0), (-1, -1), 1),
	('BOTTOMPADDING', (0, 0), (-1, -1), 1),
	]))

	story.append(table)
	doc.build(story)
	buffer.seek(0)
	return buffer.getvalue()

	# Function to convert PDF bytes to image using fitz
	def pdf_to_image(pdf_bytes):
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	page = doc[0]
	pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0)) # 2x zoom
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	doc.close()
	return img
	except Exception as e:
	st.error(f"Failed to render PDF preview: {e}")
	return None

	# Sidebar for settings (collapsed by default)
	with st.sidebar:
	auto_size = st.checkbox("Auto-size text", value=True)
	if not auto_size:
	base_font_size = st.slider("Base Font Size (points)", min_value=6, max_value=16, value=10, step=1)
	else:
	base_font_size = 10
	st.info("Font size will auto-adjust between 6-12 points based on content length.")

	# Use session state to persist markdown content
	if 'markdown_content' not in st.session_state:
	st.session_state.markdown_content = default_markdown

	# Generate PDF
	with st.spinner("Generating PDF..."):
	pdf_bytes = create_main_pdf(st.session_state.markdown_content, base_font_size, auto_size)

	# Display PDF preview in a full-width container
	with st.container():
	pdf_image = pdf_to_image(pdf_bytes)
	if pdf_image:
	st.image(pdf_image, use_container_width=True)
	else:
	st.info("Download the PDF to view it locally.")

	# Download button
	st.download_button(
	label="Download PDF",
	data=pdf_bytes,
	file_name="ml_outline.pdf",
	mime="application/pdf"
	)

	# Markdown editor
	edited_markdown = st.text_area(
	"Modify the markdown content below:",
	value=st.session_state.markdown_content,
	height=300
	)

	# Update markdown and regenerate PDF on change
	if st.button("Update PDF"):
	st.session_state.markdown_content = edited_markdown
	st.rerun()

	# Save markdown option
	st.download_button(
	label="Save Markdown",
	data=st.session_state.markdown_content,
	file_name="ml_outline.md",
	mime="text/markdown"
	)