import streamlit as st import requests from bs4 import BeautifulSoup import re from llama_index.core.node_parser import MarkdownNodeParser from llama_index.core.schema import Document, MetadataMode import textstat from markdownify import markdownify as md # --- Core Logic Classes --- class WebpageContentProcessor: """ Handles fetching, converting, and parsing webpage content into structured chunks. This class is responsible for the entire content processing pipeline. """ def __init__(self): pass def fetch_and_convert_to_markdown(self, url: str) -> str: """ Fetches HTML content, removes common boilerplate tags from the entire page, and then converts the remaining body content to Markdown using markdownify. """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() html_content = response.text soup = BeautifulSoup(html_content, 'html.parser') # Remove common boilerplate and non-content tags from the entire document tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form'] for tag_name in tags_to_remove: for element in soup.find_all(tag_name): element.decompose() # Process the entire remaining body content_container = soup.find('body') if not content_container: return "Error: Could not find the
of the webpage." markdown_output = md(str(content_container)) # Post-processing to clean up the resulting Markdown markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output) markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE) return markdown_output.strip() except requests.exceptions.Timeout: return "Error: The request timed out. The server is taking too long to respond." except requests.exceptions.RequestException as e: return f"Error fetching the URL: {e}. Please check the URL and your connection." except Exception as e: return f"An unexpected error occurred during content processing: {e}" def parse_markdown_into_chunks(self, markdown_content: str) -> list: """ Parses Markdown content into logically separated chunks based on its structure. Uses MarkdownNodeParser to respect headers and sections. """ if not markdown_content or "Error" in markdown_content: return [] parser = MarkdownNodeParser(include_metadata=True) doc = Document(text=markdown_content) nodes = parser.get_nodes_from_documents([doc]) structured_chunks = [] for i, node in enumerate(nodes): content = node.get_content(metadata_mode=MetadataMode.NONE).strip() if not content: continue title_match = re.match(r"^(#+)\s*(.*)", content) if title_match: title = title_match.group(2).strip() content_text = content[len(title_match.group(0)):].strip() else: first_line = content.split('\n')[0].strip() title = (first_line[:75] + '...') if len(first_line) > 75 else first_line content_text = content if not title: title = f"[Chunk {i+1}]" structured_chunks.append({ "id": i, "title": title, "content": content_text }) return structured_chunks class ChunkManager: """ Manages the state of chunks, including their content, statistics, and targets. """ def __init__(self): self._chunks = [] self.target_flesch_min = 60 self.target_grade_max = 9 self.target_min_chunk_words = 40 self.target_max_chunk_words = 600 def set_chunks(self, chunks: list): self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks] def get_chunks(self) -> list: return self._chunks def _add_stats_to_chunk(self, chunk: dict) -> dict: chunk['stats'] = self._calculate_chunk_stats(chunk['content']) return chunk def _calculate_chunk_stats(self, text: str) -> dict: """Calculates readability and other metrics for a text chunk.""" stats = {} try: stats['word_count'] = textstat.lexicon_count(text, removepunct=True) stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text) stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text) except (Exception, TypeError): stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0}) return stats def format_chunk_stats(self, stats: dict) -> str: """Creates a formatted string of stats with color-coding based on targets.""" flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red" grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red" word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red" return ( f"**Word Count:** {stats.get('word_count', 0)} | " f"**Reading Ease:** {stats.get('flesch_reading_ease', 0):.2f} | " f"**Grade Level:** {stats.get('flesch_kincaid_grade', 0):.2f}" ) def get_document_summary_stats(self) -> str: """Calculates and formats stats for the entire document.""" if not self._chunks: return "No document loaded." total_words = sum(c['stats']['word_count'] for c in self._chunks) if len(self._chunks) > 0: avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks) avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks) else: avg_ease = avg_grade = 0 return ( f"- **Total Chunks:** {len(self._chunks)}\n" f"- **Total Words:** {total_words}\n" f"- **Avg. Reading Ease:** {avg_ease:.2f}\n" f"- **Avg. Grade Level:** {avg_grade:.2f}" ) def get_chunk_by_id(self, chunk_id: int) -> dict | None: return next((c for c in self._chunks if c["id"] == chunk_id), None) def update_chunk_content(self, chunk_id: int, new_content: str): chunk = self.get_chunk_by_id(chunk_id) if chunk: chunk["content"] = new_content self._add_stats_to_chunk(chunk) if chunk["title"].startswith("["): first_line = new_content.split('\n')[0].strip() new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line if new_title: chunk["title"] = new_title def delete_chunk(self, chunk_id: int): self._chunks = [c for c in self._chunks if c["id"] != chunk_id] for i, chunk in enumerate(self._chunks): chunk['id'] = i def get_final_markdown(self) -> str: if not self._chunks: return "No content to display." final_doc_parts = [] for c in self._chunks: is_header = re.match(r"^(#+)\s*(.*)", c['title']) if not c['title'].startswith("[") and not is_header: final_doc_parts.append(f"## {c['title']}\n\n{c['content']}") else: final_doc_parts.append(c['content']) return "\n\n---\n\n".join(final_doc_parts) def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int): self.target_flesch_min = flesch_min self.target_grade_max = grade_max self.target_min_chunk_words = min_words self.target_max_chunk_words = max_words self.set_chunks(self.get_chunks()) # --- Streamlit UI Application --- st.set_page_config(layout="wide", page_title="Webpage Content Editor") # --- MODIFIED: Custom CSS to increase sidebar width --- st.markdown( """ """, unsafe_allow_html=True ) def init_session_state(): if 'processor' not in st.session_state: st.session_state.processor = WebpageContentProcessor() if 'manager' not in st.session_state: st.session_state.manager = ChunkManager() if 'selected_chunk_id' not in st.session_state: st.session_state.selected_chunk_id = None if 'status_message' not in st.session_state: st.session_state.status_message = "" init_session_state() processor = st.session_state.processor manager = st.session_state.manager with st.sidebar: # --- MODIFIED: Removed the st.image line for the logo --- st.title("Settings & Overview") with st.expander("About this App & AI Writing Guidelines", expanded=True): st.info( """ This app helps you refine web content for AI synthesis by chunking it into logical, verifiable blocks. **Writing for AI Verifiability:** * **Structure with Headers:** Use H1, H2, H3 tags logically. * **Write for Clarity:** Use short, direct sentences. State facts explicitly. * **Create Verifiable Blocks:** Format content as definitions, Q&As, or step-by-step guides. * **Use the Editor's Metrics:** Aim for a **Reading Ease > 60** and a **Word Count** between 40-600 per chunk. The colors will guide you. """, icon="💡" ) st.subheader("📊 Document Overview") st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True) st.subheader("🎯 Content Targets") with st.form("targets_form"): st.write("Set readability targets to guide your editing. Colors in the editor will reflect these targets.") c1, c2 = st.columns(2) f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min), help="Measures readability. Higher scores mean the text is easier to read. Scores of 60-70 are considered plain English.") g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max), help="Estimates the U.S. school grade level needed to understand the text. A score of 8.0 means an eighth grader can read it. Lower scores are easier to read.") w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words)) w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words)) if st.form_submit_button("Set New Targets", use_container_width=True): manager.set_targets(f_min, g_max, w_min, w_max) st.session_state.status_message = "Content targets have been updated." st.rerun() st.subheader("📋 Final Compiled Document") st.text_area("Final Markdown Output", manager.get_final_markdown(), height=300, key="final_markdown") # --- Main Page Layout --- st.title("📝 Content Chunk Editor") st.caption("Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's [work on content chunking](https://wordlift.io/blog/en/googles-ai-mode-product-pages/).