import streamlit as st import requests from bs4 import BeautifulSoup import re from llama_index.core.node_parser import MarkdownNodeParser from llama_index.core.schema import Document, MetadataMode import textstat from markdownify import markdownify as md # --- Core Logic Classes --- class WebpageContentProcessor: """ Handles fetching, converting, and parsing webpage content into structured chunks. This class is responsible for the entire content processing pipeline. """ def __init__(self): pass def fetch_and_convert_to_markdown(self, url: str) -> str: """ Fetches HTML content, removes common boilerplate tags from the entire page, and then converts the remaining body content to Markdown using markdownify. """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() html_content = response.text soup = BeautifulSoup(html_content, 'html.parser') # Remove common boilerplate and non-content tags from the entire document tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form'] for tag_name in tags_to_remove: for element in soup.find_all(tag_name): element.decompose() # Process the entire remaining body content_container = soup.find('body') if not content_container: return "Error: Could not find the of the webpage." markdown_output = md(str(content_container)) # Post-processing to clean up the resulting Markdown markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output) markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE) return markdown_output.strip() except requests.exceptions.Timeout: return "Error: The request timed out. The server is taking too long to respond." except requests.exceptions.RequestException as e: return f"Error fetching the URL: {e}. Please check the URL and your connection." except Exception as e: return f"An unexpected error occurred during content processing: {e}" def parse_markdown_into_chunks(self, markdown_content: str) -> list: """ Parses Markdown content into logically separated chunks based on its structure. Uses MarkdownNodeParser to respect headers and sections. """ if not markdown_content or "Error" in markdown_content: return [] parser = MarkdownNodeParser(include_metadata=True) doc = Document(text=markdown_content) nodes = parser.get_nodes_from_documents([doc]) structured_chunks = [] for i, node in enumerate(nodes): content = node.get_content(metadata_mode=MetadataMode.NONE).strip() if not content: continue title_match = re.match(r"^(#+)\s*(.*)", content) if title_match: title = title_match.group(2).strip() content_text = content[len(title_match.group(0)):].strip() else: first_line = content.split('\n')[0].strip() title = (first_line[:75] + '...') if len(first_line) > 75 else first_line content_text = content if not title: title = f"[Chunk {i+1}]" structured_chunks.append({ "id": i, "title": title, "content": content_text }) return structured_chunks class ChunkManager: """ Manages the state of chunks, including their content, statistics, and targets. """ def __init__(self): self._chunks = [] self.target_flesch_min = 60 self.target_grade_max = 9 self.target_min_chunk_words = 40 self.target_max_chunk_words = 600 def set_chunks(self, chunks: list): self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks] def get_chunks(self) -> list: return self._chunks def _add_stats_to_chunk(self, chunk: dict) -> dict: chunk['stats'] = self._calculate_chunk_stats(chunk['content']) return chunk def _calculate_chunk_stats(self, text: str) -> dict: """Calculates readability and other metrics for a text chunk.""" stats = {} try: stats['word_count'] = textstat.lexicon_count(text, removepunct=True) stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text) stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text) except (Exception, TypeError): stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0}) return stats def format_chunk_stats(self, stats: dict) -> str: """Creates a formatted string of stats with color-coding based on targets.""" flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red" grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red" word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red" return ( f"**Word Count:** {stats.get('word_count', 0)}   |   " f"**Reading Ease:** {stats.get('flesch_reading_ease', 0):.2f}   |   " f"**Grade Level:** {stats.get('flesch_kincaid_grade', 0):.2f}" ) def get_document_summary_stats(self) -> str: """Calculates and formats stats for the entire document.""" if not self._chunks: return "No document loaded." total_words = sum(c['stats']['word_count'] for c in self._chunks) if len(self._chunks) > 0: avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks) avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks) else: avg_ease = avg_grade = 0 return ( f"- **Total Chunks:** {len(self._chunks)}\n" f"- **Total Words:** {total_words}\n" f"- **Avg. Reading Ease:** {avg_ease:.2f}\n" f"- **Avg. Grade Level:** {avg_grade:.2f}" ) def get_chunk_by_id(self, chunk_id: int) -> dict | None: return next((c for c in self._chunks if c["id"] == chunk_id), None) def update_chunk_content(self, chunk_id: int, new_content: str): chunk = self.get_chunk_by_id(chunk_id) if chunk: chunk["content"] = new_content self._add_stats_to_chunk(chunk) if chunk["title"].startswith("["): first_line = new_content.split('\n')[0].strip() new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line if new_title: chunk["title"] = new_title def delete_chunk(self, chunk_id: int): self._chunks = [c for c in self._chunks if c["id"] != chunk_id] for i, chunk in enumerate(self._chunks): chunk['id'] = i def get_final_markdown(self) -> str: if not self._chunks: return "No content to display." final_doc_parts = [] for c in self._chunks: is_header = re.match(r"^(#+)\s*(.*)", c['title']) if not c['title'].startswith("[") and not is_header: final_doc_parts.append(f"## {c['title']}\n\n{c['content']}") else: final_doc_parts.append(c['content']) return "\n\n---\n\n".join(final_doc_parts) def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int): self.target_flesch_min = flesch_min self.target_grade_max = grade_max self.target_min_chunk_words = min_words self.target_max_chunk_words = max_words self.set_chunks(self.get_chunks()) # --- Streamlit UI Application --- st.set_page_config(layout="wide", page_title="Webpage Content Editor") # --- MODIFIED: Custom CSS to increase sidebar width --- st.markdown( """ """, unsafe_allow_html=True ) def init_session_state(): if 'processor' not in st.session_state: st.session_state.processor = WebpageContentProcessor() if 'manager' not in st.session_state: st.session_state.manager = ChunkManager() if 'selected_chunk_id' not in st.session_state: st.session_state.selected_chunk_id = None if 'status_message' not in st.session_state: st.session_state.status_message = "" init_session_state() processor = st.session_state.processor manager = st.session_state.manager with st.sidebar: # --- MODIFIED: Removed the st.image line for the logo --- st.title("Settings & Overview") with st.expander("About this App & AI Writing Guidelines", expanded=True): st.info( """ This app helps you refine web content for AI synthesis by chunking it into logical, verifiable blocks. **Writing for AI Verifiability:** * **Structure with Headers:** Use H1, H2, H3 tags logically. * **Write for Clarity:** Use short, direct sentences. State facts explicitly. * **Create Verifiable Blocks:** Format content as definitions, Q&As, or step-by-step guides. * **Use the Editor's Metrics:** Aim for a **Reading Ease > 60** and a **Word Count** between 40-600 per chunk. The colors will guide you. """, icon="💡" ) st.subheader("📊 Document Overview") st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True) st.subheader("🎯 Content Targets") with st.form("targets_form"): st.write("Set readability targets to guide your editing. Colors in the editor will reflect these targets.") c1, c2 = st.columns(2) f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min), help="Measures readability. Higher scores mean the text is easier to read. Scores of 60-70 are considered plain English.") g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max), help="Estimates the U.S. school grade level needed to understand the text. A score of 8.0 means an eighth grader can read it. Lower scores are easier to read.") w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words)) w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words)) if st.form_submit_button("Set New Targets", use_container_width=True): manager.set_targets(f_min, g_max, w_min, w_max) st.session_state.status_message = "Content targets have been updated." st.rerun() st.subheader("📋 Final Compiled Document") st.text_area("Final Markdown Output", manager.get_final_markdown(), height=300, key="final_markdown") # --- Main Page Layout --- st.title("📝 Content Chunk Editor") st.caption("Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's [work on content chunking](https://wordlift.io/blog/en/googles-ai-mode-product-pages/).
A tool to fetch, chunk, and refine web content for AI synthesis. Best experienced on desktop.", unsafe_allow_html=True) url_input = st.text_input("Enter a webpage URL to start", key="url_input") with st.expander("⚠️ Important Information", expanded=False): st.warning( """ **Early Draft:** This is an early version of the application. You may encounter bugs or incomplete features. """, icon="🛠️" ) st.warning( """ **Restrictive Bot Policy:** This tool fetches content using automated requests. If a target website blocks bots, the app may time out or fail to retrieve content. """, icon="🤖" ) if st.button("Process URL", use_container_width=True, type="primary"): if url_input: with st.spinner("Fetching and chunking content..."): markdown = processor.fetch_and_convert_to_markdown(url_input) if "Error" in markdown: st.session_state.status_message = markdown manager.set_chunks([]) st.session_state.selected_chunk_id = None else: chunks = processor.parse_markdown_into_chunks(markdown) manager.set_chunks(chunks) if chunks: st.session_state.status_message = f"Successfully processed {len(chunks)} chunks." st.session_state.selected_chunk_id = chunks[0]['id'] else: st.session_state.status_message = "Could not extract any content chunks." st.session_state.selected_chunk_id = None st.rerun() if st.session_state.status_message: st.toast(st.session_state.status_message) st.session_state.status_message = "" chunks = manager.get_chunks() if not chunks: st.write("Process a URL to begin editing content chunks, or adjust settings in the sidebar.") with st.expander("Chunking Strategy Examples"): st.write("See how different websites structure their content, affecting chunking quality.") st.error("**Bad Chunking Example (Few Structural Headers)**") st.markdown(""" * [Wikipedia: Markdown](https://en.wikipedia.org/wiki/Markdown) """) st.success("**Good Chunking Examples (Clear, Hierarchical Headers)**") st.markdown(""" * [The Blog Starter](https://www.theblogstarter.com/) * [Google Safety Blog](https://blog.google/technology/safety-security/google-survey-digital-security-2025/) * [HubSpot: What is a Blog?](https://blog.hubspot.com/marketing/what-is-a-blog) """) else: chunk_ids = [c['id'] for c in chunks] if st.session_state.selected_chunk_id not in chunk_ids: st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None if st.session_state.selected_chunk_id is not None: chunk_options = {c['id']: c['title'] for c in chunks} selected_id = st.selectbox( "Select a chunk to edit", options=chunk_ids, format_func=lambda x: f"Chunk {x}: {chunk_options.get(x, 'N/A')}", index=chunk_ids.index(st.session_state.selected_chunk_id) ) if selected_id != st.session_state.selected_chunk_id: st.session_state.selected_chunk_id = selected_id st.rerun() selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id) if selected_chunk: editor_col, preview_col = st.columns(2) with editor_col: st.markdown(f"**Editing: {selected_chunk['title']}**") st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True) edited_content = st.text_area( "Chunk Content (Markdown)", value=selected_chunk['content'], height=400, key=f"editor_{selected_chunk['id']}" ) b_col1, b_col2, _ = st.columns([1, 1, 3]) if b_col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"): manager.update_chunk_content(selected_chunk['id'], edited_content) st.session_state.status_message = "Chunk updated successfully!" st.rerun() if b_col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"): manager.delete_chunk(selected_chunk['id']) st.session_state.status_message = "Chunk deleted." remaining_chunks = manager.get_chunks() st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None st.rerun() with preview_col: st.markdown("**Live Preview**") with st.container(height=525, border=True): st.markdown(edited_content, unsafe_allow_html=True)