from transformers import pipeline from bs4 import BeautifulSoup import requests def fetch_webpage_content(url): """Fetch the content of a webpage.""" try: response = requests.get(url, timeout=10) # Ensures the use of standard HTTP/HTTPS ports response.raise_for_status() # Raises an error for bad responses print("Hello", response.text) return response.text except requests.exceptions.RequestException as e: print(f"Error fetching the webpage: {e}") return None def parse_and_segment_content(html_content): """Parse and segment HTML content into manageable chunks.""" if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') results = soup.find_all(['h1', 'p']) text = ' '.join([result.text for result in results]) text = text.replace('.', '.').replace('!', '!').replace('?', '?') sentences = text.split('') print("Doing segmentation") max_chunk = 500 chunks = [] current_chunk = -1 for sentence in sentences: if len(sentence.strip()) == 0: continue if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk: chunks.append([]) current_chunk += 1 chunks[current_chunk].extend(sentence.split()) chunks = [' '.join(chunk).strip() for chunk in chunks] return chunks def summarize_text(chunks): """Summarize the given text chunks.""" if not chunks: return "No content to summarize." summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") summaries = [] print("Summarizing content") for chunk in chunks: try: summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False) summaries.append(summary[0]['summary_text']) except Exception as e: print(f"Error in summarization: {e}") summaries.append("Error summarizing text.") # Keep the flow even if summarization fails return ' '.join(summaries) # Example usage # url = "https://example.com" # html_content = fetch_webpage_content(url) # if html_content: # chunks = parse_and_segment_content(html_content) # summary = summarize_text(chunks) # print(summary) # else: # print("Failed to fetch or parse webpage content.")