import streamlit as st from googlesearch import search import requests from bs4 import BeautifulSoup import chunk # Importing the chunk module # Function to perform Google search and return the first two links def google_search(query): try: search_results = search(query, num_results=2) # Get first two results first_two_links = [next(search_results, None), next(search_results, None)] return first_two_links except Exception as e: st.error(f"An error occurred: {e}") return None # Function to fetch webpage content def fetch_webpage_content(url): try: response = requests.get(url) response.raise_for_status() # Check if the request was successful return response.text except Exception as e: st.error(f"Failed to fetch the webpage content: {e}") return None # Function to scrape text from webpage content using BeautifulSoup def scrape_text(webpage_content): try: soup = BeautifulSoup(webpage_content, 'html.parser') for script in soup(["script", "style"]): script.decompose() # Remove unnecessary elements text = soup.get_text() # Get raw text lines = (line.strip() for line in text.splitlines()) # Strip lines chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Split and clean text = '\n'.join(chunk for chunk in chunks if chunk) # Join cleaned text return text except Exception as e: st.error(f"Failed to scrape text from webpage content: {e}") return None # Streamlit app UI st.title("Search and Chunk Webpage Content") # Input field for search query query = st.text_input("Enter search query", "") # Button to trigger search if st.button("Search"): if query: first_two_links = google_search(query) # Get first two links if first_two_links: for i, link in enumerate(first_two_links, 1): st.success(f"Link {i}: [Click here]({link})") # Display links # Fetch webpage content webpage_content = fetch_webpage_content(link) if webpage_content: # Scrape text from webpage content scraped_text = scrape_text(webpage_content) if scraped_text: # Ensure scraped_text is not empty st.write(f"Scraped Content for Link {i}:") st.text(scraped_text[:500]) # Display first 500 characters of the content # Chunk the scraped text using chunk.py chunked_text = chunk.chunk_text(scraped_text) if chunked_text: # Ensure chunked_text is not empty st.write(f"Chunked Data for Link {i}:") for chunk_part in chunked_text: st.write(chunk_part) # Display each chunk # Save and download chunked data using chunk.py chunk.save_and_download_chunked_data(chunked_text, file_name=f"chunked_data_link_{i}.txt") else: st.warning("No chunked data available") else: st.warning("No content scraped from this link") else: st.warning("No results found") else: st.error("Please enter a query")