query-app / app.py
Makima57's picture
Update app.py
05f7c2a verified
import streamlit as st
from googlesearch import search
import requests
from bs4 import BeautifulSoup
import chunk # Importing the chunk module
# Function to perform Google search and return the first two links
def google_search(query):
try:
search_results = search(query, num_results=2) # Get first two results
first_two_links = [next(search_results, None), next(search_results, None)]
return first_two_links
except Exception as e:
st.error(f"An error occurred: {e}")
return None
# Function to fetch webpage content
def fetch_webpage_content(url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
return response.text
except Exception as e:
st.error(f"Failed to fetch the webpage content: {e}")
return None
# Function to scrape text from webpage content using BeautifulSoup
def scrape_text(webpage_content):
try:
soup = BeautifulSoup(webpage_content, 'html.parser')
for script in soup(["script", "style"]):
script.decompose() # Remove unnecessary elements
text = soup.get_text() # Get raw text
lines = (line.strip() for line in text.splitlines()) # Strip lines
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Split and clean
text = '\n'.join(chunk for chunk in chunks if chunk) # Join cleaned text
return text
except Exception as e:
st.error(f"Failed to scrape text from webpage content: {e}")
return None
# Streamlit app UI
st.title("Search and Chunk Webpage Content")
# Input field for search query
query = st.text_input("Enter search query", "")
# Button to trigger search
if st.button("Search"):
if query:
first_two_links = google_search(query) # Get first two links
if first_two_links:
for i, link in enumerate(first_two_links, 1):
st.success(f"Link {i}: [Click here]({link})") # Display links
# Fetch webpage content
webpage_content = fetch_webpage_content(link)
if webpage_content:
# Scrape text from webpage content
scraped_text = scrape_text(webpage_content)
if scraped_text: # Ensure scraped_text is not empty
st.write(f"Scraped Content for Link {i}:")
st.text(scraped_text[:500]) # Display first 500 characters of the content
# Chunk the scraped text using chunk.py
chunked_text = chunk.chunk_text(scraped_text)
if chunked_text: # Ensure chunked_text is not empty
st.write(f"Chunked Data for Link {i}:")
for chunk_part in chunked_text:
st.write(chunk_part) # Display each chunk
# Save and download chunked data using chunk.py
chunk.save_and_download_chunked_data(chunked_text, file_name=f"chunked_data_link_{i}.txt")
else:
st.warning("No chunked data available")
else:
st.warning("No content scraped from this link")
else:
st.warning("No results found")
else:
st.error("Please enter a query")