Spaces:

Makima57
/

query-app

Sleeping

App Files Files Community

query-app / app.py

Makima57

Update app.py

05f7c2a verified 10 months ago

raw

history blame contribute delete

3.53 kB

	import streamlit as st
	from googlesearch import search
	import requests
	from bs4 import BeautifulSoup
	import chunk # Importing the chunk module

	# Function to perform Google search and return the first two links
	def google_search(query):
	try:
	search_results = search(query, num_results=2) # Get first two results
	first_two_links = [next(search_results, None), next(search_results, None)]
	return first_two_links
	except Exception as e:
	st.error(f"An error occurred: {e}")
	return None

	# Function to fetch webpage content
	def fetch_webpage_content(url):
	try:
	response = requests.get(url)
	response.raise_for_status() # Check if the request was successful
	return response.text
	except Exception as e:
	st.error(f"Failed to fetch the webpage content: {e}")
	return None

	# Function to scrape text from webpage content using BeautifulSoup
	def scrape_text(webpage_content):
	try:
	soup = BeautifulSoup(webpage_content, 'html.parser')
	for script in soup(["script", "style"]):
	script.decompose() # Remove unnecessary elements
	text = soup.get_text() # Get raw text
	lines = (line.strip() for line in text.splitlines()) # Strip lines
	chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Split and clean
	text = '\n'.join(chunk for chunk in chunks if chunk) # Join cleaned text
	return text
	except Exception as e:
	st.error(f"Failed to scrape text from webpage content: {e}")
	return None

	# Streamlit app UI
	st.title("Search and Chunk Webpage Content")

	# Input field for search query
	query = st.text_input("Enter search query", "")

	# Button to trigger search
	if st.button("Search"):
	if query:
	first_two_links = google_search(query) # Get first two links
	if first_two_links:
	for i, link in enumerate(first_two_links, 1):
	st.success(f"Link {i}: [Click here]({link})") # Display links

	# Fetch webpage content
	webpage_content = fetch_webpage_content(link)
	if webpage_content:
	# Scrape text from webpage content
	scraped_text = scrape_text(webpage_content)

	if scraped_text: # Ensure scraped_text is not empty
	st.write(f"Scraped Content for Link {i}:")
	st.text(scraped_text[:500]) # Display first 500 characters of the content

	# Chunk the scraped text using chunk.py
	chunked_text = chunk.chunk_text(scraped_text)

	if chunked_text: # Ensure chunked_text is not empty
	st.write(f"Chunked Data for Link {i}:")
	for chunk_part in chunked_text:
	st.write(chunk_part) # Display each chunk

	# Save and download chunked data using chunk.py
	chunk.save_and_download_chunked_data(chunked_text, file_name=f"chunked_data_link_{i}.txt")
	else:
	st.warning("No chunked data available")
	else:
	st.warning("No content scraped from this link")
	else:
	st.warning("No results found")
	else:
	st.error("Please enter a query")