|
import streamlit as st |
|
from googlesearch import search |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import chunk |
|
|
|
|
|
def google_search(query): |
|
try: |
|
search_results = search(query, num_results=2) |
|
first_two_links = [next(search_results, None), next(search_results, None)] |
|
return first_two_links |
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |
|
return None |
|
|
|
|
|
def fetch_webpage_content(url): |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
return response.text |
|
except Exception as e: |
|
st.error(f"Failed to fetch the webpage content: {e}") |
|
return None |
|
|
|
|
|
def scrape_text(webpage_content): |
|
try: |
|
soup = BeautifulSoup(webpage_content, 'html.parser') |
|
for script in soup(["script", "style"]): |
|
script.decompose() |
|
text = soup.get_text() |
|
lines = (line.strip() for line in text.splitlines()) |
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
text = '\n'.join(chunk for chunk in chunks if chunk) |
|
return text |
|
except Exception as e: |
|
st.error(f"Failed to scrape text from webpage content: {e}") |
|
return None |
|
|
|
|
|
st.title("Search and Chunk Webpage Content") |
|
|
|
|
|
query = st.text_input("Enter search query", "") |
|
|
|
|
|
if st.button("Search"): |
|
if query: |
|
first_two_links = google_search(query) |
|
if first_two_links: |
|
for i, link in enumerate(first_two_links, 1): |
|
st.success(f"Link {i}: [Click here]({link})") |
|
|
|
|
|
webpage_content = fetch_webpage_content(link) |
|
if webpage_content: |
|
|
|
scraped_text = scrape_text(webpage_content) |
|
|
|
if scraped_text: |
|
st.write(f"Scraped Content for Link {i}:") |
|
st.text(scraped_text[:500]) |
|
|
|
|
|
chunked_text = chunk.chunk_text(scraped_text) |
|
|
|
if chunked_text: |
|
st.write(f"Chunked Data for Link {i}:") |
|
for chunk_part in chunked_text: |
|
st.write(chunk_part) |
|
|
|
|
|
chunk.save_and_download_chunked_data(chunked_text, file_name=f"chunked_data_link_{i}.txt") |
|
else: |
|
st.warning("No chunked data available") |
|
else: |
|
st.warning("No content scraped from this link") |
|
else: |
|
st.warning("No results found") |
|
else: |
|
st.error("Please enter a query") |
|
|
|
|
|
|
|
|