Spaces:
Running
Running
import streamlit as st | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from requests.sessions import Session | |
from langdetect import detect | |
from googletrans import Translator | |
def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None): | |
try: | |
session = Session() | |
# Handle authentication if credentials are provided | |
if email and password and login_url: | |
login_data = { | |
'email': email, | |
'password': password | |
# Include other necessary fields as required by the website | |
} | |
response = session.post(login_url, data=login_data) | |
response.raise_for_status() | |
else: | |
response = session.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Remove unwanted tags | |
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]): | |
tag.extract() | |
# Use query selector if provided | |
if query_selector: | |
elements = soup.select(query_selector) | |
text_content = " ".join([element.get_text() for element in elements]) | |
else: | |
# Extract header content | |
header_content = soup.find("header") | |
header_text = header_content.get_text() if header_content else "" | |
# Extract paragraph content | |
paragraph_content = soup.find_all("p") | |
paragraph_text = " ".join([p.get_text() for p in paragraph_content]) | |
text_content = f"{header_text}\n\n{paragraph_text}" | |
# Clean up whitespace | |
visible_text = re.sub(r'\s+', ' ', text_content).strip() | |
# Translate non-English text | |
translator = Translator() | |
sentences = re.split(r'(?<=[.!?]) +', visible_text) | |
translated_sentences = [] | |
for sentence in sentences: | |
try: | |
lang = detect(sentence) | |
if lang != 'en': | |
translation = translator.translate(sentence, dest='en').text | |
translated_sentences.append(translation) | |
else: | |
translated_sentences.append(sentence) | |
except Exception: | |
translated_sentences.append(sentence) | |
translated_text = ' '.join(translated_sentences) | |
return translated_text | |
except Exception as e: | |
st.error(f"Error occurred while scraping the data: {e}") | |
return None | |
def main(): | |
st.title("Web Data Scraper") | |
url_input = st.text_input("Enter the URL πβοΈ:", "") | |
query_selector = st.text_input("Enter a query selector (optional):", "") | |
email = st.text_input("Email (if authentication required):", "") | |
password = st.text_input("Password (if authentication required):", "", type="password") | |
login_url = st.text_input("Enter the login URL (if authentication required):", "") | |
if st.button("Load Data π§"): | |
if url_input: | |
data = scrape_visible_text_from_url( | |
url=url_input, | |
query_selector=query_selector if query_selector else None, | |
email=email if email else None, | |
password=password if password else None, | |
login_url=login_url if login_url else None | |
) | |
if data: | |
st.success("Data text successfully scraped!") | |
st.subheader("Scraped Text:") | |
st.write(data) | |
else: | |
st.warning("Failed to load data from the URL.") | |
else: | |
st.warning("Please enter a valid URL.") | |
if __name__ == "__main__": | |
main() |