Web-Scraper / app.py
Charbel Malo
Scraper+
46f7221 verified
raw
history blame
3.74 kB
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
from requests.sessions import Session
from langdetect import detect
from googletrans import Translator
def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
try:
session = Session()
# Handle authentication if credentials are provided
if email and password and login_url:
login_data = {
'email': email,
'password': password
# Include other necessary fields as required by the website
}
response = session.post(login_url, data=login_data)
response.raise_for_status()
else:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove unwanted tags
for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
tag.extract()
# Use query selector if provided
if query_selector:
elements = soup.select(query_selector)
text_content = " ".join([element.get_text() for element in elements])
else:
# Extract header content
header_content = soup.find("header")
header_text = header_content.get_text() if header_content else ""
# Extract paragraph content
paragraph_content = soup.find_all("p")
paragraph_text = " ".join([p.get_text() for p in paragraph_content])
text_content = f"{header_text}\n\n{paragraph_text}"
# Clean up whitespace
visible_text = re.sub(r'\s+', ' ', text_content).strip()
# Translate non-English text
translator = Translator()
sentences = re.split(r'(?<=[.!?]) +', visible_text)
translated_sentences = []
for sentence in sentences:
try:
lang = detect(sentence)
if lang != 'en':
translation = translator.translate(sentence, dest='en').text
translated_sentences.append(translation)
else:
translated_sentences.append(sentence)
except Exception:
translated_sentences.append(sentence)
translated_text = ' '.join(translated_sentences)
return translated_text
except Exception as e:
st.error(f"Error occurred while scraping the data: {e}")
return None
def main():
st.title("Web Data Scraper")
url_input = st.text_input("Enter the URL πŸ‘‰βœοΈ:", "")
query_selector = st.text_input("Enter a query selector (optional):", "")
email = st.text_input("Email (if authentication required):", "")
password = st.text_input("Password (if authentication required):", "", type="password")
login_url = st.text_input("Enter the login URL (if authentication required):", "")
if st.button("Load Data 🧈"):
if url_input:
data = scrape_visible_text_from_url(
url=url_input,
query_selector=query_selector if query_selector else None,
email=email if email else None,
password=password if password else None,
login_url=login_url if login_url else None
)
if data:
st.success("Data text successfully scraped!")
st.subheader("Scraped Text:")
st.write(data)
else:
st.warning("Failed to load data from the URL.")
else:
st.warning("Please enter a valid URL.")
if __name__ == "__main__":
main()