Spaces:

Insightly
/

web_scraper

Runtime error

App Files Files Community

shreyasiv commited on Aug 3, 2023

Commit

1ff5cd6

1 Parent(s): e295a0e

Upload 2 files

Browse files

Files changed (2) hide show

app.py +56 -0
requirements.txt +60 -0

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import re
+# Function to scrape only visible text from the given URL
+def scrape_visible_text_from_url(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Remove script, style, and other non-visible tags
+        for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
+            tag.extract()
+        # Get the header content
+        header_content = soup.find("header")
+        header_text = header_content.get_text() if header_content else ""
+        # Get the paragraph content
+        paragraph_content = soup.find_all("p")
+        paragraph_text = " ".join([p.get_text() for p in paragraph_content])
+        # Combine header and paragraph text
+        visible_text = f"{header_text}\n\n{paragraph_text}"
+        # Remove multiple whitespaces and newlines
+        visible_text = re.sub(r'\s+', ' ', visible_text)
+        return visible_text.strip()
+    except Exception as e:
+        st.error(f"Error occurred while scraping the data: {e}")
+        return None
+# Streamlit UI
+def main():
+    st.title("Web Data Scraper")
+    # Get the URL from the user
+    url_input = st.text_input("Enter the URL of the web page:", "")
+    if st.button("Scrape Visible Text"):
+        if url_input:
+            # Extract visible text from the URL
+            data = scrape_visible_text_from_url(url_input)
+            if data:
+                st.success("Visible text successfully scraped!")
+                st.subheader("Scraped Text:")
+                st.write(data)
+            else:
+                st.warning("Failed to scrape visible text from the URL.")
+        else:
+            st.warning("Please enter a valid URL.")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,60 @@

+aiohttp==3.8.5
+aiosignal==1.3.1
+altair==5.0.1
+async-timeout==4.0.2
+attrs==23.1.0
+beautifulsoup4==4.12.2
+blinker==1.6.2
+bs4==0.0.1
+cachetools==5.3.1
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.6
+decorator==5.1.1
+frozenlist==1.4.0
+gitdb==4.0.10
+GitPython==3.1.32
+idna==3.4
+importlib-metadata==6.8.0
+Jinja2==3.1.2
+jsonschema==4.18.4
+jsonschema-specifications==2023.7.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+multidict==6.0.4
+numpy==1.25.2
+openai==0.27.8
+packaging==23.1
+pandas==2.0.3
+Pillow==9.5.0
+protobuf==4.23.4
+pyarrow==12.0.1
+pydeck==0.8.0
+Pygments==2.15.1
+Pympler==1.0.1
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+referencing==0.30.0
+requests==2.31.0
+rich==13.5.2
+rpds-py==0.9.2
+six==1.16.0
+smmap==5.0.0
+soupsieve==2.4.1
+streamlit==1.25.0
+tenacity==8.2.2
+toml==0.10.2
+toolz==0.12.0
+tornado==6.3.2
+tqdm==4.65.0
+typing_extensions==4.7.1
+tzdata==2023.3
+tzlocal==4.3.1
+urllib3==2.0.4
+validators==0.20.0
+watchdog==3.0.0
+yarl==1.9.2
+zipp==3.16.2