Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from io import BytesIO | |
| import pandas as pd | |
| from selenium import webdriver | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.webdriver.common.by import By | |
| from selenium.common.exceptions import WebDriverException | |
| from io import BytesIO | |
| from bs4 import BeautifulSoup | |
| def main(): | |
| st.title("Website Content Exctractor") | |
| # Get website URL from user input | |
| url = st.text_input("Enter a URL:", "") | |
| if st.button("Proceed"): | |
| if not url: | |
| st.warning("URL is empty.") | |
| else: | |
| visualize(url) | |
| def visualize(url): | |
| try: | |
| # Fetch and display the website content | |
| with st.spinner("loading website data ..."): | |
| # innerHTML = get_innerHTML(url) | |
| html_content, xtarget_dropdown, xurl = take_webdata(url) | |
| #st.subheader("Website title:") | |
| if xtarget_dropdown: | |
| st.code(xtarget_dropdown, language='html') | |
| if xurl: | |
| st.code(xurl, language='html') | |
| else: | |
| st.warning("tidak ditemukan.") | |
| else: | |
| st.warning("tidak ditemukan.") | |
| except Exception as e: | |
| st.error(f"Error: {e}") | |
| def take_webdata(url): | |
| options = webdriver.ChromeOptions() | |
| options.add_argument('--headless') | |
| options.add_argument('--no-sandbox') | |
| options.add_argument('--disable-dev-shm-usage') | |
| try: | |
| wd = webdriver.Chrome(options=options) | |
| wd.set_window_size(1080, 720) # Adjust the window size here | |
| wd.get(url) | |
| wd.implicitly_wait(15) | |
| # Get the page title | |
| page_title = wd.title | |
| #screenshot = wd.get_screenshot_as_png() | |
| #WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.ID, "tournament-table"))) | |
| html = wd.execute_script("return document.documentElement.outerHTML;") | |
| soup = BeautifulSoup(html, "html.parser") | |
| target_dropdown = soup.find('div', class_='dropdown-menu', attrs={'aria-labelledby': 'navbar-match'}) | |
| if target_dropdown: | |
| klasemenlink = target_dropdown.find('a', class_='dropdown-item',string='KLASEMEN') | |
| if klasemenlink: | |
| urlx = klasemenlink.get('href') | |
| #for link in links: | |
| #urlx = link.get('href') | |
| #text = link.get_text(strip=True) | |
| #print(f"Text: {text}") | |
| #print(f"URL: {url}") | |
| #print("---") | |
| else: | |
| print("Dropdown menu tidak ditemukan") | |
| #div_find = soup.find("div", id="tournament-table", class_="tournament-table-standings") | |
| #rows = div_find.select("div[class*=ui-table__row]") | |
| #rows = div_find.find("ui-table__row ") if div_find else None | |
| #rows = soup.find("div", class_="ui-table__row ") | |
| except WebDriverException as e: | |
| return page_title | |
| finally: | |
| if wd: | |
| wd.quit() | |
| return html ,target_dropdown, urlx | |
| if __name__ == "__main__": | |
| main() | |