|
|
|
|
|
import os |
|
|
import time |
|
|
import tempfile |
|
|
import shutil |
|
|
import logging |
|
|
import subprocess |
|
|
import socket |
|
|
import random |
|
|
from selenium import webdriver |
|
|
from selenium.webdriver.common.by import By |
|
|
from selenium.webdriver.support.ui import WebDriverWait |
|
|
from selenium.webdriver.support import expected_conditions as EC |
|
|
from selenium.webdriver.chrome.options import Options |
|
|
from selenium.webdriver.chrome.service import Service |
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
WEBDRIVER_PORT = random.randint(4444, 4544) |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
|
handlers=[ |
|
|
logging.FileHandler('/tmp/webscraper.log'), |
|
|
logging.StreamHandler() |
|
|
] |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
os.environ['MPLCONFIGDIR'] = tempfile.mkdtemp() |
|
|
logger.info(f"Matplotlib config directory set to: {os.environ['MPLCONFIGDIR']}") |
|
|
|
|
|
def verify_chrome_installation(): |
|
|
try: |
|
|
result = subprocess.run(['google-chrome-stable', '--version'], |
|
|
capture_output=True, text=True) |
|
|
logger.info(f"Chrome version: {result.stdout}") |
|
|
return True |
|
|
except: |
|
|
logger.error("Chrome verification failed") |
|
|
return False |
|
|
|
|
|
def verify_chromedriver(): |
|
|
chromedriver_path = '/usr/local/bin/chromedriver' |
|
|
try: |
|
|
result = subprocess.run([chromedriver_path, '--version'], |
|
|
capture_output=True, text=True) |
|
|
logger.info(f"ChromeDriver version: {result.stdout}") |
|
|
st = os.stat(chromedriver_path) |
|
|
if not st.st_mode & 0o111: |
|
|
logger.warning("ChromeDriver not executable, fixing permissions...") |
|
|
os.chmod(chromedriver_path, 0o755) |
|
|
logger.info("Permissions updated") |
|
|
return True |
|
|
except: |
|
|
logger.error("ChromeDriver verification failed") |
|
|
return False |
|
|
|
|
|
def create_service(): |
|
|
|
|
|
chromedriver_path = shutil.which('chromedriver') or '/usr/local/bin/chromedriver' |
|
|
if not os.path.exists(chromedriver_path): |
|
|
raise FileNotFoundError(f"ChromeDriver missing at {chromedriver_path}") |
|
|
|
|
|
|
|
|
port = WEBDRIVER_PORT |
|
|
s = socket.socket() |
|
|
if s.connect_ex(('127.0.0.1', port)) == 0: |
|
|
port += 1 |
|
|
s.close() |
|
|
|
|
|
|
|
|
log_dir = tempfile.gettempdir() |
|
|
if not os.access(log_dir, os.W_OK): |
|
|
log_dir = '/tmp' |
|
|
|
|
|
|
|
|
try: |
|
|
return Service( |
|
|
executable_path=chromedriver_path, |
|
|
port=port, |
|
|
service_args=[ |
|
|
'--verbose', |
|
|
'--log-path=/tmp/chromedriver.log' |
|
|
] |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"Service creation failed: {str(e)}") |
|
|
return Service() |
|
|
|
|
|
def verify_critical_path(): |
|
|
print(f"ChromeDriver exists: {os.path.exists('/usr/local/bin/chromedriver')}") |
|
|
print(f"Chrome exists: {os.path.exists('/usr/bin/google-chrome-stable')}") |
|
|
|
|
|
print(f"ChromeDriver executable: {os.access('/usr/local/bin/chromedriver', os.X_OK)}") |
|
|
print(f"Chrome executable: {os.access('/usr/bin/google-chrome-stable', os.X_OK)}") |
|
|
|
|
|
try: |
|
|
subprocess.run(["/usr/local/bin/chromedriver", "--version"], check=True) |
|
|
subprocess.run(["/usr/bin/google-chrome-stable", "--version"], check=True) |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"Binary test failed: {e}") |
|
|
|
|
|
def setup_selenium(): |
|
|
try: |
|
|
logger.info("Initializing Selenium Chrome driver...") |
|
|
options = Options() |
|
|
options.add_argument("--headless=new") |
|
|
options.add_argument("--no-sandbox") |
|
|
options.add_argument("--disable-dev-shm-usage") |
|
|
options.add_argument("--disable-gpu") |
|
|
options.add_argument("--window-size=1920,1080") |
|
|
options.add_argument("--start-maximized") |
|
|
options.add_argument("--disable-blink-features=AutomationControlled") |
|
|
options.add_argument("--disable-notifications") |
|
|
options.add_argument("--remote-debugging-port=9222") |
|
|
|
|
|
|
|
|
options.add_experimental_option("excludeSwitches", ["enable-automation"]) |
|
|
options.add_experimental_option("useAutomationExtension", False) |
|
|
options.add_experimental_option("prefs", { |
|
|
"profile.default_content_setting_values.notifications": 2 |
|
|
}) |
|
|
|
|
|
|
|
|
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") |
|
|
|
|
|
logger.info("Verifying ChromeDriver...") |
|
|
verifying_chromedriver = verify_chromedriver() |
|
|
logger.info(f"ChromeDriver verification: {verifying_chromedriver}") |
|
|
|
|
|
chromedriver_path = "/usr/local/bin/chromedriver" |
|
|
|
|
|
logger.info(f"Setting executable permission for {chromedriver_path}") |
|
|
|
|
|
service = create_service() |
|
|
logger.info("Service created") |
|
|
|
|
|
verify_critical_path() |
|
|
|
|
|
try: |
|
|
logger.info("Attempting Chrome initialization...") |
|
|
driver = webdriver.Chrome( |
|
|
options=options, |
|
|
service=service |
|
|
) |
|
|
logger.info("Successfully initialized Chrome driver!") |
|
|
return driver |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Initialization failed completely: {str(e)}") |
|
|
try: |
|
|
from subprocess import run |
|
|
run([chromedriver_path, "--version"], check=True) |
|
|
run(["/usr/bin/google-chrome-stable", "--version"], check=True) |
|
|
except Exception as sub_e: |
|
|
logger.error(f"Subprocess check failed: {str(sub_e)}") |
|
|
raise |
|
|
|
|
|
except Exception as e: |
|
|
logger.error("Possible causes:") |
|
|
logger.error("- Missing or incompatible ChromeDriver") |
|
|
logger.error("- Chrome binary not found") |
|
|
logger.error("- Insufficient permissions") |
|
|
logger.error("- Missing system dependencies") |
|
|
raise RuntimeError(f"Failed to initialize WebDriver: {str(e)}") |
|
|
|
|
|
def check_selenium_environment(): |
|
|
try: |
|
|
driver = setup_selenium() |
|
|
driver.quit() |
|
|
return True |
|
|
except: |
|
|
return False |
|
|
|
|
|
if not check_selenium_environment(): |
|
|
logger.critical("Selenium environment check failed!") |
|
|
exit(1) |
|
|
|
|
|
def form_input_text(word): |
|
|
logger.info(f"Generating input text for word: {word}") |
|
|
return f"use the word '{word}' based on the following sentences types, give output in a table format, no quotes around sentences, no repeating the same sentence: Simple Sentence, Compound Sentence" |
|
|
|
|
|
def fetch_sentences(input_text): |
|
|
driver = None |
|
|
generated_sentences = [] |
|
|
try: |
|
|
logger.info("Starting sentence fetching process...") |
|
|
logger.debug(f"Input text: {input_text[:100]}...") |
|
|
|
|
|
driver = setup_selenium() |
|
|
driver.implicitly_wait(5) |
|
|
|
|
|
logger.info("Navigating to target URL...") |
|
|
driver.get("https://copilot.microsoft.com/chat") |
|
|
print("...after: driver.get('https://copilot.microsoft.com/chat')") |
|
|
|
|
|
logger.info("Waiting for page to load...") |
|
|
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) |
|
|
time.sleep(10) |
|
|
|
|
|
textarea = WebDriverWait(driver, 20).until( |
|
|
EC.presence_of_element_located((By.ID, "userInput")) |
|
|
) |
|
|
textarea.clear() |
|
|
textarea.send_keys(input_text) |
|
|
print("...after: sending") |
|
|
time.sleep(20) |
|
|
|
|
|
button = driver.find_element(By.XPATH, "//button[@title='Submit message']") |
|
|
button.click() |
|
|
print("...after: button.click()") |
|
|
print("Text inserted and button pressed successfully!") |
|
|
|
|
|
time.sleep(20) |
|
|
|
|
|
table = WebDriverWait(driver, 10).until( |
|
|
EC.presence_of_element_located((By.XPATH, "//table")) |
|
|
) |
|
|
|
|
|
rows = table.find_elements(By.TAG_NAME, "tr") |
|
|
print(len(rows)) |
|
|
|
|
|
for row in rows: |
|
|
row_cells = row.find_elements(By.TAG_NAME, "td") |
|
|
generated_sentences.append([r.text for r in row_cells]) |
|
|
|
|
|
time.sleep(10) |
|
|
finally: |
|
|
if driver: |
|
|
driver.quit() |
|
|
|
|
|
generated_sentences = [item for item in generated_sentences if len(item) > 0] if len(generated_sentences) > 1 else generated_sentences |
|
|
return generated_sentences |
|
|
|
|
|
def scrape_website(url): |
|
|
try: |
|
|
if not verify_chrome_installation(): |
|
|
return "Error: Chrome not properly installed" |
|
|
|
|
|
current_word = 'go' |
|
|
return fetch_sentences(form_input_text(current_word)) |
|
|
|
|
|
except RuntimeError as e: |
|
|
logger.error(f"Runtime error: {str(e)}") |
|
|
return f"System error: {str(e)}" |
|
|
except Exception as e: |
|
|
logger.error(f"Unexpected error: {str(e)}") |
|
|
return f"Unexpected error occurred: {str(e)}" |
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=scrape_website, |
|
|
inputs=[ |
|
|
gr.Textbox(label="URL to scrape", placeholder="https://example.com"), |
|
|
], |
|
|
outputs=gr.Textbox(label="Scraped Content"), |
|
|
title="Web Scraper with Selenium & Chrome", |
|
|
description="Enter a URL to scrape its content using headless Chrome browser." |
|
|
) |
|
|
|
|
|
iface.queue().launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False |
|
|
) |
|
|
|