broadfield-dev commited on
Commit
d2ba6e0
·
verified ·
1 Parent(s): 10dbcf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -48
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  os.system("playwright install")
3
- # app.py (Final, Working Async Version with All Bugs Fixed)
4
 
5
  import gradio as gr
6
  from playwright.async_api import async_playwright
@@ -9,23 +9,101 @@ import urllib.parse
9
  import os
10
  from itertools import cycle
11
  import uuid
 
12
 
13
  # --- 1. GLOBAL RESOURCES & STATE ---
14
- P = None
15
- BROWSER = None
16
- REVOLVER = None
17
- LIVE_CONTEXTS = {}
18
- APP_STARTED = False
19
-
20
  SEARCH_ENGINES = {
21
- "DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
22
- "Google": "https://www.google.com/search?q={query}",
23
- "Bing": "https://www.bing.com/search?q={query}",
24
- "Brave": "https://search.brave.com/search?q={query}",
25
  "Ecosia": "https://www.ecosia.org/search?q={query}"
26
  }
27
 
28
- # --- 2. PLAIN DATA STATE CLASSES ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  class TabState:
30
  def __init__(self, tab_id, proxy_used="Direct Connection"):
31
  self.id, self.url, self.title = tab_id, "about:blank", "New Tab"
@@ -50,39 +128,21 @@ class CredentialRevolver:
50
  def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
51
  def count(self): return len(self.proxies) if self.proxies else 0
52
 
53
- # --- 3. CORE ASYNC LOGIC & FORMATTING ---
54
- def _process_element_to_markdown(element):
55
- if isinstance(element, NavigableString): return element.strip()
56
- if element.name is None: return ''
57
- inner_text = ''.join(_process_element_to_markdown(child) for child in element.children)
58
- if element.name in ['p', 'div', 'article', 'section']: return f"\n{inner_text.strip()}\n"
59
- if element.name == 'h1': return f"\n# {inner_text.strip()}\n"
60
- if element.name == 'h2': return f"\n## {inner_text.strip()}\n"
61
- if element.name == 'h3': return f"\n### {inner_text.strip()}\n"
62
- if element.name in ['h4', 'h5', 'h6']: return f"\n#### {inner_text.strip()}\n"
63
- if element.name == 'li': return f"* {inner_text.strip()}\n"
64
- if element.name in ['ul', 'ol']: return f"\n{inner_text}\n"
65
- if element.name in ['strong', 'b']: return f"**{inner_text.strip()}**"
66
- if element.name in ['em', 'i']: return f"*{inner_text.strip()}*"
67
- if element.name in ['pre', 'code']: return f"\n```\n{inner_text.strip()}\n```\n"
68
- if element.name == 'a': return f"[{inner_text.strip()}]({element.get('href', '')})"
69
- if element.name == 'hr': return "\n---\n"
70
- return inner_text
71
-
72
- def _format_html_to_markdown(soup):
73
- content_node = soup.find('main') or soup.find('body')
74
- if not content_node: return "Could not find main body content."
75
- for el in content_node.select('nav, footer, header, aside, form, script, style'): el.decompose()
76
- return _process_element_to_markdown(content_node)
77
 
 
78
  async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
79
  log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
80
  try:
81
  await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
82
  tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
83
  log += f"\n✅ Arrived at: {tab_state.url}"
 
84
  html_content = await live_page.content(); soup = BeautifulSoup(html_content, 'lxml')
85
- tab_state.parsed_text = _format_html_to_markdown(soup)
 
 
 
 
86
  tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
87
  log += f"\n🔗 Found {len(tab_state.links)} links."
88
  except Exception as e:
@@ -123,23 +183,23 @@ async def handle_action(browser_state: BrowserState, search_engine: str, action:
123
  else: log = "No action taken."
124
  return browser_state, log
125
 
126
- # ** CRITICAL BUG FIX: `NameError` is fixed by defining this function before it is called **
127
  def update_ui_components(browser_state: BrowserState):
128
  active_tab = browser_state.get_active_tab()
129
  if not active_tab: return {page_content: gr.Markdown("No active tabs."), url_textbox: "", links_display: "", tab_selector: gr.Radio(choices=[])}
130
  tab_choices = [(f"Tab {i}: {t.title[:25]}... ({t.proxy_used})", t.id) for i, t in enumerate(browser_state.tabs)]
131
  links_md = "### 🔗 Links on Page\n" + ('\n'.join(f"{i}. [{link['text'][:80]}]({link['url']})" for i, link in enumerate(active_tab.links[:25])) if active_tab.links else "_No links found._")
132
- page_md = f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text}"
 
133
  return {
134
  page_content: gr.Markdown(page_md),
135
  url_textbox: gr.Textbox(value=active_tab.url), links_display: gr.Markdown(links_md),
136
  tab_selector: gr.Radio(choices=tab_choices, value=active_tab.id, label="Active Tabs"),
137
  }
138
 
139
- # --- 4. GRADIO UI AND EVENT HANDLING ---
140
  with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
141
  browser_state = gr.State(BrowserState())
142
- gr.Markdown("# 🛰️ Real Browser Demo v2.1")
143
  with gr.Row():
144
  with gr.Column(scale=4):
145
  with gr.Row(): url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4); go_btn = gr.Button("Go", variant="primary", scale=1)
@@ -176,12 +236,17 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
176
  async def on_switch_tab(state, search_engine, value): return await master_handler(state, search_engine, "switch_tab", value)
177
 
178
  outputs = [browser_state, *all_outputs]
179
- demo.load(on_load, [browser_state, search_engine_selector], outputs)
180
- go_btn.click(on_go_click, [browser_state, search_engine_selector, url_textbox], outputs, show_progress="full")
181
- url_textbox.submit(on_go_click, [browser_state, search_engine_selector, url_textbox], outputs, show_progress="full")
182
- click_btn.click(on_click_link, [browser_state, search_engine_selector, click_num_box], outputs, show_progress="full")
183
- new_tab_btn.click(on_new_tab, [browser_state, search_engine_selector], outputs, show_progress="full")
184
- close_tab_btn.click(on_close_tab, [browser_state, search_engine_selector], outputs)
185
- tab_selector.input(on_switch_tab, [browser_state, search_engine_selector, tab_selector], outputs)
 
 
 
 
 
186
 
187
  demo.launch()
 
1
  import os
2
  os.system("playwright install")
3
+ # app.py (Final Version with Advanced Formatting)
4
 
5
  import gradio as gr
6
  from playwright.async_api import async_playwright
 
9
  import os
10
  from itertools import cycle
11
  import uuid
12
+ import re
13
 
14
  # --- 1. GLOBAL RESOURCES & STATE ---
15
+ P, BROWSER, REVOLVER, LIVE_CONTEXTS, APP_STARTED = None, None, None, {}, False
 
 
 
 
 
16
  SEARCH_ENGINES = {
17
+ "DuckDuckGo": "https://duckduckgo.com/html/?q={query}", "Google": "https://www.google.com/search?q={query}",
18
+ "Bing": "https://www.bing.com/search?q={query}", "Brave": "https://search.brave.com/search?q={query}",
 
 
19
  "Ecosia": "https://www.ecosia.org/search?q={query}"
20
  }
21
 
22
+ # --- 2. NEW: ADVANCED HTML-TO-MARKDOWN CONVERTER ---
23
+ class HTML_TO_MARKDOWN_CONVERTER:
24
+ """A sophisticated converter to turn cleaned HTML into readable Markdown."""
25
+ def __init__(self, soup: BeautifulSoup, base_url: str):
26
+ self.soup = soup
27
+ self.base_url = base_url
28
+
29
+ def _cleanup_html(self):
30
+ """Aggressively remove non-content tags and sections from the HTML."""
31
+ selectors_to_remove = [
32
+ 'nav', 'footer', 'header', 'aside', 'form', 'script', 'style', 'svg', 'button', 'input', 'textarea',
33
+ '[role="navigation"]', '[role="search"]', '[id*="comment"]', '[class*="comment-"]',
34
+ '[id*="sidebar"]', '[class*="sidebar"]', '[id*="related"]', '[class*="related"]',
35
+ '[id*="share"]', '[class*="share"]', '[id*="social"]', '[class*="social"]',
36
+ '[id*="cookie"]', '[class*="cookie"]'
37
+ ]
38
+ for selector in selectors_to_remove:
39
+ for element in self.soup.select(selector):
40
+ element.decompose()
41
+
42
+ def convert(self):
43
+ """Main conversion method."""
44
+ self._cleanup_html()
45
+ content_node = self.soup.find('main') or self.soup.find('article') or self.soup.find('body')
46
+ if not content_node:
47
+ return "Could not find main content."
48
+ return self._process_node(content_node)
49
+
50
+ def _process_node(self, element):
51
+ """Recursively process each HTML node into Markdown text."""
52
+ if isinstance(element, NavigableString):
53
+ # Use regex to replace multiple spaces/newlines with a single space
54
+ return re.sub(r'\s+', ' ', element.strip())
55
+
56
+ if element.name is None or not element.name:
57
+ return ''
58
+
59
+ # Process children first to build up inner content
60
+ inner_md = " ".join(self._process_node(child) for child in element.children).strip()
61
+
62
+ # Block-level tags add newlines
63
+ if element.name in ['p', 'div', 'section']:
64
+ return f"\n\n{inner_md}\n\n"
65
+ if element.name == 'h1': return f"\n\n# {inner_md}\n\n"
66
+ if element.name == 'h2': return f"\n\n## {inner_md}\n\n"
67
+ if element.name == 'h3': return f"\n\n### {inner_md}\n\n"
68
+ if element.name in ['h4', 'h5', 'h6']: return f"\n\n#### {inner_md}\n\n"
69
+ if element.name == 'li': return f"* {inner_md}\n"
70
+ if element.name in ['ul', 'ol']: return f"\n{inner_md}\n"
71
+ if element.name == 'blockquote': return f"> {inner_md.replace(chr(10), chr(10) + '> ')}\n\n"
72
+ if element.name == 'hr': return "\n\n---\n\n"
73
+
74
+ # Table conversion
75
+ if element.name == 'table':
76
+ header = " | ".join(f"**{th.get_text(strip=True)}**" for th in element.select('thead th, tr th'))
77
+ separator = " | ".join(['---'] * len(header.split('|')))
78
+ rows = [" | ".join(td.get_text(strip=True) for td in tr.find_all('td')) for tr in element.select('tbody tr')]
79
+ return f"\n\n{header}\n{separator}\n" + "\n".join(rows) + "\n\n"
80
+
81
+ # Pre-formatted and code
82
+ if element.name == 'pre': return f"\n```\n{element.get_text(strip=True)}\n```\n\n"
83
+ if element.name == 'code': return f"`{inner_md}`"
84
+
85
+ # Inline tags
86
+ if element.name in ['strong', 'b']: return f"**{inner_md}**"
87
+ if element.name in ['em', 'i']: return f"*{inner_md}*"
88
+
89
+ # Links and Images
90
+ if element.name == 'a':
91
+ href = element.get('href', '')
92
+ # Resolve relative URLs
93
+ full_href = urllib.parse.urljoin(self.base_url, href)
94
+ return f"[{inner_md}]({full_href})"
95
+ if element.name == 'img':
96
+ src = element.get('src', '')
97
+ alt = element.get('alt', 'Image').strip()
98
+ # Resolve relative URLs
99
+ full_src = urllib.parse.urljoin(self.base_url, src)
100
+ return f"\n\n![{alt}]({full_src})\n\n"
101
+
102
+ # Return inner markdown for unrecognized tags (like span, etc.)
103
+ return inner_md
104
+
105
+
106
+ # --- 3. PLAIN DATA STATE CLASSES ---
107
  class TabState:
108
  def __init__(self, tab_id, proxy_used="Direct Connection"):
109
  self.id, self.url, self.title = tab_id, "about:blank", "New Tab"
 
128
  def get_next(self): return next(self.proxy_cycler) if self.proxy_cycler else None
129
  def count(self): return len(self.proxies) if self.proxies else 0
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ # --- 4. CORE ASYNC LOGIC ---
133
  async def _fetch_and_update_tab_state(tab_state: TabState, url: str):
134
  log = f"▶️ Navigating to {url}..."; live_page = LIVE_CONTEXTS[tab_state.id]["page"]
135
  try:
136
  await live_page.goto(url, wait_until='domcontentloaded', timeout=30000)
137
  tab_state.url = live_page.url; tab_state.title = await live_page.title() or "No Title"
138
  log += f"\n✅ Arrived at: {tab_state.url}"
139
+
140
  html_content = await live_page.content(); soup = BeautifulSoup(html_content, 'lxml')
141
+
142
+ # Use the new advanced converter
143
+ converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=tab_state.url)
144
+ tab_state.parsed_text = converter.convert()
145
+
146
  tab_state.links = [{'text': link.get_text(strip=True) or "[No Link Text]", 'url': urllib.parse.urljoin(tab_state.url, link['href'])} for link in soup.find_all('a', href=True) if link.get('href', '').startswith('http')]
147
  log += f"\n🔗 Found {len(tab_state.links)} links."
148
  except Exception as e:
 
183
  else: log = "No action taken."
184
  return browser_state, log
185
 
 
186
  def update_ui_components(browser_state: BrowserState):
187
  active_tab = browser_state.get_active_tab()
188
  if not active_tab: return {page_content: gr.Markdown("No active tabs."), url_textbox: "", links_display: "", tab_selector: gr.Radio(choices=[])}
189
  tab_choices = [(f"Tab {i}: {t.title[:25]}... ({t.proxy_used})", t.id) for i, t in enumerate(browser_state.tabs)]
190
  links_md = "### 🔗 Links on Page\n" + ('\n'.join(f"{i}. [{link['text'][:80]}]({link['url']})" for i, link in enumerate(active_tab.links[:25])) if active_tab.links else "_No links found._")
191
+ # Clean up excessive newlines for final display
192
+ page_md = re.sub(r'\n{3,}', '\n\n', f"# {active_tab.title}\n**URL:** {active_tab.url}\n\n---\n\n{active_tab.parsed_text}").strip()
193
  return {
194
  page_content: gr.Markdown(page_md),
195
  url_textbox: gr.Textbox(value=active_tab.url), links_display: gr.Markdown(links_md),
196
  tab_selector: gr.Radio(choices=tab_choices, value=active_tab.id, label="Active Tabs"),
197
  }
198
 
199
+ # --- 5. GRADIO UI AND EVENT HANDLING ---
200
  with gr.Blocks(theme=gr.themes.Soft(), title="Real Browser Demo") as demo:
201
  browser_state = gr.State(BrowserState())
202
+ gr.Markdown("# 🛰️ Real Browser Demo v2.2 (Advanced Formatting)")
203
  with gr.Row():
204
  with gr.Column(scale=4):
205
  with gr.Row(): url_textbox = gr.Textbox(label="Enter URL or Search Query", interactive=True, scale=4); go_btn = gr.Button("Go", variant="primary", scale=1)
 
236
  async def on_switch_tab(state, search_engine, value): return await master_handler(state, search_engine, "switch_tab", value)
237
 
238
  outputs = [browser_state, *all_outputs]
239
+ go_inputs = [browser_state, search_engine_selector, url_textbox]
240
+ click_inputs = [browser_state, search_engine_selector, click_num_box]
241
+ tab_inputs = [browser_state, search_engine_selector]
242
+ switch_inputs = [browser_state, search_engine_selector, tab_selector]
243
+
244
+ demo.load(on_load, tab_inputs, outputs)
245
+ go_btn.click(on_go_click, go_inputs, outputs, show_progress="full")
246
+ url_textbox.submit(on_go_click, go_inputs, outputs, show_progress="full")
247
+ click_btn.click(on_click_link, click_inputs, outputs, show_progress="full")
248
+ new_tab_btn.click(on_new_tab, tab_inputs, outputs, show_progress="full")
249
+ close_tab_btn.click(on_close_tab, tab_inputs, outputs)
250
+ tab_selector.input(on_switch_tab, switch_inputs, outputs)
251
 
252
  demo.launch()