Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -177,10 +177,13 @@ async def chat(
|
|
177 |
|
178 |
def extract_text_from_webpage(html_content):
|
179 |
"""Extracts visible text from HTML content using BeautifulSoup."""
|
180 |
-
soup = BeautifulSoup(html_content)
|
181 |
-
|
|
|
182 |
tag.extract()
|
183 |
-
|
|
|
|
|
184 |
|
185 |
async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
|
186 |
"""Fetches a URL and extracts text asynchronously."""
|
@@ -245,15 +248,19 @@ async def web_search_and_extract(
|
|
245 |
|
246 |
def extract_text_from_webpage2(html_content):
|
247 |
"""Extracts visible text from HTML content using BeautifulSoup."""
|
248 |
-
soup = BeautifulSoup(html_content)
|
249 |
-
|
|
|
250 |
tag.extract()
|
251 |
-
|
|
|
|
|
252 |
|
253 |
-
def fetch_and_extract2(url, max_chars):
|
254 |
"""Fetches a URL and extracts text using threading."""
|
|
|
255 |
try:
|
256 |
-
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0
|
257 |
response.raise_for_status()
|
258 |
html_content = response.text
|
259 |
visible_text = extract_text_from_webpage2(html_content)
|
@@ -267,19 +274,20 @@ def fetch_and_extract2(url, max_chars):
|
|
267 |
@app.get("/api/websearch-and-extract-threading")
|
268 |
def web_search_and_extract_threading(
|
269 |
q: str,
|
270 |
-
max_results: int =
|
271 |
timelimit: Optional[str] = None,
|
272 |
safesearch: str = "moderate",
|
273 |
region: str = "wt-wt",
|
274 |
backend: str = "html",
|
275 |
-
max_chars: int =
|
276 |
-
extract_only: bool = True
|
|
|
277 |
):
|
278 |
"""
|
279 |
Searches using WEBS, extracts text from the top results using threading, and returns both.
|
280 |
"""
|
281 |
try:
|
282 |
-
with WEBS() as webs:
|
283 |
# Perform WEBS search
|
284 |
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
285 |
timelimit=timelimit, backend=backend, max_results=max_results)
|
@@ -289,7 +297,7 @@ def web_search_and_extract_threading(
|
|
289 |
threads = []
|
290 |
for result in search_results:
|
291 |
if 'href' in result:
|
292 |
-
thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
|
293 |
threads.append(thread)
|
294 |
thread.start()
|
295 |
|
|
|
177 |
|
178 |
def extract_text_from_webpage(html_content):
|
179 |
"""Extracts visible text from HTML content using BeautifulSoup."""
|
180 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
181 |
+
# Remove unwanted tags
|
182 |
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
183 |
tag.extract()
|
184 |
+
# Get the remaining visible text
|
185 |
+
visible_text = soup.get_text(strip=True)
|
186 |
+
return visible_text
|
187 |
|
188 |
async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
|
189 |
"""Fetches a URL and extracts text asynchronously."""
|
|
|
248 |
|
249 |
def extract_text_from_webpage2(html_content):
|
250 |
"""Extracts visible text from HTML content using BeautifulSoup."""
|
251 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
252 |
+
# Remove unwanted tags
|
253 |
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
254 |
tag.extract()
|
255 |
+
# Get the remaining visible text
|
256 |
+
visible_text = soup.get_text(strip=True)
|
257 |
+
return visible_text
|
258 |
|
259 |
+
def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
|
260 |
"""Fetches a URL and extracts text using threading."""
|
261 |
+
proxies = {'http': proxy, 'https': proxy} if proxy else None
|
262 |
try:
|
263 |
+
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, proxies=proxies)
|
264 |
response.raise_for_status()
|
265 |
html_content = response.text
|
266 |
visible_text = extract_text_from_webpage2(html_content)
|
|
|
274 |
@app.get("/api/websearch-and-extract-threading")
|
275 |
def web_search_and_extract_threading(
|
276 |
q: str,
|
277 |
+
max_results: int = 3,
|
278 |
timelimit: Optional[str] = None,
|
279 |
safesearch: str = "moderate",
|
280 |
region: str = "wt-wt",
|
281 |
backend: str = "html",
|
282 |
+
max_chars: int = 6000,
|
283 |
+
extract_only: bool = True,
|
284 |
+
proxy: Optional[str] = None
|
285 |
):
|
286 |
"""
|
287 |
Searches using WEBS, extracts text from the top results using threading, and returns both.
|
288 |
"""
|
289 |
try:
|
290 |
+
with WEBS(proxy=proxy) as webs:
|
291 |
# Perform WEBS search
|
292 |
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
293 |
timelimit=timelimit, backend=backend, max_results=max_results)
|
|
|
297 |
threads = []
|
298 |
for result in search_results:
|
299 |
if 'href' in result:
|
300 |
+
thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars, proxy)))
|
301 |
threads.append(thread)
|
302 |
thread.start()
|
303 |
|