Spaces:
Sleeping
Sleeping
Shreyas094
commited on
Commit
β’
2769ea6
1
Parent(s):
426506c
Update app.py
Browse files
app.py
CHANGED
@@ -2,10 +2,34 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
import time
|
4 |
import random
|
|
|
|
|
5 |
|
6 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"""
|
8 |
-
Perform a search using the Searx API with error handling, retry logic, and
|
9 |
"""
|
10 |
search_endpoint = f"{instance_url}/search"
|
11 |
params = {
|
@@ -16,7 +40,7 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
|
|
16 |
'time_range': '',
|
17 |
'engines': '',
|
18 |
'safesearch': '0',
|
19 |
-
'results': str(num_results)
|
20 |
}
|
21 |
|
22 |
headers = {
|
@@ -42,12 +66,14 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
|
|
42 |
for idx, result in enumerate(data['results'][:num_results], start=1):
|
43 |
title = result.get('title', 'No Title')
|
44 |
url = result.get('url', 'No URL')
|
45 |
-
snippet = result.get('content', 'No Description')
|
46 |
|
47 |
-
#
|
48 |
-
|
|
|
|
|
|
|
49 |
|
50 |
-
formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{
|
51 |
|
52 |
return formatted_results
|
53 |
except requests.exceptions.RequestException as e:
|
@@ -64,9 +90,9 @@ def create_gradio_interface():
|
|
64 |
Creates and returns the Gradio interface.
|
65 |
"""
|
66 |
with gr.Blocks() as demo:
|
67 |
-
gr.Markdown("# π΅οΈββοΈ Private Search with Searx and
|
68 |
gr.Markdown(
|
69 |
-
"This application allows you to perform private searches using the [Searx](https://searx.org/) metasearch engine."
|
70 |
)
|
71 |
with gr.Row():
|
72 |
with gr.Column():
|
@@ -94,23 +120,25 @@ def create_gradio_interface():
|
|
94 |
step=1,
|
95 |
label="Number of Results"
|
96 |
)
|
|
|
97 |
search_button = gr.Button("Search")
|
98 |
with gr.Column():
|
99 |
results = gr.Markdown("### Search Results will appear here...")
|
100 |
|
101 |
-
def perform_search(q, url, cats, num):
|
102 |
-
return search_searx(q, instance_url=url, categories=cats, num_results=int(num))
|
103 |
|
104 |
search_button.click(
|
105 |
perform_search,
|
106 |
-
inputs=[query, instance_url, categories, num_results],
|
107 |
outputs=results
|
108 |
)
|
109 |
|
110 |
gr.Markdown(
|
111 |
"""
|
112 |
---
|
113 |
-
**Note:** This application uses the Searx metasearch engine to fetch results from multiple sources while preserving your privacy.
|
|
|
114 |
"""
|
115 |
)
|
116 |
|
|
|
2 |
import requests
|
3 |
import time
|
4 |
import random
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import trafilatura
|
7 |
|
8 |
+
def extract_content_bs4(url):
|
9 |
+
try:
|
10 |
+
response = requests.get(url, timeout=10)
|
11 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
12 |
+
|
13 |
+
# This is a simple extraction and might need to be adjusted based on the structure of the websites you're scraping
|
14 |
+
paragraphs = soup.find_all('p')
|
15 |
+
content = ' '.join([p.text for p in paragraphs])
|
16 |
+
|
17 |
+
return content[:1000] + "..." if len(content) > 1000 else content
|
18 |
+
except Exception as e:
|
19 |
+
return f"Error extracting content: {str(e)}"
|
20 |
+
|
21 |
+
def extract_content_trafilatura(url):
|
22 |
+
try:
|
23 |
+
downloaded = trafilatura.fetch_url(url)
|
24 |
+
content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
|
25 |
+
|
26 |
+
return content[:1000] + "..." if content and len(content) > 1000 else content
|
27 |
+
except Exception as e:
|
28 |
+
return f"Error extracting content: {str(e)}"
|
29 |
+
|
30 |
+
def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10, use_trafilatura=False):
|
31 |
"""
|
32 |
+
Perform a search using the Searx API with error handling, retry logic, limited results, and content extraction.
|
33 |
"""
|
34 |
search_endpoint = f"{instance_url}/search"
|
35 |
params = {
|
|
|
40 |
'time_range': '',
|
41 |
'engines': '',
|
42 |
'safesearch': '0',
|
43 |
+
'results': str(num_results)
|
44 |
}
|
45 |
|
46 |
headers = {
|
|
|
66 |
for idx, result in enumerate(data['results'][:num_results], start=1):
|
67 |
title = result.get('title', 'No Title')
|
68 |
url = result.get('url', 'No URL')
|
|
|
69 |
|
70 |
+
# Extract content using the selected method
|
71 |
+
if use_trafilatura:
|
72 |
+
content = extract_content_trafilatura(url)
|
73 |
+
else:
|
74 |
+
content = extract_content_bs4(url)
|
75 |
|
76 |
+
formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n"
|
77 |
|
78 |
return formatted_results
|
79 |
except requests.exceptions.RequestException as e:
|
|
|
90 |
Creates and returns the Gradio interface.
|
91 |
"""
|
92 |
with gr.Blocks() as demo:
|
93 |
+
gr.Markdown("# π΅οΈββοΈ Private Search with Searx and Content Extraction")
|
94 |
gr.Markdown(
|
95 |
+
"This application allows you to perform private searches using the [Searx](https://searx.org/) metasearch engine and extract content from the results."
|
96 |
)
|
97 |
with gr.Row():
|
98 |
with gr.Column():
|
|
|
120 |
step=1,
|
121 |
label="Number of Results"
|
122 |
)
|
123 |
+
use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)")
|
124 |
search_button = gr.Button("Search")
|
125 |
with gr.Column():
|
126 |
results = gr.Markdown("### Search Results will appear here...")
|
127 |
|
128 |
+
def perform_search(q, url, cats, num, use_traf):
|
129 |
+
return search_searx(q, instance_url=url, categories=cats, num_results=int(num), use_trafilatura=use_traf)
|
130 |
|
131 |
search_button.click(
|
132 |
perform_search,
|
133 |
+
inputs=[query, instance_url, categories, num_results, use_trafilatura],
|
134 |
outputs=results
|
135 |
)
|
136 |
|
137 |
gr.Markdown(
|
138 |
"""
|
139 |
---
|
140 |
+
**Note:** This application uses the Searx metasearch engine to fetch results from multiple sources while preserving your privacy.
|
141 |
+
It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites.
|
142 |
"""
|
143 |
)
|
144 |
|