Shreyas094 commited on
Commit
2769ea6
β€’
1 Parent(s): 426506c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -13
app.py CHANGED
@@ -2,10 +2,34 @@ import gradio as gr
2
  import requests
3
  import time
4
  import random
 
 
5
 
6
- def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  """
8
- Perform a search using the Searx API with error handling, retry logic, and limited results.
9
  """
10
  search_endpoint = f"{instance_url}/search"
11
  params = {
@@ -16,7 +40,7 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
16
  'time_range': '',
17
  'engines': '',
18
  'safesearch': '0',
19
- 'results': str(num_results) # Limit the number of results
20
  }
21
 
22
  headers = {
@@ -42,12 +66,14 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
42
  for idx, result in enumerate(data['results'][:num_results], start=1):
43
  title = result.get('title', 'No Title')
44
  url = result.get('url', 'No URL')
45
- snippet = result.get('content', 'No Description')
46
 
47
- # Try to get a longer snippet if available
48
- long_content = result.get('long_content', snippet)
 
 
 
49
 
50
- formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{long_content}\n\n"
51
 
52
  return formatted_results
53
  except requests.exceptions.RequestException as e:
@@ -64,9 +90,9 @@ def create_gradio_interface():
64
  Creates and returns the Gradio interface.
65
  """
66
  with gr.Blocks() as demo:
67
- gr.Markdown("# πŸ•΅οΈβ€β™‚οΈ Private Search with Searx and Gradio")
68
  gr.Markdown(
69
- "This application allows you to perform private searches using the [Searx](https://searx.org/) metasearch engine."
70
  )
71
  with gr.Row():
72
  with gr.Column():
@@ -94,23 +120,25 @@ def create_gradio_interface():
94
  step=1,
95
  label="Number of Results"
96
  )
 
97
  search_button = gr.Button("Search")
98
  with gr.Column():
99
  results = gr.Markdown("### Search Results will appear here...")
100
 
101
- def perform_search(q, url, cats, num):
102
- return search_searx(q, instance_url=url, categories=cats, num_results=int(num))
103
 
104
  search_button.click(
105
  perform_search,
106
- inputs=[query, instance_url, categories, num_results],
107
  outputs=results
108
  )
109
 
110
  gr.Markdown(
111
  """
112
  ---
113
- **Note:** This application uses the Searx metasearch engine to fetch results from multiple sources while preserving your privacy.
 
114
  """
115
  )
116
 
 
2
  import requests
3
  import time
4
  import random
5
+ from bs4 import BeautifulSoup
6
+ import trafilatura
7
 
8
+ def extract_content_bs4(url):
9
+ try:
10
+ response = requests.get(url, timeout=10)
11
+ soup = BeautifulSoup(response.content, 'html.parser')
12
+
13
+ # This is a simple extraction and might need to be adjusted based on the structure of the websites you're scraping
14
+ paragraphs = soup.find_all('p')
15
+ content = ' '.join([p.text for p in paragraphs])
16
+
17
+ return content[:1000] + "..." if len(content) > 1000 else content
18
+ except Exception as e:
19
+ return f"Error extracting content: {str(e)}"
20
+
21
+ def extract_content_trafilatura(url):
22
+ try:
23
+ downloaded = trafilatura.fetch_url(url)
24
+ content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
25
+
26
+ return content[:1000] + "..." if content and len(content) > 1000 else content
27
+ except Exception as e:
28
+ return f"Error extracting content: {str(e)}"
29
+
30
+ def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10, use_trafilatura=False):
31
  """
32
+ Perform a search using the Searx API with error handling, retry logic, limited results, and content extraction.
33
  """
34
  search_endpoint = f"{instance_url}/search"
35
  params = {
 
40
  'time_range': '',
41
  'engines': '',
42
  'safesearch': '0',
43
+ 'results': str(num_results)
44
  }
45
 
46
  headers = {
 
66
  for idx, result in enumerate(data['results'][:num_results], start=1):
67
  title = result.get('title', 'No Title')
68
  url = result.get('url', 'No URL')
 
69
 
70
+ # Extract content using the selected method
71
+ if use_trafilatura:
72
+ content = extract_content_trafilatura(url)
73
+ else:
74
+ content = extract_content_bs4(url)
75
 
76
+ formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n"
77
 
78
  return formatted_results
79
  except requests.exceptions.RequestException as e:
 
90
  Creates and returns the Gradio interface.
91
  """
92
  with gr.Blocks() as demo:
93
+ gr.Markdown("# πŸ•΅οΈβ€β™‚οΈ Private Search with Searx and Content Extraction")
94
  gr.Markdown(
95
+ "This application allows you to perform private searches using the [Searx](https://searx.org/) metasearch engine and extract content from the results."
96
  )
97
  with gr.Row():
98
  with gr.Column():
 
120
  step=1,
121
  label="Number of Results"
122
  )
123
+ use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)")
124
  search_button = gr.Button("Search")
125
  with gr.Column():
126
  results = gr.Markdown("### Search Results will appear here...")
127
 
128
+ def perform_search(q, url, cats, num, use_traf):
129
+ return search_searx(q, instance_url=url, categories=cats, num_results=int(num), use_trafilatura=use_traf)
130
 
131
  search_button.click(
132
  perform_search,
133
+ inputs=[query, instance_url, categories, num_results, use_trafilatura],
134
  outputs=results
135
  )
136
 
137
  gr.Markdown(
138
  """
139
  ---
140
+ **Note:** This application uses the Searx metasearch engine to fetch results from multiple sources while preserving your privacy.
141
+ It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites.
142
  """
143
  )
144