Prathmesh48 That1BrainCell commited on
Commit
fa1e477
1 Parent(s): df221cb

Github Change (#2)

Browse files

- Github Change (77d38bcd36c7163322108d71b656a1bccd75bcb9)


Co-authored-by: Aditya Metkar <That1BrainCell@users.noreply.huggingface.co>

Files changed (1) hide show
  1. app.py +327 -172
app.py CHANGED
@@ -1,172 +1,327 @@
1
- # file: app.py
2
-
3
- import gradio as gr
4
- import requests
5
- import json
6
- import concurrent.futures
7
- from concurrent.futures import ThreadPoolExecutor
8
- from langchain_community.document_loaders import PyPDFLoader
9
- from langdetect import detect_langs
10
- from PyPDF2 import PdfReader
11
- from io import BytesIO
12
- import logging
13
- from dotenv import load_dotenv
14
- import os
15
-
16
- load_dotenv()
17
- data = False
18
- seen = set()
19
-
20
- main_url = "https://similar-products-api.vercel.app/search/all"
21
- main_product = "Samsung Galaxy"
22
-
23
- API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
24
- headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
25
-
26
- logging.basicConfig(level=logging.INFO)
27
-
28
- def get_links(product):
29
- params = {
30
- "API_KEY": "12345",
31
- "product": f"{product}",
32
- }
33
- response = requests.get(main_url, params=params)
34
- if response.status_code == 200:
35
- results = response.json()
36
- return results
37
- else:
38
- return {}
39
-
40
- def language_preprocess(text):
41
- try:
42
- if detect_langs(text)[0].lang == 'en':
43
- return True
44
- return False
45
- except Exception as e:
46
- logging.error(f"Language detection error: {e}")
47
- return False
48
-
49
- def relevant(product, similar_product, content):
50
- try:
51
- payload = {"inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content[:700]}'''}
52
- response = requests.post(API_URL, headers=headers, json=payload)
53
- output = response.json()
54
- return bool(output[0]['generated_text'])
55
- except Exception as e:
56
- logging.error(f"Relevance checking error: {e}")
57
- return False
58
-
59
- def download_pdf(url, timeout=10):
60
- try:
61
- response = requests.get(url, timeout=timeout)
62
- response.raise_for_status()
63
- return BytesIO(response.content)
64
- except requests.RequestException as e:
65
- logging.error(f"PDF download error: {e}")
66
- return None
67
-
68
- def extract_text_from_pages(pdf_file, pages):
69
- reader = PdfReader(pdf_file)
70
- extracted_text = ""
71
- try:
72
- for page_num in pages:
73
- if page_num < len(reader.pages):
74
- page = reader.pages[page_num]
75
- extracted_text += page.extract_text() + "\n"
76
- else:
77
- logging.warning(f"Page {page_num} does not exist in the document.")
78
- return extracted_text
79
- except Exception as e:
80
- logging.error(f"PDF text extraction error: {e}")
81
- return 'हे चालत नाही'
82
-
83
- def process_link(link, similar_product):
84
- if link in seen:
85
- return None
86
- seen.add(link)
87
- try:
88
- pdf_file = download_pdf(link)
89
- if pdf_file:
90
- text = extract_text_from_pages(pdf_file, [0, 2, 4])
91
- if language_preprocess(text):
92
- if relevant(main_product, similar_product, text):
93
- return link
94
- except Exception as e:
95
- logging.error(f"Error processing link: {e}")
96
- return None
97
-
98
- def filtering(urls, similar_product):
99
- res = []
100
- with ThreadPoolExecutor() as executor:
101
- futures = {executor.submit(process_link, link, similar_product): link for link in urls}
102
- for future in concurrent.futures.as_completed(futures):
103
- result = future.result()
104
- if result is not None:
105
- res.append(result)
106
- return res
107
-
108
- def wikipedia_url(product):
109
- api_url = "https://en.wikipedia.org/w/api.php"
110
- params = {
111
- "action": "opensearch",
112
- "search": product,
113
- "limit": 5,
114
- "namespace": 0,
115
- "format": "json"
116
- }
117
- try:
118
- response = requests.get(api_url, params=params)
119
- response.raise_for_status()
120
- data = response.json()
121
- if data and len(data) > 3 and len(data[3]) > 0:
122
- return data[3]
123
- else:
124
- return []
125
- except requests.RequestException as e:
126
- logging.error(f"Error fetching Wikipedia URLs: {e}")
127
- return []
128
-
129
- def preprocess_initial(product):
130
- return get_links(product)
131
-
132
- def preprocess_filter(product, data):
133
- for similar_product in data:
134
- if similar_product != product:
135
- if list(data[similar_product][0])[0] == 'duckduckgo':
136
- s = set(('duckduckgo', 'google', 'archive'))
137
- temp = []
138
-
139
- for idx, item in enumerate(data[similar_product]):
140
- if list(item)[0] in s:
141
- urls = data[similar_product][idx][list(item)[0]]
142
- temp += filtering(urls, similar_product)
143
- else:
144
- temp += data[similar_product][idx][list(item)[0]]
145
-
146
- data[similar_product] = temp
147
- data[similar_product] += wikipedia_url(similar_product)
148
- else:
149
- urls = data[similar_product]
150
- data[similar_product] = filtering(urls, similar_product)
151
- data[similar_product] += wikipedia_url(similar_product)
152
- logging.info('Filtering completed')
153
- return data
154
-
155
- def main(product_name):
156
- return preprocess_initial(product_name)
157
-
158
- def filter_links(product_name, initial_data):
159
- return preprocess_filter(product_name, initial_data)
160
-
161
- with gr.Blocks() as demo:
162
- product_name = gr.Textbox(label="Product Name")
163
- get_links_btn = gr.Button("Get Links")
164
- initial_links_output = gr.JSON()
165
- filter_btn = gr.Button("Filter Links")
166
- filtered_links_output = gr.JSON()
167
-
168
- get_links_btn.click(fn=main, inputs=product_name, outputs=initial_links_output)
169
- filter_btn.click(fn=filter_links, inputs=[product_name, initial_links_output], outputs=filtered_links_output)
170
-
171
- if __name__ == "__main__":
172
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, render_template
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from googlesearch import search
5
+ from duckduckgo_search import DDGS
6
+ import concurrent.futures
7
+ import re
8
+
9
+
10
+ app = Flask(__name__)
11
+
12
+ API_KEY_DEFAULT = '12345'
13
+
14
+ # Function to search DuckDuckGo
15
+ def duckduckgo_search(query):
16
+ try:
17
+ results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
18
+ return [res['href'] for res in results]
19
+ except:
20
+ return []
21
+
22
+ # Function to search Google
23
+ def google_search(query):
24
+
25
+ links = []
26
+ try:
27
+ api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
28
+ search_engine_id = 'c4ca951b9fc6949cb'
29
+
30
+ url = f"https://www.googleapis.com/customsearch/v1"
31
+ params = {
32
+ "key": api_key,
33
+ "cx": search_engine_id,
34
+ "q": query + " manual filetype:pdf"
35
+ }
36
+
37
+ response = requests.get(url, params=params)
38
+ results = response.json()
39
+
40
+ for item in results.get('items', []):
41
+ links.append(item['link'])
42
+ except:
43
+ pass
44
+
45
+ try:
46
+ extension = "ext:pdf"
47
+ for result in search(query + " manual " + extension, num_results=5):
48
+ if result.endswith('.pdf'):
49
+ links.append(result)
50
+ except:
51
+ pass
52
+
53
+
54
+ return links
55
+
56
+ # Function to search Internet Archive
57
+ def archive_search(query):
58
+
59
+ try:
60
+ url = "https://archive.org/advancedsearch.php"
61
+ params = {
62
+ 'q': f'{query} manual',
63
+ 'fl[]': ['identifier', 'title', 'format'],
64
+ 'rows': 50,
65
+ 'page': 1,
66
+ 'output': 'json'
67
+ }
68
+
69
+ # Make the request
70
+ response = requests.get(url, params=params)
71
+ data = response.json()
72
+
73
+ # Function to extract hyperlinks from a webpage
74
+ def extract_hyperlinks(url):
75
+ # Send a GET request to the URL
76
+ response = requests.get(url)
77
+
78
+ # Check if the request was successful
79
+ if response.status_code == 200:
80
+ # Parse the HTML content of the page
81
+ soup = BeautifulSoup(response.text, 'html.parser')
82
+
83
+ # Find all <a> tags (hyperlinks)
84
+ for link in soup.find_all('a', href=True):
85
+ href = link['href']
86
+ if href.endswith('.pdf'):
87
+ pdf_files.append(url+'/'+href)
88
+ if href.endswith('.iso'):
89
+ # If the link ends with .iso, follow the link and extract .pdf hyperlinks
90
+ extract_pdf_from_iso(url+'/'+href+'/')
91
+
92
+ # Function to extract .pdf hyperlinks from an .iso file
93
+ def extract_pdf_from_iso(iso_url):
94
+ # Send a GET request to the ISO URL
95
+ iso_response = requests.get(iso_url)
96
+
97
+ # Check if the request was successful
98
+ if iso_response.status_code == 200:
99
+ # Parse the HTML content of the ISO page
100
+ iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
101
+
102
+ # Find all <a> tags (hyperlinks) in the ISO page
103
+ for link in iso_soup.find_all('a', href=True):
104
+ href = link['href']
105
+ if href.endswith('.pdf'):
106
+ pdf_files.append('https:'+href)
107
+
108
+ pdf_files = []
109
+
110
+ def process_doc(doc):
111
+ identifier = doc.get('identifier', 'N/A')
112
+ # title = doc.get('title', 'N/A')
113
+ # format = doc.get('format', 'N/A')
114
+ pdf_link = f"https://archive.org/download/{identifier}"
115
+ extract_hyperlinks(pdf_link)
116
+
117
+ with concurrent.futures.ThreadPoolExecutor() as executor:
118
+ futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
119
+
120
+ # Optionally, wait for all futures to complete and handle any exceptions
121
+ for future in concurrent.futures.as_completed(futures):
122
+ try:
123
+ future.result() # This will raise an exception if the function call raised
124
+ except Exception as exc:
125
+ print(f'Generated an exception: {exc}')
126
+
127
+
128
+ return pdf_files
129
+
130
+ except:
131
+ return []
132
+
133
+ def github_search(query):
134
+
135
+ try:
136
+ # GitHub Search API endpoint
137
+ url = f"https://api.github.com/search/code?q={query}+extension:md"
138
+
139
+ headers = {
140
+ 'Authorization': 'Token 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'
141
+ }
142
+
143
+ # Make the request
144
+ response = requests.get(url,headers=headers)
145
+ data = response.json()
146
+ links = [item['html_url'].replace('/blob','').replace('//github','//raw.github') for item in data['items']]
147
+
148
+ return links
149
+
150
+ except:
151
+ return []
152
+
153
+
154
+ #Similarity Check
155
+
156
+ def extract_similar_products(query):
157
+ results = DDGS().chat(f'{query} Similar Products')
158
+
159
+ pattern = r'^\d+\.\s(.+)$'
160
+ matches = re.findall(pattern, results, re.MULTILINE)
161
+ matches = [item.split(': ')[0] for item in matches]
162
+ print(matches)
163
+
164
+ return matches[:5] if matches else []
165
+
166
+
167
+ # Define API routes -------------------------------------------------------
168
+
169
+ @app.route('/')
170
+ def home():
171
+ return render_template('index.html')
172
+
173
+
174
+ @app.route('/search/google', methods=['GET','POST'])
175
+ def search_google():
176
+
177
+ if request.method == 'POST':
178
+ data = request.get_json()
179
+ api_key = data.get('API_KEY')
180
+ product = data.get('product')
181
+ else:
182
+ product = request.args.get('product')
183
+ api_key = request.args.get('API_KEY')
184
+
185
+ similar_products = extract_similar_products(product)
186
+ if api_key == API_KEY_DEFAULT:
187
+ results = {product: google_search(product)}
188
+ for p in similar_products:
189
+ results[p] = google_search(p)
190
+ return jsonify(results)
191
+
192
+ else:
193
+ return jsonify({'error': 'Invalid API key'}), 401
194
+
195
+ @app.route('/search/duckduckgo', methods=['GET','POST'])
196
+ def search_duckduckgo():
197
+
198
+ if request.method == 'POST':
199
+ data = request.get_json()
200
+ api_key = data.get('API_KEY')
201
+ product = data.get('product')
202
+ else:
203
+ product = request.args.get('product')
204
+ api_key = request.args.get('API_KEY')
205
+
206
+ similar_products = extract_similar_products(product)
207
+
208
+
209
+ if api_key == API_KEY_DEFAULT:
210
+ results = {product: duckduckgo_search(product)}
211
+ for p in similar_products:
212
+ results[p] = duckduckgo_search(p)
213
+ return jsonify(results)
214
+ else:
215
+ return jsonify({'error': 'Invalid API key'}), 401
216
+
217
+
218
+ @app.route('/search/archive', methods=['GET','POST'])
219
+ def search_archive():
220
+
221
+ if request.method == 'POST':
222
+ data = request.get_json()
223
+ api_key = data.get('API_KEY')
224
+ product = data.get('product')
225
+ else:
226
+ product = request.args.get('product')
227
+ api_key = request.args.get('API_KEY')
228
+
229
+ # Retrieve custom headers if any
230
+
231
+ similar_products = extract_similar_products(product)
232
+
233
+ if api_key == API_KEY_DEFAULT:
234
+ results = {product: archive_search(product)}
235
+
236
+ def process_product(product):
237
+ return product, archive_search(product)
238
+
239
+ with concurrent.futures.ThreadPoolExecutor() as executor:
240
+ # Map the process_product function to similar_products
241
+ future_to_product = {executor.submit(process_product, p): p for p in similar_products}
242
+
243
+ # Collect results as they complete
244
+ for future in concurrent.futures.as_completed(future_to_product):
245
+ product, result = future.result()
246
+ results[product] = result
247
+
248
+ return jsonify(results)
249
+
250
+ else:
251
+ return jsonify({'error': 'Invalid API key'}), 401
252
+
253
+
254
+ @app.route('/search/github', methods=['GET','POST'])
255
+ def search_github():
256
+
257
+ if request.method == 'POST':
258
+ data = request.get_json()
259
+ api_key = data.get('API_KEY')
260
+ product = data.get('product')
261
+ else:
262
+ product = request.args.get('product')
263
+ api_key = request.args.get('API_KEY')
264
+
265
+ similar_products = extract_similar_products(product)
266
+
267
+ if api_key == API_KEY_DEFAULT:
268
+ results = {product: github_search(product)}
269
+ for p in similar_products:
270
+ results[p] = github_search(p)
271
+ return jsonify(results)
272
+
273
+ else:
274
+ return jsonify({'error': 'Invalid API key'}), 401
275
+
276
+
277
+ @app.route('/search/all', methods=['GET','POST'])
278
+ def search_all():
279
+
280
+ if request.method == 'POST':
281
+ data = request.get_json()
282
+ api_key = data.get('API_KEY')
283
+ product = data.get('product')
284
+ else:
285
+ product = request.args.get('product')
286
+ api_key = request.args.get('API_KEY')
287
+
288
+ similar_products = extract_similar_products(product)
289
+
290
+
291
+ if api_key == API_KEY_DEFAULT:
292
+
293
+ results = {
294
+ product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
295
+ }
296
+
297
+ def search_product(p):
298
+ return {
299
+ 'product': p,
300
+ 'duckduckgo': duckduckgo_search(p),
301
+ 'google': google_search(p),
302
+ 'github': github_search(p),
303
+ 'archive': archive_search(p)
304
+ }
305
+
306
+ with concurrent.futures.ThreadPoolExecutor() as executor:
307
+ future_to_product = {executor.submit(search_product, p): p for p in similar_products}
308
+
309
+ for future in concurrent.futures.as_completed(future_to_product):
310
+ result = future.result()
311
+ product = result['product']
312
+ results[product] = [
313
+ {'duckduckgo': result['duckduckgo']},
314
+ {'google': result['google']},
315
+ {'github': result['github']},
316
+ {'archive': result['archive']}
317
+ ]
318
+
319
+ return jsonify(results)
320
+
321
+ else:
322
+ return jsonify({'error': 'Invalid API key'}), 401
323
+
324
+ # Run the Flask app
325
+ if __name__ == '__main__':
326
+ app.run(debug=True)
327
+