Process-Links / app.py
Prathmesh48's picture
Update app.py
37ea6f0 verified
# file: app.py
import gradio as gr
import requests
import json
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import WebBaseLoader
from langdetect import detect_langs
from PyPDF2 import PdfReader
from io import BytesIO
import logging
from dotenv import load_dotenv
import os
load_dotenv()
data = False
seen = set()
main_url = "https://similar-products-api.vercel.app/search/all"
main_product = "Samsung Galaxy"
API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
logging.basicConfig(level=logging.INFO)
def get_links(product):
params = {
"API_KEY": "12345",
"product": f"{product}",
}
response = requests.get(main_url, params=params)
if response.status_code == 200:
results = response.json()
return results
else:
return {}
def language_preprocess(text):
try:
if detect_langs(text)[0].lang == 'en':
return True
return False
except Exception as e:
logging.error(f"Language detection error: {e}")
return False
def relevant(product, similar_product, content):
try:
payload = {"inputs": f'''Do you think that the given content is similar to {similar_product} and {product}, just Respond True or False \nContent for similar product: {content[:700]}'''}
response = requests.post(API_URL, headers=headers, json=payload)
output = response.json()
return bool(output[0]['generated_text'])
except Exception as e:
logging.error(f"Relevance checking error: {e}")
return False
def download_pdf(url, timeout=10):
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status()
return BytesIO(response.content)
except requests.RequestException as e:
logging.error(f"PDF download error: {e}")
return None
def extract_text_from_pdf(pdf_file, pages):
reader = PdfReader(pdf_file)
extracted_text = ""
try:
for page_num in pages:
if page_num < len(reader.pages):
page = reader.pages[page_num]
extracted_text += page.extract_text() + "\n"
else:
logging.warning(f"Page {page_num} does not exist in the document.")
return extracted_text
except Exception as e:
logging.error(f"PDF text extraction error: {e}")
return 'हे चालत नाही'
def extract_text_online(link):
loader = WebBaseLoader(link)
pages = loader.load_and_split()
text = ''
for page in pages[:3]:
text+=page.page_content
return text
def process_link(link, similar_product):
if link in seen:
return None
seen.add(link)
try:
if link[-3:]=='.md':
text = extract_text_online(link)
else:
pdf_file = download_pdf(link)
text = extract_text_from_pdf(pdf_file, [0, 2, 4])
if language_preprocess(text):
if relevant(main_product, similar_product, text):
return link
except:
pass
return None
def filtering(urls, similar_product):
res = []
with ThreadPoolExecutor() as executor:
futures = {executor.submit(process_link, link, similar_product): link for link in urls}
for future in concurrent.futures.as_completed(futures):
result = future.result()
if result is not None:
res.append(result)
return res
def wikipedia_url(product):
api_url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "opensearch",
"search": product,
"limit": 5,
"namespace": 0,
"format": "json"
}
try:
response = requests.get(api_url, params=params)
response.raise_for_status()
data = response.json()
if data and len(data) > 3 and len(data[3]) > 0:
return data[3]
else:
return []
except requests.RequestException as e:
logging.error(f"Error fetching Wikipedia URLs: {e}")
return []
def preprocess_initial(product):
return get_links(product)
def preprocess_filter(product, data):
for similar_product in data:
# if similar_product != product:
if list(data[similar_product][0])[0] == 'duckduckgo':
s = set(('duckduckgo', 'google', 'archive'))
temp = []
for idx, item in enumerate(data[similar_product]):
if list(item)[0] in s:
urls = data[similar_product][idx][list(item)[0]]
temp += filtering(urls, similar_product)
else:
temp += data[similar_product][idx][list(item)[0]]
data[similar_product] = temp
data[similar_product] += wikipedia_url(similar_product)
else:
urls = data[similar_product]
data[similar_product] = filtering(urls, similar_product)
data[similar_product] += wikipedia_url(similar_product)
logging.info('Filtering completed')
return data
def main(product_name):
return preprocess_initial(product_name)
def filter_links(product_name, initial_data):
return preprocess_filter(product_name, initial_data)
with gr.Blocks() as demo:
product_name = gr.Textbox(label="Product Name")
get_links_btn = gr.Button("Get Links")
initial_links_output = gr.JSON()
filter_btn = gr.Button("Filter Links")
filtered_links_output = gr.JSON()
get_links_btn.click(fn=main, inputs=product_name, outputs=initial_links_output)
filter_btn.click(fn=filter_links, inputs=[product_name, initial_links_output], outputs=filtered_links_output)
if __name__ == "__main__":
demo.launch()