Spaces:
Sleeping
Sleeping
import os | |
# Disable tokenizers parallelism to avoid deadlocks | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
import streamlit as st | |
import requests | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
from fpdf import FPDF | |
import pandas as pd | |
import torch | |
from transformers import pipeline, AutoTokenizer, AutoModel | |
from tqdm import tqdm | |
from concurrent.futures import ProcessPoolExecutor | |
from summarizer import Summarizer | |
import re | |
def parse_html_file(file_path): | |
try: | |
with open(file_path, "r", encoding="utf-8") as file: | |
html_content = file.read() | |
soup = BeautifulSoup(html_content, "html.parser") | |
return soup | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return None | |
def scrape_amazon_product(url): | |
global revList | |
HEADERS = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', | |
'Accept-Language': 'en-US, en;q=0.5' | |
} | |
try: | |
response = requests.get(url, headers=HEADERS) | |
if response.status_code == 200: | |
with open("temp.html", 'wb') as file: | |
file.write(response.content) | |
else: | |
print(f"Failed to download HTML. Status code: {response.status_code}") | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
current_directory = os.getcwd() | |
file_name = "temp.html" | |
file_path = os.path.join(current_directory, file_name) | |
global global_file_path | |
global_file_path = file_path | |
soup = parse_html_file(file_path) | |
product_name_element = soup.find('span', {'id': 'productTitle'}) | |
product_name = product_name_element.text.strip() if product_name_element else None | |
categories = soup.find_all('a', {'class': 'a-link-normal a-color-tertiary'}) | |
category = categories[-1].text.strip() if categories else None | |
product_description_element = soup.find('div', {'id': 'productDescription'}) | |
product_description = product_description_element.text.strip() if product_description_element else None | |
review_elements = soup.find_all('span', {'data-hook': 'review-body'}) | |
reviews = [review.text.strip() for review in review_elements] | |
global revList | |
revList = reviews | |
return { | |
'product_name': product_name, | |
'category': category, | |
'product_description': product_description, | |
'Reviews': reviews | |
} | |
def CalcReviews(reviews): | |
positive_reviews = [] | |
negative_reviews = [] | |
model = Summarizer() | |
for review in tqdm(reviews): | |
summary = model(review, num_sentences=1) | |
if "good" in summary or "great" in summary: | |
positive_reviews.append(review) | |
else: | |
negative_reviews.append(review) | |
ratio = len(positive_reviews) / (len(negative_reviews) if negative_reviews else 1) | |
positive_summary = model("\n".join(positive_reviews), num_sentences=3) | |
negative_summary = model("\n".join(negative_reviews), num_sentences=3) | |
return { | |
'positive_reviews': positive_reviews, | |
'negative_reviews': negative_reviews, | |
'Ratio of Positive to Negative Reviews': ratio, | |
'positive_summary': positive_summary, | |
'negative_summary': negative_summary | |
} | |
def generate_pdf(product_data, review_data): | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
pdf.cell(200, 10, txt="Product Report", ln=True, align="C") | |
pdf.cell(200, 10, txt=f"Product Name: {product_data['product_name']}", ln=True, align="L") | |
pdf.cell(200, 10, txt=f"Category: {product_data['category']}", ln=True, align="L") | |
pdf.cell(200, 10, txt="Product Description:", ln=True, align="L") | |
pdf.multi_cell(0, 10, txt=product_data['product_description']) | |
pdf.cell(200, 10, txt="Reviews Summary", ln=True, align="L") | |
pdf.cell(200, 10, txt=f"Number of Positive Reviews: {len(review_data['positive_reviews'])}", ln=True, align="L") | |
pdf.cell(200, 10, txt=f"Number of Negative Reviews: {len(review_data['negative_reviews'])}", ln=True, align="L") | |
pdf.cell(200, 10, txt=f"Positive to Negative Ratio: {review_data['Ratio of Positive to Negative Reviews']}", ln=True, align="L") | |
pdf.cell(200, 10, txt="Summary of Positive Reviews", ln=True, align="L") | |
pdf.multi_cell(0, 10, txt=review_data['positive_summary']) | |
pdf.cell(200, 10, txt="Summary of Negative Reviews", ln=True, align="L") | |
pdf.multi_cell(0, 10, txt=review_data['negative_summary']) | |
pdf_file_path = "product_report.pdf" | |
pdf.output(pdf_file_path) | |
return pdf_file_path | |
def get_answer(question, pdf_path): | |
headers = { | |
'Authorization': 'Bearer YOUR_API_KEY', | |
'Content-Type': 'application/json' | |
} | |
files = {'file': open(pdf_path, 'rb')} | |
response1 = requests.post( | |
'https://api.chatpdf.com/v1/sources', headers=headers, files=files) | |
if response1.status_code == 200: | |
source_id = response1.json()['sourceId'] | |
else: | |
st.error("Failed to upload PDF to ChatPDF.") | |
return None | |
data = { | |
'sourceId': source_id, | |
'messages': [ | |
{ | |
'role': "user", | |
'content': question, | |
} | |
] | |
} | |
response = requests.post( | |
'https://api.chatpdf.com/v1/chats/message', headers=headers, json=data) | |
if response.status_code == 200: | |
return response.json()['content'] | |
else: | |
st.error("Failed to get response from ChatPDF.") | |
return None | |
# Streamlit application | |
st.title("Amazon Product Insights Dashboard") | |
# URL input | |
url = st.text_input("Enter Amazon Product URL:") | |
if url: | |
product_data = scrape_amazon_product(url) | |
if product_data: | |
st.header(product_data['product_name']) | |
st.subheader("Product Description") | |
st.write(product_data['product_description']) | |
st.subheader("Reviews") | |
st.write(product_data['Reviews']) | |
review_data = CalcReviews(product_data['Reviews']) | |
st.metric("Number of Positive Reviews" + review_data['positive_reviews']) | |
st.metric("Number of Negative Reviews" + review_data['negative_reviews']) | |
st.write("Positive to Negative Ratio : " + review_data['Ratio of Positive to Negative Reviews']) | |
st.subheader("Summary of Positive Reviews") | |
st.write(review_data['positive_summary']) | |
st.subheader("Summary of Negative Reviews") | |
st.write(review_data['negative_summary']) | |
# Generate PDF | |
pdf_path = generate_pdf(product_data, review_data) | |
# Chatbot interaction | |
st.subheader("Chat with the Product") | |
user_question = st.text_input("Ask a question about the product:") | |
if user_question: | |
response = get_answer(user_question, pdf_path) | |
st.write(response) | |