import os # Disable tokenizers parallelism to avoid deadlocks os.environ["TOKENIZERS_PARALLELISM"] = "false" import streamlit as st import requests from bs4 import BeautifulSoup from transformers import pipeline from fpdf import FPDF import pandas as pd import torch from transformers import pipeline, AutoTokenizer, AutoModel from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor from summarizer import Summarizer import re @st.cache_data def parse_html_file(file_path): try: with open(file_path, "r", encoding="utf-8") as file: html_content = file.read() soup = BeautifulSoup(html_content, "html.parser") return soup except Exception as e: print(f"An error occurred: {e}") return None @st.cache_data def scrape_amazon_product(url): global revList HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5' } try: response = requests.get(url, headers=HEADERS) if response.status_code == 200: with open("temp.html", 'wb') as file: file.write(response.content) else: print(f"Failed to download HTML. Status code: {response.status_code}") except Exception as e: print(f"An error occurred: {e}") current_directory = os.getcwd() file_name = "temp.html" file_path = os.path.join(current_directory, file_name) global global_file_path global_file_path = file_path soup = parse_html_file(file_path) product_name_element = soup.find('span', {'id': 'productTitle'}) product_name = product_name_element.text.strip() if product_name_element else None categories = soup.find_all('a', {'class': 'a-link-normal a-color-tertiary'}) category = categories[-1].text.strip() if categories else None product_description_element = soup.find('div', {'id': 'productDescription'}) product_description = product_description_element.text.strip() if product_description_element else None review_elements = soup.find_all('span', {'data-hook': 'review-body'}) reviews = [review.text.strip() for review in review_elements] global revList revList = reviews return { 'product_name': product_name, 'category': category, 'product_description': product_description, 'Reviews': reviews } def CalcReviews(reviews): positive_reviews = [] negative_reviews = [] model = Summarizer() for review in tqdm(reviews): summary = model(review, num_sentences=1) if "good" in summary or "great" in summary: positive_reviews.append(review) else: negative_reviews.append(review) ratio = len(positive_reviews) / (len(negative_reviews) if negative_reviews else 1) positive_summary = model("\n".join(positive_reviews), num_sentences=3) negative_summary = model("\n".join(negative_reviews), num_sentences=3) return { 'positive_reviews': positive_reviews, 'negative_reviews': negative_reviews, 'Ratio of Positive to Negative Reviews': ratio, 'positive_summary': positive_summary, 'negative_summary': negative_summary } def generate_pdf(product_data, review_data): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) pdf.cell(200, 10, txt="Product Report", ln=True, align="C") pdf.cell(200, 10, txt=f"Product Name: {product_data['product_name']}", ln=True, align="L") pdf.cell(200, 10, txt=f"Category: {product_data['category']}", ln=True, align="L") pdf.cell(200, 10, txt="Product Description:", ln=True, align="L") pdf.multi_cell(0, 10, txt=product_data['product_description']) pdf.cell(200, 10, txt="Reviews Summary", ln=True, align="L") pdf.cell(200, 10, txt=f"Number of Positive Reviews: {len(review_data['positive_reviews'])}", ln=True, align="L") pdf.cell(200, 10, txt=f"Number of Negative Reviews: {len(review_data['negative_reviews'])}", ln=True, align="L") pdf.cell(200, 10, txt=f"Positive to Negative Ratio: {review_data['Ratio of Positive to Negative Reviews']}", ln=True, align="L") pdf.cell(200, 10, txt="Summary of Positive Reviews", ln=True, align="L") pdf.multi_cell(0, 10, txt=review_data['positive_summary']) pdf.cell(200, 10, txt="Summary of Negative Reviews", ln=True, align="L") pdf.multi_cell(0, 10, txt=review_data['negative_summary']) pdf_file_path = "product_report.pdf" pdf.output(pdf_file_path) return pdf_file_path def get_answer(question, pdf_path): headers = { 'Authorization': 'Bearer YOUR_API_KEY', 'Content-Type': 'application/json' } files = {'file': open(pdf_path, 'rb')} response1 = requests.post( 'https://api.chatpdf.com/v1/sources', headers=headers, files=files) if response1.status_code == 200: source_id = response1.json()['sourceId'] else: st.error("Failed to upload PDF to ChatPDF.") return None data = { 'sourceId': source_id, 'messages': [ { 'role': "user", 'content': question, } ] } response = requests.post( 'https://api.chatpdf.com/v1/chats/message', headers=headers, json=data) if response.status_code == 200: return response.json()['content'] else: st.error("Failed to get response from ChatPDF.") return None # Streamlit application st.title("Amazon Product Insights Dashboard") # URL input url = st.text_input("Enter Amazon Product URL:") if url: product_data = scrape_amazon_product(url) if product_data: st.header(product_data['product_name']) st.subheader("Product Description") st.write(product_data['product_description']) st.subheader("Reviews") st.write(product_data['Reviews']) review_data = CalcReviews(product_data['Reviews']) st.metric("Number of Positive Reviews" + review_data['positive_reviews']) st.metric("Number of Negative Reviews" + review_data['negative_reviews']) st.write("Positive to Negative Ratio : " + review_data['Ratio of Positive to Negative Reviews']) st.subheader("Summary of Positive Reviews") st.write(review_data['positive_summary']) st.subheader("Summary of Negative Reviews") st.write(review_data['negative_summary']) # Generate PDF pdf_path = generate_pdf(product_data, review_data) # Chatbot interaction st.subheader("Chat with the Product") user_question = st.text_input("Ask a question about the product:") if user_question: response = get_answer(user_question, pdf_path) st.write(response)