IntelMentorship / app.py
Fuad04's picture
Update app.py
492128d verified
raw
history blame contribute delete
No virus
6.9 kB
import os
# Disable tokenizers parallelism to avoid deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import streamlit as st
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from fpdf import FPDF
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModel
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from summarizer import Summarizer
import re
@st.cache_data
def parse_html_file(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
html_content = file.read()
soup = BeautifulSoup(html_content, "html.parser")
return soup
except Exception as e:
print(f"An error occurred: {e}")
return None
@st.cache_data
def scrape_amazon_product(url):
global revList
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'
}
try:
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
with open("temp.html", 'wb') as file:
file.write(response.content)
else:
print(f"Failed to download HTML. Status code: {response.status_code}")
except Exception as e:
print(f"An error occurred: {e}")
current_directory = os.getcwd()
file_name = "temp.html"
file_path = os.path.join(current_directory, file_name)
global global_file_path
global_file_path = file_path
soup = parse_html_file(file_path)
product_name_element = soup.find('span', {'id': 'productTitle'})
product_name = product_name_element.text.strip() if product_name_element else None
categories = soup.find_all('a', {'class': 'a-link-normal a-color-tertiary'})
category = categories[-1].text.strip() if categories else None
product_description_element = soup.find('div', {'id': 'productDescription'})
product_description = product_description_element.text.strip() if product_description_element else None
review_elements = soup.find_all('span', {'data-hook': 'review-body'})
reviews = [review.text.strip() for review in review_elements]
global revList
revList = reviews
return {
'product_name': product_name,
'category': category,
'product_description': product_description,
'Reviews': reviews
}
def CalcReviews(reviews):
positive_reviews = []
negative_reviews = []
model = Summarizer()
for review in tqdm(reviews):
summary = model(review, num_sentences=1)
if "good" in summary or "great" in summary:
positive_reviews.append(review)
else:
negative_reviews.append(review)
ratio = len(positive_reviews) / (len(negative_reviews) if negative_reviews else 1)
positive_summary = model("\n".join(positive_reviews), num_sentences=3)
negative_summary = model("\n".join(negative_reviews), num_sentences=3)
return {
'positive_reviews': positive_reviews,
'negative_reviews': negative_reviews,
'Ratio of Positive to Negative Reviews': ratio,
'positive_summary': positive_summary,
'negative_summary': negative_summary
}
def generate_pdf(product_data, review_data):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Product Report", ln=True, align="C")
pdf.cell(200, 10, txt=f"Product Name: {product_data['product_name']}", ln=True, align="L")
pdf.cell(200, 10, txt=f"Category: {product_data['category']}", ln=True, align="L")
pdf.cell(200, 10, txt="Product Description:", ln=True, align="L")
pdf.multi_cell(0, 10, txt=product_data['product_description'])
pdf.cell(200, 10, txt="Reviews Summary", ln=True, align="L")
pdf.cell(200, 10, txt=f"Number of Positive Reviews: {len(review_data['positive_reviews'])}", ln=True, align="L")
pdf.cell(200, 10, txt=f"Number of Negative Reviews: {len(review_data['negative_reviews'])}", ln=True, align="L")
pdf.cell(200, 10, txt=f"Positive to Negative Ratio: {review_data['Ratio of Positive to Negative Reviews']}", ln=True, align="L")
pdf.cell(200, 10, txt="Summary of Positive Reviews", ln=True, align="L")
pdf.multi_cell(0, 10, txt=review_data['positive_summary'])
pdf.cell(200, 10, txt="Summary of Negative Reviews", ln=True, align="L")
pdf.multi_cell(0, 10, txt=review_data['negative_summary'])
pdf_file_path = "product_report.pdf"
pdf.output(pdf_file_path)
return pdf_file_path
def get_answer(question, pdf_path):
headers = {
'Authorization': 'Bearer YOUR_API_KEY',
'Content-Type': 'application/json'
}
files = {'file': open(pdf_path, 'rb')}
response1 = requests.post(
'https://api.chatpdf.com/v1/sources', headers=headers, files=files)
if response1.status_code == 200:
source_id = response1.json()['sourceId']
else:
st.error("Failed to upload PDF to ChatPDF.")
return None
data = {
'sourceId': source_id,
'messages': [
{
'role': "user",
'content': question,
}
]
}
response = requests.post(
'https://api.chatpdf.com/v1/chats/message', headers=headers, json=data)
if response.status_code == 200:
return response.json()['content']
else:
st.error("Failed to get response from ChatPDF.")
return None
# Streamlit application
st.title("Amazon Product Insights Dashboard")
# URL input
url = st.text_input("Enter Amazon Product URL:")
if url:
product_data = scrape_amazon_product(url)
if product_data:
st.header(product_data['product_name'])
st.subheader("Product Description")
st.write(product_data['product_description'])
st.subheader("Reviews")
st.write(product_data['Reviews'])
review_data = CalcReviews(product_data['Reviews'])
st.metric("Number of Positive Reviews" + review_data['positive_reviews'])
st.metric("Number of Negative Reviews" + review_data['negative_reviews'])
st.write("Positive to Negative Ratio : " + review_data['Ratio of Positive to Negative Reviews'])
st.subheader("Summary of Positive Reviews")
st.write(review_data['positive_summary'])
st.subheader("Summary of Negative Reviews")
st.write(review_data['negative_summary'])
# Generate PDF
pdf_path = generate_pdf(product_data, review_data)
# Chatbot interaction
st.subheader("Chat with the Product")
user_question = st.text_input("Ask a question about the product:")
if user_question:
response = get_answer(user_question, pdf_path)
st.write(response)