dfinel's picture
Update app.py
7306b68 verified
import grequests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import spacy
from collections import Counter
from transformers import pipeline
from bert_regression import get_ratings_dic
import os
from langchain.llms import OpenAI
import gradio as gr
os.environ["OPENAI_API_KEY"] = "sk-proj-flLYFFvadHYqGvN4u5l5T3BlbkFJ9dzQB92UqD08RaA7tYIM"
nlp = spacy.load('spacy_model')
sentiment_pipeline = pipeline("sentiment-analysis", model='my_sentiment_model')
classifier = pipeline(task="zero-shot-classification", model="my_zero_shot")
custom_headers = {
# Eliminating non-english reviews
"Accept-language": "en;q=1.0",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}
def get_soup(response):
if response.status_code != 200:
print("Error in getting webpage")
return None
soup = BeautifulSoup(response.text, "html.parser")
return soup
def get_soup_reviews(soup):
review_elements = soup.select("div.review")
scraped_reviews = []
for review in review_elements:
r_content_element = review.select_one("span.review-text")
r_content = r_content_element.text if r_content_element else None
preprocessed_review = r_content.replace('\n', '')
scraped_reviews.append(preprocessed_review)
return scraped_reviews
def scrape_reviews(base_url):
all_reviews = []
star_ratings = ['one', 'two', 'three', 'four', 'five']
for star in tqdm(star_ratings):
page_number = 1
while True:
url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}"
response = grequests.get(url, headers=custom_headers).send().response
soup = get_soup(response)
if not soup:
continue # Skip to next star rating if unable to parse page
reviews = get_soup_reviews(soup)
all_reviews.extend(reviews)
# Note: there's a valid page for any pageNumber,
# so we need to stop scraping based on the button of next page
# Check for the presence of the "Next page" element
next_page_element = soup.find("li", class_="a-disabled a-last")
if next_page_element:
break # Exit loop if "Next page" element is found
page_number += 1
return all_reviews
def remove_links(review):
pattern = r'\bhttps?://\S+'
return re.sub(pattern, '', review)
def preprocess_data(df):
df.rename(columns={'content': 'Text'}, inplace = True)
df.Text = df.Text.astype(str)
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
df['Text'] = df['Text'].apply(remove_links)
return df
def get_noun_ver_adj(reviews):
noun_ver_adj = []
for i in tqdm(range(reviews.shape[0])):
sente = nlp(reviews.iloc[i])
for token in sente:
noun = adj = adverb = adv_verb = neg = ''
if token.dep_ == 'ROOT':
for child in token.children:
if child.pos_ == 'NOUN':
noun = child.text
elif child.pos_ == 'ADJ':
adj = child.text
for other_child in child.children:
if other_child.pos_ == 'ADV':
adverb = other_child.text
elif child.pos_ == 'ADV':
adv_verb = child.text
elif child.pos_ == 'PART':
neg = child.text
if noun and adj:
if adverb:
noun_ver_adj.append((noun, token.text, adverb, adj))
elif adv_verb and neg:
noun_ver_adj.append((noun, token.text, adv_verb, neg, adj))
elif neg:
noun_ver_adj.append((noun, token.text, neg, adj))
else:
noun_ver_adj.append((noun, token.text, adj))
return noun_ver_adj
def get_most_common_noun(noun_ver_adj):
element_counts_lemma_noun = Counter(nlp(item[0].lower())[0].lemma_ for item in noun_ver_adj)
most_common_noun = list(map(lambda x: x[0], element_counts_lemma_noun.most_common(10)))
return most_common_noun[:5]
def get_insights(topic, noun_ver_adj):
list_tuples = [' '.join(x) for x in noun_ver_adj if nlp(x[0].lower())[0].lemma_ == topic]
results = sentiment_pipeline(list_tuples)
pos = 0
neg = 0
pos_adj = []
neg_adj = []
for sentence, result in zip(list_tuples, results):
if result['label'] == 'POSITIVE':
pos += 1
pos_adj.append(sentence.rsplit(None, 1)[-1].lower())
else:
neg += 1
neg_adj.append(sentence.rsplit(None, 1)[-1].lower())
most_common_pos_adj = list(map(lambda x: x[0], Counter(pos_adj).most_common(5)))
most_common_neg_adj = list(map(lambda x: x[0], Counter(neg_adj).most_common(5)))
return most_common_pos_adj, most_common_neg_adj
def get_df_all_topics_sent(reviews, sentiment, most_common_noun, threshold=0.6):
# Get the dataframe of all topics with the corresponding sentiment (positive or negative)
reviews_list = reviews.to_list()
hypothesis = f'This product review reflect a {sentiment} sentiment of the {{}}'
df_sent = classifier(reviews_list, most_common_noun, hypothesis_template=hypothesis, multi_label=True)
df_sent = pd.DataFrame(df_sent)
df_sent = df_sent.set_index('sequence').apply(pd.Series.explode).reset_index()
df_sent = df_sent[df_sent['scores'] >= threshold]
return df_sent
def get_both_df(reviews,most_common_noun):
# get both df and remove indexes from the positive and negative dataframes where the score is higher in one or the other df
df_pos = get_df_all_topics_sent(reviews, 'positive', most_common_noun)
print('done')
df_neg = get_df_all_topics_sent(reviews, 'negative', most_common_noun)
merged_df = pd.merge(df_pos, df_neg, on=['sequence', 'labels'], suffixes=('_pos', '_neg'))
to_remove_pos = merged_df[merged_df.scores_pos < merged_df.scores_neg][['sequence', 'labels']]
indexes_pos_to_remove = df_pos.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index(
'index').index
to_remove_neg = merged_df[merged_df.scores_pos > merged_df.scores_neg][['sequence', 'labels']]
indexes_neg_to_remove = df_neg.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index(
'index').index
df_pos.drop(index=indexes_pos_to_remove, inplace=True)
df_neg.drop(index=indexes_neg_to_remove, inplace=True)
return df_pos, df_neg
def get_df_sent_topic(topic, df_all_topic_sentim):
# get the reviews of a specific topic corresponding to the given sentiment
df_topic = df_all_topic_sentim[df_all_topic_sentim.labels == topic].copy()
df_topic.drop(columns=['labels', 'scores'], inplace=True)
return df_topic
def get_percentages_topic(topic, df_all_topic_pos, df_all_topic_neg):
# get percentages of positive and negative reviews for the given topic
df_pos = get_df_sent_topic(topic, df_all_topic_pos)
df_neg = get_df_sent_topic(topic, df_all_topic_neg)
pos_perc = round(df_pos.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2)
neg_perc = round(df_neg.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2)
return pos_perc, neg_perc
def get_df_adjectives(sentiment, reviews, topic,df_all_topic_sent, noun_ver_adj, threshold=0.6):
reviews_list = reviews.to_list()
if sentiment == 'positive':
adj = get_insights(topic, noun_ver_adj)[0]
else:
adj = get_insights(topic, noun_ver_adj)[1]
hypothesis = f'The {sentiment} sentiment representing the product {topic} is {{}}'
df_topic = get_df_sent_topic(topic, df_all_topic_sent)
df_adj = classifier(df_topic.sequence.tolist(), adj, hypothesis_template=hypothesis, multi_label=True)
df_adj = pd.DataFrame(df_adj)
df_adj = df_adj.set_index('sequence').apply(pd.Series.explode).reset_index()
df_adj = df_adj[df_adj['scores'] >= threshold]
return (df_adj.labels.value_counts(normalize=True).values.round(2) * 100).astype(int), df_adj.labels.value_counts(
normalize=True).index.values.astype(str)
def get_topics_adjectives(most_common_noun, noun_ver_adj):
dic = {}
for i in range(5):
dic[most_common_noun[i]] = get_insights(most_common_noun[i], noun_ver_adj)
return dic
def generate_feedback(dic, temperature = 0.9):
text = f"""Create a summary adressed to a business owner of a product about its reviews.
We provide the main topics of the reviews with their main attributes.
For each topic which are the keys of the dictionary, the first list is positive adjectives and the second is negative.
Start the text by : 'Dear business owner,'
You have to create subpart for each topic and explain on the first part of each topic the positive attributes by writing :
topic :
positive feedbacks : sentences explaining the positive feedbacks
negative feedbacks : sentences explaining the negative feedbacks
Finish the text by signing with this company name : 'The Topic Magnet'.
Feel free to put many feed lines
: {dic}
"""
llm = OpenAI(temperature = temperature, max_tokens = 1000)
generated_text = llm(text)
return generated_text.strip()
def get_reviews(url):
df = pd.DataFrame({'Text': scrape_reviews(url)})
print('ok 1')
df = preprocess_data(df)
print('ok 2')
reviews = df.Text
print('ok 3')
noun_ver_adj = get_noun_ver_adj(reviews)
print('ok 4')
most_common_noun = get_most_common_noun(noun_ver_adj)
print('ok 5')
dic1 = get_topics_adjectives(most_common_noun, noun_ver_adj)
print('ok 6')
dic2 = get_ratings_dic(df)
print('ok 7')
generated_text = generate_feedback(dic1)
print('ok 8')
return dic2,generated_text
if __name__ == '__main__':
interface = gr.Interface(fn=get_reviews, inputs=gr.Textbox(), outputs=[gr.Textbox(label = 'Real ratings'),gr.Textbox(label = 'Actionable insights')], title='The Topic Magnet',
description='Enter the url of your Amazon reviews to get real ratings and valuable insights')
print('ok 9')
interface.queue().launch()