Spaces:
Sleeping
Sleeping
import grequests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import re | |
from tqdm import tqdm | |
import spacy | |
from collections import Counter | |
from transformers import pipeline | |
from bert_regression import get_ratings_dic | |
import os | |
from langchain.llms import OpenAI | |
import gradio as gr | |
os.environ["OPENAI_API_KEY"] = "sk-proj-flLYFFvadHYqGvN4u5l5T3BlbkFJ9dzQB92UqD08RaA7tYIM" | |
nlp = spacy.load('spacy_model') | |
sentiment_pipeline = pipeline("sentiment-analysis", model='my_sentiment_model') | |
classifier = pipeline(task="zero-shot-classification", model="my_zero_shot") | |
custom_headers = { | |
# Eliminating non-english reviews | |
"Accept-language": "en;q=1.0", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Cache-Control": "max-age=0", | |
"Connection": "keep-alive", | |
"User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15", | |
} | |
def get_soup(response): | |
if response.status_code != 200: | |
print("Error in getting webpage") | |
return None | |
soup = BeautifulSoup(response.text, "html.parser") | |
return soup | |
def get_soup_reviews(soup): | |
review_elements = soup.select("div.review") | |
scraped_reviews = [] | |
for review in review_elements: | |
r_content_element = review.select_one("span.review-text") | |
r_content = r_content_element.text if r_content_element else None | |
preprocessed_review = r_content.replace('\n', '') | |
scraped_reviews.append(preprocessed_review) | |
return scraped_reviews | |
def scrape_reviews(base_url): | |
all_reviews = [] | |
star_ratings = ['one', 'two', 'three', 'four', 'five'] | |
for star in tqdm(star_ratings): | |
page_number = 1 | |
while True: | |
url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}" | |
response = grequests.get(url, headers=custom_headers).send().response | |
soup = get_soup(response) | |
if not soup: | |
continue # Skip to next star rating if unable to parse page | |
reviews = get_soup_reviews(soup) | |
all_reviews.extend(reviews) | |
# Note: there's a valid page for any pageNumber, | |
# so we need to stop scraping based on the button of next page | |
# Check for the presence of the "Next page" element | |
next_page_element = soup.find("li", class_="a-disabled a-last") | |
if next_page_element: | |
break # Exit loop if "Next page" element is found | |
page_number += 1 | |
return all_reviews | |
def remove_links(review): | |
pattern = r'\bhttps?://\S+' | |
return re.sub(pattern, '', review) | |
def preprocess_data(df): | |
df.rename(columns={'content': 'Text'}, inplace = True) | |
df.Text = df.Text.astype(str) | |
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True) | |
df['Text'] = df['Text'].apply(remove_links) | |
return df | |
def get_noun_ver_adj(reviews): | |
noun_ver_adj = [] | |
for i in tqdm(range(reviews.shape[0])): | |
sente = nlp(reviews.iloc[i]) | |
for token in sente: | |
noun = adj = adverb = adv_verb = neg = '' | |
if token.dep_ == 'ROOT': | |
for child in token.children: | |
if child.pos_ == 'NOUN': | |
noun = child.text | |
elif child.pos_ == 'ADJ': | |
adj = child.text | |
for other_child in child.children: | |
if other_child.pos_ == 'ADV': | |
adverb = other_child.text | |
elif child.pos_ == 'ADV': | |
adv_verb = child.text | |
elif child.pos_ == 'PART': | |
neg = child.text | |
if noun and adj: | |
if adverb: | |
noun_ver_adj.append((noun, token.text, adverb, adj)) | |
elif adv_verb and neg: | |
noun_ver_adj.append((noun, token.text, adv_verb, neg, adj)) | |
elif neg: | |
noun_ver_adj.append((noun, token.text, neg, adj)) | |
else: | |
noun_ver_adj.append((noun, token.text, adj)) | |
return noun_ver_adj | |
def get_most_common_noun(noun_ver_adj): | |
element_counts_lemma_noun = Counter(nlp(item[0].lower())[0].lemma_ for item in noun_ver_adj) | |
most_common_noun = list(map(lambda x: x[0], element_counts_lemma_noun.most_common(10))) | |
return most_common_noun[:5] | |
def get_insights(topic, noun_ver_adj): | |
list_tuples = [' '.join(x) for x in noun_ver_adj if nlp(x[0].lower())[0].lemma_ == topic] | |
results = sentiment_pipeline(list_tuples) | |
pos = 0 | |
neg = 0 | |
pos_adj = [] | |
neg_adj = [] | |
for sentence, result in zip(list_tuples, results): | |
if result['label'] == 'POSITIVE': | |
pos += 1 | |
pos_adj.append(sentence.rsplit(None, 1)[-1].lower()) | |
else: | |
neg += 1 | |
neg_adj.append(sentence.rsplit(None, 1)[-1].lower()) | |
most_common_pos_adj = list(map(lambda x: x[0], Counter(pos_adj).most_common(5))) | |
most_common_neg_adj = list(map(lambda x: x[0], Counter(neg_adj).most_common(5))) | |
return most_common_pos_adj, most_common_neg_adj | |
def get_df_all_topics_sent(reviews, sentiment, most_common_noun, threshold=0.6): | |
# Get the dataframe of all topics with the corresponding sentiment (positive or negative) | |
reviews_list = reviews.to_list() | |
hypothesis = f'This product review reflect a {sentiment} sentiment of the {{}}' | |
df_sent = classifier(reviews_list, most_common_noun, hypothesis_template=hypothesis, multi_label=True) | |
df_sent = pd.DataFrame(df_sent) | |
df_sent = df_sent.set_index('sequence').apply(pd.Series.explode).reset_index() | |
df_sent = df_sent[df_sent['scores'] >= threshold] | |
return df_sent | |
def get_both_df(reviews,most_common_noun): | |
# get both df and remove indexes from the positive and negative dataframes where the score is higher in one or the other df | |
df_pos = get_df_all_topics_sent(reviews, 'positive', most_common_noun) | |
print('done') | |
df_neg = get_df_all_topics_sent(reviews, 'negative', most_common_noun) | |
merged_df = pd.merge(df_pos, df_neg, on=['sequence', 'labels'], suffixes=('_pos', '_neg')) | |
to_remove_pos = merged_df[merged_df.scores_pos < merged_df.scores_neg][['sequence', 'labels']] | |
indexes_pos_to_remove = df_pos.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index( | |
'index').index | |
to_remove_neg = merged_df[merged_df.scores_pos > merged_df.scores_neg][['sequence', 'labels']] | |
indexes_neg_to_remove = df_neg.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index( | |
'index').index | |
df_pos.drop(index=indexes_pos_to_remove, inplace=True) | |
df_neg.drop(index=indexes_neg_to_remove, inplace=True) | |
return df_pos, df_neg | |
def get_df_sent_topic(topic, df_all_topic_sentim): | |
# get the reviews of a specific topic corresponding to the given sentiment | |
df_topic = df_all_topic_sentim[df_all_topic_sentim.labels == topic].copy() | |
df_topic.drop(columns=['labels', 'scores'], inplace=True) | |
return df_topic | |
def get_percentages_topic(topic, df_all_topic_pos, df_all_topic_neg): | |
# get percentages of positive and negative reviews for the given topic | |
df_pos = get_df_sent_topic(topic, df_all_topic_pos) | |
df_neg = get_df_sent_topic(topic, df_all_topic_neg) | |
pos_perc = round(df_pos.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2) | |
neg_perc = round(df_neg.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2) | |
return pos_perc, neg_perc | |
def get_df_adjectives(sentiment, reviews, topic,df_all_topic_sent, noun_ver_adj, threshold=0.6): | |
reviews_list = reviews.to_list() | |
if sentiment == 'positive': | |
adj = get_insights(topic, noun_ver_adj)[0] | |
else: | |
adj = get_insights(topic, noun_ver_adj)[1] | |
hypothesis = f'The {sentiment} sentiment representing the product {topic} is {{}}' | |
df_topic = get_df_sent_topic(topic, df_all_topic_sent) | |
df_adj = classifier(df_topic.sequence.tolist(), adj, hypothesis_template=hypothesis, multi_label=True) | |
df_adj = pd.DataFrame(df_adj) | |
df_adj = df_adj.set_index('sequence').apply(pd.Series.explode).reset_index() | |
df_adj = df_adj[df_adj['scores'] >= threshold] | |
return (df_adj.labels.value_counts(normalize=True).values.round(2) * 100).astype(int), df_adj.labels.value_counts( | |
normalize=True).index.values.astype(str) | |
def get_topics_adjectives(most_common_noun, noun_ver_adj): | |
dic = {} | |
for i in range(5): | |
dic[most_common_noun[i]] = get_insights(most_common_noun[i], noun_ver_adj) | |
return dic | |
def generate_feedback(dic, temperature = 0.9): | |
text = f"""Create a summary adressed to a business owner of a product about its reviews. | |
We provide the main topics of the reviews with their main attributes. | |
For each topic which are the keys of the dictionary, the first list is positive adjectives and the second is negative. | |
Start the text by : 'Dear business owner,' | |
You have to create subpart for each topic and explain on the first part of each topic the positive attributes by writing : | |
topic : | |
positive feedbacks : sentences explaining the positive feedbacks | |
negative feedbacks : sentences explaining the negative feedbacks | |
Finish the text by signing with this company name : 'The Topic Magnet'. | |
Feel free to put many feed lines | |
: {dic} | |
""" | |
llm = OpenAI(temperature = temperature, max_tokens = 1000) | |
generated_text = llm(text) | |
return generated_text.strip() | |
def get_reviews(url): | |
df = pd.DataFrame({'Text': scrape_reviews(url)}) | |
print('ok 1') | |
df = preprocess_data(df) | |
print('ok 2') | |
reviews = df.Text | |
print('ok 3') | |
noun_ver_adj = get_noun_ver_adj(reviews) | |
print('ok 4') | |
most_common_noun = get_most_common_noun(noun_ver_adj) | |
print('ok 5') | |
dic1 = get_topics_adjectives(most_common_noun, noun_ver_adj) | |
print('ok 6') | |
dic2 = get_ratings_dic(df) | |
print('ok 7') | |
generated_text = generate_feedback(dic1) | |
print('ok 8') | |
return dic2,generated_text | |
if __name__ == '__main__': | |
interface = gr.Interface(fn=get_reviews, inputs=gr.Textbox(), outputs=[gr.Textbox(label = 'Real ratings'),gr.Textbox(label = 'Actionable insights')], title='The Topic Magnet', | |
description='Enter the url of your Amazon reviews to get real ratings and valuable insights') | |
print('ok 9') | |
interface.queue().launch() | |