File size: 10,491 Bytes
21cb43a
 
 
 
 
 
 
 
 
 
 
 
 
646813e
21cb43a
 
65bf073
21cb43a
 
 
4bdcdfd
21cb43a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77636a7
21cb43a
 
 
7306b68
21cb43a
7306b68
21cb43a
7306b68
21cb43a
7306b68
21cb43a
7306b68
21cb43a
7306b68
21cb43a
7306b68
77636a7
7306b68
77636a7
21cb43a
 
 
4bdcdfd
77636a7
 
646813e
7306b68
047e8b2
77636a7
21cb43a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import grequests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import spacy
from collections import Counter
from transformers import pipeline
from bert_regression import get_ratings_dic
import os
from langchain.llms import OpenAI
import gradio as gr

os.environ["OPENAI_API_KEY"] = "sk-proj-flLYFFvadHYqGvN4u5l5T3BlbkFJ9dzQB92UqD08RaA7tYIM"


nlp = spacy.load('spacy_model')
sentiment_pipeline = pipeline("sentiment-analysis", model='my_sentiment_model')
classifier = pipeline(task="zero-shot-classification", model="my_zero_shot")


custom_headers = {
    # Eliminating non-english reviews
    "Accept-language": "en;q=1.0",
    "Accept-Encoding": "gzip, deflate, br",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
}


def get_soup(response):
    if response.status_code != 200:
        print("Error in getting webpage")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    return soup


def get_soup_reviews(soup):
    review_elements = soup.select("div.review")

    scraped_reviews = []

    for review in review_elements:
        r_content_element = review.select_one("span.review-text")
        r_content = r_content_element.text if r_content_element else None
        preprocessed_review = r_content.replace('\n', '')

        scraped_reviews.append(preprocessed_review)

    return scraped_reviews


def scrape_reviews(base_url):
    all_reviews = []
    star_ratings = ['one', 'two', 'three', 'four', 'five']

    for star in tqdm(star_ratings):
        page_number = 1

        while True:
            url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}"
            response = grequests.get(url, headers=custom_headers).send().response
            soup = get_soup(response)

            if not soup:
                continue  # Skip to next star rating if unable to parse page

            reviews = get_soup_reviews(soup)
            all_reviews.extend(reviews)

            # Note: there's a valid page for any pageNumber,
            # so we need to stop scraping based on the button of next page
            # Check for the presence of the "Next page" element
            next_page_element = soup.find("li", class_="a-disabled a-last")
            if next_page_element:
                break  # Exit loop if "Next page" element is found

            page_number += 1

    return all_reviews
def remove_links(review):
    pattern = r'\bhttps?://\S+'
    return re.sub(pattern, '', review)


def preprocess_data(df):
    df.rename(columns={'content': 'Text'}, inplace = True)
    df.Text = df.Text.astype(str)
    df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
    df['Text'] = df['Text'].apply(remove_links)
    return df


def get_noun_ver_adj(reviews):
    noun_ver_adj = []
    for i in tqdm(range(reviews.shape[0])):
        sente = nlp(reviews.iloc[i])
        for token in sente:
            noun = adj = adverb = adv_verb = neg = ''
            if token.dep_ == 'ROOT':
                for child in token.children:
                    if child.pos_ == 'NOUN':
                        noun = child.text
                    elif child.pos_ == 'ADJ':
                        adj = child.text
                        for other_child in child.children:
                            if other_child.pos_ == 'ADV':
                                adverb = other_child.text
                    elif child.pos_ == 'ADV':
                        adv_verb = child.text
                    elif child.pos_ == 'PART':
                        neg = child.text
                if noun and adj:
                    if adverb:
                        noun_ver_adj.append((noun, token.text, adverb, adj))
                    elif adv_verb and neg:
                        noun_ver_adj.append((noun, token.text, adv_verb, neg, adj))
                    elif neg:
                        noun_ver_adj.append((noun, token.text, neg, adj))
                    else:
                        noun_ver_adj.append((noun, token.text, adj))
    return noun_ver_adj


def get_most_common_noun(noun_ver_adj):
    element_counts_lemma_noun = Counter(nlp(item[0].lower())[0].lemma_ for item in noun_ver_adj)
    most_common_noun = list(map(lambda x: x[0], element_counts_lemma_noun.most_common(10)))
    return most_common_noun[:5]


def get_insights(topic, noun_ver_adj):
    list_tuples = [' '.join(x) for x in noun_ver_adj if nlp(x[0].lower())[0].lemma_ == topic]
    results = sentiment_pipeline(list_tuples)
    pos = 0
    neg = 0
    pos_adj = []
    neg_adj = []
    for sentence, result in zip(list_tuples, results):
        if result['label'] == 'POSITIVE':
            pos += 1
            pos_adj.append(sentence.rsplit(None, 1)[-1].lower())
        else:
            neg += 1
            neg_adj.append(sentence.rsplit(None, 1)[-1].lower())
    most_common_pos_adj = list(map(lambda x: x[0], Counter(pos_adj).most_common(5)))
    most_common_neg_adj = list(map(lambda x: x[0], Counter(neg_adj).most_common(5)))
    return most_common_pos_adj, most_common_neg_adj


def get_df_all_topics_sent(reviews, sentiment, most_common_noun, threshold=0.6):
    # Get the dataframe of all topics with the corresponding sentiment (positive or negative)
    reviews_list = reviews.to_list()
    hypothesis = f'This product review reflect a {sentiment} sentiment of the {{}}'
    df_sent = classifier(reviews_list, most_common_noun, hypothesis_template=hypothesis, multi_label=True)
    df_sent = pd.DataFrame(df_sent)
    df_sent = df_sent.set_index('sequence').apply(pd.Series.explode).reset_index()
    df_sent = df_sent[df_sent['scores'] >= threshold]
    return df_sent


def get_both_df(reviews,most_common_noun):
    # get both df and remove indexes from the positive and negative dataframes where the score is higher in one or the other df
    df_pos = get_df_all_topics_sent(reviews, 'positive', most_common_noun)
    print('done')
    df_neg = get_df_all_topics_sent(reviews, 'negative', most_common_noun)
    merged_df = pd.merge(df_pos, df_neg, on=['sequence', 'labels'], suffixes=('_pos', '_neg'))
    to_remove_pos = merged_df[merged_df.scores_pos < merged_df.scores_neg][['sequence', 'labels']]
    indexes_pos_to_remove = df_pos.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index(
        'index').index
    to_remove_neg = merged_df[merged_df.scores_pos > merged_df.scores_neg][['sequence', 'labels']]
    indexes_neg_to_remove = df_neg.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index(
        'index').index
    df_pos.drop(index=indexes_pos_to_remove, inplace=True)
    df_neg.drop(index=indexes_neg_to_remove, inplace=True)
    return df_pos, df_neg


def get_df_sent_topic(topic, df_all_topic_sentim):
    # get the reviews of a specific topic corresponding to the given sentiment
    df_topic = df_all_topic_sentim[df_all_topic_sentim.labels == topic].copy()
    df_topic.drop(columns=['labels', 'scores'], inplace=True)
    return df_topic


def get_percentages_topic(topic, df_all_topic_pos, df_all_topic_neg):
    # get percentages of positive and negative reviews for the given topic
    df_pos = get_df_sent_topic(topic, df_all_topic_pos)
    df_neg = get_df_sent_topic(topic, df_all_topic_neg)
    pos_perc = round(df_pos.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2)
    neg_perc = round(df_neg.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2)
    return pos_perc, neg_perc


def get_df_adjectives(sentiment, reviews, topic,df_all_topic_sent, noun_ver_adj, threshold=0.6):
    reviews_list = reviews.to_list()
    if sentiment == 'positive':
        adj = get_insights(topic, noun_ver_adj)[0]
    else:
        adj = get_insights(topic, noun_ver_adj)[1]
    hypothesis = f'The {sentiment} sentiment representing the product {topic} is {{}}'
    df_topic = get_df_sent_topic(topic, df_all_topic_sent)
    df_adj = classifier(df_topic.sequence.tolist(), adj, hypothesis_template=hypothesis, multi_label=True)
    df_adj = pd.DataFrame(df_adj)
    df_adj = df_adj.set_index('sequence').apply(pd.Series.explode).reset_index()
    df_adj = df_adj[df_adj['scores'] >= threshold]
    return (df_adj.labels.value_counts(normalize=True).values.round(2) * 100).astype(int), df_adj.labels.value_counts(
        normalize=True).index.values.astype(str)

def get_topics_adjectives(most_common_noun, noun_ver_adj):
    dic = {}
    for i in range(5):
        dic[most_common_noun[i]] = get_insights(most_common_noun[i], noun_ver_adj)
    return dic

def generate_feedback(dic, temperature = 0.9):
  text = f"""Create a summary adressed to a business owner of a product about its reviews.
We provide the main topics of the reviews with their main attributes. 
For each topic which are the keys of the dictionary, the first list is positive adjectives and the second is negative.
Start the text by : 'Dear business owner,'
You have to create subpart for each topic and explain on the first part of each topic the positive attributes by writing :
topic :
positive feedbacks : sentences explaining the positive feedbacks
negative feedbacks : sentences explaining the negative feedbacks
Finish the text by signing with this company name : 'The Topic Magnet'.
Feel free to put many feed lines 
: {dic}
   """
  llm = OpenAI(temperature = temperature, max_tokens = 1000)
  generated_text = llm(text)
  return generated_text.strip()




def get_reviews(url):
    df = pd.DataFrame({'Text': scrape_reviews(url)})
    print('ok 1')
    df = preprocess_data(df)
    print('ok 2')
    reviews = df.Text
    print('ok 3')
    noun_ver_adj = get_noun_ver_adj(reviews)
    print('ok 4')
    most_common_noun = get_most_common_noun(noun_ver_adj)
    print('ok 5')
    dic1 = get_topics_adjectives(most_common_noun, noun_ver_adj)
    print('ok 6')
    dic2 = get_ratings_dic(df)
    print('ok 7')
    generated_text = generate_feedback(dic1)
    print('ok 8')
    return dic2,generated_text




if __name__ == '__main__':
    interface = gr.Interface(fn=get_reviews, inputs=gr.Textbox(), outputs=[gr.Textbox(label = 'Real ratings'),gr.Textbox(label = 'Actionable insights')], title='The Topic Magnet',
           description='Enter the url of your Amazon reviews to get real ratings and valuable insights')
    print('ok 9')
    interface.queue().launch()