dfinel commited on
Commit
21cb43a
1 Parent(s): 2fef160

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. README.md +2 -8
  2. app.py +274 -0
  3. bert_regression.py +73 -0
  4. requirements.txt +22 -0
  5. scraper.py +79 -0
  6. training_bert.py +118 -0
  7. transformers_models.py +12 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Deployment Final Project
3
- emoji: 🏢
4
- colorFrom: pink
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.26.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: deployment_final_project
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.26.0
 
 
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import grequests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import re
5
+ from tqdm import tqdm
6
+ import spacy
7
+ from collections import Counter
8
+ from transformers import pipeline
9
+ from flask import Flask
10
+ from bert_regression import get_ratings_dic
11
+ import os
12
+ from langchain.llms import OpenAI
13
+ import gradio as gr
14
+
15
+
16
+ os.environ["OPENAI_API_KEY"] = "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"
17
+
18
+ app = Flask(__name__)
19
+
20
+ nlp = spacy.load('../topic_magnet/spacy_model')
21
+ sentiment_pipeline = pipeline("sentiment-analysis", model='my_sentiment_model')
22
+ classifier = pipeline(task="zero-shot-classification", model="my_zero_shot")
23
+
24
+ product_url = 'https://www.amazon.co.uk/product-reviews/B0B21DW5DL/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_review'
25
+ custom_headers = {
26
+ # Eliminating non-english reviews
27
+ "Accept-language": "en;q=1.0",
28
+ "Accept-Encoding": "gzip, deflate, br",
29
+ "Cache-Control": "max-age=0",
30
+ "Connection": "keep-alive",
31
+ "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
32
+ }
33
+
34
+
35
+ def get_soup(response):
36
+ if response.status_code != 200:
37
+ print("Error in getting webpage")
38
+ return None
39
+
40
+ soup = BeautifulSoup(response.text, "html.parser")
41
+ return soup
42
+
43
+
44
+ def get_soup_reviews(soup):
45
+ review_elements = soup.select("div.review")
46
+
47
+ scraped_reviews = []
48
+
49
+ for review in review_elements:
50
+ r_content_element = review.select_one("span.review-text")
51
+ r_content = r_content_element.text if r_content_element else None
52
+ preprocessed_review = r_content.replace('\n', '')
53
+
54
+ scraped_reviews.append(preprocessed_review)
55
+
56
+ return scraped_reviews
57
+
58
+
59
+ def scrape_reviews(base_url):
60
+ all_reviews = []
61
+ star_ratings = ['one', 'two', 'three', 'four', 'five']
62
+
63
+ for star in tqdm(star_ratings):
64
+ page_number = 1
65
+
66
+ while True:
67
+ url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}"
68
+ response = grequests.get(url, headers=custom_headers).send().response
69
+ soup = get_soup(response)
70
+
71
+ if not soup:
72
+ continue # Skip to next star rating if unable to parse page
73
+
74
+ reviews = get_soup_reviews(soup)
75
+ all_reviews.extend(reviews)
76
+
77
+ # Note: there's a valid page for any pageNumber,
78
+ # so we need to stop scraping based on the button of next page
79
+ # Check for the presence of the "Next page" element
80
+ next_page_element = soup.find("li", class_="a-disabled a-last")
81
+ if next_page_element:
82
+ break # Exit loop if "Next page" element is found
83
+
84
+ page_number += 1
85
+
86
+ return all_reviews
87
+ def remove_links(review):
88
+ pattern = r'\bhttps?://\S+'
89
+ return re.sub(pattern, '', review)
90
+
91
+
92
+ def preprocess_data(df):
93
+ df.rename(columns={'content': 'Text'}, inplace = True)
94
+ df.Text = df.Text.astype(str)
95
+ df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
96
+ df['Text'] = df['Text'].apply(remove_links)
97
+ return df
98
+
99
+
100
+ def get_noun_ver_adj(reviews):
101
+ noun_ver_adj = []
102
+ for i in tqdm(range(reviews.shape[0])):
103
+ sente = nlp(reviews.iloc[i])
104
+ for token in sente:
105
+ noun = adj = adverb = adv_verb = neg = ''
106
+ if token.dep_ == 'ROOT':
107
+ for child in token.children:
108
+ if child.pos_ == 'NOUN':
109
+ noun = child.text
110
+ elif child.pos_ == 'ADJ':
111
+ adj = child.text
112
+ for other_child in child.children:
113
+ if other_child.pos_ == 'ADV':
114
+ adverb = other_child.text
115
+ elif child.pos_ == 'ADV':
116
+ adv_verb = child.text
117
+ elif child.pos_ == 'PART':
118
+ neg = child.text
119
+ if noun and adj:
120
+ if adverb:
121
+ noun_ver_adj.append((noun, token.text, adverb, adj))
122
+ elif adv_verb and neg:
123
+ noun_ver_adj.append((noun, token.text, adv_verb, neg, adj))
124
+ elif neg:
125
+ noun_ver_adj.append((noun, token.text, neg, adj))
126
+ else:
127
+ noun_ver_adj.append((noun, token.text, adj))
128
+ return noun_ver_adj
129
+
130
+
131
+ def get_most_common_noun(noun_ver_adj):
132
+ element_counts_lemma_noun = Counter(nlp(item[0].lower())[0].lemma_ for item in noun_ver_adj)
133
+ most_common_noun = list(map(lambda x: x[0], element_counts_lemma_noun.most_common(10)))
134
+ return most_common_noun[:5]
135
+
136
+
137
+ def get_insights(topic, noun_ver_adj):
138
+ list_tuples = [' '.join(x) for x in noun_ver_adj if nlp(x[0].lower())[0].lemma_ == topic]
139
+ results = sentiment_pipeline(list_tuples)
140
+ pos = 0
141
+ neg = 0
142
+ pos_adj = []
143
+ neg_adj = []
144
+ for sentence, result in zip(list_tuples, results):
145
+ if result['label'] == 'POSITIVE':
146
+ pos += 1
147
+ pos_adj.append(sentence.rsplit(None, 1)[-1].lower())
148
+ else:
149
+ neg += 1
150
+ neg_adj.append(sentence.rsplit(None, 1)[-1].lower())
151
+ most_common_pos_adj = list(map(lambda x: x[0], Counter(pos_adj).most_common(5)))
152
+ most_common_neg_adj = list(map(lambda x: x[0], Counter(neg_adj).most_common(5)))
153
+ return most_common_pos_adj, most_common_neg_adj
154
+
155
+
156
+ def get_df_all_topics_sent(reviews, sentiment, most_common_noun, threshold=0.6):
157
+ # Get the dataframe of all topics with the corresponding sentiment (positive or negative)
158
+ reviews_list = reviews.to_list()
159
+ hypothesis = f'This product review reflect a {sentiment} sentiment of the {{}}'
160
+ df_sent = classifier(reviews_list, most_common_noun, hypothesis_template=hypothesis, multi_label=True)
161
+ df_sent = pd.DataFrame(df_sent)
162
+ df_sent = df_sent.set_index('sequence').apply(pd.Series.explode).reset_index()
163
+ df_sent = df_sent[df_sent['scores'] >= threshold]
164
+ return df_sent
165
+
166
+
167
+ def get_both_df(reviews,most_common_noun):
168
+ # get both df and remove indexes from the positive and negative dataframes where the score is higher in one or the other df
169
+ df_pos = get_df_all_topics_sent(reviews, 'positive', most_common_noun)
170
+ print('done')
171
+ df_neg = get_df_all_topics_sent(reviews, 'negative', most_common_noun)
172
+ merged_df = pd.merge(df_pos, df_neg, on=['sequence', 'labels'], suffixes=('_pos', '_neg'))
173
+ to_remove_pos = merged_df[merged_df.scores_pos < merged_df.scores_neg][['sequence', 'labels']]
174
+ indexes_pos_to_remove = df_pos.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index(
175
+ 'index').index
176
+ to_remove_neg = merged_df[merged_df.scores_pos > merged_df.scores_neg][['sequence', 'labels']]
177
+ indexes_neg_to_remove = df_neg.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index(
178
+ 'index').index
179
+ df_pos.drop(index=indexes_pos_to_remove, inplace=True)
180
+ df_neg.drop(index=indexes_neg_to_remove, inplace=True)
181
+ return df_pos, df_neg
182
+
183
+
184
+ def get_df_sent_topic(topic, df_all_topic_sentim):
185
+ # get the reviews of a specific topic corresponding to the given sentiment
186
+ df_topic = df_all_topic_sentim[df_all_topic_sentim.labels == topic].copy()
187
+ df_topic.drop(columns=['labels', 'scores'], inplace=True)
188
+ return df_topic
189
+
190
+
191
+ def get_percentages_topic(topic, df_all_topic_pos, df_all_topic_neg):
192
+ # get percentages of positive and negative reviews for the given topic
193
+ df_pos = get_df_sent_topic(topic, df_all_topic_pos)
194
+ df_neg = get_df_sent_topic(topic, df_all_topic_neg)
195
+ pos_perc = round(df_pos.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2)
196
+ neg_perc = round(df_neg.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2)
197
+ return pos_perc, neg_perc
198
+
199
+
200
+ def get_df_adjectives(sentiment, reviews, topic,df_all_topic_sent, noun_ver_adj, threshold=0.6):
201
+ reviews_list = reviews.to_list()
202
+ if sentiment == 'positive':
203
+ adj = get_insights(topic, noun_ver_adj)[0]
204
+ else:
205
+ adj = get_insights(topic, noun_ver_adj)[1]
206
+ hypothesis = f'The {sentiment} sentiment representing the product {topic} is {{}}'
207
+ df_topic = get_df_sent_topic(topic, df_all_topic_sent)
208
+ df_adj = classifier(df_topic.sequence.tolist(), adj, hypothesis_template=hypothesis, multi_label=True)
209
+ df_adj = pd.DataFrame(df_adj)
210
+ df_adj = df_adj.set_index('sequence').apply(pd.Series.explode).reset_index()
211
+ df_adj = df_adj[df_adj['scores'] >= threshold]
212
+ return (df_adj.labels.value_counts(normalize=True).values.round(2) * 100).astype(int), df_adj.labels.value_counts(
213
+ normalize=True).index.values.astype(str)
214
+
215
+ def get_topics_adjectives(most_common_noun, noun_ver_adj):
216
+ dic = {}
217
+ for i in range(5):
218
+ dic[most_common_noun[i]] = get_insights(most_common_noun[i], noun_ver_adj)
219
+ return dic
220
+
221
+ def generate_feedback(dic, temperature = 0.9):
222
+ text = f"""Create a summary adressed to a business owner of a product about its reviews.
223
+ We provide the main topics of the reviews with their main attributes.
224
+ For each topic which are the keys of the dictionary, the first list is positive adjectives and the second is negative.
225
+ Start the text by : 'Dear business owner,'
226
+ You have to create subpart for each topic and explain on the first part of each topic the positive attributes by writing :
227
+ topic :
228
+ positive feedbacks : sentences explaining the positive feedbacks
229
+ negative feedbacks : sentences explaining the negative feedbacks
230
+ Finish the text by signing with this company name : 'The Topic Magnet'.
231
+ Feel free to put many feed lines
232
+ : {dic}
233
+ """
234
+ llm = OpenAI(temperature = temperature, max_tokens = 1000)
235
+ generated_text = llm(text)
236
+ #return generated_text.strip().replace('\n',' ')
237
+ return generated_text.strip()
238
+
239
+
240
+ #@app.route('/get_reviews', methods = ['GET'])
241
+
242
+ def get_reviews(url):
243
+ df = pd.DataFrame({'Text': scrape_reviews(url)})
244
+ df = preprocess_data(df)
245
+ reviews = df.Text
246
+ noun_ver_adj = get_noun_ver_adj(reviews)
247
+ most_common_noun = get_most_common_noun(noun_ver_adj)
248
+ dic1 = get_topics_adjectives(most_common_noun, noun_ver_adj)
249
+ dic2 = get_ratings_dic(df)
250
+ generated_text = generate_feedback(dic1)
251
+ #return jsonify(data1 = dic1, data2 = dic2, data3 = generated_text)
252
+ return dic2,generated_text
253
+
254
+ # gr.Interface(fn = get_reviews, inputs = gr.Textbox(), outputs = gr.Textbox(), title = 'The Topic Magnet',
255
+ # description = 'Enter the url of your Amazon reviews to get real ratings and valuable insights').launch(share = True)
256
+
257
+ #print(get_reviews(url))
258
+
259
+ if __name__ == '__main__':
260
+ interface = gr.Interface(fn=get_reviews, inputs=gr.Textbox(), outputs=[gr.Textbox(label = 'Real ratings'),gr.Textbox(label = 'Actionable insights')], title='The Topic Magnet',
261
+ description='Enter the url of your Amazon reviews to get real ratings and valuable insights')
262
+ interface.launch(share = True)
263
+ #app.run(host = '0.0.0.0', debug = True, port = 5000)
264
+
265
+
266
+ #print(most_common_noun)
267
+ #print(get_insights(most_common_noun[0],noun_ver_adj))
268
+
269
+ #dfs_topics = get_both_df(reviews,most_common_noun)
270
+ #df_all_topic_pos = dfs_topics[0]
271
+ #df_all_topic_neg = dfs_topics[1]
272
+ #print(get_percentages_topic(most_common_noun[0],df_all_topic_pos,df_all_topic_neg))
273
+ #print(get_df_adjectives('positive',reviews,most_common_noun[0],noun_ver_adj))
274
+
bert_regression.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datasets import Dataset
4
+ from transformers import AutoTokenizer,AutoModelForSequenceClassification
5
+ import torch
6
+ from itertools import chain
7
+ import re
8
+ def remove_links(review):
9
+ pattern = r'\bhttps?://\S+'
10
+ return re.sub(pattern, '', review)
11
+
12
+
13
+ # df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv')
14
+ # df = df.loc[:,['Text']].iloc[:1000]
15
+ # df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
16
+ # df['Text'] = df['Text'].apply(remove_links)
17
+
18
+ model = AutoModelForSequenceClassification.from_pretrained(
19
+ '../topic_magnet/bert_regr_other_pretrained', num_labels = 1)
20
+ tokenizer = AutoTokenizer.from_pretrained(
21
+ '../topic_magnet/bert_regr_other_pretrained')
22
+
23
+ def preprocess_function_regr(examples):
24
+ return tokenizer(examples["Text"], truncation=True, max_length=64, padding = 'max_length')
25
+
26
+ def get_predictions(reviews):
27
+ #new_test = pd.DataFrame(reviews)
28
+ new_ds_regr = Dataset.from_pandas(reviews)
29
+ new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text'])
30
+ input_ids = torch.tensor(new_ds_regr_tok['input_ids'])
31
+ token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids'])
32
+ attention_mask = torch.tensor(new_ds_regr_tok['attention_mask'])
33
+ with torch.no_grad():
34
+ outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
35
+ predictions = outputs.logits
36
+ return predictions
37
+
38
+ def get_ratings_perc(reviews):
39
+ preds = get_predictions(reviews)
40
+ predictions_list = list(chain.from_iterable(preds.tolist()))
41
+ predictions_array = np.clip(predictions_list,1,5)
42
+ predictions_array = [round(x) for x in predictions_array]
43
+ sum = np.unique(predictions_array, return_counts = True)[1].sum()
44
+ ratings_perc = np.unique(predictions_array, return_counts = True)[1]/sum *100
45
+ return ratings_perc
46
+
47
+ def get_ratings_dic(reviews):
48
+ ratings_perc = get_ratings_perc(reviews)
49
+ dic = {}
50
+ for i in range(1,6):
51
+ dic[i] = f'{ratings_perc[i-1].round(2)} %'
52
+ return dic
53
+
54
+ #print(get_ratings_dic(df))
55
+
56
+
57
+
58
+
59
+ # new_test = pd.DataFrame(df.loc[:,'Text'].iloc[:1000])
60
+ # new_ds_regr = Dataset.from_pandas(new_test)
61
+ # new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text'])
62
+ #
63
+ # input_ids = torch.tensor(new_ds_regr_tok['input_ids'])
64
+ # token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids'])
65
+ # attention_mask = torch.tensor(new_ds_regr_tok['attention_mask'])
66
+ # with torch.no_grad():
67
+ # outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
68
+ # predictions = outputs.logits
69
+ #
70
+ # predictions_list = list(chain.from_iterable(predictions.tolist()))
71
+ # predictions_array = np.clip(predictions_list,1,5)
72
+ # predictions_array = [round(x) for x in predictions_array]
73
+ # print(np.unique(predictions_array, return_counts = True))
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.12.3
2
+ datasets==2.18.0
3
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
4
+ fastapi==0.110.1
5
+ Flask==3.0.3
6
+ gevent==24.2.1
7
+ gradio==4.26.0
8
+ grequests==0.7.0
9
+ huggingface-hub==0.22.2
10
+ langchain==0.1.16
11
+ matplotlib==3.8.4
12
+ numpy==1.26.4
13
+ openai==1.17.0
14
+ pandas==2.2.1
15
+ requests==2.31.0
16
+ spacy==3.7.4
17
+ tokenizers==0.15.2
18
+ torch==2.2.2
19
+ tqdm==4.66.2
20
+ transformers==4.39.3
21
+
22
+
scraper.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import grequests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import time
5
+ import csv
6
+ from tqdm import tqdm
7
+
8
+ #product_url = "https://www.amazon.co.uk/Smiths-Savoury-Snacks-Favourites-24/product-reviews/B07X2M1D16/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
9
+ product_url = 'https://www.amazon.co.uk/product-reviews/B0B21DW5DL/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_review'
10
+ custom_headers = {
11
+ # Eliminating non-english reviews
12
+ "Accept-language": "en;q=1.0",
13
+ "Accept-Encoding": "gzip, deflate, br",
14
+ "Cache-Control": "max-age=0",
15
+ "Connection": "keep-alive",
16
+ "User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
17
+ }
18
+
19
+
20
+ def get_soup(response):
21
+ if response.status_code != 200:
22
+ print("Error in getting webpage")
23
+ return None
24
+
25
+ soup = BeautifulSoup(response.text, "html.parser")
26
+ return soup
27
+
28
+
29
+ def get_reviews(soup):
30
+ review_elements = soup.select("div.review")
31
+
32
+ scraped_reviews = []
33
+
34
+ for review in review_elements:
35
+ r_content_element = review.select_one("span.review-text")
36
+ r_content = r_content_element.text if r_content_element else None
37
+ preprocessed_review = r_content.replace('\n', '')
38
+
39
+ scraped_reviews.append(preprocessed_review)
40
+
41
+ return scraped_reviews
42
+
43
+
44
+ def scrape_reviews(base_url):
45
+ all_reviews = []
46
+ star_ratings = ['one', 'two', 'three', 'four', 'five']
47
+
48
+ for star in tqdm(star_ratings):
49
+ page_number = 1
50
+
51
+ while True:
52
+ url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}"
53
+ response = grequests.get(url, headers=custom_headers).send().response
54
+ soup = get_soup(response)
55
+
56
+ if not soup:
57
+ continue # Skip to next star rating if unable to parse page
58
+
59
+ reviews = get_reviews(soup)
60
+ all_reviews.extend(reviews)
61
+
62
+ # Note: there's a valid page for any pageNumber,
63
+ # so we need to stop scraping based on the button of next page
64
+ # Check for the presence of the "Next page" element
65
+ next_page_element = soup.find("li", class_="a-disabled a-last")
66
+ if next_page_element:
67
+ break # Exit loop if "Next page" element is found
68
+
69
+ page_number += 1
70
+
71
+ return all_reviews
72
+
73
+
74
+ print(scrape_reviews(product_url))
75
+
76
+
77
+
78
+
79
+
training_bert.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ from sklearn.model_selection import GroupShuffleSplit
5
+ def remove_links(review):
6
+ pattern = r'\bhttps?://\S+'
7
+ return re.sub(pattern, '', review)
8
+
9
+ df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv')
10
+ df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
11
+ df['Text'] = df['Text'].apply(remove_links)
12
+
13
+
14
+ splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42)
15
+ split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId'])
16
+ train_inds, temp_inds = next(split_temp)
17
+
18
+ train = df.iloc[train_inds]
19
+ temp = df.iloc[temp_inds]
20
+
21
+ splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42)
22
+ split_val = splitter_val.split(temp, groups=temp['ProductId'])
23
+ val_inds, test_inds = next(split_val)
24
+
25
+ val = temp.iloc[val_inds]
26
+ test = temp.iloc[test_inds]
27
+
28
+ X_train = train.drop(columns = 'Score')
29
+ y_train = train.Score
30
+
31
+ X_val = val.drop(columns = 'Score')
32
+ y_val = val.Score
33
+
34
+ X_test = test.drop(columns = 'Score')
35
+ y_test = test.Score
36
+
37
+ from transformers import AutoTokenizer,AutoModelForSequenceClassification
38
+ base_model = 'bert-base-cased'
39
+ learning_rate = 2e-5
40
+ max_length = 64
41
+ batch_size = 32
42
+ epochs = 5
43
+ nbr_samples = 10000
44
+ tokenizer_regr = AutoTokenizer.from_pretrained(base_model)
45
+ model_regr = AutoModelForSequenceClassification.from_pretrained(base_model,num_labels = 1)
46
+
47
+ X_train_bert = X_train.iloc[:nbr_samples]
48
+ del X_train_bert['ProductId']
49
+ X_train_bert['label'] = y_train.iloc[:nbr_samples].astype(float)
50
+
51
+ X_val_bert = X_val.iloc[:nbr_samples]
52
+ del X_val_bert['ProductId']
53
+ X_val_bert['label'] = y_val.iloc[:nbr_samples].astype(float)
54
+
55
+ from datasets import Dataset
56
+ ds_train_regr = Dataset.from_pandas(X_train_bert)
57
+ ds_val_regr = Dataset.from_pandas(X_val_bert)
58
+
59
+ def preprocess_function_regr(examples):
60
+ return tokenizer_regr(examples["Text"], truncation=True, max_length=64, padding = 'max_length')
61
+
62
+ ds_train_regr_tok = ds_train_regr.map(preprocess_function_regr, remove_columns = ['Text'])
63
+ ds_val_regr_tok = ds_val_regr.map(preprocess_function_regr, remove_columns = ['Text'])
64
+
65
+ from sklearn.metrics import mean_absolute_error
66
+ from sklearn.metrics import mean_squared_error
67
+ from sklearn.metrics import r2_score
68
+
69
+ def compute_metrics_for_regression(eval_pred):
70
+ logits, labels = eval_pred
71
+ labels = labels.reshape(-1, 1)
72
+
73
+ mse = mean_squared_error(labels, logits)
74
+ mae = mean_absolute_error(labels, logits)
75
+ r2 = r2_score(labels, logits)
76
+ single_squared_errors = ((logits - labels).flatten()**2).tolist()
77
+ accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
78
+
79
+ return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}
80
+
81
+ from transformers import TrainingArguments
82
+
83
+ output_dir = ".."
84
+
85
+ training_args = TrainingArguments(
86
+ output_dir = output_dir,
87
+ learning_rate=learning_rate,
88
+ per_device_train_batch_size=batch_size,
89
+ per_device_eval_batch_size=batch_size,
90
+ num_train_epochs=epochs,
91
+ evaluation_strategy="epoch",
92
+ save_strategy="epoch",
93
+ metric_for_best_model="accuracy",
94
+ load_best_model_at_end=True,
95
+ weight_decay=0.01,
96
+ )
97
+ from transformers import Trainer
98
+ import torch
99
+ class RegressionTrainer(Trainer):
100
+ def compute_loss(self, model, inputs, return_outputs=False):
101
+ labels = inputs.pop("labels")
102
+ outputs = model(**inputs)
103
+ logits = outputs[0][:, 0]
104
+ loss = torch.nn.functional.mse_loss(logits, labels)
105
+ return (loss, outputs) if return_outputs else loss
106
+
107
+ trainer = Trainer(
108
+ model=model_regr,
109
+ args=training_args,
110
+ train_dataset=ds_train_regr_tok,
111
+ eval_dataset=ds_val_regr_tok,
112
+ compute_metrics=compute_metrics_for_regression
113
+ )
114
+
115
+ trainer.train()
116
+
117
+ tokenizer_regr.save_pretrained('.')
118
+ model_regr.save_pretrained('.', from_pt = True)
transformers_models.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import spacy
3
+
4
+ nlp = spacy.load('en_core_web_sm')
5
+ sentiment_pipeline = pipeline("sentiment-analysis", model='distilbert/distilbert-base-uncased-finetuned-sst-2-english')
6
+ classifier = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli")
7
+
8
+ nlp.to_disk('spacy_model')
9
+
10
+ sentiment_pipeline.save_pretrained('my_sentiment_model')
11
+
12
+ classifier.save_pretrained('my_zero_shot')