Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- README.md +2 -8
- app.py +274 -0
- bert_regression.py +73 -0
- requirements.txt +22 -0
- scraper.py +79 -0
- training_bert.py +118 -0
- transformers_models.py +12 -0
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.26.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: deployment_final_project
|
3 |
+
app_file: app.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.26.0
|
|
|
|
|
6 |
---
|
|
|
|
app.py
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import grequests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
import re
|
5 |
+
from tqdm import tqdm
|
6 |
+
import spacy
|
7 |
+
from collections import Counter
|
8 |
+
from transformers import pipeline
|
9 |
+
from flask import Flask
|
10 |
+
from bert_regression import get_ratings_dic
|
11 |
+
import os
|
12 |
+
from langchain.llms import OpenAI
|
13 |
+
import gradio as gr
|
14 |
+
|
15 |
+
|
16 |
+
os.environ["OPENAI_API_KEY"] = "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"
|
17 |
+
|
18 |
+
app = Flask(__name__)
|
19 |
+
|
20 |
+
nlp = spacy.load('../topic_magnet/spacy_model')
|
21 |
+
sentiment_pipeline = pipeline("sentiment-analysis", model='my_sentiment_model')
|
22 |
+
classifier = pipeline(task="zero-shot-classification", model="my_zero_shot")
|
23 |
+
|
24 |
+
product_url = 'https://www.amazon.co.uk/product-reviews/B0B21DW5DL/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_review'
|
25 |
+
custom_headers = {
|
26 |
+
# Eliminating non-english reviews
|
27 |
+
"Accept-language": "en;q=1.0",
|
28 |
+
"Accept-Encoding": "gzip, deflate, br",
|
29 |
+
"Cache-Control": "max-age=0",
|
30 |
+
"Connection": "keep-alive",
|
31 |
+
"User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
32 |
+
}
|
33 |
+
|
34 |
+
|
35 |
+
def get_soup(response):
|
36 |
+
if response.status_code != 200:
|
37 |
+
print("Error in getting webpage")
|
38 |
+
return None
|
39 |
+
|
40 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
41 |
+
return soup
|
42 |
+
|
43 |
+
|
44 |
+
def get_soup_reviews(soup):
|
45 |
+
review_elements = soup.select("div.review")
|
46 |
+
|
47 |
+
scraped_reviews = []
|
48 |
+
|
49 |
+
for review in review_elements:
|
50 |
+
r_content_element = review.select_one("span.review-text")
|
51 |
+
r_content = r_content_element.text if r_content_element else None
|
52 |
+
preprocessed_review = r_content.replace('\n', '')
|
53 |
+
|
54 |
+
scraped_reviews.append(preprocessed_review)
|
55 |
+
|
56 |
+
return scraped_reviews
|
57 |
+
|
58 |
+
|
59 |
+
def scrape_reviews(base_url):
|
60 |
+
all_reviews = []
|
61 |
+
star_ratings = ['one', 'two', 'three', 'four', 'five']
|
62 |
+
|
63 |
+
for star in tqdm(star_ratings):
|
64 |
+
page_number = 1
|
65 |
+
|
66 |
+
while True:
|
67 |
+
url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}"
|
68 |
+
response = grequests.get(url, headers=custom_headers).send().response
|
69 |
+
soup = get_soup(response)
|
70 |
+
|
71 |
+
if not soup:
|
72 |
+
continue # Skip to next star rating if unable to parse page
|
73 |
+
|
74 |
+
reviews = get_soup_reviews(soup)
|
75 |
+
all_reviews.extend(reviews)
|
76 |
+
|
77 |
+
# Note: there's a valid page for any pageNumber,
|
78 |
+
# so we need to stop scraping based on the button of next page
|
79 |
+
# Check for the presence of the "Next page" element
|
80 |
+
next_page_element = soup.find("li", class_="a-disabled a-last")
|
81 |
+
if next_page_element:
|
82 |
+
break # Exit loop if "Next page" element is found
|
83 |
+
|
84 |
+
page_number += 1
|
85 |
+
|
86 |
+
return all_reviews
|
87 |
+
def remove_links(review):
|
88 |
+
pattern = r'\bhttps?://\S+'
|
89 |
+
return re.sub(pattern, '', review)
|
90 |
+
|
91 |
+
|
92 |
+
def preprocess_data(df):
|
93 |
+
df.rename(columns={'content': 'Text'}, inplace = True)
|
94 |
+
df.Text = df.Text.astype(str)
|
95 |
+
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
|
96 |
+
df['Text'] = df['Text'].apply(remove_links)
|
97 |
+
return df
|
98 |
+
|
99 |
+
|
100 |
+
def get_noun_ver_adj(reviews):
|
101 |
+
noun_ver_adj = []
|
102 |
+
for i in tqdm(range(reviews.shape[0])):
|
103 |
+
sente = nlp(reviews.iloc[i])
|
104 |
+
for token in sente:
|
105 |
+
noun = adj = adverb = adv_verb = neg = ''
|
106 |
+
if token.dep_ == 'ROOT':
|
107 |
+
for child in token.children:
|
108 |
+
if child.pos_ == 'NOUN':
|
109 |
+
noun = child.text
|
110 |
+
elif child.pos_ == 'ADJ':
|
111 |
+
adj = child.text
|
112 |
+
for other_child in child.children:
|
113 |
+
if other_child.pos_ == 'ADV':
|
114 |
+
adverb = other_child.text
|
115 |
+
elif child.pos_ == 'ADV':
|
116 |
+
adv_verb = child.text
|
117 |
+
elif child.pos_ == 'PART':
|
118 |
+
neg = child.text
|
119 |
+
if noun and adj:
|
120 |
+
if adverb:
|
121 |
+
noun_ver_adj.append((noun, token.text, adverb, adj))
|
122 |
+
elif adv_verb and neg:
|
123 |
+
noun_ver_adj.append((noun, token.text, adv_verb, neg, adj))
|
124 |
+
elif neg:
|
125 |
+
noun_ver_adj.append((noun, token.text, neg, adj))
|
126 |
+
else:
|
127 |
+
noun_ver_adj.append((noun, token.text, adj))
|
128 |
+
return noun_ver_adj
|
129 |
+
|
130 |
+
|
131 |
+
def get_most_common_noun(noun_ver_adj):
|
132 |
+
element_counts_lemma_noun = Counter(nlp(item[0].lower())[0].lemma_ for item in noun_ver_adj)
|
133 |
+
most_common_noun = list(map(lambda x: x[0], element_counts_lemma_noun.most_common(10)))
|
134 |
+
return most_common_noun[:5]
|
135 |
+
|
136 |
+
|
137 |
+
def get_insights(topic, noun_ver_adj):
|
138 |
+
list_tuples = [' '.join(x) for x in noun_ver_adj if nlp(x[0].lower())[0].lemma_ == topic]
|
139 |
+
results = sentiment_pipeline(list_tuples)
|
140 |
+
pos = 0
|
141 |
+
neg = 0
|
142 |
+
pos_adj = []
|
143 |
+
neg_adj = []
|
144 |
+
for sentence, result in zip(list_tuples, results):
|
145 |
+
if result['label'] == 'POSITIVE':
|
146 |
+
pos += 1
|
147 |
+
pos_adj.append(sentence.rsplit(None, 1)[-1].lower())
|
148 |
+
else:
|
149 |
+
neg += 1
|
150 |
+
neg_adj.append(sentence.rsplit(None, 1)[-1].lower())
|
151 |
+
most_common_pos_adj = list(map(lambda x: x[0], Counter(pos_adj).most_common(5)))
|
152 |
+
most_common_neg_adj = list(map(lambda x: x[0], Counter(neg_adj).most_common(5)))
|
153 |
+
return most_common_pos_adj, most_common_neg_adj
|
154 |
+
|
155 |
+
|
156 |
+
def get_df_all_topics_sent(reviews, sentiment, most_common_noun, threshold=0.6):
|
157 |
+
# Get the dataframe of all topics with the corresponding sentiment (positive or negative)
|
158 |
+
reviews_list = reviews.to_list()
|
159 |
+
hypothesis = f'This product review reflect a {sentiment} sentiment of the {{}}'
|
160 |
+
df_sent = classifier(reviews_list, most_common_noun, hypothesis_template=hypothesis, multi_label=True)
|
161 |
+
df_sent = pd.DataFrame(df_sent)
|
162 |
+
df_sent = df_sent.set_index('sequence').apply(pd.Series.explode).reset_index()
|
163 |
+
df_sent = df_sent[df_sent['scores'] >= threshold]
|
164 |
+
return df_sent
|
165 |
+
|
166 |
+
|
167 |
+
def get_both_df(reviews,most_common_noun):
|
168 |
+
# get both df and remove indexes from the positive and negative dataframes where the score is higher in one or the other df
|
169 |
+
df_pos = get_df_all_topics_sent(reviews, 'positive', most_common_noun)
|
170 |
+
print('done')
|
171 |
+
df_neg = get_df_all_topics_sent(reviews, 'negative', most_common_noun)
|
172 |
+
merged_df = pd.merge(df_pos, df_neg, on=['sequence', 'labels'], suffixes=('_pos', '_neg'))
|
173 |
+
to_remove_pos = merged_df[merged_df.scores_pos < merged_df.scores_neg][['sequence', 'labels']]
|
174 |
+
indexes_pos_to_remove = df_pos.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index(
|
175 |
+
'index').index
|
176 |
+
to_remove_neg = merged_df[merged_df.scores_pos > merged_df.scores_neg][['sequence', 'labels']]
|
177 |
+
indexes_neg_to_remove = df_neg.reset_index().merge(to_remove_pos, on=['sequence', 'labels'], how='inner').set_index(
|
178 |
+
'index').index
|
179 |
+
df_pos.drop(index=indexes_pos_to_remove, inplace=True)
|
180 |
+
df_neg.drop(index=indexes_neg_to_remove, inplace=True)
|
181 |
+
return df_pos, df_neg
|
182 |
+
|
183 |
+
|
184 |
+
def get_df_sent_topic(topic, df_all_topic_sentim):
|
185 |
+
# get the reviews of a specific topic corresponding to the given sentiment
|
186 |
+
df_topic = df_all_topic_sentim[df_all_topic_sentim.labels == topic].copy()
|
187 |
+
df_topic.drop(columns=['labels', 'scores'], inplace=True)
|
188 |
+
return df_topic
|
189 |
+
|
190 |
+
|
191 |
+
def get_percentages_topic(topic, df_all_topic_pos, df_all_topic_neg):
|
192 |
+
# get percentages of positive and negative reviews for the given topic
|
193 |
+
df_pos = get_df_sent_topic(topic, df_all_topic_pos)
|
194 |
+
df_neg = get_df_sent_topic(topic, df_all_topic_neg)
|
195 |
+
pos_perc = round(df_pos.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2)
|
196 |
+
neg_perc = round(df_neg.shape[0] / (df_pos.shape[0] + df_neg.shape[0]) * 100, 2)
|
197 |
+
return pos_perc, neg_perc
|
198 |
+
|
199 |
+
|
200 |
+
def get_df_adjectives(sentiment, reviews, topic,df_all_topic_sent, noun_ver_adj, threshold=0.6):
|
201 |
+
reviews_list = reviews.to_list()
|
202 |
+
if sentiment == 'positive':
|
203 |
+
adj = get_insights(topic, noun_ver_adj)[0]
|
204 |
+
else:
|
205 |
+
adj = get_insights(topic, noun_ver_adj)[1]
|
206 |
+
hypothesis = f'The {sentiment} sentiment representing the product {topic} is {{}}'
|
207 |
+
df_topic = get_df_sent_topic(topic, df_all_topic_sent)
|
208 |
+
df_adj = classifier(df_topic.sequence.tolist(), adj, hypothesis_template=hypothesis, multi_label=True)
|
209 |
+
df_adj = pd.DataFrame(df_adj)
|
210 |
+
df_adj = df_adj.set_index('sequence').apply(pd.Series.explode).reset_index()
|
211 |
+
df_adj = df_adj[df_adj['scores'] >= threshold]
|
212 |
+
return (df_adj.labels.value_counts(normalize=True).values.round(2) * 100).astype(int), df_adj.labels.value_counts(
|
213 |
+
normalize=True).index.values.astype(str)
|
214 |
+
|
215 |
+
def get_topics_adjectives(most_common_noun, noun_ver_adj):
|
216 |
+
dic = {}
|
217 |
+
for i in range(5):
|
218 |
+
dic[most_common_noun[i]] = get_insights(most_common_noun[i], noun_ver_adj)
|
219 |
+
return dic
|
220 |
+
|
221 |
+
def generate_feedback(dic, temperature = 0.9):
|
222 |
+
text = f"""Create a summary adressed to a business owner of a product about its reviews.
|
223 |
+
We provide the main topics of the reviews with their main attributes.
|
224 |
+
For each topic which are the keys of the dictionary, the first list is positive adjectives and the second is negative.
|
225 |
+
Start the text by : 'Dear business owner,'
|
226 |
+
You have to create subpart for each topic and explain on the first part of each topic the positive attributes by writing :
|
227 |
+
topic :
|
228 |
+
positive feedbacks : sentences explaining the positive feedbacks
|
229 |
+
negative feedbacks : sentences explaining the negative feedbacks
|
230 |
+
Finish the text by signing with this company name : 'The Topic Magnet'.
|
231 |
+
Feel free to put many feed lines
|
232 |
+
: {dic}
|
233 |
+
"""
|
234 |
+
llm = OpenAI(temperature = temperature, max_tokens = 1000)
|
235 |
+
generated_text = llm(text)
|
236 |
+
#return generated_text.strip().replace('\n',' ')
|
237 |
+
return generated_text.strip()
|
238 |
+
|
239 |
+
|
240 |
+
#@app.route('/get_reviews', methods = ['GET'])
|
241 |
+
|
242 |
+
def get_reviews(url):
|
243 |
+
df = pd.DataFrame({'Text': scrape_reviews(url)})
|
244 |
+
df = preprocess_data(df)
|
245 |
+
reviews = df.Text
|
246 |
+
noun_ver_adj = get_noun_ver_adj(reviews)
|
247 |
+
most_common_noun = get_most_common_noun(noun_ver_adj)
|
248 |
+
dic1 = get_topics_adjectives(most_common_noun, noun_ver_adj)
|
249 |
+
dic2 = get_ratings_dic(df)
|
250 |
+
generated_text = generate_feedback(dic1)
|
251 |
+
#return jsonify(data1 = dic1, data2 = dic2, data3 = generated_text)
|
252 |
+
return dic2,generated_text
|
253 |
+
|
254 |
+
# gr.Interface(fn = get_reviews, inputs = gr.Textbox(), outputs = gr.Textbox(), title = 'The Topic Magnet',
|
255 |
+
# description = 'Enter the url of your Amazon reviews to get real ratings and valuable insights').launch(share = True)
|
256 |
+
|
257 |
+
#print(get_reviews(url))
|
258 |
+
|
259 |
+
if __name__ == '__main__':
|
260 |
+
interface = gr.Interface(fn=get_reviews, inputs=gr.Textbox(), outputs=[gr.Textbox(label = 'Real ratings'),gr.Textbox(label = 'Actionable insights')], title='The Topic Magnet',
|
261 |
+
description='Enter the url of your Amazon reviews to get real ratings and valuable insights')
|
262 |
+
interface.launch(share = True)
|
263 |
+
#app.run(host = '0.0.0.0', debug = True, port = 5000)
|
264 |
+
|
265 |
+
|
266 |
+
#print(most_common_noun)
|
267 |
+
#print(get_insights(most_common_noun[0],noun_ver_adj))
|
268 |
+
|
269 |
+
#dfs_topics = get_both_df(reviews,most_common_noun)
|
270 |
+
#df_all_topic_pos = dfs_topics[0]
|
271 |
+
#df_all_topic_neg = dfs_topics[1]
|
272 |
+
#print(get_percentages_topic(most_common_noun[0],df_all_topic_pos,df_all_topic_neg))
|
273 |
+
#print(get_df_adjectives('positive',reviews,most_common_noun[0],noun_ver_adj))
|
274 |
+
|
bert_regression.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from datasets import Dataset
|
4 |
+
from transformers import AutoTokenizer,AutoModelForSequenceClassification
|
5 |
+
import torch
|
6 |
+
from itertools import chain
|
7 |
+
import re
|
8 |
+
def remove_links(review):
|
9 |
+
pattern = r'\bhttps?://\S+'
|
10 |
+
return re.sub(pattern, '', review)
|
11 |
+
|
12 |
+
|
13 |
+
# df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv')
|
14 |
+
# df = df.loc[:,['Text']].iloc[:1000]
|
15 |
+
# df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
|
16 |
+
# df['Text'] = df['Text'].apply(remove_links)
|
17 |
+
|
18 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
19 |
+
'../topic_magnet/bert_regr_other_pretrained', num_labels = 1)
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
21 |
+
'../topic_magnet/bert_regr_other_pretrained')
|
22 |
+
|
23 |
+
def preprocess_function_regr(examples):
|
24 |
+
return tokenizer(examples["Text"], truncation=True, max_length=64, padding = 'max_length')
|
25 |
+
|
26 |
+
def get_predictions(reviews):
|
27 |
+
#new_test = pd.DataFrame(reviews)
|
28 |
+
new_ds_regr = Dataset.from_pandas(reviews)
|
29 |
+
new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text'])
|
30 |
+
input_ids = torch.tensor(new_ds_regr_tok['input_ids'])
|
31 |
+
token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids'])
|
32 |
+
attention_mask = torch.tensor(new_ds_regr_tok['attention_mask'])
|
33 |
+
with torch.no_grad():
|
34 |
+
outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
|
35 |
+
predictions = outputs.logits
|
36 |
+
return predictions
|
37 |
+
|
38 |
+
def get_ratings_perc(reviews):
|
39 |
+
preds = get_predictions(reviews)
|
40 |
+
predictions_list = list(chain.from_iterable(preds.tolist()))
|
41 |
+
predictions_array = np.clip(predictions_list,1,5)
|
42 |
+
predictions_array = [round(x) for x in predictions_array]
|
43 |
+
sum = np.unique(predictions_array, return_counts = True)[1].sum()
|
44 |
+
ratings_perc = np.unique(predictions_array, return_counts = True)[1]/sum *100
|
45 |
+
return ratings_perc
|
46 |
+
|
47 |
+
def get_ratings_dic(reviews):
|
48 |
+
ratings_perc = get_ratings_perc(reviews)
|
49 |
+
dic = {}
|
50 |
+
for i in range(1,6):
|
51 |
+
dic[i] = f'{ratings_perc[i-1].round(2)} %'
|
52 |
+
return dic
|
53 |
+
|
54 |
+
#print(get_ratings_dic(df))
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
# new_test = pd.DataFrame(df.loc[:,'Text'].iloc[:1000])
|
60 |
+
# new_ds_regr = Dataset.from_pandas(new_test)
|
61 |
+
# new_ds_regr_tok = new_ds_regr.map(preprocess_function_regr, remove_columns = ['Text'])
|
62 |
+
#
|
63 |
+
# input_ids = torch.tensor(new_ds_regr_tok['input_ids'])
|
64 |
+
# token_type_ids = torch.tensor(new_ds_regr_tok['token_type_ids'])
|
65 |
+
# attention_mask = torch.tensor(new_ds_regr_tok['attention_mask'])
|
66 |
+
# with torch.no_grad():
|
67 |
+
# outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
|
68 |
+
# predictions = outputs.logits
|
69 |
+
#
|
70 |
+
# predictions_list = list(chain.from_iterable(predictions.tolist()))
|
71 |
+
# predictions_array = np.clip(predictions_list,1,5)
|
72 |
+
# predictions_array = [round(x) for x in predictions_array]
|
73 |
+
# print(np.unique(predictions_array, return_counts = True))
|
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4==4.12.3
|
2 |
+
datasets==2.18.0
|
3 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
4 |
+
fastapi==0.110.1
|
5 |
+
Flask==3.0.3
|
6 |
+
gevent==24.2.1
|
7 |
+
gradio==4.26.0
|
8 |
+
grequests==0.7.0
|
9 |
+
huggingface-hub==0.22.2
|
10 |
+
langchain==0.1.16
|
11 |
+
matplotlib==3.8.4
|
12 |
+
numpy==1.26.4
|
13 |
+
openai==1.17.0
|
14 |
+
pandas==2.2.1
|
15 |
+
requests==2.31.0
|
16 |
+
spacy==3.7.4
|
17 |
+
tokenizers==0.15.2
|
18 |
+
torch==2.2.2
|
19 |
+
tqdm==4.66.2
|
20 |
+
transformers==4.39.3
|
21 |
+
|
22 |
+
|
scraper.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import grequests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
import time
|
5 |
+
import csv
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
#product_url = "https://www.amazon.co.uk/Smiths-Savoury-Snacks-Favourites-24/product-reviews/B07X2M1D16/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
|
9 |
+
product_url = 'https://www.amazon.co.uk/product-reviews/B0B21DW5DL/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_review'
|
10 |
+
custom_headers = {
|
11 |
+
# Eliminating non-english reviews
|
12 |
+
"Accept-language": "en;q=1.0",
|
13 |
+
"Accept-Encoding": "gzip, deflate, br",
|
14 |
+
"Cache-Control": "max-age=0",
|
15 |
+
"Connection": "keep-alive",
|
16 |
+
"User-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
17 |
+
}
|
18 |
+
|
19 |
+
|
20 |
+
def get_soup(response):
|
21 |
+
if response.status_code != 200:
|
22 |
+
print("Error in getting webpage")
|
23 |
+
return None
|
24 |
+
|
25 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
26 |
+
return soup
|
27 |
+
|
28 |
+
|
29 |
+
def get_reviews(soup):
|
30 |
+
review_elements = soup.select("div.review")
|
31 |
+
|
32 |
+
scraped_reviews = []
|
33 |
+
|
34 |
+
for review in review_elements:
|
35 |
+
r_content_element = review.select_one("span.review-text")
|
36 |
+
r_content = r_content_element.text if r_content_element else None
|
37 |
+
preprocessed_review = r_content.replace('\n', '')
|
38 |
+
|
39 |
+
scraped_reviews.append(preprocessed_review)
|
40 |
+
|
41 |
+
return scraped_reviews
|
42 |
+
|
43 |
+
|
44 |
+
def scrape_reviews(base_url):
|
45 |
+
all_reviews = []
|
46 |
+
star_ratings = ['one', 'two', 'three', 'four', 'five']
|
47 |
+
|
48 |
+
for star in tqdm(star_ratings):
|
49 |
+
page_number = 1
|
50 |
+
|
51 |
+
while True:
|
52 |
+
url = f"{base_url}&filterByStar={star}_star&&pageNumber={page_number}"
|
53 |
+
response = grequests.get(url, headers=custom_headers).send().response
|
54 |
+
soup = get_soup(response)
|
55 |
+
|
56 |
+
if not soup:
|
57 |
+
continue # Skip to next star rating if unable to parse page
|
58 |
+
|
59 |
+
reviews = get_reviews(soup)
|
60 |
+
all_reviews.extend(reviews)
|
61 |
+
|
62 |
+
# Note: there's a valid page for any pageNumber,
|
63 |
+
# so we need to stop scraping based on the button of next page
|
64 |
+
# Check for the presence of the "Next page" element
|
65 |
+
next_page_element = soup.find("li", class_="a-disabled a-last")
|
66 |
+
if next_page_element:
|
67 |
+
break # Exit loop if "Next page" element is found
|
68 |
+
|
69 |
+
page_number += 1
|
70 |
+
|
71 |
+
return all_reviews
|
72 |
+
|
73 |
+
|
74 |
+
print(scrape_reviews(product_url))
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
training_bert.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
from sklearn.model_selection import GroupShuffleSplit
|
5 |
+
def remove_links(review):
|
6 |
+
pattern = r'\bhttps?://\S+'
|
7 |
+
return re.sub(pattern, '', review)
|
8 |
+
|
9 |
+
df = pd.read_csv('/Users/danfinel/Downloads/Reviews.csv')
|
10 |
+
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)
|
11 |
+
df['Text'] = df['Text'].apply(remove_links)
|
12 |
+
|
13 |
+
|
14 |
+
splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42)
|
15 |
+
split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId'])
|
16 |
+
train_inds, temp_inds = next(split_temp)
|
17 |
+
|
18 |
+
train = df.iloc[train_inds]
|
19 |
+
temp = df.iloc[temp_inds]
|
20 |
+
|
21 |
+
splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42)
|
22 |
+
split_val = splitter_val.split(temp, groups=temp['ProductId'])
|
23 |
+
val_inds, test_inds = next(split_val)
|
24 |
+
|
25 |
+
val = temp.iloc[val_inds]
|
26 |
+
test = temp.iloc[test_inds]
|
27 |
+
|
28 |
+
X_train = train.drop(columns = 'Score')
|
29 |
+
y_train = train.Score
|
30 |
+
|
31 |
+
X_val = val.drop(columns = 'Score')
|
32 |
+
y_val = val.Score
|
33 |
+
|
34 |
+
X_test = test.drop(columns = 'Score')
|
35 |
+
y_test = test.Score
|
36 |
+
|
37 |
+
from transformers import AutoTokenizer,AutoModelForSequenceClassification
|
38 |
+
base_model = 'bert-base-cased'
|
39 |
+
learning_rate = 2e-5
|
40 |
+
max_length = 64
|
41 |
+
batch_size = 32
|
42 |
+
epochs = 5
|
43 |
+
nbr_samples = 10000
|
44 |
+
tokenizer_regr = AutoTokenizer.from_pretrained(base_model)
|
45 |
+
model_regr = AutoModelForSequenceClassification.from_pretrained(base_model,num_labels = 1)
|
46 |
+
|
47 |
+
X_train_bert = X_train.iloc[:nbr_samples]
|
48 |
+
del X_train_bert['ProductId']
|
49 |
+
X_train_bert['label'] = y_train.iloc[:nbr_samples].astype(float)
|
50 |
+
|
51 |
+
X_val_bert = X_val.iloc[:nbr_samples]
|
52 |
+
del X_val_bert['ProductId']
|
53 |
+
X_val_bert['label'] = y_val.iloc[:nbr_samples].astype(float)
|
54 |
+
|
55 |
+
from datasets import Dataset
|
56 |
+
ds_train_regr = Dataset.from_pandas(X_train_bert)
|
57 |
+
ds_val_regr = Dataset.from_pandas(X_val_bert)
|
58 |
+
|
59 |
+
def preprocess_function_regr(examples):
|
60 |
+
return tokenizer_regr(examples["Text"], truncation=True, max_length=64, padding = 'max_length')
|
61 |
+
|
62 |
+
ds_train_regr_tok = ds_train_regr.map(preprocess_function_regr, remove_columns = ['Text'])
|
63 |
+
ds_val_regr_tok = ds_val_regr.map(preprocess_function_regr, remove_columns = ['Text'])
|
64 |
+
|
65 |
+
from sklearn.metrics import mean_absolute_error
|
66 |
+
from sklearn.metrics import mean_squared_error
|
67 |
+
from sklearn.metrics import r2_score
|
68 |
+
|
69 |
+
def compute_metrics_for_regression(eval_pred):
|
70 |
+
logits, labels = eval_pred
|
71 |
+
labels = labels.reshape(-1, 1)
|
72 |
+
|
73 |
+
mse = mean_squared_error(labels, logits)
|
74 |
+
mae = mean_absolute_error(labels, logits)
|
75 |
+
r2 = r2_score(labels, logits)
|
76 |
+
single_squared_errors = ((logits - labels).flatten()**2).tolist()
|
77 |
+
accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
|
78 |
+
|
79 |
+
return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}
|
80 |
+
|
81 |
+
from transformers import TrainingArguments
|
82 |
+
|
83 |
+
output_dir = ".."
|
84 |
+
|
85 |
+
training_args = TrainingArguments(
|
86 |
+
output_dir = output_dir,
|
87 |
+
learning_rate=learning_rate,
|
88 |
+
per_device_train_batch_size=batch_size,
|
89 |
+
per_device_eval_batch_size=batch_size,
|
90 |
+
num_train_epochs=epochs,
|
91 |
+
evaluation_strategy="epoch",
|
92 |
+
save_strategy="epoch",
|
93 |
+
metric_for_best_model="accuracy",
|
94 |
+
load_best_model_at_end=True,
|
95 |
+
weight_decay=0.01,
|
96 |
+
)
|
97 |
+
from transformers import Trainer
|
98 |
+
import torch
|
99 |
+
class RegressionTrainer(Trainer):
|
100 |
+
def compute_loss(self, model, inputs, return_outputs=False):
|
101 |
+
labels = inputs.pop("labels")
|
102 |
+
outputs = model(**inputs)
|
103 |
+
logits = outputs[0][:, 0]
|
104 |
+
loss = torch.nn.functional.mse_loss(logits, labels)
|
105 |
+
return (loss, outputs) if return_outputs else loss
|
106 |
+
|
107 |
+
trainer = Trainer(
|
108 |
+
model=model_regr,
|
109 |
+
args=training_args,
|
110 |
+
train_dataset=ds_train_regr_tok,
|
111 |
+
eval_dataset=ds_val_regr_tok,
|
112 |
+
compute_metrics=compute_metrics_for_regression
|
113 |
+
)
|
114 |
+
|
115 |
+
trainer.train()
|
116 |
+
|
117 |
+
tokenizer_regr.save_pretrained('.')
|
118 |
+
model_regr.save_pretrained('.', from_pt = True)
|
transformers_models.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
import spacy
|
3 |
+
|
4 |
+
nlp = spacy.load('en_core_web_sm')
|
5 |
+
sentiment_pipeline = pipeline("sentiment-analysis", model='distilbert/distilbert-base-uncased-finetuned-sst-2-english')
|
6 |
+
classifier = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli")
|
7 |
+
|
8 |
+
nlp.to_disk('spacy_model')
|
9 |
+
|
10 |
+
sentiment_pipeline.save_pretrained('my_sentiment_model')
|
11 |
+
|
12 |
+
classifier.save_pretrained('my_zero_shot')
|