|
from loguru import logger |
|
import pandas as pd |
|
import json |
|
from datetime import datetime |
|
import ast |
|
import numpy as np |
|
from pymongo import MongoClient |
|
from collections import defaultdict |
|
|
|
from tqdm import tqdm |
|
import time |
|
|
|
import requests |
|
import json |
|
import os |
|
import pandas as pd |
|
import nltk |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
from nltk.corpus import stopwords |
|
from textblob import TextBlob |
|
import re |
|
from transformers import BertTokenizer, BertModel |
|
from transformers import RobertaTokenizer, RobertaModel |
|
import torch |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
|
|
|
|
nltk.download('punkt') |
|
nltk.download('averaged_perceptron_tagger') |
|
nltk.download('stopwords') |
|
nltk.download('punkt_tab') |
|
nltk.download('averaged_perceptron_tagger_eng') |
|
|
|
class Preprocessor: |
|
def __init__(self,df): |
|
self.df=df |
|
self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') |
|
self.model = RobertaModel.from_pretrained('roberta-base') |
|
self.stop_words = set(stopwords.words('english')) |
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
|
|
def get_bert_embedding(self, text): |
|
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512) |
|
with torch.no_grad(): |
|
outputs = self.model(**inputs) |
|
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy() |
|
|
|
def preprocess_text(self,text): |
|
return text if pd.notna(text) else "" |
|
|
|
|
|
def calculate_duration(self, time_range): |
|
if not isinstance(time_range, str) or "-" not in time_range: |
|
return None |
|
start_str, end_str = time_range.split('-') |
|
start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip() |
|
end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip() |
|
try: |
|
start = datetime.strptime(start_str, '%H:%M') |
|
end = datetime.strptime(end_str, '%H:%M') |
|
duration = (end - start).total_seconds() / 3600 |
|
return duration if duration >= 0 else duration + 24 |
|
except ValueError: |
|
return None |
|
def calculate_sentiment_severity(self, text): |
|
if pd.isna(text) or not text.strip(): |
|
return pd.Series({"good_severity": 0.0, "bad_severity": 0.0}) |
|
|
|
|
|
blob = TextBlob(text) |
|
polarity = blob.sentiment.polarity |
|
|
|
|
|
good_weight = 0.7 |
|
bad_weight = 0.3 |
|
|
|
if polarity > 0: |
|
good_severity = good_weight * polarity |
|
bad_severity = 0.0 |
|
elif polarity < 0: |
|
good_severity = 0.0 |
|
bad_severity = bad_weight * abs(polarity) |
|
else: |
|
good_severity = 0.0 |
|
bad_severity = 0.0 |
|
|
|
return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity}) |
|
|
|
|
|
def get_avg_duration(self, hours_str): |
|
if pd.isna(hours_str) or not isinstance(hours_str, str): |
|
return pd.NA |
|
try: |
|
hours_dict = ast.literal_eval(hours_str) |
|
if not hours_dict: |
|
return pd.NA |
|
durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()] |
|
valid_durations = [d for d in durations if d is not None] |
|
return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA |
|
except (ValueError, SyntaxError, ZeroDivisionError): |
|
return pd.NA |
|
|
|
|
|
def calculate_time_since_last_review(self): |
|
present_date = datetime.now() |
|
user_latest_timestamp = {} |
|
|
|
|
|
self.df["review_date"] = pd.to_datetime(self.df["review_date"]) |
|
|
|
|
|
for user_id in self.df["user_id"].unique(): |
|
latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max() |
|
|
|
if not isinstance(latest_date, datetime): |
|
latest_date = latest_date.to_pydatetime() |
|
|
|
hours_difference = (present_date - latest_date).total_seconds() / 3600 |
|
user_latest_timestamp[user_id] = hours_difference |
|
|
|
|
|
self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp) |
|
|
|
def calculate_time_since_last_review_business(self): |
|
present_date = datetime.now() |
|
|
|
|
|
self.df["review_date"] = pd.to_datetime(self.df["review_date"]) |
|
|
|
|
|
business_latest_timestamp = {} |
|
|
|
|
|
for business_id in self.df["business_id"].unique(): |
|
|
|
latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max() |
|
|
|
|
|
if not isinstance(latest_date, datetime): |
|
latest_date = latest_date.to_pydatetime() |
|
|
|
|
|
hours_difference = (present_date - latest_date).total_seconds() / 3600 |
|
business_latest_timestamp[business_id] = hours_difference |
|
|
|
|
|
self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp) |
|
|
|
|
|
|
|
def calculate_user_account_age(self): |
|
present_date = datetime.now() |
|
|
|
|
|
self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"]) |
|
|
|
|
|
self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days |
|
|
|
|
|
def calculate_avg_time_between_reviews(self): |
|
|
|
self.df["review_date"] = pd.to_datetime(self.df["review_date"]) |
|
|
|
|
|
self.df = self.df.sort_values(["user_id", "review_date"]) |
|
|
|
|
|
def calculate_avg_time(group): |
|
if len(group) == 1: |
|
return 0 |
|
|
|
diffs = group["review_date"].diff().dt.total_seconds() / 3600 |
|
|
|
return diffs.dropna().mean() |
|
|
|
|
|
avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time) |
|
|
|
|
|
self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user) |
|
|
|
|
|
def calculate_user_degree(self): |
|
|
|
user_business_counts = self.df.groupby("user_id")["business_id"].nunique() |
|
|
|
|
|
self.df["user_degree"] = self.df["user_id"].map(user_business_counts) |
|
|
|
|
|
def calculate_business_degree(self): |
|
|
|
business_user_counts = self.df.groupby("business_id")["user_id"].nunique() |
|
|
|
|
|
self.df["business_degree"] = self.df["business_id"].map(business_user_counts) |
|
|
|
|
|
def calculate_rating_variance_user(self): |
|
|
|
user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0]) |
|
|
|
|
|
self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode) |
|
|
|
|
|
def calculate_user_review_burst_count(self): |
|
|
|
self.df["review_date"] = pd.to_datetime(self.df["review_date"]) |
|
|
|
|
|
self.df = self.df.sort_values(["user_id", "review_date"]) |
|
|
|
|
|
def calculate_burst_count(group): |
|
if len(group) <= 1: |
|
return 0 |
|
|
|
|
|
dates = group["review_date"] |
|
|
|
|
|
burst_counts = [] |
|
for i, date in enumerate(dates): |
|
|
|
window_end = date + pd.Timedelta(days=20) |
|
count = ((dates >= date) & (dates <= window_end)).sum() |
|
burst_counts.append(count) |
|
|
|
|
|
return max(burst_counts) |
|
|
|
|
|
user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count) |
|
|
|
|
|
self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts) |
|
|
|
|
|
def calculate_business_review_burst_count(self): |
|
|
|
self.df["review_date"] = pd.to_datetime(self.df["review_date"]) |
|
|
|
|
|
self.df = self.df.sort_values(["business_id", "review_date"]) |
|
|
|
|
|
def calculate_burst_count(group): |
|
if len(group) <= 1: |
|
return 0 |
|
|
|
|
|
dates = group["review_date"] |
|
|
|
|
|
burst_counts = [] |
|
for i, date in enumerate(dates): |
|
|
|
window_end = date + pd.Timedelta(days=10) |
|
count = ((dates >= date) & (dates <= window_end)).sum() |
|
burst_counts.append(count) |
|
|
|
|
|
return max(burst_counts) |
|
|
|
|
|
business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count) |
|
|
|
|
|
self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts) |
|
|
|
|
|
def calculate_temporal_similarity(self): |
|
self.df["review_date"] = pd.to_datetime(self.df["review_date"]) |
|
|
|
|
|
self.df["day_of_week"] = self.df["review_date"].dt.dayofweek |
|
|
|
|
|
def calculate_avg_hours_on_frequent_days(group): |
|
frequent_days = group["day_of_week"].mode().tolist() |
|
|
|
if len(group) <= 1: |
|
return 0 |
|
|
|
frequent_reviews = group[group["day_of_week"].isin(frequent_days)] |
|
|
|
if len(frequent_reviews) <= 1: |
|
return 0 |
|
|
|
frequent_reviews = frequent_reviews.sort_values("review_date") |
|
diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600 |
|
|
|
return diffs.dropna().mean() |
|
|
|
|
|
avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days) |
|
|
|
|
|
self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user) |
|
|
|
|
|
self.df = self.df.drop(columns=["day_of_week"]) |
|
|
|
|
|
def calculate_rating_deviation_from_business_average(self): |
|
|
|
business_avg_rating = self.df.groupby("business_id")["review_stars"].mean() |
|
|
|
|
|
self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating) |
|
|
|
|
|
self.df["rating_deviation_from_business_average"] = ( |
|
self.df["review_stars"] - self.df["business_avg_rating"] |
|
) |
|
|
|
|
|
self.df = self.df.drop(columns=["business_avg_rating"]) |
|
|
|
def calculate_review_like_ratio(self): |
|
|
|
self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int) |
|
|
|
|
|
user_like_ratio = self.df.groupby("user_id")["is_liked"].mean() |
|
|
|
|
|
self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio) |
|
|
|
|
|
self.df = self.df.drop(columns=["is_liked"]) |
|
|
|
def calculate_latest_checkin_hours(self): |
|
self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"]) |
|
|
|
|
|
def get_latest_checkin(checkin_list): |
|
if not checkin_list or pd.isna(checkin_list): |
|
return None |
|
if isinstance(checkin_list, str): |
|
checkin_dates = checkin_list.split(", ") |
|
else: |
|
checkin_dates = checkin_list |
|
return pd.to_datetime(checkin_dates).max() |
|
|
|
|
|
self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin) |
|
|
|
|
|
self.df["latest_checkin_hours"] = ( |
|
(self.df["latest_checkin_date"] - self.df["yelping_since"]) |
|
.dt.total_seconds() / 3600 |
|
) |
|
|
|
|
|
self.df = self.df.drop(columns=["latest_checkin_date"]) |
|
self.df["latest_checkin_hours"].fillna(0,inplace=True) |
|
|
|
|
|
def compute_pronoun_density(self, text): |
|
text = self.preprocess_text(text) |
|
if not text: |
|
return 0 |
|
words = word_tokenize(text.lower()) |
|
pos_tags = nltk.pos_tag(words) |
|
pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we']) |
|
return pronouns / len(words) if words else 0 |
|
|
|
def compute_avg_sentence_length(self, text): |
|
text = self.preprocess_text(text) |
|
if not text: |
|
return 0 |
|
sentences = sent_tokenize(text) |
|
return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0 |
|
|
|
def compute_excessive_punctuation(self, text): |
|
text = self.preprocess_text(text) |
|
return len(re.findall(r'[!?.]{2,}', text)) |
|
|
|
def compute_sentiment_polarity(self, text): |
|
text = self.preprocess_text(text) |
|
return TextBlob(text).sentiment.polarity if text else 0 |
|
|
|
def compute_code_switching_flag(self, text): |
|
text = self.preprocess_text(text) |
|
if not text: |
|
return 0 |
|
|
|
tokens = self.tokenizer.tokenize(text.lower()) |
|
if not tokens: |
|
return 0 |
|
|
|
english_words = self.stop_words |
|
token_set = set(tokens) |
|
english_count = sum(1 for token in tokens if token in english_words) |
|
|
|
non_english_pattern = re.compile(r'[^\x00-\x7F]') |
|
has_non_ascii = 1 if non_english_pattern.search(text) else 0 |
|
|
|
english_ratio = english_count / len(tokens) if tokens else 0 |
|
|
|
non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii) |
|
|
|
|
|
|
|
|
|
if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0): |
|
return 1 |
|
return 0 |
|
|
|
|
|
def batch_tokenize(self, texts, batch_size=32, max_length=512): |
|
tokenized_outputs = [] |
|
for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"): |
|
batch_texts = texts[i:i + batch_size] |
|
valid_texts = [self.preprocess_text(t) for t in batch_texts] |
|
|
|
inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length) |
|
tokenized_outputs.append(inputs['input_ids'].to(self.device)) |
|
|
|
return torch.cat(tokenized_outputs, dim=0) |
|
|
|
def compute_grammar_error_score(self, texts, tokenized_ids): |
|
print("Computing grammar error scores...") |
|
error_scores = np.zeros(len(texts), dtype=float) |
|
|
|
vocab_set = set(self.tokenizer.get_vocab().keys()) |
|
for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")): |
|
if input_ids.sum() == 0: |
|
continue |
|
tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True) |
|
unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words) |
|
total_count = len([t for t in tokens if t not in self.stop_words]) |
|
error_scores[i] = unknown_count / total_count if total_count > 0 else 0 |
|
|
|
return error_scores |
|
|
|
def compute_repetitive_words_count(self, texts, tokenized_ids): |
|
print("Computing repetitive words counts...") |
|
rep_counts = np.zeros(len(texts), dtype=int) |
|
|
|
for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")): |
|
if input_ids.sum() == 0: |
|
continue |
|
tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True) |
|
valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2] |
|
if valid_tokens: |
|
token_counts = {} |
|
for token in valid_tokens: |
|
token_counts[token] = token_counts.get(token, 0) + 1 |
|
rep_counts[i] = sum(1 for count in token_counts.values() if count > 1) |
|
|
|
return rep_counts |
|
|
|
def preprocess_text_for_similarity(self, text): |
|
if pd.isna(text) or not text.strip(): |
|
return [] |
|
return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words] |
|
|
|
def batch_encode_words(self, texts, batch_size=32, max_length=512): |
|
word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")] |
|
vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))} |
|
|
|
encoded_batches = [] |
|
for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"): |
|
batch_words = word_lists[i:i + batch_size] |
|
encoded = np.zeros((len(batch_words), max_length), dtype=np.int64) |
|
for j, words in enumerate(batch_words): |
|
if words: |
|
word_ids = [vocab.get(w, 0) for w in words][:max_length] |
|
encoded[j, :len(word_ids)] = word_ids |
|
encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device) |
|
encoded_batches.append(encoded_tensor) |
|
|
|
return torch.cat(encoded_batches, dim=0), vocab |
|
|
|
def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512): |
|
all_texts = self.df["review_text"].tolist() |
|
all_users = self.df["user_id"].tolist() |
|
all_review_ids = self.df["review_id"].tolist() |
|
|
|
encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length) |
|
|
|
similarity_scores = {rid: 0.0 for rid in all_review_ids} |
|
for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")): |
|
if pd.isna(review_id) or pd.isna(user_id): |
|
continue |
|
|
|
current_words = encoded_words[i] |
|
if current_words.sum() == 0: |
|
continue |
|
|
|
other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)], |
|
dtype=torch.long).to(self.device) |
|
if not other_indices.numel(): |
|
continue |
|
|
|
other_words = encoded_words[other_indices] |
|
current_set = torch.unique(current_words[current_words > 0]) |
|
other_flat = other_words[other_words > 0] |
|
|
|
if other_flat.numel() == 0: |
|
continue |
|
|
|
other_set = torch.unique(other_flat) |
|
intersection = torch.sum(torch.isin(current_set, other_set)).float() |
|
union = torch.unique(torch.cat([current_set, other_set])).numel() |
|
similarity = intersection / union if union > 0 else 0.0 |
|
|
|
similarity_scores[review_id] = similarity.item() |
|
return pd.Series(similarity_scores, index=all_review_ids) |
|
|
|
def calculate_friend_count(self): |
|
friends = [] |
|
for v in self.df["friends"]: |
|
if isinstance(v, str): |
|
friends.append(len(v.split(","))) |
|
elif type(v)==int or type(v)==float: |
|
friends.append(0) |
|
self.df["friends"] = friends |
|
|
|
def count_elite_years(self, elite): |
|
if pd.isna(elite): |
|
return 0 |
|
return len(str(elite).split(",")) |
|
|
|
def transform_elite_status(self): |
|
self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False) |
|
self.df["elite"] = self.df["elite"].astype(int) |
|
|
|
|
|
def calculate_review_useful_funny_cool(self): |
|
self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0) |
|
self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0) |
|
self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0) |
|
self.df["review_useful_funny_cool"] = ( |
|
self.df["review_useful"] + |
|
self.df["review_funny"] + |
|
self.df["review_cool"] |
|
) |
|
self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int) |
|
|
|
|
|
def calculate_user_useful_funny_cool(self): |
|
self.df["user_useful_funny_cool"] = ( |
|
self.df["user_useful"] + |
|
self.df["user_funny"] + |
|
self.df["user_cool"] |
|
) |
|
self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int) |
|
|
|
def compute_fake_score(self, row): |
|
suspicion_points = 0 |
|
|
|
|
|
if row["pronoun_density"] < 0.01: |
|
suspicion_points += 1 |
|
if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30: |
|
suspicion_points += 1 |
|
if row["grammar_error_score"] > 5: |
|
suspicion_points += 1 |
|
if row["repetitive_words_count"] > 5: |
|
suspicion_points += 1 |
|
if row["code_switching_flag"] == 1: |
|
suspicion_points += 1 |
|
if row["excessive_punctuation_count"] > 3: |
|
suspicion_points += 1 |
|
if abs(row["sentiment_polarity"]) > 0.8: |
|
suspicion_points += 1 |
|
|
|
|
|
if row["similarity_to_other_reviews"] > 0.8: |
|
suspicion_points += 1 |
|
if row["user_review_burst_count"] > 5: |
|
suspicion_points += 1 |
|
if row["business_review_burst_count"] > 5: |
|
suspicion_points += 1 |
|
if abs(row["rating_deviation_from_business_average"]) > 2: |
|
suspicion_points += 1 |
|
if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1: |
|
suspicion_points += 1 |
|
|
|
|
|
if row["user_account_age"] < 30: |
|
suspicion_points += 1 |
|
if row["average_time_between_reviews"] < 24: |
|
suspicion_points += 1 |
|
if row["user_degree"] < 2: |
|
suspicion_points += 1 |
|
if row["time_since_last_review_user"] < 24: |
|
suspicion_points += 1 |
|
|
|
|
|
return 1 if suspicion_points >= 3 else 0 |
|
|
|
|
|
def run_pipeline(self): |
|
|
|
|
|
|
|
logger.info("FINALYZING HOURS COLUMN ...") |
|
self.df["hours"] = self.df["hours"].apply(self.get_avg_duration) |
|
self.df["hours"] = self.df["hours"].fillna(0) |
|
print(self.df["hours"][:10]) |
|
print(self.df["hours"].isnull().sum()) |
|
|
|
|
|
|
|
|
|
logger.info("FINALYZING ATTRIBUTES COLUMN ...") |
|
self.df.drop("attributes",axis=1,inplace=True) |
|
|
|
|
|
|
|
logger.info("CREATING time_since_last_review_user COLUMN ...") |
|
self.calculate_time_since_last_review() |
|
print(np.unique(self.df["time_since_last_review_user"] )) |
|
|
|
|
|
logger.info("CREATING time_since_last_review_business COLUMN ...") |
|
self.calculate_time_since_last_review_business() |
|
print(np.unique(self.df["time_since_last_review_business"] )) |
|
|
|
|
|
|
|
logger.info("CREATING user_account_age COLUMN ...") |
|
self.calculate_user_account_age() |
|
print(np.unique(self.df["user_account_age"] )) |
|
|
|
|
|
|
|
logger.info("CREATING average_time_between_reviews COLUMN ...") |
|
self.calculate_avg_time_between_reviews() |
|
print(np.unique(self.df["average_time_between_reviews"] )) |
|
|
|
|
|
|
|
logger.info("CREATING user_degree COLUMN ...") |
|
self.calculate_user_degree() |
|
print(np.unique(self.df["user_degree"] )) |
|
|
|
|
|
logger.info("CREATING business_degree COLUMN ...") |
|
self.calculate_business_degree() |
|
print(np.unique(self.df["business_degree"] )) |
|
|
|
|
|
logger.info("CREATING rating_variance_user COLUMN ...") |
|
self.calculate_rating_variance_user() |
|
print(np.unique(self.df["rating_variance_user"] )) |
|
|
|
|
|
|
|
logger.info("CREATING user_review_burst_count COLUMN ...") |
|
self.calculate_user_review_burst_count() |
|
print(np.unique(self.df["user_review_burst_count"] )) |
|
|
|
|
|
logger.info("CREATING business_review_burst_count COLUMN ...") |
|
self.calculate_business_review_burst_count() |
|
print(np.unique(self.df["business_review_burst_count"] )) |
|
|
|
|
|
|
|
logger.info("CREATING temporal_similarity COLUMN ...") |
|
self.calculate_temporal_similarity() |
|
print(np.unique(self.df["temporal_similarity"] )) |
|
|
|
|
|
|
|
logger.info("CREATING rating_deviation_from_business_average COLUMN ...") |
|
self.calculate_rating_deviation_from_business_average() |
|
print(np.unique(self.df["rating_deviation_from_business_average"] )) |
|
|
|
|
|
|
|
logger.info("CREATING review_like_ratio COLUMN ...") |
|
self.calculate_review_like_ratio() |
|
print(np.unique(self.df["review_like_ratio"] )) |
|
|
|
|
|
|
|
logger.info("CREATING latest_checkin_hours COLUMN ...") |
|
self.calculate_latest_checkin_hours() |
|
print(np.unique(self.df["latest_checkin_hours"] )) |
|
|
|
|
|
|
|
|
|
logger.info("CREATING pronoun_density COLUMN ...") |
|
self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density) |
|
print(np.unique(self.df["pronoun_density"] )) |
|
|
|
logger.info("CREATING avg_sentence_length COLUMN ...") |
|
self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length) |
|
print(np.unique(self.df["avg_sentence_length"] )) |
|
|
|
logger.info("CREATING excessive_punctuation_count COLUMN ...") |
|
self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation) |
|
print(np.unique(self.df["excessive_punctuation_count"] )) |
|
|
|
logger.info("CREATING sentiment_polarity COLUMN ...") |
|
self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity) |
|
print(np.unique(self.df["sentiment_polarity"] )) |
|
|
|
logger.info("CREATING good_severity and bad_severity COLUMNS ...") |
|
severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity) |
|
self.df[["good_severity", "bad_severity"]] = severity_scores |
|
print(np.unique(self.df["good_severity"] )) |
|
print(np.unique(self.df["bad_severity"] )) |
|
|
|
|
|
logger.info("CREATING code_switching_flag COLUMN ...") |
|
self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag) |
|
print(np.unique(self.df["code_switching_flag"] )) |
|
|
|
|
|
all_texts = self.df["review_text"].tolist() |
|
tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512) |
|
|
|
logger.info("CREATING grammar_error_score COLUMN ...") |
|
self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids) |
|
print(np.unique(self.df["grammar_error_score"] )) |
|
|
|
|
|
logger.info("CREATING repetitive_words_count COLUMN ...") |
|
self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids) |
|
print(np.unique(self.df["repetitive_words_count"] )) |
|
|
|
|
|
|
|
logger.info("CREATING similarity_to_other_reviews COLUMN ...") |
|
similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512) |
|
self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores) |
|
|
|
print(np.unique(self.df["similarity_to_other_reviews"] )) |
|
|
|
|
|
|
|
logger.info("CREATING friends COLUMN ...") |
|
self.calculate_friend_count() |
|
print(self.df["friends"].value_counts()) |
|
|
|
logger.info("CREATING elite COLUMN ...") |
|
self.transform_elite_status() |
|
print(self.df["elite"].value_counts()) |
|
|
|
|
|
logger.info("CREATING review_useful_funny_cool COLUMN ...") |
|
self.calculate_review_useful_funny_cool() |
|
print(self.df["review_useful_funny_cool"].value_counts()) |
|
|
|
|
|
logger.info("CREATING user_useful_funny_cool COLUMN ...") |
|
self.calculate_user_useful_funny_cool() |
|
print(self.df["user_useful_funny_cool"].value_counts()) |
|
|
|
|
|
logger.info("CREATING LABEL COLUMN ...") |
|
self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1) |
|
print(self.df["fake"].value_counts()) |
|
|
|
|
|
logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....") |
|
print(set(self.df.isnull().sum().values)) |
|
|
|
|
|
|
|
|
|
return self.df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|