## Load the dataset

In [None]:
import pandas as pd
import zipfile

In [None]:
# Path to the zipped file
zip_file_path = '/content/drive/MyDrive/Amazon_shoe_review/amazon_reviews_us_Shoes_v1_00.tsv (1).zip'

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # List all the files in the ZIP archive
    print(zip_ref.namelist())

    # Read the TSV file from the ZIP archive, handling errors
    with zip_ref.open('amazon_reviews_us_Shoes_v1_00.tsv') as tsv_file:
        df = pd.read_csv(tsv_file, sep='\t', on_bad_lines='skip')  # Skip bad lines

print(df.shape) # Check the number of rows and columns in the dataframe

['amazon_reviews_us_Shoes_v1_00.tsv']
(4358820, 15)


In [None]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,18069663,R3P2HIOQCIN5ZU,B000XB31C0,265024781,Minnetonka Men's Double Deerskin Softsole Mocc...,Shoes,1,0,0,N,Y,.,Do not buy: really didn't start to wear them u...,2015-08-31
1,US,16251825,R12VVR0WH5Q24V,B00CFYZH5W,259035853,Teva Men's Pajaro Flip-Flop,Shoes,5,0,0,N,Y,super flip flop,provides great cushion as well as archsupport,2015-08-31
2,US,20381037,RNCCKB6TV5EEF,B00S8JNN3Q,666066660,Anne Klein Perfect Pair Wristlet,Shoes,4,0,0,N,Y,Great clutch purse!,It's perfect if you need something small for c...,2015-08-31
3,US,108364,R2NZXYIVCGB13W,B00XFBPOQG,448483263,adidas Men's 10K Lifestyle Runner Sneaker,Shoes,5,0,6,N,Y,Badass,Getting what u see,2015-08-31
4,US,45449350,R2EQ1TG9IT3OEQ,B00SW64Y9W,7853171,OverBling Sneakers for Men Casual Men Shoes Ge...,Shoes,3,0,0,N,Y,Three Stars,small,2015-08-31


In [None]:
df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [None]:
df.value_counts('star_rating')

star_rating
5    2635010
4     846239
3     403443
2     242379
1     231749
Name: count, dtype: int64

## Balance the dataset

The dataset is quite unbalanced. We have too many 4 and 5-star ratings. Let's balance the dataset, and keep only reviews in each class.

In [None]:
balanced_df = pd.DataFrame(columns=df.columns)

In [None]:
balanced_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date


In [None]:
# for stars in range(1,6):
#   data = df[df['star_rating'] == stars][:20000]
#   balanced_df = pd.concat([balanced_df, data])

In [None]:
for stars in range(1, 6):
    # Filter the DataFrame to include only rows with the current star rating and non-None review_body
    data = df[(df['star_rating'] == stars) & (df['review_body'].notna())][:20000]
    # Concatenate the filtered data to balanced_df
    balanced_df = pd.concat([balanced_df, data], ignore_index=True)

In [None]:
balanced_df.shape

(100000, 15)

In [None]:
balanced_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,18069663,R3P2HIOQCIN5ZU,B000XB31C0,265024781,Minnetonka Men's Double Deerskin Softsole Mocc...,Shoes,1,0,0,N,Y,.,Do not buy: really didn't start to wear them u...,2015-08-31
1,US,12334573,R9BHBB06QD6TM,B008NCHMBW,138572112,Foot Sox Original Sanitary Disposable Try on S...,Shoes,1,2,2,N,Y,Tissue paper is this thicker than this. It rip...,Tissue paper is this thicker than this. It ri...,2015-08-31
2,US,230032,RFN9JY10X50F3,B005VPABXA,598684491,Comfy Feet Womens Snooki House Slippers,Shoes,1,1,1,N,Y,One Star,Not what I expected would not order again,2015-08-31
3,US,9088293,R13749PRFL9JKT,B003E7QNTI,580898588,Ariat Women's Krista Pull-on Steel Toe Western...,Shoes,1,0,0,N,Y,not wide width at all,These boots were order as the 9c which is the ...,2015-08-31
4,US,22138690,RE9KJGO100EDW,B00SENYY98,470125824,"Women's Fun Design, Lightweight & Comfortable ...",Shoes,1,0,0,N,Y,giant and cheep,"Cheep, made in China although they advertised ...",2015-08-31


In [None]:
balanced_df.value_counts('star_rating')

star_rating
1    20000
2    20000
3    20000
4    20000
5    20000
Name: count, dtype: int64

## Convert the pandas dataframe to huggingFace dataset

In [None]:
!pip install datasets



In [None]:
!pip install --upgrade pyarrow



In [None]:
from datasets import Dataset

We don't need all the columns. We'll keep only the star_rating and the review_body.

In [None]:
shoe_df = balanced_df[['star_rating', 'review_body']]

In [None]:
shoe_df.head()

Unnamed: 0,star_rating,review_body
0,1,Do not buy: really didn't start to wear them u...
1,1,Tissue paper is this thicker than this. It ri...
2,1,Not what I expected would not order again
3,1,These boots were order as the 9c which is the ...
4,1,"Cheep, made in China although they advertised ..."


In [None]:
shoe_df.value_counts('star_rating')

star_rating
1    20000
2    20000
3    20000
4    20000
5    20000
Name: count, dtype: int64

In [None]:
# Adjust the star_rating to start from 0
shoe_df['star_rating'] = shoe_df['star_rating'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shoe_df['star_rating'] = shoe_df['star_rating'] - 1


In [None]:
shoe_df.value_counts('star_rating')

star_rating
0    20000
1    20000
2    20000
3    20000
4    20000
Name: count, dtype: int64

In [None]:
shoe_df.head()

Unnamed: 0,star_rating,review_body
0,0,Do not buy: really didn't start to wear them u...
1,0,Tissue paper is this thicker than this. It ri...
2,0,Not what I expected would not order again
3,0,These boots were order as the 9c which is the ...
4,0,"Cheep, made in China although they advertised ..."


In [None]:
hf_dataset = Dataset.from_pandas(shoe_df, preserve_index=False)

In [None]:
hf_dataset

Dataset({
    features: ['star_rating', 'review_body'],
    num_rows: 100000
})

In [None]:
hf_dataset = hf_dataset.rename_column('star_rating', 'labels')
hf_dataset = hf_dataset.rename_column('review_body', 'text')

In [None]:
hf_dataset[0]

{'labels': 0,
 'text': "Do not buy: really didn't start to wear them until May of 2016. Junk, they are falling apart. The outer sole is so thin that although I wear them almost completely in the house on rugs the inner padding is showing through in the heel. My previous pair from the same company lasted 5 years before I threw them out.  I'm sorry I didn't wear them more often when I first got them as I would have returned them immediately"}

In [None]:
dataset_split = hf_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

In [None]:
dataset_split

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
})

In [None]:
# Save the dataset
dataset_split.save_to_disk("/content/drive/MyDrive/Amazon_shoe_review/shoe_hfdataset")

Saving the dataset (0/1 shards):   0%|          | 0/90000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# Save the dataset in csv format
dataset_split['train'].to_csv('/content/drive/MyDrive/Amazon_shoe_review/shoe_dataset_train.csv')
dataset_split['test'].to_csv('/content/drive/MyDrive/Amazon_shoe_review/shoe_dataset_test.csv')

Creating CSV from Arrow format:   0%|          | 0/90 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

1770307

## Push the dataset to huggingface hub

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
dataset_split.push_to_hub(repo_id = 'amazon_shoe_review')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/90 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mazed/amazon_shoe_review/commit/769013166f5d610c3eacbf65e7e3fc60c2de0ebc', commit_message='Upload dataset', commit_description='', oid='769013166f5d610c3eacbf65e7e3fc60c2de0ebc', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from datasets import load_dataset

shoe_dataset = load_dataset("mazed/amazon_shoe_review")

Downloading readme:   0%|          | 0.00/456 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
shoe_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 90000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
})