In [1]:
import torch
import cupy as cp
from moviepy.editor import VideoFileClip
import pandas as pd
import librosa
import scipy.stats
import soundfile as sf
import io
import os
from tqdm import tqdm
import pickle as pk

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Statistical Features  
A first easy step is to compute the mean, standard deviation, minimum, maximum, median and quartiles of the frequencies of each signal. This can be done using Numpy and it always brings value to our feature extraction.

In [2]:
def describe_freq(freqs):
    freqs = cp.array(freqs)  # Convert to CuPy array for GPU computation
    mean = cp.mean(freqs)
    std = cp.std(freqs)
    maxv = cp.amax(freqs)
    minv = cp.amin(freqs)
    median = cp.median(freqs)
    skew = scipy.stats.skew(cp.asnumpy(freqs))  # Skew not directly supported in CuPy
    kurt = scipy.stats.kurtosis(cp.asnumpy(freqs))  # Kurtosis not directly supported in CuPy
    q1 = cp.quantile(freqs, 0.25)
    q3 = cp.quantile(freqs, 0.75)
    mode = scipy.stats.mode(cp.asnumpy(freqs))[0][0]  # Mode not directly supported in CuPy
    iqr = cp.subtract(q3, q1)

    return [mean.get(), std.get(), maxv.get(), minv.get(), median.get(), skew, kurt, q1.get(), q3.get(), mode, iqr.get()]

In [3]:
def get_features(x, sr):
    x = torch.tensor(x, device=device)  # Send to GPU
    rmse = torch.mean(torch.tensor(librosa.feature.rms(y=x.cpu().numpy())[0], device=device))
    zcr = torch.mean(torch.tensor(librosa.feature.zero_crossing_rate(x.cpu().numpy())[0], device=device))
    tempo = torch.tensor(librosa.beat.tempo(y=x.cpu().numpy(), sr=sr)[0], device=device)
    mfcc = torch.mean(torch.tensor(librosa.feature.mfcc(y=x.cpu().numpy(), sr=sr), device=device), axis=1)
    spec_cen = torch.mean(torch.tensor(librosa.feature.spectral_centroid(y=x.cpu().numpy(), sr=sr), device=device))
    spectral_bandwidth = torch.mean(torch.tensor(librosa.feature.spectral_bandwidth(y=x.cpu().numpy(), sr=sr), device=device))
    spectral_contrast = torch.mean(torch.tensor(librosa.feature.spectral_contrast(y=x.cpu().numpy(), sr=sr), device=device))
    spectral_flatness = torch.mean(torch.tensor(librosa.feature.spectral_flatness(y=x.cpu().numpy()), device=device))
    spectral_rolloff = torch.mean(torch.tensor(librosa.feature.spectral_rolloff(y=x.cpu().numpy(), sr=sr), device=device))

    features = [rmse, zcr, tempo, spec_cen, spectral_bandwidth, spectral_contrast, spectral_flatness, spectral_rolloff]
    features = [f.item() for f in features] + [mfcc[i].item() for i in range(mfcc.size(0))]  # Convert to list
    return features

In [4]:
def extract_features(file_path):
    try:
        # Load video file
        video_clip = VideoFileClip(file_path)
        audio = video_clip.audio
        fps = audio.fps
        audio_samples = cp.array(list(audio.iter_frames(fps=fps, dtype="float32"))).flatten()
        buffer = io.BytesIO()
        sf.write(buffer, cp.asnumpy(audio_samples), fps, format="wav")
        buffer.seek(0)
        x, sr = librosa.load(buffer, sr=None)
        video_clip.close()
        features = get_features(x, sr)
        return features

    except Exception as e:
        print(f"Error encountered while parsing file: {file_path}, {e}")
        return None

In [5]:
def load_data(real_dir, fake_dir, real_files, fake_files):
    data = []
    columns = ["rmse", "zcr", "tempo", "spectral_centroid", "spectral_bandwidth",
               "spectral_contrast", "spectral_flatness", "spectral_rolloff"] + \
              [f"mfcc{i}" for i in range(1, 21)] + ["label"]

    # Set up progress bar
    total_files = len(real_files) + len(fake_files)
    pbar = tqdm(total=total_files, desc="Processing files", unit="file")

    # Process real audio files
    for file_name in real_files:
        file_path = os.path.join(real_dir, file_name)
        features = extract_features(file_path)
        if features is not None:
            features.append(0)  # Label: 0 for REAL
            data.append(features)
        pbar.update(1)

    # Process fake audio files
    for file_name in fake_files:
        file_path = os.path.join(fake_dir, file_name)
        features = extract_features(file_path)
        if features is not None:
            features.append(1)  # Label: 1 for FAKE
            data.append(features)
        pbar.update(1)

    pbar.close()
    df = pd.DataFrame(data, columns=columns)
    return df


In [6]:
real_audio_dir = r"H:\.shortcut-targets-by-id\1jH_pc6mMj0Iu8wLS1r0vggMWpVElJvOU\SIH2024_DATASET\REAL"
fake_audio_dir = r"H:\.shortcut-targets-by-id\1jH_pc6mMj0Iu8wLS1r0vggMWpVElJvOU\SIH2024_DATASET\FAKE"

In [None]:
with open(
    r"H:\.shortcut-targets-by-id\1jH_pc6mMj0Iu8wLS1r0vggMWpVElJvOU\SIH2024_DATASET\real_files.pkl",
    "rb",
) as f:
    real_files = pk.load(f)

with open(
    r"H:\.shortcut-targets-by-id\1jH_pc6mMj0Iu8wLS1r0vggMWpVElJvOU\SIH2024_DATASET\fake_files.pkl",
    "rb",
) as f:
    fake_files = pk.load(f)

In [None]:
len(real_files), len(fake_files)

(19154, 99992)

In [None]:
fake_files = fake_files[:len(real_files)]

In [None]:
len(real_files), len(fake_files)

(19154, 19154)

In [None]:
df = load_data(real_audio_dir, fake_audio_dir, real_files[:2000], fake_files[:2000])

Processing files:  17%|█▋        | 671/4000 [1:19:37<4:35:56,  4.97s/file] 

In [None]:
df.tail()

Unnamed: 0,rmse,zcr,tempo,spectral_centroid,spectral_bandwidth,spectral_contrast,spectral_flatness,spectral_rolloff,mfcc1,mfcc2,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
5,0.004624,0.025053,129.199219,2725.983254,5010.822943,14.822473,0.002854,4820.49492,-534.778259,154.150742,...,8.461435,-5.363853,1.651735,1.570598,-6.969818,-1.332273,-7.264575,-2.166896,-5.390424,1
6,0.012205,0.040296,123.046875,3647.104615,5343.519738,16.671819,0.007903,8357.563553,-421.535065,121.641014,...,16.492485,-15.264863,5.351438,-6.834963,-6.844149,2.524184,-9.907133,2.443203,-3.203485,1
7,0.000486,0.06573,123.046875,4911.11856,5816.15461,13.167884,0.02047,12992.775671,-651.358948,105.40844,...,22.212151,-8.999311,9.15981,-1.134552,0.878308,-4.592861,6.159277,-8.804791,4.221607,1
8,0.010587,0.044573,126.048018,3769.014655,5425.975753,16.238748,0.00802,8702.531203,-423.674591,125.309708,...,17.190102,-19.386557,2.690195,-8.97252,-8.547749,3.633717,-7.594123,5.063034,-3.646331,1
9,0.001556,0.048985,126.048018,3916.497123,5451.384648,14.959555,0.011601,8986.764496,-614.185364,123.651947,...,16.776917,-9.418891,1.858516,-3.961122,-3.926236,-5.990383,3.210501,-8.581244,4.236759,1


In [None]:
# for file in file_names:

#     clean_file = file.split("/")[-1]
#     video_clip = VideoFileClip(file)
#     audio = video_clip.audio
#     fps = audio.fps
#     audio_samples = np.array(list(audio.iter_frames(fps=fps, dtype="float32"))).flatten()
#     buffer = io.BytesIO()
#     sf.write(buffer, audio_samples, fps, format='wav')
#     buffer.seek(0)
#     x, sr = librosa.load(buffer, sr=None)
#     label = json.load(open("train_sample_videos/metadata.json"))[clean_file]['label']
#     new_row = pd.DataFrame([[clean_file] + get_features(x, sr) + [label]], columns=column_ames)
#     df = pd.concat([df, new_row], ignore_index=True)

In [None]:
df.to_csv( "full_features.csv", index=False)

OSError: Cannot save file into a non-existent directory: '\content\drive\MyDrive\SIH2024_DATASET'