Spaces:
Sleeping
Sleeping
File size: 5,998 Bytes
769dd6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import os
import pandas as pd
import requests
import zipfile
from pathlib import Path
import logging
from tqdm import tqdm
import json
import kaggle
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DatasetDownloader:
def __init__(self):
self.project_root = Path(__file__).parent.parent.parent
self.raw_data_dir = self.project_root / "data" / "raw"
self.processed_data_dir = self.project_root / "data" / "processed"
# Create directories if they don't exist
os.makedirs(self.raw_data_dir, exist_ok=True)
os.makedirs(self.processed_data_dir, exist_ok=True)
def download_kaggle_dataset(self):
"""Download dataset from Kaggle."""
logger.info("Downloading dataset from Kaggle...")
# Kaggle dataset ID
dataset_id = "clmentbisaillon/fake-and-real-news-dataset"
try:
kaggle.api.dataset_download_files(
dataset_id,
path=self.raw_data_dir,
unzip=True
)
logger.info("Successfully downloaded dataset from Kaggle")
except Exception as e:
logger.error(f"Error downloading from Kaggle: {str(e)}")
logger.info("Please download the dataset manually from: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset")
def download_liar(self):
"""Download LIAR dataset."""
logger.info("Downloading LIAR dataset...")
# URL for LIAR dataset
url = "https://www.cs.ucsb.edu/~william/data/liar_dataset.zip"
output_path = self.raw_data_dir / "liar_dataset.zip"
if not output_path.exists():
try:
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
with open(output_path, 'wb') as f, tqdm(
desc="Downloading LIAR dataset",
total=total_size,
unit='iB',
unit_scale=True
) as pbar:
for data in response.iter_content(chunk_size=1024):
size = f.write(data)
pbar.update(size)
# Extract the zip file
with zipfile.ZipFile(output_path, 'r') as zip_ref:
zip_ref.extractall(self.raw_data_dir / "liar")
except Exception as e:
logger.error(f"Error downloading LIAR dataset: {str(e)}")
logger.info("Please download the LIAR dataset manually from: https://www.cs.ucsb.edu/~william/data/liar_dataset.zip")
def process_kaggle_dataset(self):
"""Process the Kaggle dataset."""
logger.info("Processing Kaggle dataset...")
# Read fake and real news files
fake_df = pd.read_csv(self.raw_data_dir / "Fake.csv")
true_df = pd.read_csv(self.raw_data_dir / "True.csv")
# Add labels
fake_df['label'] = 1 # 1 for fake
true_df['label'] = 0 # 0 for real
# Combine datasets
combined_df = pd.concat([fake_df, true_df], ignore_index=True)
# Save processed data
combined_df.to_csv(self.processed_data_dir / "kaggle_processed.csv", index=False)
logger.info(f"Saved {len(combined_df)} articles from Kaggle dataset")
def process_liar(self):
"""Process LIAR dataset."""
logger.info("Processing LIAR dataset...")
# Read LIAR dataset
liar_file = self.raw_data_dir / "liar" / "train.tsv"
if not liar_file.exists():
logger.error("LIAR dataset not found!")
return
# Read TSV file
df = pd.read_csv(liar_file, sep='\t', header=None)
# Rename columns
df.columns = [
'id', 'label', 'statement', 'subject', 'speaker',
'job_title', 'state_info', 'party_affiliation',
'barely_true', 'false', 'half_true', 'mostly_true',
'pants_on_fire', 'venue'
]
# Convert labels to binary (0 for true, 1 for false)
label_map = {
'true': 0,
'mostly-true': 0,
'half-true': 0,
'barely-true': 1,
'false': 1,
'pants-fire': 1
}
df['label'] = df['label'].map(label_map)
# Select relevant columns
df = df[['statement', 'label', 'subject', 'speaker', 'party_affiliation']]
df.columns = ['text', 'label', 'subject', 'speaker', 'party']
# Save processed data
df.to_csv(self.processed_data_dir / "liar_processed.csv", index=False)
logger.info(f"Saved {len(df)} articles from LIAR dataset")
def combine_datasets(self):
"""Combine processed datasets."""
logger.info("Combining datasets...")
# Read processed datasets
kaggle_df = pd.read_csv(self.processed_data_dir / "kaggle_processed.csv")
liar_df = pd.read_csv(self.processed_data_dir / "liar_processed.csv")
# Combine datasets
combined_df = pd.concat([
kaggle_df[['text', 'label']],
liar_df[['text', 'label']]
], ignore_index=True)
# Save combined dataset
combined_df.to_csv(self.processed_data_dir / "combined_dataset.csv", index=False)
logger.info(f"Combined dataset contains {len(combined_df)} articles")
def main():
downloader = DatasetDownloader()
# Download datasets
downloader.download_kaggle_dataset()
downloader.download_liar()
# Process datasets
downloader.process_kaggle_dataset()
downloader.process_liar()
# Combine datasets
downloader.combine_datasets()
logger.info("Dataset preparation completed!")
if __name__ == "__main__":
main() |