Spaces:
Running
Running
File size: 6,212 Bytes
ef4c8c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import logging
import os
import sys
from datetime import datetime
from pathlib import Path
from datasets import Dataset, Features, Value
from dotenv import load_dotenv
from huggingface_hub import HfApi
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
# Logging setup
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('debug_upload.log', mode='w')
]
)
REPO_ID = "Allanatrix/Scientific_Research_Tokenized"
JSONL_SRC = Path(r"C:\Users\kunya\PycharmProjects\DataVolt\Tokenization\scientific_corpus_325M.jsonl")
ARROW_PATH = Path("scientific_corpus_325M.arrow")
README_PATH = Path("README.md")
def debug_jsonl_head(jsonl_path, n=5):
logging.info(f"Printing the first {n} lines of {jsonl_path} for schema inspection:")
try:
with open(jsonl_path, "r", encoding="utf-8") as f:
for i in range(n):
line = f.readline()
if not line:
break
logging.info(f"Line {i+1}: {line.strip()}")
except Exception as e:
logging.error(f"Failed to read JSONL head: {e}")
def infer_features_from_sample(jsonl_path, n=100):
import json
from collections import defaultdict
types = defaultdict(set)
try:
with open(jsonl_path, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
if i >= n:
break
obj = json.loads(line)
for k, v in obj.items():
types[k].add(type(v).__name__)
logging.info(f"Inferred field types from first {n} lines: {dict(types)}")
except Exception as e:
logging.error(f"Failed to infer features: {e}")
def convert_jsonl_to_arrow(jsonl_path, arrow_path):
try:
logging.info(f"Converting {jsonl_path} to Arrow format at {arrow_path} ...")
if not jsonl_path.exists():
logging.error(f"JSONL source file does not exist: {jsonl_path}")
print(f"\nβ JSONL source file does not exist: {jsonl_path}")
raise FileNotFoundError(f"JSONL source file does not exist: {jsonl_path}")
logging.info(f"File size: {jsonl_path.stat().st_size} bytes")
debug_jsonl_head(jsonl_path, n=5)
infer_features_from_sample(jsonl_path, n=100)
# Try loading a small sample first for debugging
try:
sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]")
logging.info(f"Sample loaded: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
except Exception as sample_e:
logging.error(f"Failed to load sample from JSONL: {sample_e}", exc_info=True)
print(f"\nβ Failed to load sample from JSONL. See debug_upload.log for details.")
# Try to load with explicit features if possible
# Example: features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
# Uncomment and adjust the following lines if you know the schema:
# features = Features({'url': Value('string'), 'pubmed_id': Value('string')})
# try:
# sample_dataset = Dataset.from_json(str(jsonl_path), split="train[:1000]", features=features)
# logging.info(f"Sample loaded with explicit features: {len(sample_dataset)} rows, columns: {sample_dataset.column_names}")
# except Exception as e2:
# logging.error(f"Still failed with explicit features: {e2}", exc_info=True)
raise
# Now load the full dataset
dataset = Dataset.from_json(str(jsonl_path))
logging.info(f"Full dataset loaded: {len(dataset)} rows, columns: {dataset.column_names}")
dataset.to_file(str(arrow_path))
logging.info(f"Saved Arrow dataset with {len(dataset):,} rows.")
return dataset
except Exception as e:
logging.error(f"An error occurred while generating the dataset: {e}", exc_info=True)
print(f"\nβ Failed to convert JSONL to Arrow. See debug_upload.log for details.")
raise
def create_readme(dataset):
content = f"""# Scientific Research Tokenized Dataset
- **Examples**: {len(dataset):,}
- **Columns**: {dataset.column_names}
- **Updated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## Usage
```python
from datasets import load_dataset
ds = load_dataset("{REPO_ID}")
```
"""
with open(README_PATH, "w", encoding="utf-8") as f:
f.write(content)
logging.info("README.md created.")
def upload_to_hf():
api = HfApi()
logging.info("Uploading Arrow file to HuggingFace Hub ...")
api.upload_file(
path_or_fileobj=str(ARROW_PATH),
path_in_repo=ARROW_PATH.name,
repo_id=REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Upload Arrow dataset"
)
logging.info("Uploading README.md to HuggingFace Hub ...")
api.upload_file(
path_or_fileobj=str(README_PATH),
path_in_repo="README.md",
repo_id=REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
commit_message="Update README"
)
logging.info("Upload complete.")
def upload_to_huggingface(*args, **kwargs):
"""Alias for upload_to_hf to match expected import in Main_2.py"""
return upload_to_hf(*args, **kwargs)
def cleanup():
if ARROW_PATH.exists():
ARROW_PATH.unlink()
if README_PATH.exists():
README_PATH.unlink()
logging.info("Cleaned up local files.")
def main():
try:
if not HF_TOKEN:
print("β HF_TOKEN not found in environment. Please set it in your .env file.")
return
dataset = convert_jsonl_to_arrow(JSONL_SRC, ARROW_PATH)
create_readme(dataset)
upload_to_hf()
print(f"\nπ SUCCESS! View at: https://huggingface.co/datasets/{REPO_ID}")
except Exception as e:
logging.error(f"Process failed: {e}")
print(f"\nβ Upload failed. See debug_upload.log for details.")
sys.exit(1)
finally:
cleanup()
if __name__ == "__main__":
main() |