Spaces:

Duplicated from tfrere/research-article-template

HuggingFaceTB
/

smol-training-playbook

Running on CPU Upgrade

App Files Files Community

smol-training-playbook / app /scripts /notion-importer /index.mjs

tfrere's picture

tfrere HF Staff

update pipe

d06ea3c about 2 months ago

history blame contribute delete

19.4 kB

	#!/usr/bin/env node

	import { config } from 'dotenv';
	import { join, dirname, basename } from 'path';
	import { fileURLToPath } from 'url';
	import { copyFileSync, existsSync, mkdirSync, readFileSync, writeFileSync, readdirSync, statSync, unlinkSync } from 'fs';
	import { convertNotionToMarkdown } from './notion-converter.mjs';
	import { convertToMdx } from './mdx-converter.mjs';
	import { Client } from '@notionhq/client';

	// Load environment variables from .env file (but don't override existing ones)
	config({ override: false });

	const __filename = fileURLToPath(import.meta.url);
	const __dirname = dirname(__filename);

	// Default configuration
	const DEFAULT_INPUT = join(__dirname, 'input', 'pages.json');
	const DEFAULT_OUTPUT = join(__dirname, 'output');
	const ASTRO_CONTENT_PATH = join(__dirname, '..', '..', 'src', 'content', 'article.mdx');
	const ASTRO_ASSETS_PATH = join(__dirname, '..', '..', 'src', 'content', 'assets', 'image');
	const ASTRO_BIB_PATH = join(__dirname, '..', '..', 'src', 'content', 'bibliography.bib');
	const STATIC_BIB_PATH = join(__dirname, 'static', 'bibliography.bib');

	function parseArgs() {
	const args = process.argv.slice(2);
	const config = {
	input: DEFAULT_INPUT,
	output: DEFAULT_OUTPUT,
	clean: false,
	notionOnly: false,
	mdxOnly: false,
	token: process.env.NOTION_TOKEN,
	pageId: process.env.NOTION_PAGE_ID
	};

	for (const arg of args) {
	if (arg.startsWith('--input=')) {
	config.input = arg.split('=')[1];
	} else if (arg.startsWith('--output=')) {
	config.output = arg.split('=')[1];
	} else if (arg.startsWith('--token=')) {
	config.token = arg.split('=')[1];
	} else if (arg.startsWith('--page-id=')) {
	config.pageId = arg.split('=')[1];
	} else if (arg === '--clean') {
	config.clean = true;
	} else if (arg === '--notion-only') {
	config.notionOnly = true;
	} else if (arg === '--mdx-only') {
	config.mdxOnly = true;
	}
	}

	return config;
	}

	function showHelp() {
	console.log(`
	🚀 Notion to MDX Toolkit

	Usage:
	node index.mjs [options]

	Options:
	--input=PATH Input pages configuration file (default: input/pages.json)
	--output=PATH Output directory (default: output/)
	--token=TOKEN Notion API token (or set NOTION_TOKEN env var)
	--clean Clean output directory before processing
	--notion-only Only convert Notion to Markdown (skip MDX conversion)
	--mdx-only Only convert existing Markdown to MDX
	--help, -h Show this help

	Environment Variables:
	NOTION_TOKEN Your Notion integration token

	Examples:
	# Full conversion workflow
	NOTION_TOKEN=your_token node index.mjs --clean

	# Only convert Notion pages to Markdown
	node index.mjs --notion-only --token=your_token

	# Only convert existing Markdown to MDX
	node index.mjs --mdx-only

	# Custom paths
	node index.mjs --input=my-pages.json --output=converted/ --token=your_token

	Configuration File Format (pages.json):
	{
	"pages": [
	{
	"id": "your-notion-page-id",
	"title": "Page Title",
	"slug": "page-slug"
	}
	]
	}

	Workflow:
	1. Notion → Markdown (with media download)
	2. Markdown → MDX (with Astro components)
	3. Copy to Astro content directory
	`);
	}

	function ensureDirectory(dir) {
	if (!existsSync(dir)) {
	mkdirSync(dir, { recursive: true });
	}
	}

	async function cleanDirectory(dir) {
	if (existsSync(dir)) {
	const { execSync } = await import('child_process');
	execSync(`rm -rf "${dir}"/*`, { stdio: 'inherit' });
	}
	}

	function readPagesConfig(inputFile) {
	try {
	const content = readFileSync(inputFile, 'utf8');
	return JSON.parse(content);
	} catch (error) {
	console.error(`❌ Error reading pages config: ${error.message}`);
	return { pages: [] };
	}
	}

	/**
	* Create a temporary pages.json from NOTION_PAGE_ID environment variable
	* Extracts title and generates slug from the Notion page
	*/
	async function createPagesConfigFromEnv(pageId, token, outputPath) {
	try {
	console.log('🔍 Fetching page info from Notion API...');
	const notion = new Client({ auth: token });
	const page = await notion.pages.retrieve({ page_id: pageId });

	// Extract title
	let title = 'Article';
	if (page.properties.title && page.properties.title.title && page.properties.title.title.length > 0) {
	title = page.properties.title.title[0].plain_text;
	} else if (page.properties.Name && page.properties.Name.title && page.properties.Name.title.length > 0) {
	title = page.properties.Name.title[0].plain_text;
	}

	// Generate slug from title
	const slug = title
	.toLowerCase()
	.replace(/[^\w\s-]/g, '')
	.replace(/\s+/g, '-')
	.replace(/-+/g, '-')
	.trim();

	console.log(` ✅ Found page: "${title}" (slug: ${slug})`);

	// Create pages config
	const pagesConfig = {
	pages: [{
	id: pageId,
	title: title,
	slug: slug
	}]
	};

	// Write to temporary file
	writeFileSync(outputPath, JSON.stringify(pagesConfig, null, 4));
	console.log(` ✅ Created temporary pages config`);

	return pagesConfig;
	} catch (error) {
	console.error(`❌ Error fetching page from Notion: ${error.message}`);
	throw error;
	}
	}

	/**
	* Final cleanup function to remove exclude tags and unused imports
	* @param {string} content - MDX content
	* @returns {string} - Cleaned content
	*/
	function cleanupExcludeTagsAndImports(content) {
	let cleanedContent = content;
	let removedCount = 0;
	const removedImageVariables = new Set();

	// First, extract image variable names from exclude blocks before removing them
	const excludeBlocks = cleanedContent.match(/<exclude>[\s\S]*?<\/exclude>/g) \|\| [];
	excludeBlocks.forEach(match => {
	const imageMatches = match.match(/src=\{([^}]+)\}/g);
	if (imageMatches) {
	imageMatches.forEach(imgMatch => {
	const varName = imgMatch.match(/src=\{([^}]+)\}/)?.[1];
	if (varName) {
	removedImageVariables.add(varName);
	}
	});
	}
	});

	// Remove <exclude> tags and everything between them (including multiline)
	cleanedContent = cleanedContent.replace(/<exclude>[\s\S]*?<\/exclude>/g, (match) => {
	removedCount++;
	return '';
	});

	// Remove unused image imports that were only used in exclude blocks
	if (removedImageVariables.size > 0) {
	removedImageVariables.forEach(varName => {
	// Check if the variable is still used elsewhere in the content after removing exclude blocks
	const remainingUsage = cleanedContent.includes(`{${varName}}`) \|\| cleanedContent.includes(`src={${varName}}`);

	if (!remainingUsage) {
	// Remove import lines for unused image variables
	// Pattern: import VarName from './assets/image/filename';
	const importPattern = new RegExp(`import\\s+${varName.replace(/[.+?^${}()\|[\]\\]/g, '\\$&')}\\s+from\\s+['"][^'"]+['"];?\\s`, 'g');
	cleanedContent = cleanedContent.replace(importPattern, '');
	console.log(` 🗑️ Removed unused import: ${varName}`);
	}
	});
	}

	if (removedCount > 0) {
	console.log(` 🧹 Final cleanup: removed ${removedCount} exclude block(s) and ${removedImageVariables.size} unused import(s)`);
	}

	// Ensure there's always a blank line after imports before content starts
	// Find the last import line and ensure there's a blank line before the next non-empty line
	const lines = cleanedContent.split('\n');
	let lastImportIndex = -1;

	// Find the last import line
	for (let i = 0; i < lines.length; i++) {
	if (lines[i].trim().startsWith('import ') && lines[i].trim().endsWith(';')) {
	lastImportIndex = i;
	}
	}

	// If we found imports, ensure there's a blank line after the last one
	if (lastImportIndex >= 0) {
	// Find the next non-empty line after the last import
	let nextNonEmptyIndex = lastImportIndex + 1;
	while (nextNonEmptyIndex < lines.length && lines[nextNonEmptyIndex].trim() === '') {
	nextNonEmptyIndex++;
	}

	// If there's no blank line between the last import and next content, add one
	if (nextNonEmptyIndex > lastImportIndex + 1) {
	// There are already blank lines, this is fine
	} else {
	// No blank line, add one
	lines.splice(nextNonEmptyIndex, 0, '');
	}

	cleanedContent = lines.join('\n');
	}

	return cleanedContent;
	}

	function copyToAstroContent(outputDir) {
	console.log('📋 Copying MDX files to Astro content directory...');

	try {
	// Ensure Astro directories exist
	mkdirSync(dirname(ASTRO_CONTENT_PATH), { recursive: true });
	mkdirSync(ASTRO_ASSETS_PATH, { recursive: true });

	// Copy MDX file
	const files = readdirSync(outputDir);
	const mdxFiles = files.filter(file => file.endsWith('.mdx'));
	if (mdxFiles.length > 0) {
	const mdxFile = join(outputDir, mdxFiles[0]); // Take the first MDX file
	// Read and write instead of copy to avoid EPERM issues
	let mdxContent = readFileSync(mdxFile, 'utf8');

	// Apply final cleanup to ensure no exclude tags or unused imports remain
	mdxContent = cleanupExcludeTagsAndImports(mdxContent);

	writeFileSync(ASTRO_CONTENT_PATH, mdxContent);
	console.log(` ✅ Copied and cleaned MDX to ${ASTRO_CONTENT_PATH}`);
	}

	// Copy images from both media and external-images directories
	const imageExtensions = ['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.tiff', '.html'];
	let totalImageCount = 0;

	function copyImagesRecursively(dir, sourceName) {
	if (!existsSync(dir)) return;

	const files = readdirSync(dir);
	for (const file of files) {
	const filePath = join(dir, file);
	const stat = statSync(filePath);

	if (stat.isDirectory()) {
	copyImagesRecursively(filePath, sourceName);
	} else if (imageExtensions.some(ext => file.toLowerCase().endsWith(ext))) {
	const filename = basename(filePath);
	const destPath = join(ASTRO_ASSETS_PATH, filename);

	try {
	// Validate image by checking file size and basic structure
	const stats = statSync(filePath);
	if (stats.size === 0) {
	console.log(` ⚠️ Skipping empty image: ${filename}`);
	return;
	}

	// Try to copy and validate the result
	copyFileSync(filePath, destPath);

	// Additional validation - check if the copied file has reasonable size
	const destStats = statSync(destPath);
	if (destStats.size === 0) {
	console.log(` ❌ Failed to copy corrupted image: ${filename}`);
	// Remove the empty file
	try {
	unlinkSync(destPath);
	} catch (e) { }
	return;
	}

	console.log(` ✅ Copied ${sourceName}: ${filename} (${destStats.size} bytes)`);
	totalImageCount++;
	} catch (error) {
	console.log(` ❌ Failed to copy ${filename}: ${error.message}`);
	}
	}
	}
	}

	// Copy images from media directory (Notion images)
	const mediaDir = join(outputDir, 'media');
	copyImagesRecursively(mediaDir, 'Notion image');

	// Copy images from external-images directory (downloaded external images)
	const externalImagesDir = join(outputDir, 'external-images');
	copyImagesRecursively(externalImagesDir, 'external image');

	if (totalImageCount > 0) {
	console.log(` ✅ Copied ${totalImageCount} total image(s) to ${ASTRO_ASSETS_PATH}`);
	}

	// Always update image paths and filter problematic references in MDX file
	if (existsSync(ASTRO_CONTENT_PATH)) {
	const mdxContent = readFileSync(ASTRO_CONTENT_PATH, 'utf8');
	let updatedContent = mdxContent.replace(/\.\/media\//g, './assets/image/');
	// Remove the subdirectory from image paths since we copy images directly to assets/image/
	updatedContent = updatedContent.replace(/\.\/assets\/image\/[^\/]+\//g, './assets/image/');

	// Check which images actually exist and remove references to missing/corrupted ones
	const imageReferences = updatedContent.match(/\.\/assets\/image\/[^\s\)]+/g) \|\| [];
	const existingImages = existsSync(ASTRO_ASSETS_PATH) ? readdirSync(ASTRO_ASSETS_PATH).filter(f =>
	['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.tiff'].some(ext => f.toLowerCase().endsWith(ext))
	) : [];

	for (const imgRef of imageReferences) {
	const filename = basename(imgRef);
	if (!existingImages.includes(filename)) {
	console.log(` ⚠️ Removing reference to missing/corrupted image: ${filename}`);
	// Remove the entire image reference (both Image component and markdown syntax)
	updatedContent = updatedContent.replace(
	new RegExp(`<Image[^>]src=["']${imgRef.replace(/[.+?^${}()\|[\]\\]/g, '\\$&')}["'][^>]*\/?>`, 'g'),
	''
	);
	updatedContent = updatedContent.replace(
	new RegExp(`!\\[.?\\]\\(${imgRef.replace(/[.+?^${}()\|[\]\\]/g, '\\$&')}\\)`, 'g'),
	''
	);
	}
	}

	writeFileSync(ASTRO_CONTENT_PATH, updatedContent);
	console.log(` ✅ Updated image paths and filtered problematic references in MDX file`);
	}

	// Copy static bibliography.bib if it exists, otherwise create empty
	if (existsSync(STATIC_BIB_PATH)) {
	const bibContent = readFileSync(STATIC_BIB_PATH, 'utf8');
	writeFileSync(ASTRO_BIB_PATH, bibContent);
	console.log(` ✅ Copied static bibliography from ${STATIC_BIB_PATH}`);
	} else {
	writeFileSync(ASTRO_BIB_PATH, '');
	console.log(` ✅ Created empty bibliography (no static file found)`);
	}

	} catch (error) {
	console.warn(` ⚠️ Failed to copy to Astro: ${error.message}`);
	}
	}


	async function main() {
	const args = process.argv.slice(2);

	if (args.includes('--help') \|\| args.includes('-h')) {
	showHelp();
	process.exit(0);
	}

	const config = parseArgs();

	console.log('🚀 Notion to MDX Toolkit');
	console.log('========================');

	try {
	// Prepare input config file
	let inputConfigFile = config.input;
	let pageIdFromEnv = null;

	// If NOTION_PAGE_ID is provided via env var, create temporary pages.json
	if (config.pageId && config.token) {
	console.log('✨ Using NOTION_PAGE_ID from environment variable');
	const tempConfigPath = join(config.output, '.temp-pages.json');
	ensureDirectory(config.output);
	await createPagesConfigFromEnv(config.pageId, config.token, tempConfigPath);
	inputConfigFile = tempConfigPath;
	pageIdFromEnv = config.pageId;
	} else if (!existsSync(config.input)) {
	console.error(`❌ No NOTION_PAGE_ID environment variable and no pages.json found at: ${config.input}`);
	console.log('💡 Either set NOTION_PAGE_ID env var or create input/pages.json');
	process.exit(1);
	}

	// Always clean output directory to avoid conflicts with previous imports
	console.log('🧹 Cleaning output directory to avoid conflicts...');
	await cleanDirectory(config.output);

	// Clean assets/image directory and ensure proper permissions
	console.log('🧹 Cleaning assets/image directory and setting permissions...');
	if (existsSync(ASTRO_ASSETS_PATH)) {
	await cleanDirectory(ASTRO_ASSETS_PATH);
	} else {
	ensureDirectory(ASTRO_ASSETS_PATH);
	}

	// Ensure proper permissions for assets directory
	const { execSync } = await import('child_process');
	try {
	execSync(`chmod -R 755 "${ASTRO_ASSETS_PATH}"`, { stdio: 'inherit' });
	console.log(' ✅ Set permissions for assets/image directory');
	} catch (error) {
	console.log(' ⚠️ Could not set permissions (non-critical):', error.message);
	}

	if (config.mdxOnly) {
	// Only convert existing Markdown to MDX
	console.log('📝 MDX conversion only mode');
	await convertToMdx(config.output, config.output);
	copyToAstroContent(config.output);

	} else if (config.notionOnly) {
	// Only convert Notion to Markdown
	console.log('📄 Notion conversion only mode');
	await convertNotionToMarkdown(inputConfigFile, config.output, config.token);

	} else {
	// Full workflow
	console.log('🔄 Full conversion workflow');

	// Step 1: Convert Notion to Markdown
	console.log('\n📄 Step 1: Converting Notion pages to Markdown...');
	await convertNotionToMarkdown(inputConfigFile, config.output, config.token);

	// Step 2: Convert Markdown to MDX with Notion metadata
	console.log('\n📝 Step 2: Converting Markdown to MDX...');
	const pagesConfig = readPagesConfig(inputConfigFile);
	const firstPage = pagesConfig.pages && pagesConfig.pages.length > 0 ? pagesConfig.pages[0] : null;
	const pageId = pageIdFromEnv \|\| (firstPage ? firstPage.id : null);
	await convertToMdx(config.output, config.output, pageId, config.token);

	// Step 3: Copy to Astro content directory
	console.log('\n📋 Step 3: Copying to Astro content directory...');
	copyToAstroContent(config.output);
	}

	console.log('\n🎉 Conversion completed successfully!');

	} catch (error) {
	console.error('❌ Error:', error.message);
	process.exit(1);
	}
	}

	// Export functions for use as module
	export { convertNotionToMarkdown, convertToMdx };

	// Run CLI if called directly
	if (import.meta.url === `file://${process.argv[1]}`) {
	main();
	}