Spaces:
Running
Running
| document.addEventListener('DOMContentLoaded', function() { | |
| const uploadBtn = document.getElementById('uploadBtn'); | |
| const fileInput = document.getElementById('fileInput'); | |
| const filePreviewList = document.getElementById('filePreviewList'); | |
| const filePreviewContainer = document.getElementById('filePreviewContainer'); | |
| const processBtn = document.getElementById('processBtn'); | |
| const outputFormat = document.getElementById('outputFormat'); | |
| const resultsContainer = document.getElementById('resultsContainer'); | |
| const resultsSection = document.getElementById('resultsSection'); | |
| const downloadAllBtn = document.getElementById('downloadAllBtn'); | |
| let files = []; | |
| let processedResults = []; | |
| // Set enhanced PDF.js worker path with additional configurations | |
| pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js'; | |
| // Configure PDF.js for better text extraction | |
| pdfjsLib.GlobalWorkerOptions.isEvalSupported = false; | |
| // Handle file selection | |
| uploadBtn.addEventListener('click', () => fileInput.click()); | |
| fileInput.addEventListener('change', handleFileSelection); | |
| function handleFileSelection(e) { | |
| files = Array.from(e.target.files); | |
| filePreviewList.innerHTML = ''; | |
| if (files.length === 0) { | |
| filePreviewContainer.classList.add('hidden'); | |
| return; | |
| } | |
| files.forEach((file, index) => { | |
| const filePreview = createFilePreview(file, index); | |
| filePreviewList.appendChild(filePreview); | |
| }); | |
| filePreviewContainer.classList.remove('hidden'); | |
| } | |
| function createFilePreview(file, index) { | |
| const card = document.createElement('div'); | |
| card.className = 'file-card bg-gray-50 rounded-lg p-4 flex items-center justify-between'; | |
| const fileInfo = document.createElement('div'); | |
| fileInfo.className = 'flex items-center'; | |
| const icon = document.createElement('div'); | |
| icon.className = 'bg-gray-200 p-2 rounded-full mr-3'; | |
| const fileIcon = document.createElement('i'); | |
| fileIcon.dataset.feather = getFileIcon(file); | |
| icon.appendChild(fileIcon); | |
| const fileName = document.createElement('span'); | |
| fileName.className = 'font-medium text-gray-800'; | |
| fileName.textContent = file.name; | |
| fileInfo.appendChild(icon); | |
| fileInfo.appendChild(fileName); | |
| const fileSize = document.createElement('span'); | |
| fileSize.className = 'text-gray-500 text-sm'; | |
| fileSize.textContent = formatFileSize(file.size); | |
| card.appendChild(fileInfo); | |
| card.appendChild(fileSize); | |
| feather.replace(); | |
| return card; | |
| } | |
| function getFileIcon(file) { | |
| if (file.type.includes('pdf')) return 'file'; | |
| if (file.type.includes('word') || file.type.includes('document')) return 'file-text'; | |
| if (file.type.includes('excel') || file.type.includes('spreadsheet')) return 'file-text'; | |
| if (file.type.includes('image')) return 'image'; | |
| return 'file'; | |
| } | |
| function formatFileSize(bytes) { | |
| if (bytes === 0) return '0 Bytes'; | |
| const k = 1024; | |
| const sizes = ['Bytes', 'KB', 'MB', 'GB']; | |
| const i = Math.floor(Math.log(bytes) / Math.log(k)); | |
| return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; | |
| } | |
| // Process files | |
| processBtn.addEventListener('click', async function() { | |
| if (files.length === 0) { | |
| alert('Please select at least one file'); | |
| return; | |
| } | |
| resultsContainer.innerHTML = ''; | |
| processedResults = []; | |
| processBtn.disabled = true; | |
| processBtn.innerHTML = '<i data-feather="loader" class="spinner mr-2"></i> Processing...'; | |
| feather.replace(); | |
| try { | |
| for (const file of files) { | |
| const result = await processFile(file); | |
| processedResults.push(result); | |
| displayResult(result); | |
| } | |
| resultsSection.classList.remove('hidden'); | |
| } catch (error) { | |
| console.error('Error processing files:', error); | |
| alert('An error occurred while processing files: ' + error.message); | |
| } finally { | |
| processBtn.disabled = false; | |
| processBtn.innerHTML = '<i data-feather="cpu" class="mr-2"></i> Process Files'; | |
| feather.replace(); | |
| } | |
| // Load additional Turkish language data | |
| function loadTurkishLanguageData() { | |
| if (!window.tesseractTurDataLoaded) { | |
| Tesseract.addLanguageData('tur', { | |
| data: '/static/tesseract/tur.traineddata.gz' | |
| }); | |
| window.tesseractTurDataLoaded = true; | |
| } | |
| } | |
| loadTurkishLanguageData(); | |
| } | |
| ); | |
| async function processFile(file) { | |
| const format = outputFormat.value; | |
| let content; | |
| if (file.type.includes('pdf')) { | |
| content = await extractTextFromPDF(file); | |
| } else if (file.type.includes('word') || file.type.includes('document') || | |
| file.name.endsWith('.docx') || file.name.endsWith('.doc')) { | |
| content = await extractTextFromWord(file); | |
| } else if (file.type.includes('excel') || file.type.includes('spreadsheet') || | |
| file.name.endsWith('.xlsx') || file.name.endsWith('.xls')) { | |
| content = await extractTextFromExcel(file); | |
| } else if (file.type.includes('image')) { | |
| content = await extractTextFromImage(file); | |
| } else { | |
| throw new Error('Unsupported file type: ' + file.type); | |
| } | |
| // Convert content to requested format | |
| let formattedContent; | |
| // Ensure content is properly encoded for Turkish characters | |
| const cleanContent = typeof content === 'string' ? content : JSON.stringify(content, null, 2); | |
| if (format === 'json') { | |
| formattedContent = { | |
| fileName: file.name, | |
| fileType: file.type, | |
| fileSize: file.size, | |
| content: cleanContent, | |
| extractedAt: new Date().toISOString() | |
| }; | |
| // Use custom replacer to handle Turkish characters properly | |
| formattedContent = JSON.stringify(formattedContent, null, 2); | |
| } else if (format === 'markdown') { | |
| formattedContent = `# ${file.name}\n\n`; | |
| formattedContent += cleanContent; | |
| } else if (format === 'formatted') { | |
| // Apply better formatting for Turkish text | |
| formattedContent = cleanContent | |
| .replace(/([.!?])\s*/g, '$1\n\n') // Better paragraph breaks | |
| .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks | |
| .replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 '); // Preserve Turkish words | |
| } else { | |
| // Plain text - ensure Turkish characters are preserved | |
| formattedContent = cleanContent; | |
| } | |
| return { | |
| fileName: file.name, | |
| content: formattedContent, | |
| format: format | |
| }; | |
| } | |
| async function extractTextFromPDF(file) { | |
| return new Promise(async (resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = async function(event) { | |
| try { | |
| // Create a copy of the ArrayBuffer to avoid detachment issues | |
| const arrayBuffer = event.target.result; | |
| const typedArray = new Uint8Array(arrayBuffer.slice(0)); | |
| // Enhanced PDF loading with multiple extraction strategies | |
| const loadingTask = pdfjsLib.getDocument({ | |
| data: typedArray.buffer, | |
| cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/', | |
| cMapPacked: true, | |
| standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/', | |
| useSystemFonts: true, | |
| useWorkerFetch: true, | |
| isEvalSupported: false, | |
| disableAutoFetch: false, | |
| disableStream: false | |
| }); | |
| const pdf = await loadingTask.promise; | |
| let fullText = ''; | |
| let metadata = await pdf.getMetadata(); | |
| // Strategy 1: Enhanced text extraction with structural analysis | |
| for (let i = 1; i <= pdf.numPages; i++) { | |
| const page = await pdf.getPage(i); | |
| // Get viewport for better text positioning | |
| const viewport = page.getViewport({ scale: 2.0 }); | |
| // Enhanced text content extraction | |
| const textContent = await page.getTextContent({ | |
| normalizeWhitespace: false, | |
| disableCombineTextItems: false, | |
| includeMarkedContent: true | |
| }); | |
| // Process text items with better grouping | |
| const textItems = textContent.items; | |
| let pageText = ''; | |
| let lastY = null; | |
| let lastX = null; | |
| for (let j = 0; j < textItems.length; j++) { | |
| const item = textItems[j]; | |
| const tx = pdfjsLib.Util.transform( | |
| viewport.transform, | |
| item.transform | |
| ); | |
| const x = tx[4]; | |
| const y = tx[5]; | |
| // Add line breaks based on Y position | |
| if (lastY !== null && Math.abs(y - lastY) > item.height * 0.8) { | |
| pageText += '\n'; | |
| } | |
| // Add spaces based on X position | |
| if (lastX !== null && x - lastX > item.width * 0.3) { | |
| pageText += ' '; | |
| } | |
| pageText += item.str; | |
| lastY = y; | |
| lastX = x + item.width; | |
| } | |
| // Clean up and format the text | |
| pageText = pageText | |
| .replace(/\s+/g, ' ') | |
| .replace(/\n\s*\n/g, '\n\n') | |
| .trim(); | |
| if (pageText) { | |
| fullText += pageText + '\n\n'; | |
| } | |
| } | |
| // Strategy 2: Enhanced Turkish character decoding | |
| fullText = decodeTurkishText(fullText); | |
| // Strategy 3: If still poor quality, try OCR with preprocessing | |
| if (!fullText.trim() || fullText.trim().length < 50) { | |
| console.warn('Primary text extraction failed, attempting enhanced OCR...'); | |
| // Create a fresh copy for OCR to avoid detachment | |
| const ocrArrayBuffer = arrayBuffer.slice(0); | |
| fullText = await enhancedOCRFallback(ocrArrayBuffer); | |
| } | |
| // Strategy 4: Apply text quality improvements | |
| fullText = improveTextQuality(fullText); | |
| resolve(fullText); | |
| } catch (error) { | |
| console.error('PDF extraction error:', error); | |
| // Try a simpler extraction method as fallback | |
| try { | |
| console.warn('Attempting simplified PDF extraction...'); | |
| const simpleArray = new Uint8Array(arrayBuffer.slice(0)); | |
| const simpleLoadingTask = pdfjsLib.getDocument(simpleArray.buffer); | |
| const simplePdf = await simpleLoadingTask.promise; | |
| let simpleText = ''; | |
| for (let i = 1; i <= Math.min(simplePdf.numPages, 5); i++) { | |
| const page = await simplePdf.getPage(i); | |
| const simpleContent = await page.getTextContent(); | |
| const pageText = simpleContent.items.map(item => item.str).join(' '); | |
| simpleText += pageText + '\n'; | |
| } | |
| if (simpleText.trim()) { | |
| resolve(decodeTurkishText(improveTextQuality(simpleText))); | |
| return; | |
| } | |
| } catch (fallbackError) { | |
| console.error('Fallback extraction also failed:', fallbackError); | |
| } | |
| reject(new Error('Failed to extract text from PDF: ' + error.message)); | |
| } | |
| }; | |
| reader.onerror = () => reject(new Error('Failed to read PDF file')); | |
| reader.readAsArrayBuffer(file); | |
| }); | |
| } | |
| // Enhanced Turkish text decoding | |
| function decodeTurkishText(text) { | |
| // Multiple encoding fixes for Turkish characters | |
| const fixes = [ | |
| // UTF-8 double encoding | |
| [/\u00C3\u00A7/g, 'ç'], [/\u00C3\u0087/g, 'Ç'], | |
| [/\u00C3\u011F/g, 'ğ'], [/\u00C4\u0178/g, 'Ğ'], | |
| [/\u00C3\u00BC/g, 'ü'], [/\u00C3\u009C/g, 'Ü'], | |
| [/\u00C3\u015F/g, 'ş'], [/\u00C5\u0178/g, 'Ş'], | |
| [/\u00C3\u0131/g, 'ı'], [/\u00C4\u0131/g, 'İ'], | |
| [/\u00C3\u00B6/g, 'ö'], [/\u00C3\u0096/g, 'Ö'], | |
| // ISO-8859-9 to UTF-8 | |
| [/[\u00C4\u00E4]/g, 'ä'], [/[\u00C5\u00E5]/g, 'å'], | |
| [/[\u00C6\u00E6]/g, 'æ'], [/[\u00C7\u00E7]/g, 'ç'], | |
| [/[\u00D0\u00F0]/g, 'ð'], [/[\u011E\u011F]/g, 'ğ'], | |
| [/[\u0130\u0131]/g, 'ı'], [/[\u015E\u015F]/g, 'ş'], | |
| [/[\u00D6\u00F6]/g, 'ö'], [/[\u00DC\u00FC]/g, 'ü'], | |
| [/[\u00DE\u00FE]/g, 'þ'], | |
| // Common OCR errors | |
| [/c/g, 'ç', { context: 'turkish' }], [/C/g, 'Ç', { context: 'turkish' }], | |
| [/g/g, 'ğ', { context: 'turkish' }], [/G/g, 'Ğ', { context: 'turkish' }], | |
| [/i/g, 'ı', { context: 'turkish' }], [/I/g, 'İ', { context: 'turkish' }], | |
| [/o/g, 'ö', { context: 'turkish' }], [/O/g, 'Ö', { context: 'turkish' }], | |
| [/s/g, 'ş', { context: 'turkish' }], [/S/g, 'Ş', { context: 'turkish' }], | |
| [/u/g, 'ü', { context: 'turkish' }], [/U/g, 'Ü', { context: 'turkish' }] | |
| ]; | |
| let decodedText = text; | |
| fixes.forEach(fix => { | |
| if (Array.isArray(fix) && fix.length === 2) { | |
| decodedText = decodedText.replace(fix[0], fix[1]); | |
| } | |
| }); | |
| // Apply HTML entity decoding if needed | |
| try { | |
| decodedText = he.decode(decodedText); | |
| } catch (e) { | |
| console.warn('HTML decoding failed:', e); | |
| } | |
| return decodedText; | |
| } | |
| // Enhanced OCR fallback with multiple engines | |
| async function enhancedOCRFallback(pdfData) { | |
| // Ensure we have a valid ArrayBuffer | |
| let arrayBuffer; | |
| if (pdfData instanceof ArrayBuffer) { | |
| arrayBuffer = pdfData; | |
| } else if (pdfData instanceof Uint8Array) { | |
| arrayBuffer = pdfData.buffer; | |
| } else { | |
| throw new Error('Invalid PDF data format for OCR fallback'); | |
| } | |
| const images = await convertPDFToImagesEnhanced(arrayBuffer); | |
| let ocrResults = []; | |
| for (const image of images) { | |
| // Try multiple OCR approaches | |
| const results = await Promise.allSettled([ | |
| // Tesseract with Turkish and English | |
| extractTextWithTesseract(image, 'tur+eng'), | |
| // Tesseract with additional preprocessing | |
| extractTextWithTesseract(image, 'tur+eng', { preprocess: true }), | |
| // Fallback to English only if Turkish fails | |
| extractTextWithTesseract(image, 'eng') | |
| ]); | |
| // Find the best result | |
| let bestResult = ''; | |
| let maxLength = 0; | |
| results.forEach(result => { | |
| if (result.status === 'fulfilled' && result.value.length > maxLength) { | |
| bestResult = result.value; | |
| maxLength = result.value.length; | |
| } | |
| }); | |
| if (bestResult) { | |
| ocrResults.push(bestResult); | |
| } | |
| } | |
| return ocrResults.join('\n\n') || 'OCR processing completed but no text was extracted.'; | |
| } | |
| // Enhanced Tesseract extraction | |
| async function extractTextWithTesseract(image, languages = 'tur+eng', options = {}) { | |
| try { | |
| const config = { | |
| logger: m => console.log(`Tesseract: ${m.status} - ${Math.round(m.progress * 100)}%`), | |
| preserve_interword_spaces: '1', | |
| tessedit_pageseg_mode: '6', | |
| tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ@#$%^&*+=<>:;_ ', | |
| load_system_dawg: '1', | |
| load_freq_dawg: '1' | |
| }; | |
| if (options.preprocess) { | |
| // Apply image preprocessing | |
| image = await preprocessImage(image); | |
| } | |
| const result = await Tesseract.recognize(image, languages, config); | |
| return result.data.text; | |
| } catch (error) { | |
| console.error('Tesseract OCR error:', error); | |
| throw error; | |
| } | |
| } | |
| // Image preprocessing for better OCR | |
| async function preprocessImage(canvas) { | |
| const ctx = canvas.getContext('2d'); | |
| const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height); | |
| const data = imageData.data; | |
| // Convert to grayscale | |
| for (let i = 0; i < data.length; i += 4) { | |
| const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114; | |
| data[i] = gray; | |
| data[i + 1] = gray; | |
| data[i + 2] = gray; | |
| } | |
| // Apply adaptive thresholding | |
| const threshold = 128; | |
| for (let i = 0; i < data.length; i += 4) { | |
| const value = data[i] > threshold ? 255 : 0; | |
| data[i] = value; | |
| data[i + 1] = value; | |
| data[i + 2] = value; | |
| } | |
| ctx.putImageData(imageData, 0, 0); | |
| return canvas; | |
| } | |
| // Enhanced PDF to image conversion | |
| async function convertPDFToImagesEnhanced(pdfData) { | |
| // Ensure we have a fresh copy of the data | |
| let data; | |
| if (pdfData instanceof ArrayBuffer) { | |
| data = new Uint8Array(pdfData.slice(0)); | |
| } else if (pdfData instanceof Uint8Array) { | |
| data = new Uint8Array(pdfData.buffer.slice(0)); | |
| } else { | |
| throw new Error('Invalid PDF data format for image conversion'); | |
| } | |
| const loadingTask = pdfjsLib.getDocument({ | |
| data: data.buffer, | |
| cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/', | |
| cMapPacked: true, | |
| standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/', | |
| // Disable worker for image conversion to avoid detachment issues | |
| useWorkerFetch: false, | |
| isEvalSupported: false, | |
| disableWorker: true | |
| }); | |
| const pdf = await loadingTask.promise; | |
| const images = []; | |
| // Process all pages with higher resolution | |
| for (let i = 1; i <= Math.min(pdf.numPages, 10); i++) { | |
| const page = await pdf.getPage(i); | |
| const viewport = page.getViewport({ scale: 3.0 }); | |
| const canvas = document.createElement('canvas'); | |
| const context = canvas.getContext('2d'); | |
| canvas.height = viewport.height; | |
| canvas.width = viewport.width; | |
| // Render with better quality | |
| await page.render({ | |
| canvasContext: context, | |
| viewport: viewport, | |
| renderInteractiveForms: true, | |
| intent: 'print' | |
| }).promise; | |
| images.push(canvas); | |
| } | |
| return images; | |
| } | |
| // Text quality improvement | |
| function improveTextQuality(text) { | |
| return text | |
| // Fix common OCR errors in Turkish | |
| .replace(/\bi\b/g, 'ı') // Turkish dotless i | |
| .replace(/\bI\b/g, 'İ') // Turkish capital I with dot | |
| .replace(/c([aeiou])/gi, 'ç$1') // c followed by vowel -> ç | |
| .replace(/C([AEIOU])/g, 'Ç$1') | |
| .replace(/g([aeiou])/gi, 'ğ$1') // g followed by vowel -> ğ | |
| .replace(/G([AEIOU])/g, 'Ğ$1') | |
| .replace(/s([aeiou])/gi, 'ş$1') // s followed by vowel -> ş | |
| .replace(/S([AEIOU])/g, 'Ş$1') | |
| .replace(/o([aeiou])/gi, 'ö$1') // o followed by vowel -> ö | |
| .replace(/O([AEIOU])/g, 'Ö$1') | |
| .replace(/u([aeiou])/gi, 'ü$1') // u followed by vowel -> ü | |
| .replace(/U([AEIOU])/g, 'Ü$1') | |
| // Clean up spacing | |
| .replace(/\s+/g, ' ') | |
| .replace(/\n\s*\n/g, '\n\n') | |
| // Fix common character confusion | |
| .replace(/0/g, 'O', { condition: context => /[A-Z]/.test(context.after) }) | |
| .replace(/1/g, 'I', { condition: context => /[A-Z]/.test(context.after) }) | |
| .replace(/5/g, 'S', { condition: context => /[A-Z]/.test(context.after) }) | |
| .trim(); | |
| } | |
| async function extractTextFromWord(file) { | |
| return new Promise(async (resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = async function(event) { | |
| try { | |
| // Enhanced Word document extraction | |
| const result = await mammoth.extractRawText({ | |
| arrayBuffer: event.target.result, | |
| options: { | |
| includeDefaultStyleMap: true, | |
| styleMap: [ | |
| "p[style-name='Heading 1'] => h1:fresh", | |
| "p[style-name='Heading 2'] => h2:fresh", | |
| "p[style-name='Heading 3'] => h3:fresh", | |
| "p[style-name='Title'] => h1.title:fresh", | |
| "r[style-name='Strong'] => strong", | |
| "r[style-name='Emphasis'] => em" | |
| ] | |
| } | |
| }); | |
| let text = result.value; | |
| // Apply Turkish character decoding | |
| text = decodeTurkishText(text); | |
| // Apply text quality improvements | |
| text = improveTextQuality(text); | |
| // Try alternative extraction if result is poor | |
| if (text.trim().length < 50) { | |
| console.warn('Primary Word extraction failed, trying alternative...'); | |
| const altResult = await mammoth.convertToMarkdown({ | |
| arrayBuffer: event.target.result | |
| }); | |
| if (altResult.value && altResult.value.trim().length > text.trim().length) { | |
| text = altResult.value; | |
| text = decodeTurkishText(text); | |
| text = improveTextQuality(text); | |
| } | |
| } | |
| resolve(text); | |
| } catch (error) { | |
| reject(error); | |
| } | |
| }; | |
| reader.onerror = reject; | |
| reader.readAsArrayBuffer(file); | |
| }); | |
| } | |
| async function extractTextFromExcel(file) { | |
| return new Promise(async (resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = async function(event) { | |
| try { | |
| const data = new Uint8Array(event.target.result); | |
| // Enhanced Excel reading with Turkish support | |
| const workbook = XLSX.read(data, { | |
| type: 'array', | |
| codepage: 1254, // Turkish codepage | |
| cellStyles: true, | |
| cellHTML: false | |
| }); | |
| const result = {}; | |
| workbook.SheetNames.forEach(sheetName => { | |
| const worksheet = workbook.Sheets[sheetName]; | |
| // Try multiple extraction methods | |
| const jsonData = XLSX.utils.sheet_to_json(worksheet, { | |
| header: 1, | |
| raw: false, | |
| dateNF: 'dd/mm/yyyy', | |
| defval: '' | |
| }); | |
| const csvData = XLSX.utils.sheet_to_csv(worksheet, { | |
| FS: '\t', | |
| RS: '\n', | |
| dateNF: 'dd/mm/yyyy' | |
| }); | |
| // Process data with Turkish character support | |
| const processedData = jsonData.map(row => | |
| row.map(cell => { | |
| if (typeof cell === 'string') { | |
| return decodeTurkishText(improveTextQuality(cell)); | |
| } | |
| return cell; | |
| }) | |
| ); | |
| result[sheetName] = { | |
| data: processedData, | |
| csv: decodeTurkishText(csvData), | |
| range: worksheet['!ref'] || '', | |
| rowCount: jsonData.length, | |
| colCount: jsonData[0] ? jsonData[0].length : 0 | |
| }; | |
| }); | |
| resolve(result); | |
| } catch (error) { | |
| reject(error); | |
| } | |
| }; | |
| reader.onerror = reject; | |
| reader.readAsArrayBuffer(file); | |
| }); | |
| } | |
| async function convertPDFToImages(pdfData) { | |
| // Create a fresh copy before processing | |
| if (pdfData instanceof ArrayBuffer) { | |
| return await convertPDFToImagesEnhanced(pdfData.slice(0)); | |
| } else if (pdfData instanceof Uint8Array) { | |
| return await convertPDFToImagesEnhanced(pdfData.buffer.slice(0)); | |
| } | |
| return await convertPDFToImagesEnhanced(pdfData); | |
| } | |
| async function extractTextFromImage(file) { | |
| return new Promise(async (resolve, reject) => { | |
| try { | |
| const imageElement = file instanceof HTMLCanvasElement ? file : file; | |
| // Apply advanced preprocessing | |
| const processedImages = await applyAdvancedPreprocessing(imageElement); | |
| // Multi-strategy OCR approach | |
| const ocrResults = []; | |
| for (const processedImage of processedImages) { | |
| const results = await Promise.allSettled([ | |
| // Strategy 1: Turkish with best settings | |
| performAdvancedOCR(processedImage, 'tur', { | |
| tessedit_pageseg_mode: '6', | |
| preserve_interword_spaces: '1', | |
| tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ', | |
| tessedit_ocr_engine_mode: '1', | |
| tessedit_do_ocr: '1', | |
| tessedit_load_image: '1' | |
| }), | |
| // Strategy 2: Turkish+English with auto segmentation | |
| performAdvancedOCR(processedImage, 'tur+eng', { | |
| tessedit_pageseg_mode: '1', | |
| preserve_interword_spaces: '1', | |
| tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ', | |
| tessedit_ocr_engine_mode: '1' | |
| }), | |
| // Strategy 3: Single column mode | |
| performAdvancedOCR(processedImage, 'tur', { | |
| tessedit_pageseg_mode: '3', | |
| preserve_interword_spaces: '1', | |
| tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ' | |
| }) | |
| ]); | |
| results.forEach(result => { | |
| if (result.status === 'fulfilled' && result.value.text.trim().length > 10) { | |
| ocrResults.push({ | |
| text: result.value.text, | |
| confidence: result.value.confidence || calculateConfidence(result.value.text), | |
| strategy: result.value.strategy | |
| }); | |
| } | |
| }); | |
| } | |
| // Select best result using advanced scoring | |
| const bestResult = selectBestResult(ocrResults); | |
| if (bestResult) { | |
| // Apply document structure analysis | |
| const structuredText = await analyzeDocumentStructure(bestResult.text); | |
| // Apply intelligent Turkish text corrections | |
| const correctedText = applyIntelligentTurkishCorrections(structuredText); | |
| resolve(correctedText); | |
| } else { | |
| resolve('No text could be extracted from the image.'); | |
| } | |
| } catch (error) { | |
| console.error('Enhanced image OCR error:', error); | |
| reject(error); | |
| } | |
| }); | |
| // Apply advanced image preprocessing techniques | |
| async function applyAdvancedPreprocessing(imageElement) { | |
| const processedImages = []; | |
| // Original image | |
| if (imageElement instanceof HTMLCanvasElement) { | |
| processedImages.push(imageElement); | |
| } else { | |
| const canvas = await imageToCanvas(imageElement); | |
| processedImages.push(canvas); | |
| } | |
| // Enhanced preprocessing variations | |
| const variations = [ | |
| // High contrast | |
| await applyImageEnhancement(processedImages[0], 'contrast'), | |
| // Denoised | |
| await applyImageEnhancement(processedImages[0], 'denoise'), | |
| // Sharpened | |
| await applyImageEnhancement(processedImages[0], 'sharpen'), | |
| // Binarized | |
| await applyImageEnhancement(processedImages[0], 'binarize') | |
| ]; | |
| processedImages.push(...variations.filter(img => img !== null)); | |
| return processedImages; | |
| } | |
| // Convert image to canvas | |
| async function imageToCanvas(image) { | |
| return new Promise((resolve) => { | |
| const img = new Image(); | |
| img.onload = () => { | |
| const canvas = document.createElement('canvas'); | |
| canvas.width = img.width; | |
| canvas.height = img.height; | |
| const ctx = canvas.getContext('2d'); | |
| ctx.drawImage(img, 0, 0); | |
| resolve(canvas); | |
| }; | |
| img.src = image instanceof HTMLCanvasElement ? image.toDataURL() : URL.createObjectURL(image); | |
| }); | |
| } | |
| // Apply specific image enhancement | |
| async function applyImageEnhancement(canvas, type) { | |
| const ctx = canvas.getContext('2d'); | |
| const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height); | |
| const data = imageData.data; | |
| switch(type) { | |
| case 'contrast': | |
| // Enhance contrast | |
| const contrast = 1.5; | |
| for (let i = 0; i < data.length; i += 4) { | |
| data[i] = ((data[i] - 128) * contrast) + 128; | |
| data[i + 1] = ((data[i + 1] - 128) * contrast) + 128; | |
| data[i + 2] = ((data[i + 2] - 128) * contrast) + 128; | |
| } | |
| break; | |
| case 'denoise': | |
| // Simple noise reduction | |
| for (let i = 0; i < data.length; i += 4) { | |
| const avg = (data[i] + data[i + 1] + data[i + 2]) / 3; | |
| const threshold = 30; | |
| if (Math.abs(data[i] - avg) > threshold) data[i] = avg; | |
| if (Math.abs(data[i + 1] - avg) > threshold) data[i + 1] = avg; | |
| if (Math.abs(data[i + 2] - avg) > threshold) data[i + 2] = avg; | |
| } | |
| break; | |
| case 'sharpen': | |
| // Sharpen filter | |
| const weights = [0, -1, 0, -1, 5, -1, 0, -1, 0]; | |
| const side = Math.round(Math.sqrt(weights.length)); | |
| const halfSide = Math.floor(side / 2); | |
| const output = ctx.createImageData(canvas.width, canvas.height); | |
| const dst = output.data; | |
| for (let y = 0; y < canvas.height; y++) { | |
| for (let x = 0; x < canvas.width; x++) { | |
| const dstOff = (y * canvas.width + x) * 4; | |
| let r = 0, g = 0, b = 0; | |
| for (let cy = 0; cy < side; cy++) { | |
| for (let cx = 0; cx < side; cx++) { | |
| const scy = y + cy - halfSide; | |
| const scx = x + cx - halfSide; | |
| if (scy >= 0 && scy < canvas.height && scx >= 0 && scx < canvas.width) { | |
| const srcOff = (scy * canvas.width + scx) * 4; | |
| const wt = weights[cy * side + cx]; | |
| r += data[srcOff] * wt; | |
| g += data[srcOff + 1] * wt; | |
| b += data[srcOff + 2] * wt; | |
| } | |
| } | |
| } | |
| dst[dstOff] = r; | |
| dst[dstOff + 1] = g; | |
| dst[dstOff + 2] = b; | |
| dst[dstOff + 3] = 255; | |
| } | |
| } | |
| ctx.putImageData(output, 0, 0); | |
| return canvas; | |
| case 'binarize': | |
| // Adaptive thresholding | |
| for (let i = 0; i < data.length; i += 4) { | |
| const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114; | |
| const threshold = gray > 128 ? 255 : 0; | |
| data[i] = threshold; | |
| data[i + 1] = threshold; | |
| data[i + 2] = threshold; | |
| } | |
| break; | |
| } | |
| ctx.putImageData(imageData, 0, 0); | |
| return canvas; | |
| } | |
| // Advanced OCR processing | |
| async function performAdvancedOCR(image, languages, config) { | |
| try { | |
| const result = await Tesseract.recognize(image, languages, { | |
| logger: m => console.log(`OCR (${languages}): ${m.status} - ${Math.round(m.progress * 100)}%`), | |
| ...config | |
| }); | |
| return { | |
| text: result.data.text, | |
| confidence: result.data.confidence || 0, | |
| strategy: `OCR_${languages}_${config.tessedit_pageseg_mode}` | |
| }; | |
| } catch (error) { | |
| console.error(`OCR strategy failed:`, error); | |
| return { text: '', confidence: 0 }; | |
| } | |
| } | |
| // Select best OCR result using advanced scoring | |
| function selectBestResult(results) { | |
| if (results.length === 0) return null; | |
| let bestScore = -1; | |
| let bestResult = null; | |
| results.forEach(result => { | |
| const score = calculateAdvancedScore(result.text, result.confidence); | |
| if (score > bestScore) { | |
| bestScore = score; | |
| bestResult = result; | |
| } | |
| }); | |
| return bestResult; | |
| } | |
| // Calculate advanced scoring for OCR results | |
| function calculateAdvancedScore(text, baseConfidence) { | |
| if (!text || text.trim().length === 0) return 0; | |
| let score = baseConfidence || 0; | |
| // Turkish character detection (40% weight) | |
| const turkishChars = (text.match(/[ğüşıöçĞÜŞİÖÇ]/g) || []).length; | |
| const totalChars = text.replace(/\s/g, '').length; | |
| const turkishRatio = totalChars > 0 ? turkishChars / totalChars : 0; | |
| score += turkishRatio * 40; | |
| // Word detection (20% weight) | |
| const words = text.match(/\b\w+\b/g) || []; | |
| const turkishWords = words.filter(word => /[ğıüşiöçĞIÜŞİÖÇ]/.test(word)); | |
| const wordRatio = words.length > 0 ? turkishWords.length / words.length : 0; | |
| score += wordRatio * 20; | |
| // Sentence structure (20% weight) | |
| const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 5); | |
| const avgSentenceLength = sentences.length > 0 ? | |
| sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length : 0; | |
| const sentenceScore = Math.min(avgSentenceLength / 10, 1); | |
| score += sentenceScore * 20; | |
| // Text length penalty for very short texts | |
| if (text.trim().length < 20) score *= 0.5; | |
| return Math.min(score, 100); | |
| } | |
| // Analyze document structure like Abbyy FineReader | |
| async function analyzeDocumentStructure(text) { | |
| // Split text into potential sections | |
| const lines = text.split('\n').filter(line => line.trim().length > 0); | |
| const structuredSections = []; | |
| let currentSection = { type: 'paragraph', content: [], level: 0 }; | |
| for (let i = 0; i < lines.length; i++) { | |
| const line = lines[i].trim(); | |
| // Detect headings | |
| if (isHeading(line)) { | |
| if (currentSection.content.length > 0) { | |
| structuredSections.push(currentSection); | |
| } | |
| currentSection = { | |
| type: 'heading', | |
| content: [line], | |
| level: detectHeadingLevel(line) | |
| }; | |
| } | |
| // Detect lists | |
| else if (isListItem(line)) { | |
| if (currentSection.type !== 'list') { | |
| if (currentSection.content.length > 0) { | |
| structuredSections.push(currentSection); | |
| } | |
| currentSection = { type: 'list', content: [], level: 0 }; | |
| } | |
| currentSection.content.push(line); | |
| } | |
| // Detect tables | |
| else if (isTableRow(line)) { | |
| if (currentSection.type !== 'table') { | |
| if (currentSection.content.length > 0) { | |
| structuredSections.push(currentSection); | |
| } | |
| currentSection = { type: 'table', content: [], level: 0 }; | |
| } | |
| currentSection.content.push(line); | |
| } | |
| // Regular paragraph | |
| else { | |
| if (currentSection.type !== 'paragraph') { | |
| if (currentSection.content.length > 0) { | |
| structuredSections.push(currentSection); | |
| } | |
| currentSection = { type: 'paragraph', content: [], level: 0 }; | |
| } | |
| currentSection.content.push(line); | |
| } | |
| } | |
| if (currentSection.content.length > 0) { | |
| structuredSections.push(currentSection); | |
| } | |
| return formatStructuredText(structuredSections); | |
| } | |
| // Check if line is a heading | |
| function isHeading(line) { | |
| // Short lines with all caps or title case are likely headings | |
| if (line.length < 50 && line.split(/\s+/).length <= 8) { | |
| const words = line.split(/\s+/); | |
| const titleWords = words.filter(word => /^[A-ZÇĞİÖŞÜ]/.test(word)); | |
| return titleWords.length / words.length > 0.6; | |
| } | |
| // Lines with colon at end are often headings | |
| if (line.endsWith(':')) return true; | |
| // Numbered headings | |
| if (/^\d+\.?\s+[A-ZÇĞİÖŞÜ]/.test(line)) return true; | |
| return false; | |
| } | |
| // Detect heading level | |
| function detectHeadingLevel(line) { | |
| if (/^\d+\.\d+\s+/.test(line)) return 2; | |
| if (/^\d+\s+/.test(line)) return 1; | |
| if (line.length < 30) return 1; | |
| if (line.length < 40) return 2; | |
| return 3; | |
| } | |
| // Check if line is a list item | |
| function isListItem(line) { | |
| return /^[-*•]\s+/.test(line) || | |
| /^\d+\.\s+/.test(line) || | |
| /^\([a-z]\)\s+/.test(line); | |
| } | |
| // Check if line is a table row | |
| function isTableRow(line) { | |
| // Multiple tabs or multiple pipes suggest table | |
| return (line.split('\t').length > 2) || | |
| (line.split('|').length > 3); | |
| } | |
| // Format structured text based on output format | |
| function formatStructuredText(sections) { | |
| if (outputFormat.value === 'markdown') { | |
| return formatAsMarkdown(sections); | |
| } else if (outputFormat.value === 'json') { | |
| return formatAsJSON(sections); | |
| } else if (outputFormat.value === 'formatted') { | |
| return formatAsStructuredText(sections); | |
| } | |
| return sections.map(s => s.content.join(' ')).join('\n\n'); | |
| } | |
| // Format as Markdown | |
| function formatAsMarkdown(sections) { | |
| let markdown = ''; | |
| sections.forEach(section => { | |
| switch(section.type) { | |
| case 'heading': | |
| const hashes = '#'.repeat(section.level); | |
| markdown += `${hashes} ${section.content[0]}\n\n`; | |
| break; | |
| case 'list': | |
| section.content.forEach(item => { | |
| markdown += `- ${item}\n`; | |
| }); | |
| markdown += '\n'; | |
| break; | |
| case 'table': | |
| section.content.forEach(row => { | |
| markdown += `| ${row.split(/\t+|\|/).join(' | ')} |\n`; | |
| }); | |
| markdown += '\n'; | |
| break; | |
| case 'paragraph': | |
| markdown += section.content.join(' ') + '\n\n'; | |
| break; | |
| } | |
| }); | |
| return markdown.trim(); | |
| } | |
| // Format as JSON | |
| function formatAsJSON(sections) { | |
| const structured = sections.map(section => ({ | |
| type: section.type, | |
| level: section.level, | |
| content: section.content | |
| })); | |
| return JSON.stringify(structured, null, 2); | |
| } | |
| // Format as structured text | |
| function formatAsStructuredText(sections) { | |
| let text = ''; | |
| sections.forEach(section => { | |
| switch(section.type) { | |
| case 'heading': | |
| text += '\n' + section.content[0].toUpperCase() + '\n'; | |
| text += '='.repeat(section.content[0].length) + '\n\n'; | |
| break; | |
| case 'list': | |
| section.content.forEach(item => { | |
| text += ' • ' + item + '\n'; | |
| }); | |
| text += '\n'; | |
| break; | |
| case 'table': | |
| section.content.forEach(row => { | |
| text += row + '\n'; | |
| }); | |
| text += '\n'; | |
| break; | |
| case 'paragraph': | |
| text += section.content.join(' ') + '\n\n'; | |
| break; | |
| } | |
| }); | |
| return text.trim(); | |
| } | |
| // Apply intelligent Turkish corrections | |
| function applyIntelligentTurkishCorrections(text) { | |
| // Turkish character corrections based on context | |
| const corrections = [ | |
| // Common OCR mistakes | |
| { pattern: /\bc\b/g, replacement: 'ç' }, | |
| { pattern: /\bC\b/g, replacement: 'Ç' }, | |
| { pattern: /\bg\b/g, replacement: 'ğ', context: /[aeiou]/i }, | |
| { pattern: /\bG\b/g, replacement: 'Ğ', context: /[AEIOU]/i }, | |
| { pattern: /\bi\b/g, replacement: 'ı', context: /[^iİ]/g }, | |
| { pattern: /\bI\b/g, replacement: 'İ' }, | |
| { pattern: /\bo\b/g, replacement: 'ö', context: /[aeiou]/i }, | |
| { pattern: /\bO\b/g, replacement: 'Ö', context: /[AEIOU]/i }, | |
| { pattern: /\bs\b/g, replacement: 'ş', context: /[aeiou]/i }, | |
| { pattern: /\bS\b/g, replacement: 'Ş', context: /[AEIOU]/i }, | |
| { pattern: /\bu\b/g, replacement: 'ü', context: /[aeiou]/i }, | |
| { pattern: /\bU\b/g, replacement: 'Ü', context: /[AEIOU]/i }, | |
| // Number and symbol corrections | |
| { pattern: /0/g, replacement: 'O', context: /[A-Z]/ }, | |
| { pattern: /1/g, replacement: 'İ', context: /[A-Z]/ }, | |
| { pattern: /5/g, replacement: 'S', context: /[A-Z]/ }, | |
| // Common word corrections | |
| { pattern: /\bve\b/gi, replacement: 've' }, | |
| { pattern: /\bile\b/gi, replacement: 'ile' }, | |
| { pattern: /\bicin\b/gi, replacement: 'için' }, | |
| { pattern: /\bsizin\b/gi, replacement: 'sizin' }, | |
| { pattern: /\bbir\b/gi, replacement: 'bir' }, | |
| { pattern: /\bbu\b/gi, replacement: 'bu' }, | |
| { pattern: /\bsu\b/gi, replacement: 'şu' } | |
| ]; | |
| let correctedText = text; | |
| corrections.forEach(correction => { | |
| correctedText = correctedText.replace(correction.pattern, correction.replacement); | |
| }); | |
| // Fix spacing around punctuation | |
| correctedText = correctedText | |
| .replace(/\s+([.,!?;:])/g, '$1') | |
| .replace(/([.,!?;:])\s*/g, '$1 ') | |
| .replace(/\s+/g, ' ') | |
| .trim(); | |
| return correctedText; | |
| } | |
| function processFormattedOCR(hocr) { | |
| // Apply learned corrections | |
| if (window.ocrLearningDict) { | |
| for (const [word, data] of Object.entries(window.ocrLearningDict)) { | |
| if (data.confirmedCorrect && data.confirmedCorrect !== word) { | |
| hocr = hocr.replace(new RegExp(word, 'g'), data.confirmedCorrect); | |
| } | |
| } | |
| } | |
| // Parse hOCR output to preserve formatting and layout | |
| const parser = new DOMParser(); | |
| const doc = parser.parseFromString(hocr, 'text/html'); | |
| const paragraphs = doc.querySelectorAll('.ocr_par'); | |
| let formattedText = ''; | |
| paragraphs.forEach(par => { | |
| const lines = par.querySelectorAll('.ocr_line'); | |
| lines.forEach(line => { | |
| const words = line.querySelectorAll('.ocrx_word'); | |
| let lineText = ''; | |
| words.forEach((word, index) => { | |
| const wordText = word.textContent || ''; | |
| const wordConfidence = parseFloat(word.getAttribute('title') | |
| .match(/x_wconf (\d+)/)[1]); | |
| // Better handling of Turkish characters and confidence | |
| if (wordConfidence < 50) { | |
| lineText += `[${wordText}] `; | |
| } else if (wordConfidence < 70) { | |
| lineText += `<span confidence-medium>${wordText}</span> `; | |
| } else if (wordConfidence < 85 && /[ğüşıöçĞÜŞİÖÇ]/.test(wordText)) { | |
| lineText += `${wordText}`; | |
| } else { | |
| lineText += `${wordText} `; | |
| } | |
| }); | |
| // Better line spacing for Turkish text | |
| formattedText += lineText.trim() + '\n\n'; | |
| // Store problematic words for learning | |
| words.forEach(word => { | |
| const wordConfidence = parseFloat(word.getAttribute('title') | |
| .match(/x_wconf (\d+)/)[1]); | |
| if (wordConfidence < 85) { | |
| const originalWord = word.textContent || ''; | |
| if (!window.ocrLearningDict) window.ocrLearningDict = {}; | |
| if (!window.ocrLearningDict[originalWord]) { | |
| window.ocrLearningDict[originalWord] = { | |
| occurrences: 0, | |
| confirmedCorrect: null, | |
| suggestTime: null | |
| }; | |
| } | |
| window.ocrLearningDict[originalWord].occurrences++; | |
| } | |
| }); | |
| }); | |
| formattedText += '\n'; | |
| }); | |
| return formattedText; | |
| } | |
| } | |
| function displayResult(result) { | |
| // Check if this was an OCR fallback result | |
| const isOCRResult = result.content.includes('OCR processing attempted') || | |
| result.content.includes('Warning: No extractable text found'); | |
| // Scan for potential errors and ask user confirmation | |
| if (window.ocrLearningDict) { | |
| for (const [word, data] of Object.entries(window.ocrLearningDict)) { | |
| if (data.confirmedCorrect === null && result.content.includes(word)) { | |
| data.suggestTime = new Date().toISOString(); | |
| if (confirm(`Is "${word}" correctly recognized? If not, please type the correct version.`)) { | |
| data.confirmedCorrect = word; | |
| } else { | |
| const corrected = prompt(`Please enter correct version for "${word}":`, word); | |
| if (corrected) { | |
| data.confirmedCorrect = corrected; | |
| // Replace in current result | |
| result.content = result.content.replace(new RegExp(word, 'g'), corrected); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| const resultCard = document.createElement('div'); | |
| resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm'; | |
| const header = document.createElement('div'); | |
| header.className = 'flex justify-between items-center mb-3'; | |
| const title = document.createElement('h3'); | |
| title.className = 'font-semibold text-lg text-gray-800 truncate'; | |
| title.textContent = result.fileName; | |
| const downloadBtn = document.createElement('button'); | |
| downloadBtn.className = 'bg-indigo-600 hover:bg-indigo-700 text-white font-medium py-1 px-3 rounded transition duration-200 flex items-center text-sm'; | |
| downloadBtn.innerHTML = '<i data-feather="download" class="mr-1"></i> Download'; | |
| downloadBtn.addEventListener('click', () => downloadResult(result)); | |
| header.appendChild(title); | |
| header.appendChild(downloadBtn); | |
| const content = document.createElement('div'); | |
| if (isOCRResult) { | |
| const warning = document.createElement('div'); | |
| warning.className = 'pdf-ocr-warning'; | |
| warning.innerHTML = ` | |
| <div class="flex items-start"> | |
| <i data-feather="alert-triangle" class="mr-2"></i> | |
| <div> | |
| <strong>Note:</strong> This PDF was processed using OCR as no selectable text was found. | |
| Results may contain errors or inaccuracies. | |
| </div> | |
| </div> | |
| `; | |
| content.appendChild(warning); | |
| } | |
| // Create pre element with proper Turkish character support | |
| const pre = document.createElement('pre'); | |
| pre.className = result.format === 'formatted' ? 'ocr-result turkish-text' : ''; | |
| pre.style.cssText = 'font-family: monospace; white-space: pre-wrap; word-wrap: break-word; line-height: 1.6;'; | |
| // Handle content display with proper encoding | |
| if (result.format === 'json') { | |
| try { | |
| const parsed = JSON.parse(result.content); | |
| pre.textContent = JSON.stringify(parsed, null, 2); | |
| } catch (e) { | |
| pre.textContent = result.content; | |
| } | |
| } else { | |
| pre.textContent = result.content; | |
| } | |
| content.appendChild(pre); | |
| resultCard.appendChild(header); | |
| resultCard.appendChild(content); | |
| resultsContainer.appendChild(resultCard); | |
| feather.replace(); | |
| } | |
| function downloadResult(result) { | |
| // Set proper MIME type and encoding for Turkish characters | |
| let mimeType = 'text/plain;charset=utf-8'; | |
| let content = result.content; | |
| if (result.format === 'json') { | |
| mimeType = 'application/json;charset=utf-8'; | |
| } else if (result.format === 'markdown') { | |
| mimeType = 'text/markdown;charset=utf-8'; | |
| } | |
| // Add UTF-8 BOM for better Turkish character support in some applications | |
| const bom = new Uint8Array([0xEF, 0xBB, 0xBF]); | |
| const encoder = new TextEncoder(); | |
| const contentBytes = encoder.encode(content); | |
| const combinedBytes = new Uint8Array(bom.length + contentBytes.length); | |
| combinedBytes.set(bom); | |
| combinedBytes.set(contentBytes, bom.length); | |
| const blob = new Blob([combinedBytes], { type: mimeType }); | |
| const url = URL.createObjectURL(blob); | |
| const a = document.createElement('a'); | |
| a.href = url; | |
| a.download = `${result.fileName.split('.')[0]}.${result.format}`; | |
| document.body.appendChild(a); | |
| a.click(); | |
| document.body.removeChild(a); | |
| URL.revokeObjectURL(url); | |
| } | |
| downloadAllBtn.addEventListener('click', () => { | |
| processedResults.forEach(result => { | |
| downloadResult(result); | |
| }); | |
| }); | |
| }); |