File size: 24,379 Bytes
5dd5427
 
 
 
 
 
 
 
 
 
 
575efeb
5dd5427
 
 
 
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
 
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e0745a
5dd5427
 
 
575efeb
9e0745a
575efeb
5dd5427
 
 
 
 
c21250b
 
 
 
 
 
 
 
5dd5427
c21250b
 
9e0745a
c21250b
 
 
 
 
9e0745a
5dd5427
9e0745a
c21250b
 
5dd5427
 
 
 
 
 
9e0745a
 
 
5dd5427
9e0745a
 
9dafab7
 
5dd5427
 
 
 
 
9e0745a
 
5dd5427
9e0745a
 
5dd5427
 
 
9e0745a
 
 
 
 
 
bac89b2
9e0745a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dd5427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bac89b2
5dd5427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e0745a
5dd5427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e70845
c21250b
5dd5427
 
 
 
9e70845
5dd5427
 
 
9e70845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dd5427
c21250b
5dd5427
 
 
 
 
 
 
 
bac89b2
 
 
 
 
575efeb
bac89b2
 
 
 
 
 
575efeb
 
 
bac89b2
 
 
 
 
575efeb
5dd5427
 
bac89b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575efeb
 
 
 
 
 
 
 
5dd5427
 
 
 
 
 
9e0745a
 
 
 
 
 
 
 
5dd5427
 
 
 
9e0745a
5dd5427
 
 
 
 
 
9e0745a
 
5dd5427
 
 
 
 
 
 
 
 
9e0745a
 
 
5dd5427
 
9e0745a
5dd5427
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6839f0d
5dd5427
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
// Hugging Face Project Gutenberg Dataset Service
class HuggingFaceDatasetService {
  constructor() {
    // Use Hugging Face Datasets API for streaming
    this.datasetName = 'manu/project_gutenberg';
    this.apiBase = 'https://datasets-server.huggingface.co';
    this.books = [];
    this.isLoaded = false;
    this.streamingEnabled = false;
    this.cache = new Map();
    this.preloadedBooks = [];
    this.usedBooks = new Set(); // Track books used this session
  }

  // Local fallback books for when HF streaming is unavailable
  getSampleBooks() {
    return [
      {
        id: 1,
        title: "Pride and Prejudice",
        author: "Jane Austen",
        year: 1813,
        text: "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters. \"My dear Mr. Bennet,\" said his lady to him one day, \"have you heard that Netherfield Park is let at last?\" Mr. Bennet replied that he had not. \"But it is,\" returned she; \"for Mrs. Long has just been here, and she told me all about it.\" Mr. Bennet made no answer. \"Do you not want to know who has taken it?\" cried his wife impatiently. \"You want to tell me, and I have no objection to hearing it.\" This was invitation enough."
      },
      {
        id: 2,
        title: "The Adventures of Tom Sawyer",
        author: "Mark Twain",
        year: 1876,
        text: "\"Tom!\" No answer. \"Tom!\" No answer. \"What's gone with that boy, I wonder? You TOM!\" No answer. The old lady pulled her spectacles down and looked over them about the room; then she put them up and looked out under them. She seldom or never looked through them for so small a thing as a boy; they were her state pair, the pride of her heart, and were built for \"style,\" not service--she could have seen through a pair of stove-lids just as well. She looked perplexed for a moment, and then said, not fiercely, but still loud enough for the furniture to hear: \"Well, I lay if I get hold of you I'll--\""
      },
      {
        id: 3,
        title: "Great Expectations",
        author: "Charles Dickens",
        year: 1861,
        text: "My father's family name being Pirrip, and my Christian name Philip, my infant tongue could make of both names nothing longer or more explicit than Pip. So, I called myself Pip, and came to be called Pip. I give Pirrip as my father's family name, on the authority of his tombstone and my sister,--Mrs. Joe Gargery, who married the blacksmith. As I never saw my father or my mother, and never saw any likeness of them (for their days were long before the days of photographs), my first fancies regarding what they were like were unreasonably derived from their tombstones."
      },
      {
        id: 4,
        title: "Alice's Adventures in Wonderland",
        author: "Lewis Carroll",
        year: 1865,
        text: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her."
      },
      {
        id: 5,
        title: "The Picture of Dorian Gray",
        author: "Oscar Wilde",
        year: 1890,
        text: "The studio was filled with the rich odour of roses, and when the strong summer wind stirred, amidst the trees of the garden, there came through the open door the heavy scent of the lilac, or the more delicate perfume of the pink-flowering thorn. From the corner of the divan of Persian saddle-bags on which he was lying, smoking, as was his custom, innumerable cigarettes, Lord Henry Wotton could just catch the gleam of the honey-sweet and honey-coloured blossoms of a laburnum, whose tremulous branches seemed hardly able to bear the burden of a beauty so flamelike as theirs."
      },
      {
        id: 6,
        title: "Moby Dick",
        author: "Herman Melville",
        year: 1851,
        text: "Call me Ishmael. Some years agoβ€”never mind how long preciselyβ€”having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking people's hats offβ€”then, I account it high time to get to sea as soon as possible."
      },
      {
        id: 7,
        title: "Jane Eyre",
        author: "Charlotte Bronte",
        year: 1847,
        text: "There was no possibility of taking a walk that day. We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further out-door exercise was now out of the question. I was glad of it: I never liked long walks, especially on chilly afternoons: dreadful to me was the coming home in the raw twilight, with nipped fingers and toes, and a heart saddened by the chidings of Bessie, the nurse, and humbled by the consciousness of my physical inferiority to Eliza, John, and Georgiana Reed."
      },
      {
        id: 8,
        title: "The Count of Monte Cristo",
        author: "Alexandre Dumas",
        year: 1844,
        text: "On the first Monday of February, 1815, the watchtower at Marseilles signaled the arrival of the three-master Pharaon from Smyrna, Trieste, and Naples. As was customary, the pilot immediately left the port and steered toward the chΓ’teau d'If to conduct the ship through the narrow passage that leads to the harbor. However, a young sailor of about nineteen or twenty years, standing on the ship's bow, had signaled the pilot even before he had time to ask the traditional questions that are exchanged between the pilot and the captain. The young man had already assumed command, being the ship's owner and captain."
      },
      {
        id: 9,
        title: "Wuthering Heights",
        author: "Emily Bronte",
        year: 1847,
        text: "I have just returned from a visit to my landlordβ€”the solitary neighbour that I shall be troubled with. This is certainly a beautiful country! In all England, I do not believe that I could have fixed on a situation so completely removed from the stir of society. A perfect misanthropist's Heaven: and Mr. Heathcliff and I are such a suitable pair to divide the desolation between us. A capital fellow! He little imagined how my heart warmed towards him when I beheld his black eyes withdraw so suspiciously under their brows, as I rode up, and when his fingers sheltered themselves, with a jealous resolution, still further in his waistcoat, as I announced my name."
      },
      {
        id: 10,
        title: "Frankenstein",
        author: "Mary Shelley",
        year: 1818,
        text: "It was on a dreary night of November that I beheld the accomplishment of my toils. With an anxiety that almost amounted to agony, I collected the instruments of life around me, that I might infuse a spark of being into the lifeless thing that lay at my feet. It was already one in the morning; the rain pattered dismally against the panes, and my candle was nearly burnt out, when, by the glimmer of the half-extinguished light, I saw the dull yellow eye of the creature open; it breathed hard, and a convulsive motion agitated its limbs. How can I describe my emotions at this catastrophe, or how delineate the wretch whom with such infinite pains and care I had endeavoured to form?"
      }
    ];
  }

  async loadDataset() {
    try {
      // Try to connect to HF Datasets API
      await this.initializeStreaming();
      
      if (this.streamingEnabled) {
        // Preload some books for immediate access
        await this.preloadBooks(5);
        console.log(`βœ… HF Streaming enabled: ${this.preloadedBooks.length} books preloaded`);
      } else {
        // Fall back to local samples
        this.books = this.getSampleBooks();
        console.log(`⚠️ Using local samples: ${this.books.length} books available`);
      }
      
      this.isLoaded = true;
      return this.books;
    } catch (error) {
      console.error('Error loading dataset:', error);
      // Ensure we always have local fallback
      this.books = this.getSampleBooks();
      this.isLoaded = true;
      return this.books;
    }
  }

  async initializeStreaming() {
    try {
      // Test HF Datasets API availability
      const testUrl = `${this.apiBase}/splits?dataset=${this.datasetName}`;
      const response = await fetch(testUrl);
      
      if (response.ok) {
        const data = await response.json();
        // Check if English split is available
        const hasEnglish = data.splits?.some(split => 
          split.split === 'en' && split.config === 'default'
        );
        
        this.streamingEnabled = hasEnglish || data.splits?.length > 0;
        console.log(`πŸ”— HF Datasets API: ${this.streamingEnabled ? 'Available' : 'Unavailable'}`);
      }
    } catch (error) {
      console.warn('HF Datasets API test failed:', error);
      this.streamingEnabled = false;
    }
  }

  async preloadBooks(count = 5) {
    if (!this.streamingEnabled) return;
    
    try {
      // Use random offset to avoid always getting the same books
      const randomOffset = Math.floor(Math.random() * 1000);
      const url = `${this.apiBase}/rows?dataset=${this.datasetName}&config=default&split=en&offset=${randomOffset}&length=${count}`;
      const response = await fetch(url);
      
      if (response.ok) {
        const data = await response.json();
        
        // Check if data has expected structure
        if (!data.rows || !Array.isArray(data.rows)) {
          console.error('Unexpected HF API response structure:', data);
          return;
        }
        
        console.log(`πŸ“₯ Received ${data.rows.length} books from HF API`);
        
        this.preloadedBooks = data.rows
          .map(row => {
            try {
              return this.processHFBookLazy(row.row);
            } catch (e) {
              console.warn('Error processing book:', e);
              return null;
            }
          })
          .filter(book => book !== null);
          
        console.log(`πŸ“š Preloaded ${this.preloadedBooks.length} books (lazy validation)`);
      } else {
        console.error(`HF API request failed: ${response.status} ${response.statusText}`);
      }
    } catch (error) {
      console.warn('Failed to preload books:', error);
    }
  }

  processHFBookLazy(rowData) {
    // Minimal processing - defer text cleaning and validation until book is selected
    const rawText = rowData.text || '';
    
    // Do basic metadata extraction to get proper title/author
    const extractedMetadata = this.extractMetadata(rawText);
    const title = extractedMetadata.title || rowData.title || 'Classic Literature';
    const author = extractedMetadata.author || rowData.author || 'Unknown Author';
    
    return {
      id: rowData.id || Math.random().toString(36),
      title: title,
      author: author,
      rawText: rawText,
      text: null, // Will clean when needed
      language: rowData.language || 'en',
      source: 'project_gutenberg',
      processed: false
    };
  }

  async processBookOnDemand(book) {
    if (book.processed) return book;
    
    console.log(`πŸ”„ Processing "${book.title}" on demand...`);
    const startTime = Date.now();
    
    // Clean text when actually needed
    const cleanedText = this.cleanProjectGutenbergText(book.rawText);
    
    book.text = cleanedText;
    book.processed = true;
    
    // Validate after processing
    if (!this.isValidForCloze(book)) {
      console.log(`❌ "${book.title}" failed validation after ${Date.now() - startTime}ms`);
      return null;
    }
    
    console.log(`βœ… "${book.title}" processed in ${Date.now() - startTime}ms`);
    return book;
  }


  cleanProjectGutenbergText(text) {
    if (!text) return '';
    
    let cleaned = text;
    
    // Remove Project Gutenberg start markers and everything before
    const startPatterns = [
      /\*\*\* START OF .*? \*\*\*/i,
      /\*\*\*START OF .*?\*\*\*/i,
      /START OF THE PROJECT GUTENBERG/i,
      /GUTENBERG.*?EBOOK/i
    ];
    
    for (const pattern of startPatterns) {
      const match = cleaned.match(pattern);
      if (match) {
        const startIndex = match.index + match[0].length;
        // Skip to next line
        const nextLine = cleaned.indexOf('\n', startIndex);
        if (nextLine !== -1) {
          cleaned = cleaned.substring(nextLine + 1);
        }
        break;
      }
    }
    
    // Remove Project Gutenberg end markers and everything after
    const endPatterns = [
      /\*\*\* END OF .*? \*\*\*/i,
      /\*\*\*END OF .*?\*\*\*/i,
      /END OF THE PROJECT GUTENBERG/i,
      /End of the Project Gutenberg/i
    ];
    
    for (const pattern of endPatterns) {
      const match = cleaned.match(pattern);
      if (match) {
        cleaned = cleaned.substring(0, match.index);
        break;
      }
    }
    
    // Remove common Project Gutenberg artifacts
    cleaned = cleaned
      .replace(/\r\n/g, '\n')                           // Normalize line endings
      .replace(/produced from images generously.*?\n/gi, '') // Remove scanning notes
      .replace(/\n\s*\n\s*\n+/g, '\n\n')               // Remove excessive line breaks
      .replace(/^\s*CHAPTER.*$/gm, '')                  // Remove chapter headers
      .replace(/^\s*Chapter.*$/gm, '')                  // Remove chapter headers
      .replace(/^\s*\d+\s*$/gm, '')                     // Remove page numbers
      .replace(/^\s*\[.*?\]\s*$/gm, '')                 // Remove bracketed notes
      .replace(/^\s*_.*_\s*$/gm, '')                    // Remove italic notes
      .replace(/[_*]/g, '')                             // Remove underscores and asterisks
      .trim();
    
    // Find the actual start of narrative content
    const lines = cleaned.split('\n');
    let contentStart = 0;
    
    for (let i = 0; i < Math.min(50, lines.length); i++) {
      const line = lines[i].trim();
      
      // Skip empty lines, title pages, and metadata
      if (!line || 
          line.includes('Title:') || 
          line.includes('Author:') ||
          line.includes('Release Date:') ||
          line.includes('Language:') ||
          line.includes('Character set') ||
          line.includes('www.gutenberg') ||
          line.includes('Project Gutenberg') ||
          line.length < 20) {
        contentStart = i + 1;
        continue;
      }
      
      // Found actual content
      break;
    }
    
    if (contentStart > 0 && contentStart < lines.length) {
      cleaned = lines.slice(contentStart).join('\n').trim();
    }
    
    return cleaned;
  }

  extractMetadata(text) {
    const metadata = { title: 'Classic Literature', author: 'Unknown Author' };
    
    if (!text) return metadata;
    
    // Look for the standard Project Gutenberg header format
    const firstLine = text.split('\n')[0].trim();
    
    // Parse the standard format: "The Project Gutenberg EBook of [TITLE], by [AUTHOR]"
    const pgMatch = firstLine.match(/^.*?The Project Gutenberg EBook of (.+?),\s*by\s+(.+?)$/i);
    if (pgMatch) {
      const title = pgMatch[1].trim();
      const author = pgMatch[2].trim();
      
      if (title && this.isValidTitle(title)) {
        metadata.title = this.cleanMetadataField(title);
      }
      if (author && this.isValidAuthor(author)) {
        metadata.author = this.cleanMetadataField(author);
      }
      
      return metadata;
    }
    
    // Fallback: Look for explicit Title: and Author: fields in first 50 lines
    const lines = text.split('\n').slice(0, 50);
    
    for (let i = 0; i < lines.length; i++) {
      const line = lines[i].trim();
      
      if (line.startsWith('Title:')) {
        const title = line.replace('Title:', '').trim();
        if (title && title.length > 1) {
          metadata.title = this.cleanMetadataField(title);
        }
      } else if (line.startsWith('Author:')) {
        const author = line.replace('Author:', '').trim();
        if (author && author.length > 1) {
          metadata.author = this.cleanMetadataField(author);
        }
      }
    }
    
    return metadata;
  }

  cleanMetadataField(field) {
    return field
      .replace(/\[.*?\]/g, '') // Remove bracketed info
      .replace(/\s+/g, ' ')     // Normalize whitespace
      .trim();
  }


  isValidTitle(title) {
    if (!title || title.length < 3 || title.length > 100) return false;
    // Avoid fragments that are clearly not titles
    if (title.includes('Project Gutenberg') || 
        title.includes('www.') || 
        title.includes('produced from') ||
        title.includes('images generously')) return false;
    return true;
  }

  isValidAuthor(author) {
    if (!author || author.length < 3 || author.length > 50) return false;
    // Basic validation - should look like a name
    if (author.includes('Project Gutenberg') || 
        author.includes('www.') ||
        author.includes('produced from')) return false;
    return true;
  }

  isValidForCloze(book) {
    if (!book.text) return false;
    
    const textLength = book.text.length;
    
    // Basic length criteria
    if (textLength < 2000) return false;        // Minimum readable length
    if (textLength > 500000) return false;      // Too long for performance
    
    // Check for excessive formatting (likely reference material)
    const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
    if (lineBreakRatio > 0.05) return false;    // Fragmentation threshold
    
    // Ensure it has actual narrative content
    const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
    if (sentenceCount < 10) return false;       // Sentence requirement
    
    // Sample text for quality check (first 5000 chars should be representative)
    const sampleText = book.text.substring(0, 5000);
    
    // Check for index/TOC patterns
    const indexPatterns = [
      'CONTENTS', 'INDEX', 'CHAPTER', 'Volume', 'Vol.', 
      'Part I', 'Part II', 'BOOK I', 'APPENDIX'
    ];
    const indexCount = indexPatterns.reduce((count, pattern) => 
      count + (sampleText.match(new RegExp(pattern, 'gi')) || []).length, 0
    );
    const indexRatio = indexCount / (sampleText.split(/\s+/).length || 1);
    
    if (indexRatio > 0.05) {
      console.log(`❌ Book rejected - appears to be index/TOC: "${book.title}" (index ratio: ${Math.round(indexRatio * 100)}%)`);
      return false;
    }
    
    // Check for catalog/bibliography patterns
    if (book.title && (
      book.title.toLowerCase().includes('index') ||
      book.title.toLowerCase().includes('catalog') ||
      book.title.toLowerCase().includes('bibliography') ||
      book.title.toLowerCase().includes('contents')
    )) {
      console.log(`❌ Book rejected - title suggests index/catalog: "${book.title}"`);
      return false;
    }
    
    console.log(`πŸ“– Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
    return true;
  }

  async getRandomBook() {
    if (!this.isLoaded) {
      throw new Error('Dataset not loaded');
    }
    
    // First, try to find a successfully processed HF book
    if (this.streamingEnabled && this.preloadedBooks.length > 0) {
      const availableHFBooks = this.preloadedBooks.filter(book => 
        !this.usedBooks.has(this.getBookId(book))
      );
      
      for (const book of availableHFBooks) {
        const processedBook = await this.processBookOnDemand(book);
        if (processedBook) {
          this.usedBooks.add(this.getBookId(processedBook));
          console.log(`πŸ“š Using HF book: "${processedBook.title}"`);
          return processedBook;
        }
      }
      
      // If no HF books worked, try streaming
      const streamedBook = await this.getStreamingBook();
      if (streamedBook) {
        this.usedBooks.add(this.getBookId(streamedBook));
        return streamedBook;
      }
    }
    
    // Fallback to local samples
    const fallbackBooks = this.books.length > 0 ? this.books : this.getSampleBooks();
    const availableBooks = fallbackBooks.filter(book => 
      !this.usedBooks.has(this.getBookId(book))
    );
    
    if (availableBooks.length > 0) {
      const randomIndex = Math.floor(Math.random() * availableBooks.length);
      const book = availableBooks[randomIndex];
      this.usedBooks.add(this.getBookId(book));
      console.log(`πŸ“š Using local book: "${book.title}"`);
      return book;
    }
    
    // If all books used, clear cache and start over
    this.usedBooks.clear();
    console.log('All books used, cleared used book cache');
    return this.getRandomBook();
  }

  getBookId(book) {
    // Create unique ID from title and author to track duplicates
    return `${book.title}_${book.author}`.replace(/\s+/g, '_').toLowerCase();
  }

  async getStreamingBook() {
    // Use preloaded books for immediate access
    if (this.preloadedBooks.length > 0) {
      const randomIndex = Math.floor(Math.random() * this.preloadedBooks.length);
      let book = this.preloadedBooks[randomIndex];
      
      // Process on demand if needed
      if (!book.processed) {
        book = await this.processBookOnDemand(book);
      }
      
      return book;
    }
    
    // If no preloaded books, try to fetch directly
    try {
      const offset = Math.floor(Math.random() * 1000);
      const url = `${this.apiBase}/rows?dataset=${this.datasetName}&config=default&split=en&offset=${offset}&length=1`;
      const response = await fetch(url);
      
      if (response.ok) {
        const data = await response.json();
        if (data.rows && data.rows.length > 0) {
          const book = this.processHFBookLazy(data.rows[0].row);
          return await this.processBookOnDemand(book);
        }
      }
    } catch (error) {
      console.warn('Direct streaming failed:', error);
    }
    
    return null;
  }

  async getBookByLevelCriteria(level) {
    return await this.getRandomBook();
  }



  getBookById(id) {
    // Search in both preloaded and local books
    const allBooks = [...this.preloadedBooks, ...this.books];
    return allBooks.find(book => book.id === id);
  }

  searchBooks(query) {
    if (!query) return [...this.preloadedBooks, ...this.books];
    
    const lowerQuery = query.toLowerCase();
    const allBooks = [...this.preloadedBooks, ...this.books];
    return allBooks.filter(book => 
      book.title.toLowerCase().includes(lowerQuery) ||
      book.author.toLowerCase().includes(lowerQuery)
    );
  }

  // Health check for streaming status
  getStatus() {
    return {
      streamingEnabled: this.streamingEnabled,
      preloadedBooks: this.preloadedBooks.length,
      localBooks: this.books.length,
      totalAvailable: this.preloadedBooks.length + this.books.length,
      source: this.streamingEnabled ? 'HuggingFace Datasets' : 'Local Samples'
    };
  }

  // Refresh preloaded books cache
  async refreshCache() {
    if (this.streamingEnabled) {
      await this.preloadBooks(20);
      console.log(`πŸ”„ Cache refreshed: ${this.preloadedBooks.length} books`);
    }
  }
}

export default new HuggingFaceDatasetService();