Spaces:
Sleeping
Sleeping
File size: 24,379 Bytes
5dd5427 575efeb 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 575efeb 9e0745a 575efeb 5dd5427 c21250b 5dd5427 c21250b 9e0745a c21250b 9e0745a 5dd5427 9e0745a c21250b 5dd5427 9e0745a 5dd5427 9e0745a 9dafab7 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a bac89b2 9e0745a 5dd5427 bac89b2 5dd5427 9e0745a 5dd5427 9e70845 c21250b 5dd5427 9e70845 5dd5427 9e70845 5dd5427 c21250b 5dd5427 bac89b2 575efeb bac89b2 575efeb bac89b2 575efeb 5dd5427 bac89b2 575efeb 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 9e0745a 5dd5427 6839f0d 5dd5427 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 |
// Hugging Face Project Gutenberg Dataset Service
class HuggingFaceDatasetService {
constructor() {
// Use Hugging Face Datasets API for streaming
this.datasetName = 'manu/project_gutenberg';
this.apiBase = 'https://datasets-server.huggingface.co';
this.books = [];
this.isLoaded = false;
this.streamingEnabled = false;
this.cache = new Map();
this.preloadedBooks = [];
this.usedBooks = new Set(); // Track books used this session
}
// Local fallback books for when HF streaming is unavailable
getSampleBooks() {
return [
{
id: 1,
title: "Pride and Prejudice",
author: "Jane Austen",
year: 1813,
text: "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters. \"My dear Mr. Bennet,\" said his lady to him one day, \"have you heard that Netherfield Park is let at last?\" Mr. Bennet replied that he had not. \"But it is,\" returned she; \"for Mrs. Long has just been here, and she told me all about it.\" Mr. Bennet made no answer. \"Do you not want to know who has taken it?\" cried his wife impatiently. \"You want to tell me, and I have no objection to hearing it.\" This was invitation enough."
},
{
id: 2,
title: "The Adventures of Tom Sawyer",
author: "Mark Twain",
year: 1876,
text: "\"Tom!\" No answer. \"Tom!\" No answer. \"What's gone with that boy, I wonder? You TOM!\" No answer. The old lady pulled her spectacles down and looked over them about the room; then she put them up and looked out under them. She seldom or never looked through them for so small a thing as a boy; they were her state pair, the pride of her heart, and were built for \"style,\" not service--she could have seen through a pair of stove-lids just as well. She looked perplexed for a moment, and then said, not fiercely, but still loud enough for the furniture to hear: \"Well, I lay if I get hold of you I'll--\""
},
{
id: 3,
title: "Great Expectations",
author: "Charles Dickens",
year: 1861,
text: "My father's family name being Pirrip, and my Christian name Philip, my infant tongue could make of both names nothing longer or more explicit than Pip. So, I called myself Pip, and came to be called Pip. I give Pirrip as my father's family name, on the authority of his tombstone and my sister,--Mrs. Joe Gargery, who married the blacksmith. As I never saw my father or my mother, and never saw any likeness of them (for their days were long before the days of photographs), my first fancies regarding what they were like were unreasonably derived from their tombstones."
},
{
id: 4,
title: "Alice's Adventures in Wonderland",
author: "Lewis Carroll",
year: 1865,
text: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her."
},
{
id: 5,
title: "The Picture of Dorian Gray",
author: "Oscar Wilde",
year: 1890,
text: "The studio was filled with the rich odour of roses, and when the strong summer wind stirred, amidst the trees of the garden, there came through the open door the heavy scent of the lilac, or the more delicate perfume of the pink-flowering thorn. From the corner of the divan of Persian saddle-bags on which he was lying, smoking, as was his custom, innumerable cigarettes, Lord Henry Wotton could just catch the gleam of the honey-sweet and honey-coloured blossoms of a laburnum, whose tremulous branches seemed hardly able to bear the burden of a beauty so flamelike as theirs."
},
{
id: 6,
title: "Moby Dick",
author: "Herman Melville",
year: 1851,
text: "Call me Ishmael. Some years agoβnever mind how long preciselyβhaving little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking people's hats offβthen, I account it high time to get to sea as soon as possible."
},
{
id: 7,
title: "Jane Eyre",
author: "Charlotte Bronte",
year: 1847,
text: "There was no possibility of taking a walk that day. We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further out-door exercise was now out of the question. I was glad of it: I never liked long walks, especially on chilly afternoons: dreadful to me was the coming home in the raw twilight, with nipped fingers and toes, and a heart saddened by the chidings of Bessie, the nurse, and humbled by the consciousness of my physical inferiority to Eliza, John, and Georgiana Reed."
},
{
id: 8,
title: "The Count of Monte Cristo",
author: "Alexandre Dumas",
year: 1844,
text: "On the first Monday of February, 1815, the watchtower at Marseilles signaled the arrival of the three-master Pharaon from Smyrna, Trieste, and Naples. As was customary, the pilot immediately left the port and steered toward the chΓ’teau d'If to conduct the ship through the narrow passage that leads to the harbor. However, a young sailor of about nineteen or twenty years, standing on the ship's bow, had signaled the pilot even before he had time to ask the traditional questions that are exchanged between the pilot and the captain. The young man had already assumed command, being the ship's owner and captain."
},
{
id: 9,
title: "Wuthering Heights",
author: "Emily Bronte",
year: 1847,
text: "I have just returned from a visit to my landlordβthe solitary neighbour that I shall be troubled with. This is certainly a beautiful country! In all England, I do not believe that I could have fixed on a situation so completely removed from the stir of society. A perfect misanthropist's Heaven: and Mr. Heathcliff and I are such a suitable pair to divide the desolation between us. A capital fellow! He little imagined how my heart warmed towards him when I beheld his black eyes withdraw so suspiciously under their brows, as I rode up, and when his fingers sheltered themselves, with a jealous resolution, still further in his waistcoat, as I announced my name."
},
{
id: 10,
title: "Frankenstein",
author: "Mary Shelley",
year: 1818,
text: "It was on a dreary night of November that I beheld the accomplishment of my toils. With an anxiety that almost amounted to agony, I collected the instruments of life around me, that I might infuse a spark of being into the lifeless thing that lay at my feet. It was already one in the morning; the rain pattered dismally against the panes, and my candle was nearly burnt out, when, by the glimmer of the half-extinguished light, I saw the dull yellow eye of the creature open; it breathed hard, and a convulsive motion agitated its limbs. How can I describe my emotions at this catastrophe, or how delineate the wretch whom with such infinite pains and care I had endeavoured to form?"
}
];
}
async loadDataset() {
try {
// Try to connect to HF Datasets API
await this.initializeStreaming();
if (this.streamingEnabled) {
// Preload some books for immediate access
await this.preloadBooks(5);
console.log(`β
HF Streaming enabled: ${this.preloadedBooks.length} books preloaded`);
} else {
// Fall back to local samples
this.books = this.getSampleBooks();
console.log(`β οΈ Using local samples: ${this.books.length} books available`);
}
this.isLoaded = true;
return this.books;
} catch (error) {
console.error('Error loading dataset:', error);
// Ensure we always have local fallback
this.books = this.getSampleBooks();
this.isLoaded = true;
return this.books;
}
}
async initializeStreaming() {
try {
// Test HF Datasets API availability
const testUrl = `${this.apiBase}/splits?dataset=${this.datasetName}`;
const response = await fetch(testUrl);
if (response.ok) {
const data = await response.json();
// Check if English split is available
const hasEnglish = data.splits?.some(split =>
split.split === 'en' && split.config === 'default'
);
this.streamingEnabled = hasEnglish || data.splits?.length > 0;
console.log(`π HF Datasets API: ${this.streamingEnabled ? 'Available' : 'Unavailable'}`);
}
} catch (error) {
console.warn('HF Datasets API test failed:', error);
this.streamingEnabled = false;
}
}
async preloadBooks(count = 5) {
if (!this.streamingEnabled) return;
try {
// Use random offset to avoid always getting the same books
const randomOffset = Math.floor(Math.random() * 1000);
const url = `${this.apiBase}/rows?dataset=${this.datasetName}&config=default&split=en&offset=${randomOffset}&length=${count}`;
const response = await fetch(url);
if (response.ok) {
const data = await response.json();
// Check if data has expected structure
if (!data.rows || !Array.isArray(data.rows)) {
console.error('Unexpected HF API response structure:', data);
return;
}
console.log(`π₯ Received ${data.rows.length} books from HF API`);
this.preloadedBooks = data.rows
.map(row => {
try {
return this.processHFBookLazy(row.row);
} catch (e) {
console.warn('Error processing book:', e);
return null;
}
})
.filter(book => book !== null);
console.log(`π Preloaded ${this.preloadedBooks.length} books (lazy validation)`);
} else {
console.error(`HF API request failed: ${response.status} ${response.statusText}`);
}
} catch (error) {
console.warn('Failed to preload books:', error);
}
}
processHFBookLazy(rowData) {
// Minimal processing - defer text cleaning and validation until book is selected
const rawText = rowData.text || '';
// Do basic metadata extraction to get proper title/author
const extractedMetadata = this.extractMetadata(rawText);
const title = extractedMetadata.title || rowData.title || 'Classic Literature';
const author = extractedMetadata.author || rowData.author || 'Unknown Author';
return {
id: rowData.id || Math.random().toString(36),
title: title,
author: author,
rawText: rawText,
text: null, // Will clean when needed
language: rowData.language || 'en',
source: 'project_gutenberg',
processed: false
};
}
async processBookOnDemand(book) {
if (book.processed) return book;
console.log(`π Processing "${book.title}" on demand...`);
const startTime = Date.now();
// Clean text when actually needed
const cleanedText = this.cleanProjectGutenbergText(book.rawText);
book.text = cleanedText;
book.processed = true;
// Validate after processing
if (!this.isValidForCloze(book)) {
console.log(`β "${book.title}" failed validation after ${Date.now() - startTime}ms`);
return null;
}
console.log(`β
"${book.title}" processed in ${Date.now() - startTime}ms`);
return book;
}
cleanProjectGutenbergText(text) {
if (!text) return '';
let cleaned = text;
// Remove Project Gutenberg start markers and everything before
const startPatterns = [
/\*\*\* START OF .*? \*\*\*/i,
/\*\*\*START OF .*?\*\*\*/i,
/START OF THE PROJECT GUTENBERG/i,
/GUTENBERG.*?EBOOK/i
];
for (const pattern of startPatterns) {
const match = cleaned.match(pattern);
if (match) {
const startIndex = match.index + match[0].length;
// Skip to next line
const nextLine = cleaned.indexOf('\n', startIndex);
if (nextLine !== -1) {
cleaned = cleaned.substring(nextLine + 1);
}
break;
}
}
// Remove Project Gutenberg end markers and everything after
const endPatterns = [
/\*\*\* END OF .*? \*\*\*/i,
/\*\*\*END OF .*?\*\*\*/i,
/END OF THE PROJECT GUTENBERG/i,
/End of the Project Gutenberg/i
];
for (const pattern of endPatterns) {
const match = cleaned.match(pattern);
if (match) {
cleaned = cleaned.substring(0, match.index);
break;
}
}
// Remove common Project Gutenberg artifacts
cleaned = cleaned
.replace(/\r\n/g, '\n') // Normalize line endings
.replace(/produced from images generously.*?\n/gi, '') // Remove scanning notes
.replace(/\n\s*\n\s*\n+/g, '\n\n') // Remove excessive line breaks
.replace(/^\s*CHAPTER.*$/gm, '') // Remove chapter headers
.replace(/^\s*Chapter.*$/gm, '') // Remove chapter headers
.replace(/^\s*\d+\s*$/gm, '') // Remove page numbers
.replace(/^\s*\[.*?\]\s*$/gm, '') // Remove bracketed notes
.replace(/^\s*_.*_\s*$/gm, '') // Remove italic notes
.replace(/[_*]/g, '') // Remove underscores and asterisks
.trim();
// Find the actual start of narrative content
const lines = cleaned.split('\n');
let contentStart = 0;
for (let i = 0; i < Math.min(50, lines.length); i++) {
const line = lines[i].trim();
// Skip empty lines, title pages, and metadata
if (!line ||
line.includes('Title:') ||
line.includes('Author:') ||
line.includes('Release Date:') ||
line.includes('Language:') ||
line.includes('Character set') ||
line.includes('www.gutenberg') ||
line.includes('Project Gutenberg') ||
line.length < 20) {
contentStart = i + 1;
continue;
}
// Found actual content
break;
}
if (contentStart > 0 && contentStart < lines.length) {
cleaned = lines.slice(contentStart).join('\n').trim();
}
return cleaned;
}
extractMetadata(text) {
const metadata = { title: 'Classic Literature', author: 'Unknown Author' };
if (!text) return metadata;
// Look for the standard Project Gutenberg header format
const firstLine = text.split('\n')[0].trim();
// Parse the standard format: "The Project Gutenberg EBook of [TITLE], by [AUTHOR]"
const pgMatch = firstLine.match(/^.*?The Project Gutenberg EBook of (.+?),\s*by\s+(.+?)$/i);
if (pgMatch) {
const title = pgMatch[1].trim();
const author = pgMatch[2].trim();
if (title && this.isValidTitle(title)) {
metadata.title = this.cleanMetadataField(title);
}
if (author && this.isValidAuthor(author)) {
metadata.author = this.cleanMetadataField(author);
}
return metadata;
}
// Fallback: Look for explicit Title: and Author: fields in first 50 lines
const lines = text.split('\n').slice(0, 50);
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('Title:')) {
const title = line.replace('Title:', '').trim();
if (title && title.length > 1) {
metadata.title = this.cleanMetadataField(title);
}
} else if (line.startsWith('Author:')) {
const author = line.replace('Author:', '').trim();
if (author && author.length > 1) {
metadata.author = this.cleanMetadataField(author);
}
}
}
return metadata;
}
cleanMetadataField(field) {
return field
.replace(/\[.*?\]/g, '') // Remove bracketed info
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
}
isValidTitle(title) {
if (!title || title.length < 3 || title.length > 100) return false;
// Avoid fragments that are clearly not titles
if (title.includes('Project Gutenberg') ||
title.includes('www.') ||
title.includes('produced from') ||
title.includes('images generously')) return false;
return true;
}
isValidAuthor(author) {
if (!author || author.length < 3 || author.length > 50) return false;
// Basic validation - should look like a name
if (author.includes('Project Gutenberg') ||
author.includes('www.') ||
author.includes('produced from')) return false;
return true;
}
isValidForCloze(book) {
if (!book.text) return false;
const textLength = book.text.length;
// Basic length criteria
if (textLength < 2000) return false; // Minimum readable length
if (textLength > 500000) return false; // Too long for performance
// Check for excessive formatting (likely reference material)
const lineBreakRatio = (book.text.match(/\n\n/g) || []).length / textLength;
if (lineBreakRatio > 0.05) return false; // Fragmentation threshold
// Ensure it has actual narrative content
const sentenceCount = (book.text.match(/[.!?]+/g) || []).length;
if (sentenceCount < 10) return false; // Sentence requirement
// Sample text for quality check (first 5000 chars should be representative)
const sampleText = book.text.substring(0, 5000);
// Check for index/TOC patterns
const indexPatterns = [
'CONTENTS', 'INDEX', 'CHAPTER', 'Volume', 'Vol.',
'Part I', 'Part II', 'BOOK I', 'APPENDIX'
];
const indexCount = indexPatterns.reduce((count, pattern) =>
count + (sampleText.match(new RegExp(pattern, 'gi')) || []).length, 0
);
const indexRatio = indexCount / (sampleText.split(/\s+/).length || 1);
if (indexRatio > 0.05) {
console.log(`β Book rejected - appears to be index/TOC: "${book.title}" (index ratio: ${Math.round(indexRatio * 100)}%)`);
return false;
}
// Check for catalog/bibliography patterns
if (book.title && (
book.title.toLowerCase().includes('index') ||
book.title.toLowerCase().includes('catalog') ||
book.title.toLowerCase().includes('bibliography') ||
book.title.toLowerCase().includes('contents')
)) {
console.log(`β Book rejected - title suggests index/catalog: "${book.title}"`);
return false;
}
console.log(`π Book validated: "${book.title}" (${textLength} chars, ${sentenceCount} sentences)`);
return true;
}
async getRandomBook() {
if (!this.isLoaded) {
throw new Error('Dataset not loaded');
}
// First, try to find a successfully processed HF book
if (this.streamingEnabled && this.preloadedBooks.length > 0) {
const availableHFBooks = this.preloadedBooks.filter(book =>
!this.usedBooks.has(this.getBookId(book))
);
for (const book of availableHFBooks) {
const processedBook = await this.processBookOnDemand(book);
if (processedBook) {
this.usedBooks.add(this.getBookId(processedBook));
console.log(`π Using HF book: "${processedBook.title}"`);
return processedBook;
}
}
// If no HF books worked, try streaming
const streamedBook = await this.getStreamingBook();
if (streamedBook) {
this.usedBooks.add(this.getBookId(streamedBook));
return streamedBook;
}
}
// Fallback to local samples
const fallbackBooks = this.books.length > 0 ? this.books : this.getSampleBooks();
const availableBooks = fallbackBooks.filter(book =>
!this.usedBooks.has(this.getBookId(book))
);
if (availableBooks.length > 0) {
const randomIndex = Math.floor(Math.random() * availableBooks.length);
const book = availableBooks[randomIndex];
this.usedBooks.add(this.getBookId(book));
console.log(`π Using local book: "${book.title}"`);
return book;
}
// If all books used, clear cache and start over
this.usedBooks.clear();
console.log('All books used, cleared used book cache');
return this.getRandomBook();
}
getBookId(book) {
// Create unique ID from title and author to track duplicates
return `${book.title}_${book.author}`.replace(/\s+/g, '_').toLowerCase();
}
async getStreamingBook() {
// Use preloaded books for immediate access
if (this.preloadedBooks.length > 0) {
const randomIndex = Math.floor(Math.random() * this.preloadedBooks.length);
let book = this.preloadedBooks[randomIndex];
// Process on demand if needed
if (!book.processed) {
book = await this.processBookOnDemand(book);
}
return book;
}
// If no preloaded books, try to fetch directly
try {
const offset = Math.floor(Math.random() * 1000);
const url = `${this.apiBase}/rows?dataset=${this.datasetName}&config=default&split=en&offset=${offset}&length=1`;
const response = await fetch(url);
if (response.ok) {
const data = await response.json();
if (data.rows && data.rows.length > 0) {
const book = this.processHFBookLazy(data.rows[0].row);
return await this.processBookOnDemand(book);
}
}
} catch (error) {
console.warn('Direct streaming failed:', error);
}
return null;
}
async getBookByLevelCriteria(level) {
return await this.getRandomBook();
}
getBookById(id) {
// Search in both preloaded and local books
const allBooks = [...this.preloadedBooks, ...this.books];
return allBooks.find(book => book.id === id);
}
searchBooks(query) {
if (!query) return [...this.preloadedBooks, ...this.books];
const lowerQuery = query.toLowerCase();
const allBooks = [...this.preloadedBooks, ...this.books];
return allBooks.filter(book =>
book.title.toLowerCase().includes(lowerQuery) ||
book.author.toLowerCase().includes(lowerQuery)
);
}
// Health check for streaming status
getStatus() {
return {
streamingEnabled: this.streamingEnabled,
preloadedBooks: this.preloadedBooks.length,
localBooks: this.books.length,
totalAvailable: this.preloadedBooks.length + this.books.length,
source: this.streamingEnabled ? 'HuggingFace Datasets' : 'Local Samples'
};
}
// Refresh preloaded books cache
async refreshCache() {
if (this.streamingEnabled) {
await this.preloadBooks(20);
console.log(`π Cache refreshed: ${this.preloadedBooks.length} books`);
}
}
}
export default new HuggingFaceDatasetService(); |