shwethd commited on
Commit
e1bed2c
·
verified ·
1 Parent(s): 2a077dd

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -33
app.py CHANGED
@@ -321,40 +321,62 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
321
 
322
  # Fix 0: Remove the prompt from the beginning if it appears as a speaker name
323
  # This handles cases where user enters "First Citizen:" and model repeats it
324
- prompt_stripped = prompt.strip().replace(':', '').strip()
 
 
 
325
  lines = generated_text.split('\n')
 
 
326
 
327
- if lines:
328
- first_line = lines[0].strip()
329
- # Normalize both prompt and first line for comparison (remove colons, case-insensitive)
330
- first_line_normalized = first_line.replace(':', '').strip().upper()
331
- prompt_normalized = prompt_stripped.upper()
332
 
333
- # If first line matches the prompt (case-insensitive, allowing for colon)
334
- if first_line_normalized == prompt_normalized:
335
- # Remove the first line (it's the prompt, not generated content)
336
- generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
337
-
338
- # Also check if the next line is also the same speaker (duplicate)
339
- if generated_text.strip():
340
- lines = generated_text.split('\n')
341
- next_line = lines[0].strip() if lines else ''
342
- if next_line:
343
- next_line_normalized = next_line.replace(':', '').strip().upper()
344
- # If next line is also the same speaker, remove it too
345
- if next_line_normalized == prompt_normalized and re.match(r'^([A-Z][A-Z\s]+?):\s*$', next_line):
346
- generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
347
-
348
- # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
349
- if generated_text.strip():
350
- lines = generated_text.split('\n')
351
- first_line = lines[0].strip() if lines else ''
352
- # Check if first line is orphaned dialogue (starts with capital, has punctuation, but no speaker)
353
- if first_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
354
- # Check if it's dialogue-like (starts with capital, has punctuation)
355
- if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
356
- # Just remove the orphaned first line, don't add a speaker
357
- generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
360
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
@@ -385,7 +407,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
385
  'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
386
  'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
387
  'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
388
- 'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little'
 
 
 
389
  ]
390
  for word in common_words_fix:
391
  word_lower = word.lower()
@@ -435,7 +460,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
435
 
436
  # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
437
  # Handle cases where a word got split into multiple parts
438
- multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little']
439
  for word in multi_split_words:
440
  word_lower = word.lower()
441
  # Create pattern for word split into individual letters with spaces
@@ -650,6 +675,37 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
650
 
651
  generated_text = '\n'.join(normalized_lines)
652
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653
  # Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
654
  # Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
655
  # Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."
 
321
 
322
  # Fix 0: Remove the prompt from the beginning if it appears as a speaker name
323
  # This handles cases where user enters "First Citizen:" and model repeats it
324
+ # Normalize prompt: remove colon, strip, convert to uppercase for comparison
325
+ prompt_normalized = prompt.strip().replace(':', '').strip().upper()
326
+
327
+ # Process all lines to find and remove prompt matches
328
  lines = generated_text.split('\n')
329
+ cleaned_lines = []
330
+ prompt_removed = False
331
 
332
+ for i, line in enumerate(lines):
333
+ line_stripped = line.strip()
 
 
 
334
 
335
+ # Skip empty lines at the start (but only if we haven't added any content yet)
336
+ if not line_stripped:
337
+ if not cleaned_lines:
338
+ continue # Skip leading empty lines
339
+ else:
340
+ cleaned_lines.append(line) # Keep empty lines after content starts
341
+ continue
342
+
343
+ # Normalize line for comparison (remove colon, case-insensitive)
344
+ line_normalized = line_stripped.replace(':', '').strip().upper()
345
+
346
+ # Check if this line matches the prompt (case-insensitive, allowing for colon)
347
+ # Check if it's a speaker name format (all caps OR title case OR mixed case)
348
+ is_speaker_line = (re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped) or # All caps: "FIRST CITIZEN:"
349
+ re.match(r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+):\s*$', line_stripped) or # Title case: "First Citizen:"
350
+ re.match(r'^([A-Z][A-Za-z\s]+?):\s*$', line_stripped)) # Mixed case: "First Citizen:" or "FIRST Citizen:"
351
+
352
+ # If this line matches the prompt (case-insensitive), remove it
353
+ # Be more aggressive: if it matches the prompt, remove it even if pattern doesn't match exactly
354
+ if line_normalized == prompt_normalized and not prompt_removed:
355
+ # Additional check: if it ends with colon, it's likely a speaker name
356
+ if line_stripped.endswith(':'):
357
+ # This is the prompt appearing as a speaker - skip it
358
+ prompt_removed = True
359
+ continue
360
+ # Also remove if it's a speaker line pattern
361
+ elif is_speaker_line:
362
+ prompt_removed = True
363
+ continue
364
+
365
+ # If we've already removed the prompt, add the line
366
+ cleaned_lines.append(line)
367
+
368
+ generated_text = '\n'.join(cleaned_lines)
369
+
370
+ # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
371
+ if generated_text.strip():
372
+ lines = generated_text.split('\n')
373
+ first_line = lines[0].strip() if lines else ''
374
+ # Check if first line is orphaned dialogue (starts with capital, has punctuation, but no speaker)
375
+ if first_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
376
+ # Check if it's dialogue-like (starts with capital, has punctuation)
377
+ if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
378
+ # Just remove the orphaned first line, don't add a speaker
379
+ generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
380
 
381
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
382
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
 
407
  'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
408
  'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
409
  'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
410
+ 'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great',
411
+ 'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'worthy', 'call', 'rod',
412
+ 'respect', 'drunk', 'there', 'signior', 'gremio', 'compound', 'soft', 'unvish',
413
+ 'know', 'edward'
414
  ]
415
  for word in common_words_fix:
416
  word_lower = word.lower()
 
460
 
461
  # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
462
  # Handle cases where a word got split into multiple parts
463
+ multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great', 'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'call', 'rod', 'respect', 'drunk', 'signior', 'gremio', 'compound', 'soft', 'unvish', 'know', 'edward', 'man', 'men']
464
  for word in multi_split_words:
465
  word_lower = word.lower()
466
  # Create pattern for word split into individual letters with spaces
 
675
 
676
  generated_text = '\n'.join(normalized_lines)
677
 
678
+ # Fix 0b: Remove prompt again after normalization (in case it was normalized to all caps)
679
+ # This handles cases where "First Citizen:" was normalized to "FIRST CITIZEN:"
680
+ prompt_normalized = prompt.strip().replace(':', '').strip().upper()
681
+ lines = generated_text.split('\n')
682
+ cleaned_lines_after_norm = []
683
+ prompt_removed_after_norm = False
684
+
685
+ for i, line in enumerate(lines):
686
+ line_stripped = line.strip()
687
+
688
+ # Skip empty lines at the start
689
+ if not line_stripped and not cleaned_lines_after_norm:
690
+ continue
691
+
692
+ # Normalize line for comparison (remove colon, case-insensitive)
693
+ line_normalized = line_stripped.replace(':', '').strip().upper()
694
+
695
+ # Check if this line matches the prompt (case-insensitive, allowing for colon)
696
+ # Also check if it's a speaker name format (all caps after normalization)
697
+ is_speaker_line = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
698
+
699
+ if is_speaker_line and line_normalized == prompt_normalized and not prompt_removed_after_norm:
700
+ # This is the prompt appearing as a speaker - skip it
701
+ prompt_removed_after_norm = True
702
+ continue
703
+
704
+ # If we've already removed the prompt, add the line
705
+ cleaned_lines_after_norm.append(line)
706
+
707
+ generated_text = '\n'.join(cleaned_lines_after_norm)
708
+
709
  # Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
710
  # Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
711
  # Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."