shwethd commited on
Commit
78c94b5
·
verified ·
1 Parent(s): a845bcb

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -11
app.py CHANGED
@@ -370,7 +370,8 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
370
  'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
371
  'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
372
  'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
373
- 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art'
 
374
  ]
375
  for word in common_words_fix:
376
  word_lower = word.lower()
@@ -420,7 +421,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
420
 
421
  # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
422
  # Handle cases where a word got split into multiple parts
423
- multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art']
424
  for word in multi_split_words:
425
  word_lower = word.lower()
426
  # Create pattern for word split into individual letters with spaces
@@ -485,6 +486,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
485
  # Fix "As s he" -> "As she"
486
  (r'\bAs\s+s\s+he\b', 'As she'),
487
  (r'\bas\s+s\s+he\b', 'as she'),
 
 
 
 
 
 
488
  ]
489
  for pattern, replacement in merged_fixes:
490
  generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
@@ -618,8 +625,8 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
618
 
619
  generated_text = '\n'.join(cleaned_lines)
620
 
621
- # Fix 5: Remove speaker names with no dialogue (e.g., "KING:\nEDWARD IV:" -> "EDWARD IV:")
622
- # A speaker name should be followed by actual dialogue, not immediately by another speaker
623
  lines = generated_text.split('\n')
624
  final_lines = []
625
 
@@ -628,9 +635,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
628
  speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
629
 
630
  if speaker_match:
631
- # Check if next non-empty line is another speaker (meaning this speaker has no dialogue)
632
  has_dialogue = False
633
- for j in range(i + 1, min(i + 3, len(lines))): # Check next 3 lines (more aggressive)
 
634
  next_line = lines[j].strip()
635
  if not next_line: # Skip empty lines
636
  continue
@@ -639,8 +647,9 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
639
  has_dialogue = True
640
  break
641
  # If next non-empty line IS a speaker, this speaker has no dialogue
642
- else:
643
  # This speaker has no dialogue - skip it
 
644
  break
645
 
646
  if not has_dialogue:
@@ -707,11 +716,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
707
 
708
  # Final check: if text doesn't end with punctuation and is not a speaker,
709
  # try to find the last complete sentence
 
710
  if generated_text.strip():
711
  # Find the last complete sentence (ends with . ! ?)
712
  # Split by sentences
713
  sentences = re.split(r'([.!?]+)', generated_text)
714
- if len(sentences) > 1:
715
  # Reconstruct, keeping only complete sentences
716
  complete_text = ''
717
  for i in range(0, len(sentences) - 1, 2):
@@ -719,9 +729,15 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
719
  complete_text += sentences[i] + sentences[i + 1]
720
  # If we have complete sentences, use them; otherwise keep original
721
  if complete_text.strip():
722
- # But check if we removed too much (more than 50% of text)
723
- if len(complete_text.strip()) > len(generated_text.strip()) * 0.3:
724
- generated_text = complete_text.strip()
 
 
 
 
 
 
725
 
726
  return generated_text
727
  except Exception as e:
 
370
  'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
371
  'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
372
  'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
373
+ 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
374
+ 'again', 'government', 'honour', 'light', 'stands', 'fly'
375
  ]
376
  for word in common_words_fix:
377
  word_lower = word.lower()
 
421
 
422
  # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
423
  # Handle cases where a word got split into multiple parts
424
+ multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly']
425
  for word in multi_split_words:
426
  word_lower = word.lower()
427
  # Create pattern for word split into individual letters with spaces
 
486
  # Fix "As s he" -> "As she"
487
  (r'\bAs\s+s\s+he\b', 'As she'),
488
  (r'\bas\s+s\s+he\b', 'as she'),
489
+ # Fix "ag a in" -> "again" (multiple splits)
490
+ (r'\bag\s+a\s+in\b', 'again'),
491
+ (r'\bAg\s+a\s+in\b', 'Again'),
492
+ # Fix "ag a in" -> "again" (two-part split)
493
+ (r'\bag\s+a\s+in\b', 'again'),
494
+ (r'\bAg\s+a\s+in\b', 'Again'),
495
  ]
496
  for pattern, replacement in merged_fixes:
497
  generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
 
625
 
626
  generated_text = '\n'.join(cleaned_lines)
627
 
628
+ # Fix 5: Remove speaker names with no dialogue (e.g., "KING:\nEDWARD IV:" -> "EDWARD IV:", "First Citizen:\n\nCLARENCE:" -> "CLARENCE:")
629
+ # A speaker name should be followed by actual dialogue, not immediately by another speaker or empty lines
630
  lines = generated_text.split('\n')
631
  final_lines = []
632
 
 
635
  speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
636
 
637
  if speaker_match:
638
+ # Check if next non-empty line is another speaker or if there's no dialogue at all
639
  has_dialogue = False
640
+ # Check up to 5 lines ahead (more generous to catch dialogue)
641
+ for j in range(i + 1, min(i + 6, len(lines))):
642
  next_line = lines[j].strip()
643
  if not next_line: # Skip empty lines
644
  continue
 
647
  has_dialogue = True
648
  break
649
  # If next non-empty line IS a speaker, this speaker has no dialogue
650
+ elif re.match(r'^([A-Z][A-Z\s]+?):\s*$', next_line):
651
  # This speaker has no dialogue - skip it
652
+ has_dialogue = False
653
  break
654
 
655
  if not has_dialogue:
 
716
 
717
  # Final check: if text doesn't end with punctuation and is not a speaker,
718
  # try to find the last complete sentence
719
+ # BUT: Be less aggressive - only remove if we have multiple sentences and last one is clearly incomplete
720
  if generated_text.strip():
721
  # Find the last complete sentence (ends with . ! ?)
722
  # Split by sentences
723
  sentences = re.split(r'([.!?]+)', generated_text)
724
+ if len(sentences) > 3: # Only if we have at least 2 complete sentences
725
  # Reconstruct, keeping only complete sentences
726
  complete_text = ''
727
  for i in range(0, len(sentences) - 1, 2):
 
729
  complete_text += sentences[i] + sentences[i + 1]
730
  # If we have complete sentences, use them; otherwise keep original
731
  if complete_text.strip():
732
+ # But check if we removed too much (more than 30% of text must remain)
733
+ # AND the last sentence must be very short (likely incomplete)
734
+ original_len = len(generated_text.strip())
735
+ complete_len = len(complete_text.strip())
736
+ if complete_len > original_len * 0.3:
737
+ # Check if last sentence in original is very short (likely incomplete)
738
+ last_sentence = sentences[-2] if len(sentences) >= 2 else ''
739
+ if len(last_sentence.strip()) < 15: # Very short last sentence
740
+ generated_text = complete_text.strip()
741
 
742
  return generated_text
743
  except Exception as e: