Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -370,7 +370,8 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 370 |
'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
|
| 371 |
'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
|
| 372 |
'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
|
| 373 |
-
'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art'
|
|
|
|
| 374 |
]
|
| 375 |
for word in common_words_fix:
|
| 376 |
word_lower = word.lower()
|
|
@@ -420,7 +421,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 420 |
|
| 421 |
# Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
|
| 422 |
# Handle cases where a word got split into multiple parts
|
| 423 |
-
multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art']
|
| 424 |
for word in multi_split_words:
|
| 425 |
word_lower = word.lower()
|
| 426 |
# Create pattern for word split into individual letters with spaces
|
|
@@ -485,6 +486,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 485 |
# Fix "As s he" -> "As she"
|
| 486 |
(r'\bAs\s+s\s+he\b', 'As she'),
|
| 487 |
(r'\bas\s+s\s+he\b', 'as she'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
]
|
| 489 |
for pattern, replacement in merged_fixes:
|
| 490 |
generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
|
|
@@ -618,8 +625,8 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 618 |
|
| 619 |
generated_text = '\n'.join(cleaned_lines)
|
| 620 |
|
| 621 |
-
# Fix 5: Remove speaker names with no dialogue (e.g., "KING:\nEDWARD IV:" -> "EDWARD IV:")
|
| 622 |
-
# A speaker name should be followed by actual dialogue, not immediately by another speaker
|
| 623 |
lines = generated_text.split('\n')
|
| 624 |
final_lines = []
|
| 625 |
|
|
@@ -628,9 +635,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 628 |
speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
|
| 629 |
|
| 630 |
if speaker_match:
|
| 631 |
-
# Check if next non-empty line is another speaker
|
| 632 |
has_dialogue = False
|
| 633 |
-
|
|
|
|
| 634 |
next_line = lines[j].strip()
|
| 635 |
if not next_line: # Skip empty lines
|
| 636 |
continue
|
|
@@ -639,8 +647,9 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 639 |
has_dialogue = True
|
| 640 |
break
|
| 641 |
# If next non-empty line IS a speaker, this speaker has no dialogue
|
| 642 |
-
|
| 643 |
# This speaker has no dialogue - skip it
|
|
|
|
| 644 |
break
|
| 645 |
|
| 646 |
if not has_dialogue:
|
|
@@ -707,11 +716,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 707 |
|
| 708 |
# Final check: if text doesn't end with punctuation and is not a speaker,
|
| 709 |
# try to find the last complete sentence
|
|
|
|
| 710 |
if generated_text.strip():
|
| 711 |
# Find the last complete sentence (ends with . ! ?)
|
| 712 |
# Split by sentences
|
| 713 |
sentences = re.split(r'([.!?]+)', generated_text)
|
| 714 |
-
if len(sentences) >
|
| 715 |
# Reconstruct, keeping only complete sentences
|
| 716 |
complete_text = ''
|
| 717 |
for i in range(0, len(sentences) - 1, 2):
|
|
@@ -719,9 +729,15 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 719 |
complete_text += sentences[i] + sentences[i + 1]
|
| 720 |
# If we have complete sentences, use them; otherwise keep original
|
| 721 |
if complete_text.strip():
|
| 722 |
-
# But check if we removed too much (more than
|
| 723 |
-
|
| 724 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
|
| 726 |
return generated_text
|
| 727 |
except Exception as e:
|
|
|
|
| 370 |
'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
|
| 371 |
'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
|
| 372 |
'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
|
| 373 |
+
'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
|
| 374 |
+
'again', 'government', 'honour', 'light', 'stands', 'fly'
|
| 375 |
]
|
| 376 |
for word in common_words_fix:
|
| 377 |
word_lower = word.lower()
|
|
|
|
| 421 |
|
| 422 |
# Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
|
| 423 |
# Handle cases where a word got split into multiple parts
|
| 424 |
+
multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly']
|
| 425 |
for word in multi_split_words:
|
| 426 |
word_lower = word.lower()
|
| 427 |
# Create pattern for word split into individual letters with spaces
|
|
|
|
| 486 |
# Fix "As s he" -> "As she"
|
| 487 |
(r'\bAs\s+s\s+he\b', 'As she'),
|
| 488 |
(r'\bas\s+s\s+he\b', 'as she'),
|
| 489 |
+
# Fix "ag a in" -> "again" (multiple splits)
|
| 490 |
+
(r'\bag\s+a\s+in\b', 'again'),
|
| 491 |
+
(r'\bAg\s+a\s+in\b', 'Again'),
|
| 492 |
+
# Fix "ag a in" -> "again" (two-part split)
|
| 493 |
+
(r'\bag\s+a\s+in\b', 'again'),
|
| 494 |
+
(r'\bAg\s+a\s+in\b', 'Again'),
|
| 495 |
]
|
| 496 |
for pattern, replacement in merged_fixes:
|
| 497 |
generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
|
|
|
|
| 625 |
|
| 626 |
generated_text = '\n'.join(cleaned_lines)
|
| 627 |
|
| 628 |
+
# Fix 5: Remove speaker names with no dialogue (e.g., "KING:\nEDWARD IV:" -> "EDWARD IV:", "First Citizen:\n\nCLARENCE:" -> "CLARENCE:")
|
| 629 |
+
# A speaker name should be followed by actual dialogue, not immediately by another speaker or empty lines
|
| 630 |
lines = generated_text.split('\n')
|
| 631 |
final_lines = []
|
| 632 |
|
|
|
|
| 635 |
speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
|
| 636 |
|
| 637 |
if speaker_match:
|
| 638 |
+
# Check if next non-empty line is another speaker or if there's no dialogue at all
|
| 639 |
has_dialogue = False
|
| 640 |
+
# Check up to 5 lines ahead (more generous to catch dialogue)
|
| 641 |
+
for j in range(i + 1, min(i + 6, len(lines))):
|
| 642 |
next_line = lines[j].strip()
|
| 643 |
if not next_line: # Skip empty lines
|
| 644 |
continue
|
|
|
|
| 647 |
has_dialogue = True
|
| 648 |
break
|
| 649 |
# If next non-empty line IS a speaker, this speaker has no dialogue
|
| 650 |
+
elif re.match(r'^([A-Z][A-Z\s]+?):\s*$', next_line):
|
| 651 |
# This speaker has no dialogue - skip it
|
| 652 |
+
has_dialogue = False
|
| 653 |
break
|
| 654 |
|
| 655 |
if not has_dialogue:
|
|
|
|
| 716 |
|
| 717 |
# Final check: if text doesn't end with punctuation and is not a speaker,
|
| 718 |
# try to find the last complete sentence
|
| 719 |
+
# BUT: Be less aggressive - only remove if we have multiple sentences and last one is clearly incomplete
|
| 720 |
if generated_text.strip():
|
| 721 |
# Find the last complete sentence (ends with . ! ?)
|
| 722 |
# Split by sentences
|
| 723 |
sentences = re.split(r'([.!?]+)', generated_text)
|
| 724 |
+
if len(sentences) > 3: # Only if we have at least 2 complete sentences
|
| 725 |
# Reconstruct, keeping only complete sentences
|
| 726 |
complete_text = ''
|
| 727 |
for i in range(0, len(sentences) - 1, 2):
|
|
|
|
| 729 |
complete_text += sentences[i] + sentences[i + 1]
|
| 730 |
# If we have complete sentences, use them; otherwise keep original
|
| 731 |
if complete_text.strip():
|
| 732 |
+
# But check if we removed too much (more than 30% of text must remain)
|
| 733 |
+
# AND the last sentence must be very short (likely incomplete)
|
| 734 |
+
original_len = len(generated_text.strip())
|
| 735 |
+
complete_len = len(complete_text.strip())
|
| 736 |
+
if complete_len > original_len * 0.3:
|
| 737 |
+
# Check if last sentence in original is very short (likely incomplete)
|
| 738 |
+
last_sentence = sentences[-2] if len(sentences) >= 2 else ''
|
| 739 |
+
if len(last_sentence.strip()) < 15: # Very short last sentence
|
| 740 |
+
generated_text = complete_text.strip()
|
| 741 |
|
| 742 |
return generated_text
|
| 743 |
except Exception as e:
|