Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

78c94b5

verified ·

1 Parent(s): a845bcb

Upload app.py

Browse files

Files changed (1) hide show

app.py +27 -11

app.py CHANGED Viewed

@@ -370,7 +370,8 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
             'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
             'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
-            'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art'
         ]
         for word in common_words_fix:
             word_lower = word.lower()
@@ -420,7 +421,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
         # Handle cases where a word got split into multiple parts
-        multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art']
         for word in multi_split_words:
             word_lower = word.lower()
             # Create pattern for word split into individual letters with spaces
@@ -485,6 +486,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             # Fix "As s he" -> "As she"
             (r'\bAs\s+s\s+he\b', 'As she'),
             (r'\bas\s+s\s+he\b', 'as she'),
         ]
         for pattern, replacement in merged_fixes:
             generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
@@ -618,8 +625,8 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         generated_text = '\n'.join(cleaned_lines)
-        # Fix 5: Remove speaker names with no dialogue (e.g., "KING:\nEDWARD IV:" -> "EDWARD IV:")
-        # A speaker name should be followed by actual dialogue, not immediately by another speaker
         lines = generated_text.split('\n')
         final_lines = []
@@ -628,9 +635,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
             if speaker_match:
-                # Check if next non-empty line is another speaker (meaning this speaker has no dialogue)
                 has_dialogue = False
-                for j in range(i + 1, min(i + 3, len(lines))):  # Check next 3 lines (more aggressive)
                     next_line = lines[j].strip()
                     if not next_line:  # Skip empty lines
                         continue
@@ -639,8 +647,9 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
                         has_dialogue = True
                         break
                     # If next non-empty line IS a speaker, this speaker has no dialogue
-                    else:
                         # This speaker has no dialogue - skip it
                         break
                 if not has_dialogue:
@@ -707,11 +716,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
                 # Final check: if text doesn't end with punctuation and is not a speaker,
                 # try to find the last complete sentence
                 if generated_text.strip():
                     # Find the last complete sentence (ends with . ! ?)
                     # Split by sentences
                     sentences = re.split(r'([.!?]+)', generated_text)
-                    if len(sentences) > 1:
                         # Reconstruct, keeping only complete sentences
                         complete_text = ''
                         for i in range(0, len(sentences) - 1, 2):
@@ -719,9 +729,15 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
                                 complete_text += sentences[i] + sentences[i + 1]
                         # If we have complete sentences, use them; otherwise keep original
                         if complete_text.strip():
-                            # But check if we removed too much (more than 50% of text)
-                            if len(complete_text.strip()) > len(generated_text.strip()) * 0.3:
-                                generated_text = complete_text.strip()
         return generated_text
     except Exception as e:

             'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
             'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
             'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
+            'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
+            'again', 'government', 'honour', 'light', 'stands', 'fly'
         ]
         for word in common_words_fix:
             word_lower = word.lower()
         # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
         # Handle cases where a word got split into multiple parts
+        multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly']
         for word in multi_split_words:
             word_lower = word.lower()
             # Create pattern for word split into individual letters with spaces
             # Fix "As s he" -> "As she"
             (r'\bAs\s+s\s+he\b', 'As she'),
             (r'\bas\s+s\s+he\b', 'as she'),
+            # Fix "ag a in" -> "again" (multiple splits)
+            (r'\bag\s+a\s+in\b', 'again'),
+            (r'\bAg\s+a\s+in\b', 'Again'),
+            # Fix "ag a in" -> "again" (two-part split)
+            (r'\bag\s+a\s+in\b', 'again'),
+            (r'\bAg\s+a\s+in\b', 'Again'),
         ]
         for pattern, replacement in merged_fixes:
             generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
         generated_text = '\n'.join(cleaned_lines)
+        # Fix 5: Remove speaker names with no dialogue (e.g., "KING:\nEDWARD IV:" -> "EDWARD IV:", "First Citizen:\n\nCLARENCE:" -> "CLARENCE:")
+        # A speaker name should be followed by actual dialogue, not immediately by another speaker or empty lines
         lines = generated_text.split('\n')
         final_lines = []
             speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
             if speaker_match:
+                # Check if next non-empty line is another speaker or if there's no dialogue at all
                 has_dialogue = False
+                # Check up to 5 lines ahead (more generous to catch dialogue)
+                for j in range(i + 1, min(i + 6, len(lines))):
                     next_line = lines[j].strip()
                     if not next_line:  # Skip empty lines
                         continue
                         has_dialogue = True
                         break
                     # If next non-empty line IS a speaker, this speaker has no dialogue
+                    elif re.match(r'^([A-Z][A-Z\s]+?):\s*$', next_line):
                         # This speaker has no dialogue - skip it
+                        has_dialogue = False
                         break
                 if not has_dialogue:
                 # Final check: if text doesn't end with punctuation and is not a speaker,
                 # try to find the last complete sentence
+                # BUT: Be less aggressive - only remove if we have multiple sentences and last one is clearly incomplete
                 if generated_text.strip():
                     # Find the last complete sentence (ends with . ! ?)
                     # Split by sentences
                     sentences = re.split(r'([.!?]+)', generated_text)
+                    if len(sentences) > 3:  # Only if we have at least 2 complete sentences
                         # Reconstruct, keeping only complete sentences
                         complete_text = ''
                         for i in range(0, len(sentences) - 1, 2):
                                 complete_text += sentences[i] + sentences[i + 1]
                         # If we have complete sentences, use them; otherwise keep original
                         if complete_text.strip():
+                            # But check if we removed too much (more than 30% of text must remain)
+                            # AND the last sentence must be very short (likely incomplete)
+                            original_len = len(generated_text.strip())
+                            complete_len = len(complete_text.strip())
+                            if complete_len > original_len * 0.3:
+                                # Check if last sentence in original is very short (likely incomplete)
+                                last_sentence = sentences[-2] if len(sentences) >= 2 else ''
+                                if len(last_sentence.strip()) < 15:  # Very short last sentence
+                                    generated_text = complete_text.strip()
         return generated_text
     except Exception as e: