Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14

Commit

e1bed2c

verified ·

1 Parent(s): 2a077dd

Upload app.py

Browse files

Files changed (1) hide show

app.py +89 -33

app.py CHANGED Viewed

@@ -321,40 +321,62 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 0: Remove the prompt from the beginning if it appears as a speaker name
         # This handles cases where user enters "First Citizen:" and model repeats it
-        prompt_stripped = prompt.strip().replace(':', '').strip()
         lines = generated_text.split('\n')
-        if lines:
-            first_line = lines[0].strip()
-            # Normalize both prompt and first line for comparison (remove colons, case-insensitive)
-            first_line_normalized = first_line.replace(':', '').strip().upper()
-            prompt_normalized = prompt_stripped.upper()
-            # If first line matches the prompt (case-insensitive, allowing for colon)
-            if first_line_normalized == prompt_normalized:
-                # Remove the first line (it's the prompt, not generated content)
-                generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
-                # Also check if the next line is also the same speaker (duplicate)
-                if generated_text.strip():
-                    lines = generated_text.split('\n')
-                    next_line = lines[0].strip() if lines else ''
-                    if next_line:
-                        next_line_normalized = next_line.replace(':', '').strip().upper()
-                        # If next line is also the same speaker, remove it too
-                        if next_line_normalized == prompt_normalized and re.match(r'^([A-Z][A-Z\s]+?):\s*$', next_line):
-                            generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
-                # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
-                if generated_text.strip():
-                    lines = generated_text.split('\n')
-                    first_line = lines[0].strip() if lines else ''
-                    # Check if first line is orphaned dialogue (starts with capital, has punctuation, but no speaker)
-                    if first_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
-                        # Check if it's dialogue-like (starts with capital, has punctuation)
-                        if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
-                            # Just remove the orphaned first line, don't add a speaker
-                            generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
@@ -385,7 +407,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
             'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
             'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
-            'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little'
         ]
         for word in common_words_fix:
             word_lower = word.lower()
@@ -435,7 +460,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
         # Handle cases where a word got split into multiple parts
-        multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little']
         for word in multi_split_words:
             word_lower = word.lower()
             # Create pattern for word split into individual letters with spaces
@@ -650,6 +675,37 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         generated_text = '\n'.join(normalized_lines)
         # Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
         # Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
         # Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."

         # Fix 0: Remove the prompt from the beginning if it appears as a speaker name
         # This handles cases where user enters "First Citizen:" and model repeats it
+        # Normalize prompt: remove colon, strip, convert to uppercase for comparison
+        prompt_normalized = prompt.strip().replace(':', '').strip().upper()
+        # Process all lines to find and remove prompt matches
         lines = generated_text.split('\n')
+        cleaned_lines = []
+        prompt_removed = False
+        for i, line in enumerate(lines):
+            line_stripped = line.strip()
+            # Skip empty lines at the start (but only if we haven't added any content yet)
+            if not line_stripped:
+                if not cleaned_lines:
+                    continue  # Skip leading empty lines
+                else:
+                    cleaned_lines.append(line)  # Keep empty lines after content starts
+                    continue
+            # Normalize line for comparison (remove colon, case-insensitive)
+            line_normalized = line_stripped.replace(':', '').strip().upper()
+            # Check if this line matches the prompt (case-insensitive, allowing for colon)
+            # Check if it's a speaker name format (all caps OR title case OR mixed case)
+            is_speaker_line = (re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped) or  # All caps: "FIRST CITIZEN:"
+                              re.match(r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+):\s*$', line_stripped) or  # Title case: "First Citizen:"
+                              re.match(r'^([A-Z][A-Za-z\s]+?):\s*$', line_stripped))  # Mixed case: "First Citizen:" or "FIRST Citizen:"
+            # If this line matches the prompt (case-insensitive), remove it
+            # Be more aggressive: if it matches the prompt, remove it even if pattern doesn't match exactly
+            if line_normalized == prompt_normalized and not prompt_removed:
+                # Additional check: if it ends with colon, it's likely a speaker name
+                if line_stripped.endswith(':'):
+                    # This is the prompt appearing as a speaker - skip it
+                    prompt_removed = True
+                    continue
+                # Also remove if it's a speaker line pattern
+                elif is_speaker_line:
+                    prompt_removed = True
+                    continue
+            # If we've already removed the prompt, add the line
+            cleaned_lines.append(line)
+        generated_text = '\n'.join(cleaned_lines)
+        # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
+        if generated_text.strip():
+            lines = generated_text.split('\n')
+            first_line = lines[0].strip() if lines else ''
+            # Check if first line is orphaned dialogue (starts with capital, has punctuation, but no speaker)
+            if first_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
+                # Check if it's dialogue-like (starts with capital, has punctuation)
+                if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
+                    # Just remove the orphaned first line, don't add a speaker
+                    generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
             'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
             'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
             'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
+            'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great',
+            'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'worthy', 'call', 'rod',
+            'respect', 'drunk', 'there', 'signior', 'gremio', 'compound', 'soft', 'unvish',
+            'know', 'edward'
         ]
         for word in common_words_fix:
             word_lower = word.lower()
         # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
         # Handle cases where a word got split into multiple parts
+        multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great', 'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'call', 'rod', 'respect', 'drunk', 'signior', 'gremio', 'compound', 'soft', 'unvish', 'know', 'edward', 'man', 'men']
         for word in multi_split_words:
             word_lower = word.lower()
             # Create pattern for word split into individual letters with spaces
         generated_text = '\n'.join(normalized_lines)
+        # Fix 0b: Remove prompt again after normalization (in case it was normalized to all caps)
+        # This handles cases where "First Citizen:" was normalized to "FIRST CITIZEN:"
+        prompt_normalized = prompt.strip().replace(':', '').strip().upper()
+        lines = generated_text.split('\n')
+        cleaned_lines_after_norm = []
+        prompt_removed_after_norm = False
+        for i, line in enumerate(lines):
+            line_stripped = line.strip()
+            # Skip empty lines at the start
+            if not line_stripped and not cleaned_lines_after_norm:
+                continue
+            # Normalize line for comparison (remove colon, case-insensitive)
+            line_normalized = line_stripped.replace(':', '').strip().upper()
+            # Check if this line matches the prompt (case-insensitive, allowing for colon)
+            # Also check if it's a speaker name format (all caps after normalization)
+            is_speaker_line = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
+            if is_speaker_line and line_normalized == prompt_normalized and not prompt_removed_after_norm:
+                # This is the prompt appearing as a speaker - skip it
+                prompt_removed_after_norm = True
+                continue
+            # If we've already removed the prompt, add the line
+            cleaned_lines_after_norm.append(line)
+        generated_text = '\n'.join(cleaned_lines_after_norm)
         # Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
         # Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
         # Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."