Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14

Commit

fe180fa

verified ·

1 Parent(s): 7360b49

Upload app.py

Browse files

Files changed (1) hide show

app.py +31 -18

app.py CHANGED Viewed

@@ -292,32 +292,39 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
         # Fix 3: Add space before character names (all caps words)
         generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
-        # Fix 4: Remove duplicate speaker names (e.g., "Shepherd:\n\nShepherd:" -> "Shepherd:")
-        # Pattern: Character name followed by colon, then newline(s), then same character name and colon
         lines = generated_text.split('\n')
         cleaned_lines = []
-        prev_speaker = None
-        prev_was_speaker = False
-        for line in lines:
             line_stripped = line.strip()
-            # Check if this line is a speaker name (various formats: "SHEPHERD:", "First Citizen:", "LADY MACBETH:")
-            # Pattern: Starts with capital letter(s), may have spaces, ends with colon, optionally followed by whitespace
             speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
             if speaker_match:
                 speaker = speaker_match.group(1).strip()
-                # If it's the same speaker as previous AND previous line was also a speaker, skip this duplicate
-                if speaker == prev_speaker and prev_was_speaker:
-                    continue  # Skip duplicate
-                prev_speaker = speaker
-                prev_was_speaker = True
                 cleaned_lines.append(line)
             else:
-                # Reset speaker tracking when we see actual dialogue (non-empty line that's not a speaker)
-                if line_stripped:  # Non-empty line that's not a speaker name
-                    prev_speaker = None
-                    prev_was_speaker = False
                 cleaned_lines.append(line)
         generated_text = '\n'.join(cleaned_lines)
@@ -325,8 +332,14 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
         # Fix 5: Remove multiple empty lines between speaker and dialogue
         generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
-        # Fix 6: Remove triple+ consecutive speaker names (edge case)
-        generated_text = re.sub(r'^([A-Z][A-Z\s]+?):\s*\n\1:\s*\n\1:\s*\n', r'\1:\n', generated_text, flags=re.MULTILINE)
         return generated_text
     except Exception as e:

         # Fix 3: Add space before character names (all caps words)
         generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
+        # Fix 4: Remove duplicate speaker names (e.g., "LEONTES:\n...\nLEONTES:" -> keep only first)
+        # More aggressive: remove same speaker if it appears within 5 lines
         lines = generated_text.split('\n')
         cleaned_lines = []
+        speaker_history = []  # Track recent speakers with their line numbers
+        for i, line in enumerate(lines):
             line_stripped = line.strip()
+            # Check if this line is a speaker name
             speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
             if speaker_match:
                 speaker = speaker_match.group(1).strip()
+                # Check if this speaker appeared recently (within last 5 lines)
+                recent_speaker = False
+                for hist_speaker, hist_line_num in speaker_history[-5:]:
+                    if speaker == hist_speaker:
+                        recent_speaker = True
+                        break
+                if recent_speaker:
+                    # Skip this duplicate speaker
+                    continue
+                # Add to history
+                speaker_history.append((speaker, i))
+                # Keep only last 10 speakers in history
+                if len(speaker_history) > 10:
+                    speaker_history.pop(0)
                 cleaned_lines.append(line)
             else:
                 cleaned_lines.append(line)
         generated_text = '\n'.join(cleaned_lines)
         # Fix 5: Remove multiple empty lines between speaker and dialogue
         generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
+        # Fix 6: Remove any remaining consecutive duplicate speakers (final cleanup)
+        # Pattern: Same speaker name appearing on consecutive lines (with optional whitespace)
+        generated_text = re.sub(
+            r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
+            r'\1:\n',
+            generated_text,
+            flags=re.MULTILINE
+        )
         return generated_text
     except Exception as e: