shwethd commited on
Commit
fe180fa
·
verified ·
1 Parent(s): 7360b49

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -18
app.py CHANGED
@@ -292,32 +292,39 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
292
  # Fix 3: Add space before character names (all caps words)
293
  generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
294
 
295
- # Fix 4: Remove duplicate speaker names (e.g., "Shepherd:\n\nShepherd:" -> "Shepherd:")
296
- # Pattern: Character name followed by colon, then newline(s), then same character name and colon
297
  lines = generated_text.split('\n')
298
  cleaned_lines = []
299
- prev_speaker = None
300
- prev_was_speaker = False
301
 
302
- for line in lines:
303
  line_stripped = line.strip()
304
- # Check if this line is a speaker name (various formats: "SHEPHERD:", "First Citizen:", "LADY MACBETH:")
305
- # Pattern: Starts with capital letter(s), may have spaces, ends with colon, optionally followed by whitespace
306
  speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
307
 
308
  if speaker_match:
309
  speaker = speaker_match.group(1).strip()
310
- # If it's the same speaker as previous AND previous line was also a speaker, skip this duplicate
311
- if speaker == prev_speaker and prev_was_speaker:
312
- continue # Skip duplicate
313
- prev_speaker = speaker
314
- prev_was_speaker = True
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  cleaned_lines.append(line)
316
  else:
317
- # Reset speaker tracking when we see actual dialogue (non-empty line that's not a speaker)
318
- if line_stripped: # Non-empty line that's not a speaker name
319
- prev_speaker = None
320
- prev_was_speaker = False
321
  cleaned_lines.append(line)
322
 
323
  generated_text = '\n'.join(cleaned_lines)
@@ -325,8 +332,14 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
325
  # Fix 5: Remove multiple empty lines between speaker and dialogue
326
  generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
327
 
328
- # Fix 6: Remove triple+ consecutive speaker names (edge case)
329
- generated_text = re.sub(r'^([A-Z][A-Z\s]+?):\s*\n\1:\s*\n\1:\s*\n', r'\1:\n', generated_text, flags=re.MULTILINE)
 
 
 
 
 
 
330
 
331
  return generated_text
332
  except Exception as e:
 
292
  # Fix 3: Add space before character names (all caps words)
293
  generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
294
 
295
+ # Fix 4: Remove duplicate speaker names (e.g., "LEONTES:\n...\nLEONTES:" -> keep only first)
296
+ # More aggressive: remove same speaker if it appears within 5 lines
297
  lines = generated_text.split('\n')
298
  cleaned_lines = []
299
+ speaker_history = [] # Track recent speakers with their line numbers
 
300
 
301
+ for i, line in enumerate(lines):
302
  line_stripped = line.strip()
303
+ # Check if this line is a speaker name
 
304
  speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
305
 
306
  if speaker_match:
307
  speaker = speaker_match.group(1).strip()
308
+
309
+ # Check if this speaker appeared recently (within last 5 lines)
310
+ recent_speaker = False
311
+ for hist_speaker, hist_line_num in speaker_history[-5:]:
312
+ if speaker == hist_speaker:
313
+ recent_speaker = True
314
+ break
315
+
316
+ if recent_speaker:
317
+ # Skip this duplicate speaker
318
+ continue
319
+
320
+ # Add to history
321
+ speaker_history.append((speaker, i))
322
+ # Keep only last 10 speakers in history
323
+ if len(speaker_history) > 10:
324
+ speaker_history.pop(0)
325
+
326
  cleaned_lines.append(line)
327
  else:
 
 
 
 
328
  cleaned_lines.append(line)
329
 
330
  generated_text = '\n'.join(cleaned_lines)
 
332
  # Fix 5: Remove multiple empty lines between speaker and dialogue
333
  generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
334
 
335
+ # Fix 6: Remove any remaining consecutive duplicate speakers (final cleanup)
336
+ # Pattern: Same speaker name appearing on consecutive lines (with optional whitespace)
337
+ generated_text = re.sub(
338
+ r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
339
+ r'\1:\n',
340
+ generated_text,
341
+ flags=re.MULTILINE
342
+ )
343
 
344
  return generated_text
345
  except Exception as e: