coroianpetruta commited on
Commit
2f1571b
1 Parent(s): c9cd7db

Improved random sentence generator

Browse files
Files changed (1) hide show
  1. app.py +25 -19
app.py CHANGED
@@ -44,28 +44,34 @@ dataset = load_dataset("daily_dialog", trust_remote_code=True)
44
  import re
45
 
46
  dialogs = dataset["train"]
47
- # Function to clean extra spaces around punctuation marks
48
- def clean_sentence(sentence):
49
- # Remove space before punctuation
50
- sentence = re.sub(r'\s+([?.!,"\'-])', r'\1', sentence)
51
- # Remove space after punctuation
52
- sentence = re.sub(r'([?.!,"\'-])\s+', r'\1 ', sentence)
53
- sentence = sentence.strip()
54
- return sentence
55
-
56
- # Assuming dialogues is a list of lists, where each inner list contains sentences of a dialogue
57
- # Example: dialogues = [["Hello, how are you?", "I'm fine, thank you!"], ["What's your name?", "My name is John."]]
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  # Function to randomly select one sentence from the dataset
60
  def get_random_sentence():
61
- # Select a random dialogue
62
- random_dialogue = random.choice(dialogs['dialog'])
63
- # Select a random sentence from the chosen dialogue
64
- random_sentence = random.choice(random_dialogue)
65
-
66
- # Clean the sentence
67
- clean_random_sentence = clean_sentence(random_sentence)
68
- return clean_random_sentence
69
 
70
 
71
 
 
44
  import re
45
 
46
  dialogs = dataset["train"]
47
+
48
+ def flatten(xss):
49
+ return [x for xs in xss for x in xs]
50
+
51
+ def split_keep_delimiters(s):
52
+ # Remove spaces before dots, exclamation points, commas, and question marks
53
+ s = re.sub(r'\s+([.,!?])', r'\1', s)
54
+
55
+ # Remove spaces before and after apostrophes
56
+ s = re.sub(r"\s*['’]\s*", r"'", s)
57
+ # Use re.findall to split by the delimiters while keeping them
58
+ parts = re.findall(r'[^.!?\s][^.!?]*[.!?]', s)
59
+ parts = [part.capitalize() for part in parts]
60
+ return parts
61
+
62
+
63
+ random_sentences = flatten(dialogs["dialog"])
64
+ random_sentences_stripped = []
65
+ for line in random_sentences:
66
+ sentences = split_keep_delimiters(line)
67
+ for sentence in sentences:
68
+ random_sentences_stripped.append(sentence)
69
+
70
+
71
 
72
  # Function to randomly select one sentence from the dataset
73
  def get_random_sentence():
74
+ return random.choice(random_sentences_stripped)
 
 
 
 
 
 
 
75
 
76
 
77