Spaces:

coroianpetruta
/

enro-rlhf

Sleeping

coroianpetruta commited on Jul 17

Commit

2f1571b

•

1 Parent(s): c9cd7db

Improved random sentence generator

Files changed (1) hide show

app.py CHANGED Viewed

@@ -44,28 +44,34 @@ dataset = load_dataset("daily_dialog", trust_remote_code=True)
 import re
 dialogs = dataset["train"]
-# Function to clean extra spaces around punctuation marks
-def clean_sentence(sentence):
-    # Remove space before punctuation
-    sentence = re.sub(r'\s+([?.!,"\'-])', r'\1', sentence)
-    # Remove space after punctuation
-    sentence = re.sub(r'([?.!,"\'-])\s+', r'\1 ', sentence)
-    sentence = sentence.strip()
-    return sentence
-# Assuming dialogues is a list of lists, where each inner list contains sentences of a dialogue
-# Example: dialogues = [["Hello, how are you?", "I'm fine, thank you!"], ["What's your name?", "My name is John."]]
 # Function to randomly select one sentence from the dataset
 def get_random_sentence():
-    # Select a random dialogue
-    random_dialogue = random.choice(dialogs['dialog'])
-    # Select a random sentence from the chosen dialogue
-    random_sentence = random.choice(random_dialogue)
-    # Clean the sentence
-    clean_random_sentence = clean_sentence(random_sentence)
-    return clean_random_sentence

 import re
 dialogs = dataset["train"]
+def flatten(xss):
+    return [x for xs in xss for x in xs]
+def split_keep_delimiters(s):
+    # Remove spaces before dots, exclamation points, commas, and question marks
+    s = re.sub(r'\s+([.,!?])', r'\1', s)
+    # Remove spaces before and after apostrophes
+    s = re.sub(r"\s*['’]\s*", r"'", s)
+    # Use re.findall to split by the delimiters while keeping them
+    parts = re.findall(r'[^.!?\s][^.!?]*[.!?]', s)
+    parts = [part.capitalize() for part in parts]
+    return parts
+random_sentences = flatten(dialogs["dialog"])
+random_sentences_stripped = []
+for line in random_sentences:
+    sentences = split_keep_delimiters(line)
+    for sentence in sentences:
+        random_sentences_stripped.append(sentence)
 # Function to randomly select one sentence from the dataset
 def get_random_sentence():
+    return random.choice(random_sentences_stripped)