Spaces:
Sleeping
Sleeping
File size: 3,939 Bytes
1153ecb 4e3915c f108df6 4e3915c f108df6 4e3915c f108df6 4e3915c f108df6 4e3915c 1153ecb f108df6 fd63293 f108df6 fd63293 3cf0646 f108df6 fd63293 f108df6 fd63293 f108df6 fd63293 f108df6 fd63293 f108df6 1153ecb 4e3915c f108df6 4e3915c f108df6 fd63293 f108df6 1153ecb 4e3915c 1153ecb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import gradio as gr
from datasets import load_dataset
import tempfile
import re
# List of common titles that end with a period
TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."}
def is_latin(text):
"""Check if the text contains only Latin characters."""
# Regex to match non-Latin characters
return not re.search(r'[^\x00-\x7F]', text)
def clean_text(text):
"""Remove non-Latin text and ** from the text."""
# Remove **
text = re.sub(r'\*\*', '', text)
# Split text into sentences and filter out non-Latin sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
cleaned_sentences = [s for s in sentences if is_latin(s)]
return ' '.join(cleaned_sentences)
def process_text(text):
"""Insert a newline after periods, except for titles and ."
Also replace '### Simplified Version' with 'Chapter N' where N increments."""
# Split text into words
words = text.split()
processed_text = ""
chapter_counter = 3 # Initialize chapter counter
for i, word in enumerate(words):
# Check if the word is a title (e.g., Mr., Mrs.)
if word in TITLES:
processed_text += word + " "
# Check if the word ends with a period and is not followed by a quote
elif word.endswith('.') and not word.endswith('."'):
processed_text += word + "\n"
# Replace '### Simplified Version' with 'Chapter N'
elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version":
processed_text += f"Chapter {chapter_counter} "
chapter_counter += 1 # Increment chapter counter
words[i + 1] = "" # Skip the next two words
words[i + 2] = ""
else:
processed_text += word + " "
# Remove trailing spaces and newlines
return processed_text.strip()
def combine_dataset_texts(dataset_name, split, text_column):
try:
# Load the dataset from Hugging Face Hub
dataset = load_dataset(dataset_name, split=split)
# Verify the text column exists
if text_column not in dataset.column_names:
raise gr.Error(f"Column '{text_column}' not found in dataset")
# Combine all texts into a single string without separating datapoints
combined_text = " ".join([example[text_column] for example in dataset])
# Clean the text: remove non-Latin and **
cleaned_text = clean_text(combined_text)
# Process the text: insert newlines after periods, except for titles and ."
# Also replace '### Simplified Version' with 'Chapter N'
processed_text = process_text(cleaned_text)
# Create a temporary file
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
f.write(processed_text)
return f.name
except Exception as e:
raise gr.Error(f"Error processing dataset: {str(e)}")
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Hugging Face Dataset Text Combiner")
gr.Markdown("Combine all text files from a Hugging Face dataset into a single file")
with gr.Row():
dataset_input = gr.Textbox(label="Dataset Name",
placeholder="username/dataset-name")
split_input = gr.Textbox(label="Split", value="train")
column_input = gr.Textbox(label="Text Column", value="text")
submit_btn = gr.Button("Combine Texts")
with gr.Row():
output_file = gr.File(label="Combined Text File")
error_out = gr.Textbox(label="Error Output", visible=False)
submit_btn.click(
fn=combine_dataset_texts,
inputs=[dataset_input, split_input, column_input],
outputs=output_file,
api_name="combine_texts"
)
if __name__ == "__main__":
demo.launch() |