File size: 3,939 Bytes
1153ecb
 
 
4e3915c
 
f108df6
 
 
 
 
 
 
4e3915c
 
f108df6
4e3915c
 
 
f108df6
4e3915c
f108df6
4e3915c
 
1153ecb
f108df6
fd63293
 
f108df6
 
fd63293
3cf0646
f108df6
 
 
 
fd63293
f108df6
 
fd63293
 
 
 
 
 
 
f108df6
fd63293
f108df6
fd63293
 
f108df6
1153ecb
 
 
 
 
 
 
 
 
4e3915c
 
 
f108df6
4e3915c
 
f108df6
fd63293
f108df6
1153ecb
 
 
4e3915c
1153ecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
from datasets import load_dataset
import tempfile
import re

# List of common titles that end with a period
TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."}

def is_latin(text):
    """Check if the text contains only Latin characters."""
    # Regex to match non-Latin characters
    return not re.search(r'[^\x00-\x7F]', text)

def clean_text(text):
    """Remove non-Latin text and ** from the text."""
    # Remove **
    text = re.sub(r'\*\*', '', text)
    
    # Split text into sentences and filter out non-Latin sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    cleaned_sentences = [s for s in sentences if is_latin(s)]
    
    return ' '.join(cleaned_sentences)

def process_text(text):
    """Insert a newline after periods, except for titles and ."
    Also replace '### Simplified Version' with 'Chapter N' where N increments."""
    # Split text into words
    words = text.split()
    processed_text = ""
    chapter_counter = 3  # Initialize chapter counter
    
    for i, word in enumerate(words):
        # Check if the word is a title (e.g., Mr., Mrs.)
        if word in TITLES:
            processed_text += word + " "
        # Check if the word ends with a period and is not followed by a quote
        elif word.endswith('.') and not word.endswith('."'):
            processed_text += word + "\n"
        # Replace '### Simplified Version' with 'Chapter N'
        elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version":
            processed_text += f"Chapter {chapter_counter} "
            chapter_counter += 1  # Increment chapter counter
            words[i + 1] = ""  # Skip the next two words
            words[i + 2] = ""
        else:
            processed_text += word + " "
    
    # Remove trailing spaces and newlines
    return processed_text.strip()

def combine_dataset_texts(dataset_name, split, text_column):
    try:
        # Load the dataset from Hugging Face Hub
        dataset = load_dataset(dataset_name, split=split)
        
        # Verify the text column exists
        if text_column not in dataset.column_names:
            raise gr.Error(f"Column '{text_column}' not found in dataset")
        
        # Combine all texts into a single string without separating datapoints
        combined_text = " ".join([example[text_column] for example in dataset])
        
        # Clean the text: remove non-Latin and **
        cleaned_text = clean_text(combined_text)
        
        # Process the text: insert newlines after periods, except for titles and ."
        # Also replace '### Simplified Version' with 'Chapter N'
        processed_text = process_text(cleaned_text)
        
        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
            f.write(processed_text)
            return f.name
            
    except Exception as e:
        raise gr.Error(f"Error processing dataset: {str(e)}")

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Hugging Face Dataset Text Combiner")
    gr.Markdown("Combine all text files from a Hugging Face dataset into a single file")
    
    with gr.Row():
        dataset_input = gr.Textbox(label="Dataset Name", 
                                 placeholder="username/dataset-name")
        split_input = gr.Textbox(label="Split", value="train")
        column_input = gr.Textbox(label="Text Column", value="text")
    
    submit_btn = gr.Button("Combine Texts")
    
    with gr.Row():
        output_file = gr.File(label="Combined Text File")
        error_out = gr.Textbox(label="Error Output", visible=False)

    submit_btn.click(
        fn=combine_dataset_texts,
        inputs=[dataset_input, split_input, column_input],
        outputs=output_file,
        api_name="combine_texts"
    )

if __name__ == "__main__":
    demo.launch()