docs2datasets

Sleeping

File size: 5,899 Bytes

import logging
from pathlib import Path

import gradio as gr
from datasets import Dataset
from gradio_log import Log
from huggingface_hub import DatasetCard
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode
from tqdm.auto import tqdm

log_file = "logs.txt"
Path(log_file).touch(exist_ok=True)

logging.basicConfig(filename="logs.txt", level=logging.INFO)
logging.getLogger().addHandler(logging.FileHandler(log_file))


def load_corpus(files, chunk_size=256, chunk_overlap=0, verbose=True):
    if verbose:
        gr.Info("Loading files...")
    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter.from_defaults(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    docs = {
        node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
        for node in tqdm(nodes)
    }
    # remove empty docs
    docs = {k: v for k, v in docs.items() if v}
    return docs


def upload_file(
    files,
    chunk_size: int = 256,
    chunk_overlap: int = 0,
    hub_id: str = None,
    private: bool = False,
    oauth_token: gr.OAuthToken = None,
):
    print("loading files")
    file_paths = [file.name for file in files]
    print("parsing into sentences")
    corpus = load_corpus(file_paths, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    print("Creating dataset")
    dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
    message = f"Dataset created has: \n - {len(dataset)} rows"
    if hub_id:
        if oauth_token is not None:
            gr.Info("Uploading to Hugging Face Hub")
            dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
            update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
            message += f"\n\nUploaded to [{hub_id}](https://huggingface.co/{hub_id}"
        else:
            raise gr.Error("Please login to Hugging Face Hub to push to hub")

    return dataset.to_pandas(), message


def update_dataset_card(
    hub_id,
    token,
    chunk_size,
    chunk_overlap,
):
    card = DatasetCard.load(hub_id, token=token)
    if not card.text:
        # add template description to card text
        card.text += f"""This dataset was created using [Corpus Creator](https://huggingface.co/spaces/davanstrien/corpus-creator). This dataset was created by parsing a corpus of text files into chunks of sentences using Llama Index.
        This processing was done with a chunk size of {chunk_size} and a chunk overlap of {chunk_overlap}."""
        tags = card.data.get("tags", [])
        tags.append("corpus-creator")
        card.data["tags"] = tags
        card.push_to_hub(hub_id, token=token)


description = """
Corpus Creator is a tool designed to help you easily convert a collection of text files into a dataset suitable for various natural language processing (NLP) tasks.
In particular the app is focused on splitting texts into chunks of a specified size and overlap. This can be useful for preparing data for synthetic data generation, pipelines or annotation tasks.

See an [example dataset](davanstrien/MOH-Bethnal-Green) created using this tool starting from a collection of plain text files. 

The resulting text chunks are stored in a dataset that can be previewed and uploaded to the Hugging Face Hub for easy sharing and access by the community.
The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes. 

### Usage:
- Login: Start by logging in to your Hugging Face account using the provided login button.
- Set Parameters: Customize the chunk size and overlap according to your requirements.
- Upload Files: Use the upload button to load file(s) for processing. 
- Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
- Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""

with gr.Blocks() as demo:
    gr.HTML(
        """<h1 style='text-align: center;'> Corpus Creator</h1>
        <center><i> &#128193; From random files to a Hugging Face dataset in a single step &#128193; </i></center>"""
    )
    gr.Markdown(description)
    with gr.Row():
        gr.LoginButton()
        with gr.Column():
            gr.Markdown(
                "To upload to the Hub, add an ID for where you want to push the dataset"
            )
            hub_id = gr.Textbox(value=None, label="Hub ID")
    with gr.Row():
        chunk_size = gr.Number(
            256,
            label="Chunk size (size to split text into)",
            minimum=10,
            maximum=4096,
            step=1,
        )
        chunk_overlap = gr.Number(
            0,
            label="Chunk overlap (overlap size between chunks)",
            minimum=0,
            maximum=4096,
            step=1,
        )
        private = gr.Checkbox(False, label="Upload dataset to a private repo?")
    upload_button = gr.UploadButton(
        "Load files to corpus",
        file_types=[
            "text",
        ],
        file_count="multiple",
    )
    summary = gr.Markdown()

    with gr.Accordion("detailed logs", open=False):
        Log(log_file, dark=True, xterm_font_size=12)
    corpus_preview_df = gr.DataFrame()
    upload_button.upload(
        upload_file,
        inputs=[upload_button, chunk_size, chunk_overlap, hub_id, private],
        outputs=[corpus_preview_df, summary],
    )
demo.launch(debug=True)