Spaces:
Running
Running
import logging | |
from pathlib import Path | |
import gradio as gr | |
from datasets import Dataset | |
from gradio_log import Log | |
from huggingface_hub import DatasetCard | |
from llama_index.core import SimpleDirectoryReader | |
from llama_index.core.node_parser import SentenceSplitter | |
from llama_index.core.schema import MetadataMode | |
from tqdm.auto import tqdm | |
log_file = "logs.txt" | |
Path(log_file).touch(exist_ok=True) | |
logging.basicConfig(filename="logs.txt", level=logging.INFO) | |
logging.getLogger().addHandler(logging.FileHandler(log_file)) | |
def load_corpus( | |
files, chunk_size=256, chunk_overlap=0, verbose=True, split_sentences=True | |
): | |
if verbose: | |
gr.Info("Loading files...") | |
reader = SimpleDirectoryReader(input_files=files) | |
docs = reader.load_data() | |
if split_sentences is False: | |
gr.Info( | |
"Skipping sentence splitting. Each file will be a single row in the dataset." | |
) | |
return {doc.id_: doc.text for doc in docs} | |
if split_sentences: | |
return split_corpus(verbose, docs, chunk_size, chunk_overlap) | |
def split_corpus(verbose, docs, chunk_size, chunk_overlap): | |
if verbose: | |
print(f"Loaded {len(docs)} docs") | |
parser = SentenceSplitter.from_defaults( | |
chunk_size=chunk_size, chunk_overlap=chunk_overlap | |
) | |
nodes = parser.get_nodes_from_documents(docs, show_progress=verbose) | |
if verbose: | |
print(f"Parsed {len(nodes)} nodes") | |
docs = { | |
node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) | |
for node in tqdm(nodes) | |
} | |
# remove empty docs | |
docs = {k: v for k, v in docs.items() if v} | |
return docs | |
def upload_file( | |
files, | |
chunk_size: int = 256, | |
chunk_overlap: int = 0, | |
hub_id: str = None, | |
private: bool = False, | |
split_sentences: bool = True, | |
oauth_token: gr.OAuthToken = None, | |
): | |
print("loading files") | |
file_paths = [file.name for file in files] | |
print("parsing into sentences") | |
corpus = load_corpus( | |
file_paths, | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
split_sentences=split_sentences, | |
) | |
print("Creating dataset") | |
dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()}) | |
message = f"Dataset created has: \n - {len(dataset)} rows" | |
if hub_id: | |
if oauth_token is not None: | |
gr.Info("Uploading to Hugging Face Hub") | |
dataset.push_to_hub(hub_id, token=oauth_token.token, private=private) | |
update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap) | |
message += ( | |
f"\n\nUploaded to [{hub_id}](https://huggingface.co/datasets/{hub_id})" | |
) | |
else: | |
raise gr.Error("Please login to Hugging Face Hub to push to hub") | |
return dataset.to_pandas(), message | |
def update_dataset_card( | |
hub_id, | |
token, | |
chunk_size, | |
chunk_overlap, | |
): | |
card = DatasetCard.load(hub_id, token=token) | |
if not card.text: | |
# add template description to card text | |
card.text += f"""This dataset was created using [Corpus Creator](https://huggingface.co/spaces/davanstrien/corpus-creator). This dataset was created by parsing a corpus of text files into chunks of sentences using Llama Index. | |
This processing was done with a chunk size of {chunk_size} and a chunk overlap of {chunk_overlap}.""" | |
tags = card.data.get("tags", []) | |
tags.append("corpus-creator") | |
card.data["tags"] = tags | |
card.push_to_hub(hub_id, token=token) | |
description = """ | |
Corpus Creator is a tool designed to help you easily convert a collection of text files into a dataset suitable for various natural language processing (NLP) tasks. | |
In particular the app is focused on splitting texts into chunks of a specified size and overlap. This can be useful for preparing data for synthetic data generation, pipelines or annotation tasks. | |
See an [example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created using this tool starting from a collection of plain text files. | |
The resulting text chunks are stored in a dataset that can be previewed and uploaded to the Hugging Face Hub for easy sharing and access by the community. | |
The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes. | |
### Usage: | |
- Login: Start by logging in to your Hugging Face account using the provided login button. | |
- Set Parameters: Customize the chunk size and overlap according to your requirements. If you want to split the text into chunks, check the 'Split sentences' box (on by default). | |
- Upload Files: Use the upload button to load file(s) for processing. | |
- Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub. | |
- Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub.""" | |
with gr.Blocks() as demo: | |
gr.HTML( | |
"""<h1 style='text-align: center;'> Corpus Creator</h1> | |
<center><i> 📁 From random files to a Hugging Face dataset in a single step 📁 </i></center>""" | |
) | |
gr.Markdown(description) | |
with gr.Row(): | |
gr.LoginButton() | |
with gr.Column(): | |
gr.Markdown( | |
"To upload to the Hub, add an ID for where you want to push the dataset" | |
) | |
hub_id = gr.Textbox(value=None, label="Hub ID") | |
with gr.Row(): | |
split_sentences = gr.Checkbox(True, label="Split sentences?") | |
chunk_size = gr.Number( | |
256, | |
label="Chunk size (size to split text into)", | |
minimum=10, | |
maximum=4096, | |
step=1, | |
) | |
chunk_overlap = gr.Number( | |
0, | |
label="Chunk overlap (overlap size between chunks)", | |
minimum=0, | |
maximum=4096, | |
step=1, | |
) | |
private = gr.Checkbox(False, label="Upload dataset to a private repo?") | |
upload_button = gr.File( | |
file_types=["text"], file_count="multiple", height=50, interactive=True | |
) | |
summary = gr.Markdown() | |
with gr.Accordion("detailed logs", open=False): | |
Log(log_file, dark=True, xterm_font_size=12) | |
corpus_preview_df = gr.DataFrame() | |
upload_button.upload( | |
upload_file, | |
inputs=[ | |
upload_button, | |
chunk_size, | |
chunk_overlap, | |
hub_id, | |
private, | |
split_sentences, | |
], | |
outputs=[corpus_preview_df, summary], | |
) | |
demo.launch(debug=True) | |