Spaces:

davanstrien
/

corpus-creator

Running

App Files Files Community

corpus-creator / app.py

davanstrien HF staff

make sentence splitting optional

eb008d8 5 months ago

raw

history blame

6.68 kB

	import logging
	from pathlib import Path

	import gradio as gr
	from datasets import Dataset
	from gradio_log import Log
	from huggingface_hub import DatasetCard
	from llama_index.core import SimpleDirectoryReader
	from llama_index.core.node_parser import SentenceSplitter
	from llama_index.core.schema import MetadataMode
	from tqdm.auto import tqdm

	log_file = "logs.txt"
	Path(log_file).touch(exist_ok=True)

	logging.basicConfig(filename="logs.txt", level=logging.INFO)
	logging.getLogger().addHandler(logging.FileHandler(log_file))


	def load_corpus(
	files, chunk_size=256, chunk_overlap=0, verbose=True, split_sentences=True
	):
	if verbose:
	gr.Info("Loading files...")
	reader = SimpleDirectoryReader(input_files=files)
	docs = reader.load_data()
	if split_sentences is False:
	gr.Info(
	"Skipping sentence splitting. Each file will be a single row in the dataset."
	)
	return {doc.id_: doc.text for doc in docs}
	if split_sentences:
	return split_corpus(verbose, docs, chunk_size, chunk_overlap)


	def split_corpus(verbose, docs, chunk_size, chunk_overlap):
	if verbose:
	print(f"Loaded {len(docs)} docs")

	parser = SentenceSplitter.from_defaults(
	chunk_size=chunk_size, chunk_overlap=chunk_overlap
	)
	nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

	if verbose:
	print(f"Parsed {len(nodes)} nodes")

	docs = {
	node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
	for node in tqdm(nodes)
	}
	# remove empty docs
	docs = {k: v for k, v in docs.items() if v}
	return docs


	def upload_file(
	files,
	chunk_size: int = 256,
	chunk_overlap: int = 0,
	hub_id: str = None,
	private: bool = False,
	split_sentences: bool = True,
	oauth_token: gr.OAuthToken = None,
	):
	print("loading files")
	file_paths = [file.name for file in files]
	print("parsing into sentences")
	corpus = load_corpus(
	file_paths,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	split_sentences=split_sentences,
	)
	print("Creating dataset")
	dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
	message = f"Dataset created has: \n - {len(dataset)} rows"
	if hub_id:
	if oauth_token is not None:
	gr.Info("Uploading to Hugging Face Hub")
	dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
	update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
	message += (
	f"\n\nUploaded to [{hub_id}](https://huggingface.co/datasets/{hub_id})"
	)
	else:
	raise gr.Error("Please login to Hugging Face Hub to push to hub")

	return dataset.to_pandas(), message


	def update_dataset_card(
	hub_id,
	token,
	chunk_size,
	chunk_overlap,
	):
	card = DatasetCard.load(hub_id, token=token)
	if not card.text:
	# add template description to card text
	card.text += f"""This dataset was created using [Corpus Creator](https://huggingface.co/spaces/davanstrien/corpus-creator). This dataset was created by parsing a corpus of text files into chunks of sentences using Llama Index.
	This processing was done with a chunk size of {chunk_size} and a chunk overlap of {chunk_overlap}."""
	tags = card.data.get("tags", [])
	tags.append("corpus-creator")
	card.data["tags"] = tags
	card.push_to_hub(hub_id, token=token)


	description = """
	Corpus Creator is a tool designed to help you easily convert a collection of text files into a dataset suitable for various natural language processing (NLP) tasks.
	In particular the app is focused on splitting texts into chunks of a specified size and overlap. This can be useful for preparing data for synthetic data generation, pipelines or annotation tasks.

	See an [example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created using this tool starting from a collection of plain text files.

	The resulting text chunks are stored in a dataset that can be previewed and uploaded to the Hugging Face Hub for easy sharing and access by the community.
	The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes.

	### Usage:
	- Login: Start by logging in to your Hugging Face account using the provided login button.
	- Set Parameters: Customize the chunk size and overlap according to your requirements. If you want to split the text into chunks, check the 'Split sentences' box (on by default).
	- Upload Files: Use the upload button to load file(s) for processing.
	- Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
	- Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""

	with gr.Blocks() as demo:
	gr.HTML(
	"""<h1 style='text-align: center;'> Corpus Creator</h1>
	<center><i> 📁 From random files to a Hugging Face dataset in a single step 📁 </i></center>"""
	)
	gr.Markdown(description)
	with gr.Row():
	gr.LoginButton()
	with gr.Column():
	gr.Markdown(
	"To upload to the Hub, add an ID for where you want to push the dataset"
	)
	hub_id = gr.Textbox(value=None, label="Hub ID")
	with gr.Row():
	split_sentences = gr.Checkbox(True, label="Split sentences?")
	chunk_size = gr.Number(
	256,
	label="Chunk size (size to split text into)",
	minimum=10,
	maximum=4096,
	step=1,
	)
	chunk_overlap = gr.Number(
	0,
	label="Chunk overlap (overlap size between chunks)",
	minimum=0,
	maximum=4096,
	step=1,
	)
	private = gr.Checkbox(False, label="Upload dataset to a private repo?")
	upload_button = gr.File(
	file_types=["text"], file_count="multiple", height=50, interactive=True
	)
	summary = gr.Markdown()

	with gr.Accordion("detailed logs", open=False):
	Log(log_file, dark=True, xterm_font_size=12)
	corpus_preview_df = gr.DataFrame()
	upload_button.upload(
	upload_file,
	inputs=[
	upload_button,
	chunk_size,
	chunk_overlap,
	hub_id,
	private,
	split_sentences,
	],
	outputs=[corpus_preview_df, summary],
	)
	demo.launch(debug=True)