docs2datasets

Sleeping

App Files Files Community

davanstrien HF Staff commited on Jun 17, 2024

Commit

739cf2e

verified ·

0 Parent(s):

clean

Browse files

Files changed (5) hide show

.gitattributes +35 -0
README.md +17 -0
app.py +149 -0
requirements.in +4 -0
requirements.txt +438 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+title: Corpus Creator
+emoji: 🦀
+colorFrom: pink
+colorTo: gray
+sdk: gradio
+sdk_version: 4.36.1
+app_file: app.py
+pinned: false
+hf_oauth_scopes:
+  - read-repos
+  - write-repos
+  - manage-repos
+hf_oauth: true
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import logging
+from functools import lru_cache
+from pathlib import Path
+import gradio as gr
+from datasets import Dataset
+from gradio_log import Log
+from huggingface_hub import DatasetCard
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.schema import MetadataMode
+from tqdm.auto import tqdm
+log_file = "logs.txt"
+Path(log_file).touch(exist_ok=True)
+logging.basicConfig(filename="logs.txt", level=logging.INFO)
+logging.getLogger().addHandler(logging.FileHandler(log_file))
+def load_corpus(files, chunk_size=256, chunk_overlap=0, verbose=True):
+    if verbose:
+        gr.Info("Loading files...")
+    reader = SimpleDirectoryReader(input_files=files)
+    docs = reader.load_data()
+    if verbose:
+        print(f"Loaded {len(docs)} docs")
+    parser = SentenceSplitter.from_defaults(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)
+    if verbose:
+        print(f"Parsed {len(nodes)} nodes")
+    docs = {
+        node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
+        for node in tqdm(nodes)
+    }
+    # remove empty docs
+    docs = {k: v for k, v in docs.items() if v}
+    return docs
+def upload_file(
+    files,
+    chunk_size: int = 256,
+    chunk_overlap: int = 0,
+    hub_id: str = None,
+    private: bool = False,
+    oauth_token: gr.OAuthToken = None,
+):
+    print("loading files")
+    file_paths = [file.name for file in files]
+    print("parsing into sentences")
+    corpus = load_corpus(file_paths, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    print("Creating dataset")
+    dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
+    message = f"Dataset created has: \n - {len(dataset)} rows"
+    if hub_id:
+        if oauth_token is not None:
+            gr.Info("Uploading to Hugging Face Hub")
+            dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
+            update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
+            message += f"\n\nUploaded to [{hub_id}](https://huggingface.co/{hub_id}"
+        else:
+            raise gr.Error("Please login to Hugging Face Hub to push to hub")
+    return dataset.to_pandas(), message
+def update_dataset_card(
+    hub_id,
+    token,
+    chunk_size,
+    chunk_overlap,
+):
+    card = DatasetCard.load(hub_id, token=token)
+    if not card.text:
+        # add template description to card text
+        card.text += f"""This dataset was created using [Corpus Creator](https://huggingface.co/spaces/davanstrien/corpus-creator). This dataset was created by parsing a corpus of text files into chunks of sentences using Llama Index.
+        This processing was done with a chunk size of {chunk_size} and a chunk overlap of {chunk_overlap}."""
+        tags = card.data.get("tags", [])
+        tags.append("corpus-creator")
+        card.data["tags"] = tags
+        card.push_to_hub(hub_id, token=token)
+description = """
+Corpus Creator is a tool designed to help you easily convert a collection of text files into a dataset suitable for various natural language processing (NLP) tasks.
+In particular the app is focused on splitting texts into chunks of a specified size and overlap. This can be useful for preparing data for synthetic data generation, pipelines or annotation tasks.
+The resulting text chunks are stored in a dataset that can be previewed and uploaded to the Hugging Face Hub for easy sharing and access by the community.
+The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes.
+### Usage:
+- Login: Start by logging in to your Hugging Face account using the provided login button.
+- Set Parameters: Customize the chunk size and overlap according to your requirements.
+- Upload Files: Use the upload button to load file(s) for processing.
+- Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
+- Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""
+with gr.Blocks() as demo:
+    gr.HTML(
+        """<h1 style='text-align: center;'> Corpus Creator</h1>
+        <center><i> &#128193; From random files to a Hugging Face dataset in a single step &#128193; </i></center>"""
+    )
+    gr.Markdown(description)
+    with gr.Row():
+        gr.LoginButton()
+        with gr.Column():
+            gr.Markdown(
+                "To upload to the Hub, add an ID for where you want to push the dataset"
+            )
+            hub_id = gr.Textbox(value=None, label="Hub ID")
+    with gr.Row():
+        chunk_size = gr.Number(
+            256,
+            label="Chunk size (size to split text into)",
+            minimum=10,
+            maximum=4096,
+            step=1,
+        )
+        chunk_overlap = gr.Number(
+            0,
+            label="Chunk overlap (overlap size between chunks)",
+            minimum=0,
+            maximum=4096,
+            step=1,
+        )
+        private = gr.Checkbox(False, label="Upload dataset to a private repo?")
+    upload_button = gr.UploadButton(
+        "Load files to corpus",
+        file_types=[
+            "text",
+        ],
+        file_count="multiple",
+    )
+    summary = gr.Markdown()
+    with gr.Accordion("detailed logs", open=False):
+        Log(log_file, dark=True, xterm_font_size=12)
+    corpus_preview_df = gr.DataFrame()
+    upload_button.upload(
+        upload_file,
+        inputs=[upload_button, chunk_size, chunk_overlap, hub_id, private],
+        outputs=[corpus_preview_df, summary],
+    )
+demo.launch(debug=True)

requirements.in ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio[oauth]
+llama_index
+gradio_log
+datasets

requirements.txt ADDED Viewed

	@@ -0,0 +1,438 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o requirements.txt
+aiofiles==23.2.1
+    # via gradio
+aiohttp==3.9.5
+    # via
+    #   datasets
+    #   fsspec
+    #   llama-index-core
+    #   llama-index-legacy
+aiosignal==1.3.1
+    # via aiohttp
+altair==5.3.0
+    # via gradio
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.4.0
+    # via
+    #   httpx
+    #   openai
+    #   starlette
+    #   watchfiles
+attrs==23.2.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+authlib==1.3.1
+    # via gradio
+beautifulsoup4==4.12.3
+    # via llama-index-readers-file
+certifi==2024.6.2
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==1.16.0
+    # via cryptography
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via
+    #   nltk
+    #   typer
+    #   uvicorn
+contourpy==1.2.1
+    # via matplotlib
+cryptography==42.0.8
+    # via authlib
+cycler==0.12.1
+    # via matplotlib
+dataclasses-json==0.6.7
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+datasets==2.20.0
+    # via -r requirements.in
+deprecated==1.2.14
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+dirtyjson==1.0.8
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+distro==1.9.0
+    # via openai
+dnspython==2.6.1
+    # via email-validator
+email-validator==2.1.2
+    # via fastapi
+fastapi==0.111.0
+    # via gradio
+fastapi-cli==0.0.4
+    # via fastapi
+ffmpy==0.3.2
+    # via gradio
+filelock==3.15.1
+    # via
+    #   datasets
+    #   huggingface-hub
+fonttools==4.53.0
+    # via matplotlib
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.5.0
+    # via
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+    #   llama-index-core
+    #   llama-index-legacy
+gradio==4.36.1
+    # via
+    #   -r requirements.in
+    #   gradio-log
+gradio-client==1.0.1
+    # via gradio
+gradio-log==0.0.4
+    # via -r requirements.in
+greenlet==3.0.3
+    # via sqlalchemy
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+httpcore==1.0.5
+    # via httpx
+httptools==0.6.1
+    # via uvicorn
+httpx==0.27.0
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   llama-index-core
+    #   llama-index-legacy
+    #   llamaindex-py-client
+    #   openai
+huggingface-hub==0.23.4
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+idna==3.7
+    # via
+    #   anyio
+    #   email-validator
+    #   httpx
+    #   requests
+    #   yarl
+importlib-resources==6.4.0
+    # via gradio
+itsdangerous==2.2.0
+    # via gradio
+jinja2==3.1.4
+    # via
+    #   altair
+    #   fastapi
+    #   gradio
+joblib==1.4.2
+    # via nltk
+jsonschema==4.22.0
+    # via altair
+jsonschema-specifications==2023.12.1
+    # via jsonschema
+kiwisolver==1.4.5
+    # via matplotlib
+llama-index==0.10.45
+    # via -r requirements.in
+llama-index-agent-openai==0.2.7
+    # via
+    #   llama-index
+    #   llama-index-program-openai
+llama-index-cli==0.1.12
+    # via llama-index
+llama-index-core==0.10.44
+    # via
+    #   llama-index
+    #   llama-index-agent-openai
+    #   llama-index-cli
+    #   llama-index-embeddings-openai
+    #   llama-index-indices-managed-llama-cloud
+    #   llama-index-llms-openai
+    #   llama-index-multi-modal-llms-openai
+    #   llama-index-program-openai
+    #   llama-index-question-gen-openai
+    #   llama-index-readers-file
+    #   llama-index-readers-llama-parse
+    #   llama-parse
+llama-index-embeddings-openai==0.1.10
+    # via
+    #   llama-index
+    #   llama-index-cli
+llama-index-indices-managed-llama-cloud==0.1.6
+    # via llama-index
+llama-index-legacy==0.9.48
+    # via llama-index
+llama-index-llms-openai==0.1.22
+    # via
+    #   llama-index
+    #   llama-index-agent-openai
+    #   llama-index-cli
+    #   llama-index-multi-modal-llms-openai
+    #   llama-index-program-openai
+    #   llama-index-question-gen-openai
+llama-index-multi-modal-llms-openai==0.1.6
+    # via llama-index
+llama-index-program-openai==0.1.6
+    # via
+    #   llama-index
+    #   llama-index-question-gen-openai
+llama-index-question-gen-openai==0.1.3
+    # via llama-index
+llama-index-readers-file==0.1.25
+    # via llama-index
+llama-index-readers-llama-parse==0.1.4
+    # via llama-index
+llama-parse==0.4.4
+    # via llama-index-readers-llama-parse
+llamaindex-py-client==0.1.19
+    # via
+    #   llama-index-core
+    #   llama-index-indices-managed-llama-cloud
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via
+    #   gradio
+    #   jinja2
+marshmallow==3.21.3
+    # via dataclasses-json
+matplotlib==3.9.0
+    # via gradio
+mdurl==0.1.2
+    # via markdown-it-py
+multidict==6.0.5
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+mypy-extensions==1.0.0
+    # via typing-inspect
+nest-asyncio==1.6.0
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+networkx==3.3
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+nltk==3.8.1
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+numpy==2.0.0
+    # via
+    #   altair
+    #   contourpy
+    #   datasets
+    #   gradio
+    #   llama-index-core
+    #   llama-index-legacy
+    #   matplotlib
+    #   pandas
+    #   pyarrow
+openai==1.34.0
+    # via
+    #   llama-index-agent-openai
+    #   llama-index-core
+    #   llama-index-legacy
+orjson==3.10.5
+    # via
+    #   fastapi
+    #   gradio
+packaging==24.1
+    # via
+    #   altair
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   marshmallow
+    #   matplotlib
+pandas==2.2.2
+    # via
+    #   altair
+    #   datasets
+    #   gradio
+    #   llama-index-core
+    #   llama-index-legacy
+pillow==10.3.0
+    # via
+    #   gradio
+    #   llama-index-core
+    #   matplotlib
+pyarrow==16.1.0
+    # via datasets
+pyarrow-hotfix==0.6
+    # via datasets
+pycparser==2.22
+    # via cffi
+pydantic==2.7.4
+    # via
+    #   fastapi
+    #   gradio
+    #   llamaindex-py-client
+    #   openai
+pydantic-core==2.18.4
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.18.0
+    # via rich
+pyparsing==3.1.2
+    # via matplotlib
+pypdf==4.2.0
+    # via llama-index-readers-file
+python-dateutil==2.9.0.post0
+    # via
+    #   matplotlib
+    #   pandas
+python-dotenv==1.0.1
+    # via uvicorn
+python-multipart==0.0.9
+    # via
+    #   fastapi
+    #   gradio
+pytz==2024.1
+    # via pandas
+pyyaml==6.0.1
+    # via
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   llama-index-core
+    #   uvicorn
+referencing==0.35.1
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2024.5.15
+    # via
+    #   nltk
+    #   tiktoken
+requests==2.32.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   llama-index-core
+    #   llama-index-legacy
+    #   tiktoken
+rich==13.7.1
+    # via typer
+rpds-py==0.18.1
+    # via
+    #   jsonschema
+    #   referencing
+ruff==0.4.9
+    # via gradio
+semantic-version==2.10.0
+    # via gradio
+shellingham==1.5.4
+    # via typer
+six==1.16.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+    #   openai
+soupsieve==2.5
+    # via beautifulsoup4
+sqlalchemy==2.0.30
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+starlette==0.37.2
+    # via fastapi
+striprtf==0.0.26
+    # via llama-index-readers-file
+tenacity==8.4.1
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+tiktoken==0.7.0
+    # via
+    #   llama-index-core
+    #   llama-index-legacy
+tomlkit==0.12.0
+    # via gradio
+toolz==0.12.1
+    # via altair
+tqdm==4.66.4
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   llama-index-core
+    #   nltk
+    #   openai
+typer==0.12.3
+    # via
+    #   fastapi-cli
+    #   gradio
+typing-extensions==4.12.2
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   llama-index-core
+    #   llama-index-legacy
+    #   openai
+    #   pydantic
+    #   pydantic-core
+    #   sqlalchemy
+    #   typer
+    #   typing-inspect
+typing-inspect==0.9.0
+    # via
+    #   dataclasses-json
+    #   llama-index-core
+    #   llama-index-legacy
+tzdata==2024.1
+    # via pandas
+ujson==5.10.0
+    # via fastapi
+urllib3==2.2.2
+    # via
+    #   gradio
+    #   requests
+uvicorn==0.30.1
+    # via
+    #   fastapi
+    #   gradio
+uvloop==0.19.0
+    # via uvicorn
+watchfiles==0.22.0
+    # via uvicorn
+websockets==11.0.3
+    # via
+    #   gradio-client
+    #   uvicorn
+wrapt==1.16.0
+    # via
+    #   deprecated
+    #   llama-index-core
+xxhash==3.4.1
+    # via datasets
+yarl==1.9.4
+    # via aiohttp