Spaces:

davanstrien
/

corpus-creator

Running

App Files Files Community

davanstrien HF staff commited on 19 days ago

Commit

eb008d8

•

1 Parent(s): c167550

make sentence splitting optional

Browse files

Files changed (1) hide show

app.py +30 -4

app.py CHANGED Viewed

@@ -17,11 +17,23 @@ logging.basicConfig(filename="logs.txt", level=logging.INFO)
 logging.getLogger().addHandler(logging.FileHandler(log_file))
-def load_corpus(files, chunk_size=256, chunk_overlap=0, verbose=True):
     if verbose:
         gr.Info("Loading files...")
     reader = SimpleDirectoryReader(input_files=files)
     docs = reader.load_data()
     if verbose:
         print(f"Loaded {len(docs)} docs")
@@ -48,12 +60,18 @@ def upload_file(
     chunk_overlap: int = 0,
     hub_id: str = None,
     private: bool = False,
     oauth_token: gr.OAuthToken = None,
 ):
     print("loading files")
     file_paths = [file.name for file in files]
     print("parsing into sentences")
-    corpus = load_corpus(file_paths, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     print("Creating dataset")
     dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
     message = f"Dataset created has: \n - {len(dataset)} rows"
@@ -99,7 +117,7 @@ The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.lla
 ### Usage:
 - Login: Start by logging in to your Hugging Face account using the provided login button.
-- Set Parameters: Customize the chunk size and overlap according to your requirements.
 - Upload Files: Use the upload button to load file(s) for processing.
 - Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
 - Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""
@@ -118,6 +136,7 @@ with gr.Blocks() as demo:
             )
             hub_id = gr.Textbox(value=None, label="Hub ID")
     with gr.Row():
         chunk_size = gr.Number(
             256,
             label="Chunk size (size to split text into)",
@@ -143,7 +162,14 @@ with gr.Blocks() as demo:
     corpus_preview_df = gr.DataFrame()
     upload_button.upload(
         upload_file,
-        inputs=[upload_button, chunk_size, chunk_overlap, hub_id, private],
         outputs=[corpus_preview_df, summary],
     )
 demo.launch(debug=True)

 logging.getLogger().addHandler(logging.FileHandler(log_file))
+def load_corpus(
+    files, chunk_size=256, chunk_overlap=0, verbose=True, split_sentences=True
+):
     if verbose:
         gr.Info("Loading files...")
     reader = SimpleDirectoryReader(input_files=files)
     docs = reader.load_data()
+    if split_sentences is False:
+        gr.Info(
+            "Skipping sentence splitting. Each file will be a single row in the dataset."
+        )
+        return {doc.id_: doc.text for doc in docs}
+    if split_sentences:
+        return split_corpus(verbose, docs, chunk_size, chunk_overlap)
+def split_corpus(verbose, docs, chunk_size, chunk_overlap):
     if verbose:
         print(f"Loaded {len(docs)} docs")
     chunk_overlap: int = 0,
     hub_id: str = None,
     private: bool = False,
+    split_sentences: bool = True,
     oauth_token: gr.OAuthToken = None,
 ):
     print("loading files")
     file_paths = [file.name for file in files]
     print("parsing into sentences")
+    corpus = load_corpus(
+        file_paths,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        split_sentences=split_sentences,
+    )
     print("Creating dataset")
     dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
     message = f"Dataset created has: \n - {len(dataset)} rows"
 ### Usage:
 - Login: Start by logging in to your Hugging Face account using the provided login button.
+- Set Parameters: Customize the chunk size and overlap according to your requirements. If you want to split the text into chunks, check the 'Split sentences' box (on by default).
 - Upload Files: Use the upload button to load file(s) for processing.
 - Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
 - Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""
             )
             hub_id = gr.Textbox(value=None, label="Hub ID")
     with gr.Row():
+        split_sentences = gr.Checkbox(True, label="Split sentences?")
         chunk_size = gr.Number(
             256,
             label="Chunk size (size to split text into)",
     corpus_preview_df = gr.DataFrame()
     upload_button.upload(
         upload_file,
+        inputs=[
+            upload_button,
+            chunk_size,
+            chunk_overlap,
+            hub_id,
+            private,
+            split_sentences,
+        ],
         outputs=[corpus_preview_df, summary],
     )
 demo.launch(debug=True)