davanstrien HF staff commited on
Commit
eb008d8
1 Parent(s): c167550

make sentence splitting optional

Browse files
Files changed (1) hide show
  1. app.py +30 -4
app.py CHANGED
@@ -17,11 +17,23 @@ logging.basicConfig(filename="logs.txt", level=logging.INFO)
17
  logging.getLogger().addHandler(logging.FileHandler(log_file))
18
 
19
 
20
- def load_corpus(files, chunk_size=256, chunk_overlap=0, verbose=True):
 
 
21
  if verbose:
22
  gr.Info("Loading files...")
23
  reader = SimpleDirectoryReader(input_files=files)
24
  docs = reader.load_data()
 
 
 
 
 
 
 
 
 
 
25
  if verbose:
26
  print(f"Loaded {len(docs)} docs")
27
 
@@ -48,12 +60,18 @@ def upload_file(
48
  chunk_overlap: int = 0,
49
  hub_id: str = None,
50
  private: bool = False,
 
51
  oauth_token: gr.OAuthToken = None,
52
  ):
53
  print("loading files")
54
  file_paths = [file.name for file in files]
55
  print("parsing into sentences")
56
- corpus = load_corpus(file_paths, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
 
 
 
 
57
  print("Creating dataset")
58
  dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
59
  message = f"Dataset created has: \n - {len(dataset)} rows"
@@ -99,7 +117,7 @@ The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.lla
99
 
100
  ### Usage:
101
  - Login: Start by logging in to your Hugging Face account using the provided login button.
102
- - Set Parameters: Customize the chunk size and overlap according to your requirements.
103
  - Upload Files: Use the upload button to load file(s) for processing.
104
  - Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
105
  - Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""
@@ -118,6 +136,7 @@ with gr.Blocks() as demo:
118
  )
119
  hub_id = gr.Textbox(value=None, label="Hub ID")
120
  with gr.Row():
 
121
  chunk_size = gr.Number(
122
  256,
123
  label="Chunk size (size to split text into)",
@@ -143,7 +162,14 @@ with gr.Blocks() as demo:
143
  corpus_preview_df = gr.DataFrame()
144
  upload_button.upload(
145
  upload_file,
146
- inputs=[upload_button, chunk_size, chunk_overlap, hub_id, private],
 
 
 
 
 
 
 
147
  outputs=[corpus_preview_df, summary],
148
  )
149
  demo.launch(debug=True)
 
17
  logging.getLogger().addHandler(logging.FileHandler(log_file))
18
 
19
 
20
+ def load_corpus(
21
+ files, chunk_size=256, chunk_overlap=0, verbose=True, split_sentences=True
22
+ ):
23
  if verbose:
24
  gr.Info("Loading files...")
25
  reader = SimpleDirectoryReader(input_files=files)
26
  docs = reader.load_data()
27
+ if split_sentences is False:
28
+ gr.Info(
29
+ "Skipping sentence splitting. Each file will be a single row in the dataset."
30
+ )
31
+ return {doc.id_: doc.text for doc in docs}
32
+ if split_sentences:
33
+ return split_corpus(verbose, docs, chunk_size, chunk_overlap)
34
+
35
+
36
+ def split_corpus(verbose, docs, chunk_size, chunk_overlap):
37
  if verbose:
38
  print(f"Loaded {len(docs)} docs")
39
 
 
60
  chunk_overlap: int = 0,
61
  hub_id: str = None,
62
  private: bool = False,
63
+ split_sentences: bool = True,
64
  oauth_token: gr.OAuthToken = None,
65
  ):
66
  print("loading files")
67
  file_paths = [file.name for file in files]
68
  print("parsing into sentences")
69
+ corpus = load_corpus(
70
+ file_paths,
71
+ chunk_size=chunk_size,
72
+ chunk_overlap=chunk_overlap,
73
+ split_sentences=split_sentences,
74
+ )
75
  print("Creating dataset")
76
  dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
77
  message = f"Dataset created has: \n - {len(dataset)} rows"
 
117
 
118
  ### Usage:
119
  - Login: Start by logging in to your Hugging Face account using the provided login button.
120
+ - Set Parameters: Customize the chunk size and overlap according to your requirements. If you want to split the text into chunks, check the 'Split sentences' box (on by default).
121
  - Upload Files: Use the upload button to load file(s) for processing.
122
  - Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
123
  - Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""
 
136
  )
137
  hub_id = gr.Textbox(value=None, label="Hub ID")
138
  with gr.Row():
139
+ split_sentences = gr.Checkbox(True, label="Split sentences?")
140
  chunk_size = gr.Number(
141
  256,
142
  label="Chunk size (size to split text into)",
 
162
  corpus_preview_df = gr.DataFrame()
163
  upload_button.upload(
164
  upload_file,
165
+ inputs=[
166
+ upload_button,
167
+ chunk_size,
168
+ chunk_overlap,
169
+ hub_id,
170
+ private,
171
+ split_sentences,
172
+ ],
173
  outputs=[corpus_preview_df, summary],
174
  )
175
  demo.launch(debug=True)