Quentin Gallouédec commited on
Commit
ccba23d
·
1 Parent(s): fc133fb
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -7,8 +7,10 @@ import pandas as pd
7
  from datasets import Dataset
8
  from pypdf import PdfReader
9
  from huggingface_hub import HfApi
 
10
  # import template
11
  from string import Template
 
12
  to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
13
  to_be_replaced = {
14
  "½": "1/2",
@@ -37,8 +39,6 @@ to_be_replaced = {
37
  }
38
 
39
 
40
-
41
-
42
  def clean(text):
43
  # Remove all the unwanted characters
44
  for char in to_be_removed:
@@ -113,32 +113,32 @@ caution_text = """⚠️ Caution:
113
  - Anyone (including you) will be able to delete the dataset once it is uploaded.
114
  """
115
 
116
- instructions_template = Template("""
117
- Your dataset is now available on Hugging Face Datasets at [pdf2dataset/$dataset_name](https://huggingface.co/datasets/pdf2dataset/$dataset_name).
118
-
119
- You can load the dataset using the following code:
120
 
121
  ```python
122
  from datasets import load_dataset
123
 
124
  dataset = load_dataset("pdf2dataset/$dataset_name")
125
  ```
126
- """)
 
127
 
128
  with gr.Blocks() as demo:
129
- # Convert a PDF to a dataset
130
- gr.Markdown("## Convert a PDF to a dataset")
131
  file = gr.File(file_types=["pdf"], height=50)
132
  gr.Markdown(caution_text)
133
- convert_button = gr.Button("🔄 Convert and upload")
134
- instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
135
  preview = gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200)
136
-
137
- # Delete a dataset
138
- gr.Markdown("### Delete a dataset")
139
- dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete")
140
  delete_button = gr.Button("🗑️ Delete dataset")
141
-
142
  # Define the actions
143
  convert_button.click(pdf2dataset, inputs=[file], outputs=[instructions, preview, dataset_name_to_delete])
144
  delete_button.click(delete_dataset, inputs=[dataset_name_to_delete], outputs=[delete_button])
 
7
  from datasets import Dataset
8
  from pypdf import PdfReader
9
  from huggingface_hub import HfApi
10
+
11
  # import template
12
  from string import Template
13
+
14
  to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
15
  to_be_replaced = {
16
  "½": "1/2",
 
39
  }
40
 
41
 
 
 
42
  def clean(text):
43
  # Remove all the unwanted characters
44
  for char in to_be_removed:
 
113
  - Anyone (including you) will be able to delete the dataset once it is uploaded.
114
  """
115
 
116
+ instructions_template = Template(
117
+ """
118
+ 🔗: https://huggingface.co/datasets/pdf2dataset/$dataset_name.
 
119
 
120
  ```python
121
  from datasets import load_dataset
122
 
123
  dataset = load_dataset("pdf2dataset/$dataset_name")
124
  ```
125
+ """
126
+ )
127
 
128
  with gr.Blocks() as demo:
129
+ gr.Markdown("# PDF to 🤗 Dataset")
130
+ gr.Markdown("## 1️⃣ Upload a PDF")
131
  file = gr.File(file_types=["pdf"], height=50)
132
  gr.Markdown(caution_text)
133
+ gr.Markdown("## 2️⃣ Convert the PDF and upload")
134
+ convert_button = gr.Button("🔄 Convert and upload")
135
  preview = gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200)
136
+ gr.Markdown("## 3️⃣ Use the dataset in your code")
137
+ instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
138
+ gr.Markdown("## 4️⃣ Delete the (optional)")
139
+ dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete")
140
  delete_button = gr.Button("🗑️ Delete dataset")
141
+
142
  # Define the actions
143
  convert_button.click(pdf2dataset, inputs=[file], outputs=[instructions, preview, dataset_name_to_delete])
144
  delete_button.click(delete_dataset, inputs=[dataset_name_to_delete], outputs=[delete_button])