Spaces:
Sleeping
Sleeping
Quentin Gallouédec
commited on
Commit
·
ccba23d
1
Parent(s):
fc133fb
app.py
Browse files
app.py
CHANGED
@@ -7,8 +7,10 @@ import pandas as pd
|
|
7 |
from datasets import Dataset
|
8 |
from pypdf import PdfReader
|
9 |
from huggingface_hub import HfApi
|
|
|
10 |
# import template
|
11 |
from string import Template
|
|
|
12 |
to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
|
13 |
to_be_replaced = {
|
14 |
"½": "1/2",
|
@@ -37,8 +39,6 @@ to_be_replaced = {
|
|
37 |
}
|
38 |
|
39 |
|
40 |
-
|
41 |
-
|
42 |
def clean(text):
|
43 |
# Remove all the unwanted characters
|
44 |
for char in to_be_removed:
|
@@ -113,32 +113,32 @@ caution_text = """⚠️ Caution:
|
|
113 |
- Anyone (including you) will be able to delete the dataset once it is uploaded.
|
114 |
"""
|
115 |
|
116 |
-
instructions_template = Template(
|
117 |
-
|
118 |
-
|
119 |
-
You can load the dataset using the following code:
|
120 |
|
121 |
```python
|
122 |
from datasets import load_dataset
|
123 |
|
124 |
dataset = load_dataset("pdf2dataset/$dataset_name")
|
125 |
```
|
126 |
-
"""
|
|
|
127 |
|
128 |
with gr.Blocks() as demo:
|
129 |
-
#
|
130 |
-
gr.Markdown("##
|
131 |
file = gr.File(file_types=["pdf"], height=50)
|
132 |
gr.Markdown(caution_text)
|
133 |
-
|
134 |
-
|
135 |
preview = gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200)
|
136 |
-
|
137 |
-
|
138 |
-
gr.Markdown("
|
139 |
-
dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete")
|
140 |
delete_button = gr.Button("🗑️ Delete dataset")
|
141 |
-
|
142 |
# Define the actions
|
143 |
convert_button.click(pdf2dataset, inputs=[file], outputs=[instructions, preview, dataset_name_to_delete])
|
144 |
delete_button.click(delete_dataset, inputs=[dataset_name_to_delete], outputs=[delete_button])
|
|
|
7 |
from datasets import Dataset
|
8 |
from pypdf import PdfReader
|
9 |
from huggingface_hub import HfApi
|
10 |
+
|
11 |
# import template
|
12 |
from string import Template
|
13 |
+
|
14 |
to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
|
15 |
to_be_replaced = {
|
16 |
"½": "1/2",
|
|
|
39 |
}
|
40 |
|
41 |
|
|
|
|
|
42 |
def clean(text):
|
43 |
# Remove all the unwanted characters
|
44 |
for char in to_be_removed:
|
|
|
113 |
- Anyone (including you) will be able to delete the dataset once it is uploaded.
|
114 |
"""
|
115 |
|
116 |
+
instructions_template = Template(
|
117 |
+
"""
|
118 |
+
🔗: https://huggingface.co/datasets/pdf2dataset/$dataset_name.
|
|
|
119 |
|
120 |
```python
|
121 |
from datasets import load_dataset
|
122 |
|
123 |
dataset = load_dataset("pdf2dataset/$dataset_name")
|
124 |
```
|
125 |
+
"""
|
126 |
+
)
|
127 |
|
128 |
with gr.Blocks() as demo:
|
129 |
+
gr.Markdown("# PDF to 🤗 Dataset")
|
130 |
+
gr.Markdown("## 1️⃣ Upload a PDF")
|
131 |
file = gr.File(file_types=["pdf"], height=50)
|
132 |
gr.Markdown(caution_text)
|
133 |
+
gr.Markdown("## 2️⃣ Convert the PDF and upload")
|
134 |
+
convert_button = gr.Button("🔄 Convert and upload")
|
135 |
preview = gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200)
|
136 |
+
gr.Markdown("## 3️⃣ Use the dataset in your code")
|
137 |
+
instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name"))
|
138 |
+
gr.Markdown("## 4️⃣ Delete the (optional)")
|
139 |
+
dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete")
|
140 |
delete_button = gr.Button("🗑️ Delete dataset")
|
141 |
+
|
142 |
# Define the actions
|
143 |
convert_button.click(pdf2dataset, inputs=[file], outputs=[instructions, preview, dataset_name_to_delete])
|
144 |
delete_button.click(delete_dataset, inputs=[dataset_name_to_delete], outputs=[delete_button])
|