Quentin Gallouédec commited on
Commit
58e4b18
1 Parent(s): 0b649de
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pypdf import PdfReader
2
+ import re
3
+ import random
4
+ import gradio as gr
5
+ from datasets import Dataset, DatasetDict
6
+ import os
7
+ import pandas as pd
8
+
9
+ to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
10
+ to_be_replaced = {
11
+ "½": "1/2",
12
+ "–": "-",
13
+ "‘": "'",
14
+ "’": "'",
15
+ "…": "...",
16
+ "₋": "-",
17
+ "−": "-",
18
+ "⓫": "11.",
19
+ "⓬": "12.",
20
+ "⓭": "13.",
21
+ "⓮": "14.",
22
+ "◦": "°",
23
+ "❶": "1.",
24
+ "❷": "2.",
25
+ "❸": "3.",
26
+ "❹": "4.",
27
+ "❺": "5.",
28
+ "❻": "6.",
29
+ "❼": "7.",
30
+ "❽": "8.",
31
+ "❾": "9.",
32
+ "❿": "10.",
33
+ "\n": " ",
34
+ }
35
+
36
+
37
+ def clean(text):
38
+ # Remove all the unwanted characters
39
+ for char in to_be_removed:
40
+ text = text.replace(char, "")
41
+
42
+ # Replace all the characters that need to be replaced
43
+ for char, replacement in to_be_replaced.items():
44
+ text = text.replace(char, replacement)
45
+
46
+ # For all \n, if the next line doesn't start with a capital letter, remove the \n
47
+ # text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text)
48
+
49
+ # Make sure that every "." is followed by a space
50
+ text = re.sub(r"\.([^ ])", r". \1", text)
51
+
52
+ # Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
53
+ text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
54
+
55
+ # Make sure that there is no space before a comma and a period
56
+ text = text.replace(" ,", ",")
57
+ text = text.replace(" .", ".")
58
+ text = text.replace(" -", "-")
59
+ text = text.replace("- ", "-")
60
+
61
+ while " " in text:
62
+ text = text.replace(" ", " ")
63
+
64
+ return text
65
+
66
+
67
+ def pdf2dataset(file, _, progress=gr.Progress()):
68
+ progress(0, desc="Starting...")
69
+ reader = PdfReader(file)
70
+ num_pages = len(reader.pages)
71
+ dataset_name = f"{random.getrandbits(128):x}"
72
+ page_texts = []
73
+
74
+ for page in progress.tqdm(reader.pages, total=num_pages, desc="Converting pages"):
75
+ page_text = page.extract_text()
76
+ page_text = clean(page_text)
77
+ page_texts.append(page_text)
78
+
79
+ progress(0, desc="Uploading to Hugging Face...")
80
+ dataset = Dataset.from_dict({"text": page_texts})
81
+ dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN"))
82
+ progress(1, desc="Done!")
83
+
84
+ instrctions = f"""
85
+ Your dataset is now available on Hugging Face Datasets at [pdf2dataset/{dataset_name}](https://huggingface.co/datasets/pdf2dataset/{dataset_name}).
86
+
87
+ You can load the dataset using the following code:
88
+
89
+ ```python
90
+ from datasets import load_dataset
91
+
92
+ dataset = load_dataset("pdf2dataset/{dataset_name}")
93
+ ```
94
+ """
95
+ preview = dataset["text"][:10]
96
+ preview = pd.DataFrame(preview, columns=["text"])
97
+ return instrctions, preview
98
+
99
+
100
+ demo = gr.Interface(
101
+ title="PDF to 🤗 Dataset",
102
+ fn=pdf2dataset,
103
+ inputs=[
104
+ gr.File(file_types=["pdf"]),
105
+ gr.Markdown(
106
+ "⚠️ Caution: This process will upload your data to a public Hugging Face repository. Do not upload sensitive information."
107
+ ),
108
+ ],
109
+ outputs=[gr.Markdown(), gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True)],
110
+ submit_btn="Convert to dataset",
111
+ allow_flagging="never",
112
+ )
113
+
114
+
115
+ demo.launch()