File size: 3,690 Bytes
2a831b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# clean_app.py
# -----------------------------------
# Copy_right CC _developed By HNM
# -----------------------------------
import re
import gradio as gr
import tempfile
# STEP A
mappings = {
"\x04ٲ": "ٲ",
"\x04ُ": "ُ",
"\x04ٚ": "ٚ",
"\x04ٕ": "ٕ",
"\x04ٛ": "ٛ",
"\x04ٔ": "ٔ",
"\x04ں": "ں",
"\x04": "", # if stray \x04 alone, remove or replace as needed
# ASCII symbol mappings
">": "ٲ",
"<": "ُ",
";": "ٚ",
"=": "ٕ",
":": "ٛ",
".": "ٔ",
",": "ں",
"/": "" # remove slash
}
def basic_replacements(text: str) -> str:
"""
Apply the dictionary-based .replace() calls for
the \x04 combos and ASCII symbols.
"""
for old, new in mappings.items():
text = text.replace(old, new)
return text
# STEP B
def fix_alif_combo(text: str) -> str:
""" Replace any occurrence of 'اٲ' with 'ٲ'. """
return text.replace("اٲ", "ٲ")
# STEP C
def fix_question_mark(text: str) -> str:
"""
For each occurrence of (.)?(.) => remove '?', add "یٕ" to the 2nd letter.
E.g. "س?ت" => "ستیٕ".
"""
def _repl(m):
first_char = m.group(1) # the character before '?'
second_char = m.group(2) # the character after '?'
return f"{first_char}{second_char}یٕ"
pattern = r"(.)\?(.)"
return re.sub(pattern, _repl, text)
# STEP D
def clean_line(line: str) -> str:
"""
Cleans a single line using:
1) basic replacements (\x04 combos, ASCII symbols),
2) fix_alif_combo (اٲ -> ٲ),
3) fix_question_mark (س?ت -> ستیٕ)
"""
line = basic_replacements(line)
line = fix_alif_combo(line)
line = fix_question_mark(line)
return line
# Helper: cleans the entire string (multiple lines).
def clean_text(input_text: str) -> str:
# Split into lines, clean each, then join
lines = input_text.splitlines()
cleaned_lines = [clean_line(line) for line in lines]
return "\n".join(cleaned_lines)
# -----------------------------------
# Gradio Interface
# -----------------------------------
def process_text(raw_text):
"""
This function is called by Gradio when the user clicks the button.
It returns two outputs:
1) The cleaned text (for display)
2) A temporary file path with the cleaned text (for download)
"""
cleaned = clean_text(raw_text)
# Write the cleaned text to a temporary file for download
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
tmp.write(cleaned)
tmp.flush()
tmp.close()
return cleaned, tmp.name
# Build the interface
with gr.Blocks() as demo:
gr.Markdown("## Clean Text Tool")
gr.Markdown(
"Paste your raw/unprocessed text below, then click 'Clean Text' to get the cleaned result."
)
with gr.Row():
with gr.Column():
raw_text = gr.Textbox(
label="Input (Paste uncleaned text)",
lines=15,
placeholder="Paste any length of text here...",
)
with gr.Column():
cleaned_output = gr.Textbox(
label="Output (Cleaned text)",
lines=15,
interactive=False
)
# Button to trigger cleaning
button = gr.Button("Clean Text")
# We'll show the file download output in a second row
download_file = gr.File(label="Download Cleaned .txt File")
# Connect the function to the button
button.click(
fn=process_text,
inputs=raw_text,
outputs=[cleaned_output, download_file]
)
# Run the app
if __name__ == "__main__":
demo.launch()
|