import gradio as gr

import re
import os
import py_vncorenlp
from pyvi import ViTokenizer, ViPosTagger

def preprocess_text(text):
    # Loại bỏ các ký tự đặc biệt và dấu câu
    text = re.sub(r'[^\w\s]', '', text)

    # Loại bỏ URL
    text = re.sub(r'http\S+', '', text)

    # Loại bỏ đường dẫn file
    text = re.sub(r'\/\w+', '', text)

    return text

def remove_escape_sequences(text):
    escape_sequences = ['\n', '\t', '\r', '\\']
    for sequence in escape_sequences:
        text = text.replace(sequence, '')
    return text

def remove_html_tags(text):
    clean_text = re.sub(r'<[^>]*>', '', text)
    return clean_text

def vi_word_segment(text):
    output = ViTokenizer.tokenize(text)
    return output

def process_text(text):

    text = vi_word_segment(text)
    return text

if __name__ == '__main__':
    iface = gr.Interface(fn=process_text, inputs="text", outputs="text")
    iface.launch(share=True)