Spaces:

cuongnguyen910
/

topic-clustering-global-dashboard

Build error

File size: 3,208 Bytes
import editdistance
import requests
import numpy as np
import re
from .clean_text import normalize_text
URL_SBERT = "http://10.9.3.240:6789/sbert/encode_list"
# app_config.parse_url_api('api_sbert')

def get_sbert_embedding(lst_sentence, url = URL_SBERT):
    input_data = {
        "sentences": lst_sentence
    }
    embs = requests.post(url, json=input_data).json()
    embs = np.array(embs)

    return embs

def is_number(word):
    lst_end = ['$', '%', 'vnđ', '.', ',']
    word_lo = word.lower()
    for k in lst_end:
        word_lo = word_lo.replace(k, '')

    if word_lo.isdigit():
        return True
    else:
        return False


def get_number(text):
    dt = text.split(' ')
    for w in dt:
        if is_number(w):
            return w

    return ''


def check_editdistance(ww1, ww2):
    if len(ww1) == 0 or len(ww1) == 0:
        return 0
    else:
        n_c = editdistance.eval(ww1.lower(), ww2.lower())
        score = n_c / max(len(ww1), len(ww2))
        return 1 - score


def remove_image_keyword(text_input):
    lst_key = ["ảnh:", "ảnh :", "Ảnh:", "Ảnh :",
               "Ảnh minh họa:", "Ảnh minh họa :", "ảnh minh họa:", "ảnh minh họa :",
               "Nguồn:", "nguồn:", "Nguồn :", "nguồn :",
               "Source:", "Source :", "source:", "source :",
               "Src:", "Src :", "src:", "src :",
               "Image:", "Image :", "img:", "img :",
               "image:", "image :", "Img:", "Img :",
               "xem tiếp", "xem thêm", "Xem tiếp", "Xem thêm"]
    for k in lst_key:
        text_input = text_input.replace(k, " ")
    return text_input.strip()

def clean_text(text_in, normalize=True):
    doc = re.sub('<.*?>', '', text_in)
    doc = re.sub('(function).*}', ' ', doc)
    # link
    doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
    doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)

    doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
    doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
    doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
    doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
    # escape sequence
    doc = re.sub('\n', ' ', doc)
    doc = re.sub('\t', ' ', doc)
    doc = re.sub('\r', ' ', doc)

    if normalize:
        doc = normalize_text(doc)
    return doc

if __name__ == '__main__':
    print(check_editdistance('tttt', 'tt'))