Spaces:

openthaigpt
/

data-decontamination-demo

Sleeping

App Files Files Community

sadanalog commited on Mar 16

Commit

82191a4

•

1 Parent(s): 9cef4aa

add codes

Browse files

Files changed (6) hide show

.gitignore +2 -0
app.py +102 -0
requirements.txt +0 -0
step1.py +917 -0
step2.py +183 -0
step2_perplexity.py +125 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ core

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import ast
+from step1 import clean_oscar_text, clean_mc4_text
+from step2_perplexity import sample_text_back, classify_spam, load_model
+load_model()
+TEXT_AREA_HEIGHT = 520
+def filter_pattern(x):
+    return x
+st.title('System DEMO')
+sample_text = """[
+'แจกโปรโมชั่นพิเศษ! เพียงสมัครสมาชิกใหม่และฝากเงินเข้ามาเริ่มต้นที่ 500 บาท คุณก็จะได้รับโบนัสเครดิตฟรีเพิ่มอีก 200 บาทจากเราทันที ไม่ต้องรอนาน พร้อมรับสิทธิพิเศษลุ้นรับของรางวัลพรีเมี่ยมมากมาย อาทิ รถยนต์ คอนโดฯ หรือทริปท่องเที่ยวสุดหรู เพียงโทร 088-345-7890 (คุณแพร จิตร์มณี) กดปุ่มรับสิทธิ์วันนี้',
+'นี่คือภาพสุดเร้าร้อนจากการถ่ายแบบนิตยสาร Maxim ฉบับล่าสุด ของนางเอกสาว "มิลาวดี หวานนาง" สวย เป๊ะ ปัง ร้อนแรงแซ่บจับใจทุกสัดส่วน อย่าพลาดกับเบอร์ติดต่อสั่งซื้อ 098-765-4321 (คุณน้องนุช มารศรี) ให้ส่งรูปไฟลำบากตามาด้วยนะจ๊ะ',
+'ขายนมผึ้งป่าดิบจากเมืองลำปางแท้ 100% ไม่มีผสมน้ำตาลหรือสารปนเปื้อนแม้แต่น้อย สดใหม่จากรังผึ้งดอยสูง อร่อยถึงรสถึงกลิ่น บำรุงร่างกายสุขภาพดีเยี่ยม สนใจสั่งซื้อติดต่อคุณชายเฉลิมชัย โทร. 089-222-4567 หรือทางไลน์ iD: chaleamchaihoney ราคาพิเศษ 690 บาท/กระปุก',
+'สาวๆสายแก้มมุ้งมิ้ง เตรียมพร้อมสำหรับเทศกาลผิวกระจ่างใสที่กำลังจะมาถึงนี้! โปรเด็ดสุดคุ้มจากร้าน Glowlicious Skincare แนะนำผลิตภัณฑ์ทำความสะอาดผิวหน้าขั้นเทพ ล้างสิ่งสกปรกและซิลิโคนได้อย่างหมดจด เนื้อแนบเนียนนุ่มลื่น ปราศจากน้ำมันส่วนเกิน มีวิตามินเอสูง เหมาะสำหรับผิวมันและผิวผสม ราคาเพียง 499 บาท จากปกติ 799 บาท เพื่อนๆสนใจสอบถามรายละเอียดเพิ่มเติมได้ที่ ไลน์ glowliciousskin หรือดูจากแคตตาล็อกที่แนบมา',
+'โปรดระวัง! ข้อความและลิงก์ที่คุณได้รับนั้นเป็นการหลอกลวงและมิจฉาชีพ บริษัทมากระดกรายได้ไม่ได้จัดโปรโมชั่นดังกล่าวแต่อย่างใด ขอให้อภัยในความไม่สะดวกครับ ทางเราจะไม่ยอมรับข้อเสนอจากหมายเลขที่ไม่ระบุชื่อผู้ติดต่อที่แน่ชัดและไม่ได้เป็นพนักงานของทางบริษัท โปรดงดโอนเงินหรือทำธุรกรรมใดๆ จากที่ไม่เชื่อถือได้ และติดต่อเจ้าหน้าที่ของเราเพื่อรับข้อมูลที่ถูกต้องต่อไป',
+'ถ้าอยากรวยเร็ว ลองมาเล่นพนันกับ ขุนพลอดุลย์คาสิโน สนุกครบรสได้ทั้งเงินและบริการนวดฟรีจากสาวสวย โทร. 099-888-7777',
+'สวัสดีจ๊ะ! วันนี้ขายส้มโอนะคะ ส้มโอสดป้ายแดง อร่อยมากค่า ดูมั่งมี้ทั้งภาพแล���วิดีโอลามกห้ามพลาดเด็ดขาดจร้า',
+'สื่อลามกออนไลน์คุณภาพสูงสุด XXX69 ถ่ายทำเองโดยนางเอกจริงๆ พร้อมเบอร์ติดต่อสั่งซื้อ 088-123-4567 (นางสาวก๊กเอ้ย)',
+'พลาดไม่ได้ ผลิตภัณฑ์อาหารเสริมดัชชี่วิตตาไก่ขาย 390 บาท ผสมน้ำเปล่าสำหรับผู้หญิง ไลน์ไอดีduchayfc สนใจสั่งซื้อด่วน',
+'หนีความจริงได้ที่นี่ สิทธิพิเศษพนันบอลฟรีทั้งวัน เราให้เครดิตแรกเข้า 1,000 คะแนน 0-222-33444 (คุณก้อยนวล) กดวันนี้ด่วน',
+'ในยามเช้าที่สดใส พร้อมแสงอรุณอันงดงามของดวงอาทิตย์ ฉันรู้สึกได้ถึงพลังสดชื่นและความหวังใหม่ที่จะเติมเต็มวันนี้ด้วยความสำเร็จและความสุขที่ยั่งยืน ช่างเป็นภาพที่สวยงามและทรงพลังจริงๆ ที่ได้ต้อนรับวันใหม่อันน่าตื่นเต้นนี้',
+'หนังสือเล่มนี้ได้พรรณนาเรื่องราวชีวิตของนักเดินทางผู้กล้าหาญที่ได้ผจญภัยข้ามพรมแดนไปยังดินแดนห่างไกล เขาได้สัมผัสกับวัฒนธรรมและประเพณีที่แตกต่าง ได้เห็นธรรมชาติที่งดงามและสมบูรณ์แบบ รวมถึงได้พบกับความท้าทายและอุปสรรคนานัปการ แต่ด้วยความมุ่งมั่นและพลังใจที่เข้มแข็ง เขาสามารถฟันฝ่าอุปสรรคเหล่านั้นไปได้',
+'ในสวนผลไม้แห่งนี้ มีต้นไม้นานาพันธุ์ที่ออกผลสุกงอมในทุกฤดูกาล เราสามารถเก็บเกี่ยวผลไม้สดใหม่จากธรรมชาติได้ตลอดทั้งปี ไม่ว่าจะเป็นส้ม กล้วย ชมพู่ มะม่วง หรือทุเรียนอร่อยนัว เราจะได้ลิ้มรสความหวานกรุ่นและรสชาติดั้งเดิมที่แสนจะน่าปลื้ม',
+'เมืองนี้มีประวัติศาสตร์ความเป็นมาที่ยาวนานและน่าสนใจ ตั้งแต่สมัยโบราณกาลที่เคยเป็นอาณาจักรใหญ่ มีอารยธรรมรุ่งเรืองสมบูรณ์พูนสุข จนกระทั่งถึงยุคปัจจุบันที่ก้าวสู่ความทันสมัยอย่างมั่นคง เรายังสามารถชมร่องรอยแห่งอดีตที่ปรากฏในสถาปัตยกรรมและศิลปวัฒนธรรมดั้งเดิม ซึ่งหลอมรวมกับเทคโนโลยีสมัยใหม่อย่างลงตัว',
+]"""
+input_texts = st.text_area("input a list of texts",
+                   value=sample_text,
+                   key="input",
+                   height=TEXT_AREA_HEIGHT)
+st.write(f'You wrote {len(input_texts)} characters.')
+input_texts = ast.literal_eval(input_texts)
+with st.expander("See Cleansing Steps"):
+    st.header('STEP 1: Pattern Filtering')
+    texts = [clean_oscar_text(clean_mc4_text(text_)) for text_ in input_texts]
+    st.text_area("after process",
+                 value=texts,
+                 key="step1",
+                 disabled=True,
+                 height=TEXT_AREA_HEIGHT)
+    st.header('STEP 2: Perplexity Filtering')
+    log_prob = [classify_spam(text_)[1] for text_ in texts]
+    step2_sample = sample_text_back(texts, log_prob)
+    st.text_area("after process",
+                 value=texts,
+                 key="step2",
+                 disabled=True,
+                 height=TEXT_AREA_HEIGHT)
+    st.header('STEP 3: Deduplicated by Similarity')
+    texts = [filter_pattern(t) for t in input_texts]
+    st.text_area("after process",
+                 value=texts,
+                 key="step3",
+                 disabled=True,
+                 height=TEXT_AREA_HEIGHT)
+    st.header('STEP 4: Deduplicated by Exact Matching')
+    texts = [filter_pattern(t) for t in input_texts]
+    st.text_area("after process",
+                 value=texts,
+                 key="step4",
+                 disabled=True,
+                 height=TEXT_AREA_HEIGHT)
+    st.header('STEP 5: Decontamination')
+    texts = [filter_pattern(t) for t in input_texts]
+    st.text_area("after process",
+                 value=texts,
+                 key="step5",
+                 disabled=True,
+                 height=TEXT_AREA_HEIGHT)
+    st.header('STEP 6: Anonymization')
+    texts = [filter_pattern(t) for t in input_texts]
+    st.text_area("after process",
+                 value=texts,
+                 key="step6",
+                 disabled=True,
+                 height=TEXT_AREA_HEIGHT)
+st.header('Output')
+st.text_area("output after cleansing",
+                value=sample_text,
+                key="output",
+                height=TEXT_AREA_HEIGHT)

requirements.txt ADDED Viewed

Binary file (2.41 kB). View file

step1.py ADDED Viewed

	@@ -0,0 +1,917 @@

+import re
+#### Gamble Clean Words
+GAMBLE_WORDS = [
+    "พนัน",
+    "แทงบอล",
+    "แทง",
+    "บาคารา",
+    "บา คา รา",
+    "เกมพนัน",
+    "คาสิโน",
+    "คา สิ โน",
+    "หวย",
+    "สล็อต",
+    "กาสิโน",
+    "casino",
+    "slot",
+    "เลขเด็ด",
+    "สูตรหวย",
+    "a s i n o",
+    "sbobet",
+    "fun88",
+    "ufabet",
+    "บาคาร่า",
+    "บา คา ร่า",
+    "รูเล็ต",
+    "ทำนายฝัน",
+    "เลขเด่น",
+    "สรุปผลบอล",
+    "ไฮไลท์ฟุตบอล",
+    "วิเคราะห์บอล",
+    "ดูบอลสด",
+    "พรีเมียร์ลีก",
+    "บอลประจำวัน",
+    "บอลเต็ง",
+    "บอลเด็ด",
+    "องค์ลงรวย",
+    "สูตรปลาตะเพียน",
+    "สามตัวตรง",
+    "วิเคราะห์ข้อมูลล่าง",
+    "ต่อ ครึ่งลูก",
+    "ครึ่งลูกลบ",
+    "เสมอควบครึ่ง",
+    "ครึ่งควบลูก",
+]
+#### Sale Clean Words
+SALE_SKIP_WORDS = [
+    "สอบราคา",
+    "จัดซื้อจัดจ้าง",
+    "ชมรม",
+    "สมาคม",
+    "นักลงทุน",
+    "นักการตลาด",
+    "ของกลาง",
+    "การลงทุน",
+    "นักวิเคราะห์",
+    "ขายให้แก่ประชาชน",
+    "การลดต้นทุน",
+    "การเสนอราคา",
+    "กระทรวง",
+    "ตลาดหลักทรัพย์",
+    "ยอดขายไม่ดี",
+    "ยอดขายไม่ค่อยดี",
+    "ผู้ประกอบการธุรกิจ",
+    "ออกใบอนุญาต",
+    "ผู้ประกอบกิจการ",
+]
+SALE_URL_WORDS = [
+    "alibaba.com",
+    "shopee.co.th",
+    "lazada.com",
+    "DocPlayer.net",
+    "Alibaba",
+    "AliExpress",
+    "Aliexpress",
+    "TripAdvisor",
+    "jobbkk.com",
+]
+SALE_WORDS = [
+    "ขาย",
+    "ซ่อม",
+    "ราคา",
+    "มือสอง",
+    "เช่า",
+    "ครีม",
+    "ฝ้ากระ",
+    "จุดด่างดำ",
+    "รับส่วนลด",
+    "โปรโมชั่น",
+    "กวดวิชา",
+    "ติวเตอร์",
+    "SEO",
+    "คอร์สเรียน SEO",
+    "จำหน่าย",
+    "ทัวร์",
+    "สินค้ามาใหม่",
+    "สินค้าทั้งหมด",
+    "รีวิวสินค้า",
+    "เคสกันกระแทก",
+    "ประกาศ",
+    "ลงขายของ",
+    "เลือกขนาด",
+    "บริการจัดส่ง",
+    "จัดอันดับ",
+    "คาราโอเกะ",
+    "จำหน่าย",
+    "หาเงินออนไลน์",
+    "สั่งซื้อ",
+    "ลดกระหนำ่",
+    "รหัส",
+    "ลงประกาศฟรี",
+    "หยิบใส่ตะกร้า",
+    "สนใจ",
+    "ซื้อ",
+    "สินค้า",
+    "ผลิตภัณฑ์",
+]
+#### Rent Clean Words
+RENT_SKIP_WORDS = [
+    "สอบราคา",
+    "จัดซื้อจัดจ้าง",
+    "ชมรม",
+    "สมาคม",
+    "นักลงทุน",
+    "นักการตลาด",
+    "ของกลาง",
+    "การลงทุน",
+    "นักวิเคราะห์",
+    "ขายให้แก่ประชาชน",
+    "การลดต้นทุน",
+    "การเสนอราคา",
+    "กระทรวง",
+    "ตลาดหลักทรัพย์",
+]
+RENT_WORDS = [
+    "บ้านมือสอง",
+    "ให้เช่า",
+    "เช่า",
+    "บ้านเดี่ยว",
+    "อพาร์ทเม้นท์",
+    "อสังหาริมทรัพย์",
+    "เพนท์เฮ้าส์",
+    "ทาวน์เฮ้าส์",
+]
+#### Script Clean Words
+SCRIPT_WORDS = [
+    "function",
+    "var",
+    "click",
+    "margin",
+    "width",
+    "height",
+    "return",
+    "else",
+    "alert",
+    "<br>",
+    "href",
+]
+#### Garbage Clean Words
+GARBAGE_WORDS = [
+    "โหงวเฮ้ง",
+    "ครีมฟอกสี",
+    "ครีมผิวขาว",
+    "ฟอกสี",
+    "ไวท์เทนนิ่งครีม",
+    "ครีมไวท์เทนนิ่ง",
+    "ครีมลบฝ้ากระ",
+    "รับสร้างบ้าน",
+    "ครีมโรคสะเก็ดเงิน",
+    "บร��การจองตั๋ว",
+    "บริการรีดผ้า",
+    "อาหารเสริมลดน้ำหนัก",
+    "ยาลดน้ำหนัก",
+    "ลดไขมัน",
+    "ผิงโซดา",
+    "สร้างบ้าน",
+    "ช่างกุญแจ",
+    "ช่างโลหะ",
+    "ช่างโยธา",
+    "ช่างเครื่องยนต์",
+    "ช่างไม้",
+    "ช่างกลโรงงาน",
+    "ช่างไฟฟ้า",
+    "ปรสิต",
+    "หนอน",
+    "เวิร์ม",
+]
+#### Football teams
+FOOTBALL_TEAMS = [
+    "ยูเวนตุส",
+    "อินเตอร์ มิลาน",
+    "นาโปลี",
+    "เอซี มิลาน",
+    "ลาซิโอ",
+    "โรม่า",
+    "กัลโซ่",
+    "เซเรีย",
+    "ปาร์ม่า",
+    "เอฟเวอร์ตัน",
+    "ซันเดอร์แลนด์",
+    "ลิเวอร์พูล",
+    "แมนเชสเตอร์",
+    "นิวคาสเซิล",
+    "เชลซี",
+    "อาร์เซนอล",
+    "คลิสตัลพาเลช",
+    "เซาแทมป์ตัน",
+    "เซาแธมป์ตัน",
+    "เชฟฟิลด์",
+    "ฟอเรสต์",
+    "เบอร์ตัน",
+    "เบรนท์ฟอร์ด",
+    "ฟูแล่ม",
+    "ไฮไลท์ฟุตบอล",
+    "เลบันเต้",
+    "บาร์เซโลน่า",
+    "เรอัล มาดริด",
+    "เอสปันญ่อล",
+]
+#### Hotels Advertising
+HOTEL_AD = [
+    "โรงแรมอันดับ",
+    "ที่พักแบบพิเศษอันดับ",
+    "สถานที่พักอันดับ",
+    "สถานที่พักคุ้มค่าอันดับ",
+    "โรงแรมใกล้กับ",
+    "โรงแรมที่ใกล้",
+    "โรงแรม 4 ดาว",
+    "โรงแรม 3 ดาว",
+    "ที่พักพร้อมอาหารเช้า",
+    "โรงแรมราคาถูก",
+    "โรงแรมหรู",
+]
+#########
+# PRE-COMPILE REGEX to object for speed up processing.
+#########
+# -----------------------------------------------------
+# Remove useless row that make overhead in regex processing
+# Unusual row - line size too large
+# if there are 3 large lines ( 500 characters each)
+TOOLARGE_LINE_PATTERN = ".{1500}"
+TOOLARGE_RE = re.compile(TOOLARGE_LINE_PATTERN, re.MULTILINE)
+NONECHAR_PATTERN = "๮|๞|๨|๡|๷|๻|๫|͹"
+NONECHAR_RE = re.compile(NONECHAR_PATTERN, re.MULTILINE)
+NONE_TONE_MARK_PATTERN = "ก าหนด|เป าหมาย|พ ฒนา|ค ณภาพ|ว จ ย|ค ณล กษณะ|ต างๆ|เป น |ให |บร หาร|ปร บปร ง|ใหม|อย าง|เง น"
+NONE_TONE_MARK_RE = re.compile(NONE_TONE_MARK_PATTERN, re.MULTILINE)
+# -----------------------------------------------------
+GAMBLE_PATTERN = "|".join(GAMBLE_WORDS)
+GAMBLE_RE = re.compile(GAMBLE_PATTERN, re.MULTILINE)
+FOOTBALL_PATTERN = "|".join(FOOTBALL_TEAMS)
+FOOTBALL_RE = re.compile(FOOTBALL_PATTERN, re.MULTILINE)
+HOTEL_AD_PATTERN = "|".join(HOTEL_AD)
+HOTEL_AD_RE = re.compile(HOTEL_AD_PATTERN, re.MULTILINE)
+SALE_URL_PATTERN = "|".join(SALE_URL_WORDS)
+SALE_URL_RE = re.compile(SALE_URL_PATTERN, re.MULTILINE)
+SALE_SKIP_PATTERN = "|".join(SALE_SKIP_WORDS)
+SALE_SKIP_RE = re.compile(SALE_SKIP_PATTERN, re.MULTILINE)
+SALE_PATTERN = "|".join(SALE_WORDS)
+SALE_RE = re.compile(SALE_PATTERN, re.MULTILINE)
+RENT_SKIP_PATTERN = "|".join(RENT_SKIP_WORDS)
+RENT_SKIP_RE = re.compile(RENT_SKIP_PATTERN, re.MULTILINE)
+RENT_PATTERN = "|".join(RENT_WORDS)
+RENT_RE = re.compile(RENT_PATTERN, re.MULTILINE)
+JSON_PATTERN = r"\s*\"(?:\w)*\"\s*:"
+JSON_RE = re.compile(JSON_PATTERN, re.MULTILINE)
+SCRIPT_PATTERN = r"\b" + "|".join(SCRIPT_WORDS) + r"\b"
+SCRIPT_RE = re.compile(SCRIPT_PATTERN, re.MULTILINE)
+GARBAGE_PATTERN = "|".join(GARBAGE_WORDS)
+GARBAGE_RE = re.compile(GARBAGE_PATTERN, re.MULTILINE)
+GHOST_PATTERN = "เธฃเน|เธเธญ|เธเน|เธฐเธ|เธฅเธฐ|เธซเธฒ|เธญเธฒ|เธดเธ|เธตเธข|เธญเน|เธญเธ|เธดเน|เธฑเธ|เธกเน|เธฒเธ|เธชเน|เน€เธ"
+GHOST_RE = re.compile(GHOST_PATTERN, re.MULTILINE)
+HEX_PATTERN = "(?<![^ ])(?:[0-9A-Fa-f]{2})(?![^ ])"
+HEX_RE = re.compile(HEX_PATTERN, re.MULTILINE)
+PAGE_PATTERN = "(?:<<[ ])?(?:ก่อนหน้า|ย้อนกลับ)[ ]{0,2}(?:\[[ ]?\d{0,6}[ ]?\]|[ ]?\d{0,6}[ ]?)*(?:ต่อไป|หน้าถัดไป|ถัดไป)?(?:[ ]?>>)?|<<(?:[ ]\d{0,6}[ ]\-[ ]\d{0,6})+[ ].{0,100}"
+PAGE_RE = re.compile(PAGE_PATTERN, re.MULTILINE)
+EMBEDDED_SERVER_PATTERN = "<%[ ]*[^%]*%>|<%.*"
+EMBEDDED_SERVER_RE = re.compile(EMBEDDED_SERVER_PATTERN, re.MULTILINE)
+U_PATTERN = "\uFEFF|\u00AD|[\u200A-\u200F]|\uFFFD|[\uE000-\uF8FF]|[\u202A-\u202C]|\u0092|[\u0091-\u0096]|\u2028|\u2066|\u2069|\u008d|\u0081|\u008E|<U\+[0-9A-Fa-f]{4}>"
+U_RE = re.compile(U_PATTERN, re.MULTILINE)
+BLOCK_PATTERN = "(?:\[[^\]]*\])|(?:«[^»]*»)|(?:<<([^>]*)>>)"
+BLOCK_RE = re.compile(BLOCK_PATTERN, re.MULTILINE)
+EMAIL_PATTERN = "(?:(?:([Ee]?mail|อีเมล์)[ ]{0,2}:?[ ]{0,5})?)[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
+EMAIL_RE = re.compile(EMAIL_PATTERN, re.MULTILINE)
+URL_PATTERN = r"\b(?:(?:https?|ftp)://[^\s/$\.\?#].[^\s]*)\b|\b(?:www\.?)?(?:(?:[\w-]*)\.)*(?:com|net|org|info|biz|me|io|co|asia|xyz|th|cn|in|uk|jp|ru)\b"
+URL_RE = re.compile(URL_PATTERN, re.MULTILINE)
+MENU1_PATTERN = "\|(?:[^\|\n]*\|)+.*"
+MENU1_RE = re.compile(MENU1_PATTERN, re.MULTILINE)
+MENU2_PATTERN = "\|(?:[^\|\n]*\|)+"
+MENU2_RE = re.compile(MENU2_PATTERN, re.MULTILINE)
+MENU3_PATTERN = "(?:(?:[^/\n]*/){4,}.*)"
+MENU3_RE = re.compile(MENU3_PATTERN, re.MULTILINE)
+MENU4_PATTERN = "^[^\n]{0,20}[ ]{0,2}[>»\\\\].*"
+MENU4_RE = re.compile(MENU4_PATTERN, re.MULTILINE)
+HASHTAG_PATTERN = "#\d*[ ].{0,300}|#(?:(?:[^ \n]*)[ ]?)+|Tag Archives[ ]{0,2}:.{0,300}|Posts Tagged[ ]{0,2}:.{0,300}|HASTAG[ ]{0,2}:.{0,300}|Tag[s]?[ ]{0,2}:.{0,300}|Tagged[ ].{0,300}"
+HASHTAG_RE = re.compile(HASHTAG_PATTERN, re.MULTILINE)
+SIDEBAR_PATTERN = ".{0,40}(?:(?:\[|\()\d{0,9}(?:\]|\))(?:[ ]{0,2})?,?)"
+SIDEBAR_RE = re.compile(SIDEBAR_PATTERN, re.MULTILINE)
+MARKUP_PATTERN = "\{\{[^\}]*\}\}|\{\{.*"
+MARKUP_RE = re.compile(MARKUP_PATTERN, re.MULTILINE)
+IFRAME_PATTERN = "<iframe.*?<\/iframe>\s*|<iframe.*"
+IFRAME_RE = re.compile(IFRAME_PATTERN, re.MULTILINE)
+IP_PATTERN = "\((?:(?:X{1,3}|\d{1,3})\.){3}(?:X{1,3}|\d{1,3})\)|\(?IP:?[ ]?(?:(?:X{1,3}|\d{1,3})\.){3}(?:X{1,3}|\d{1,3})\)?"
+IP_RE = re.compile(IP_PATTERN, re.MULTILINE)
+TEL_PATTERN = "(?:(?:[Pp]hone|[Mm]obile|มือถือ|Tel|TEL|Fax|FAX|เบอร์โทรศัพท์|เลขโทรศัพท์|เบอร์ติดต่อ|โทรศัพท์|โทรสาร[ ]{0,2}:|เบอร์โทร|โทร[ ]{0,2}:|โทร\.|โทร[ ]|ติดต่อที่[ ]{0,2}:?|ติดต่อ[ ]{0,2}:?)[ ]{0,2}):?(?:(?:[ ]{0,2})?(?:(?:\d{3}-\d{7})|(?:\d{4}-\d{6})|(?:\d{3}-\d{3}-\d{4}|(?:\d{3}-\d{3}-\d{3})|(?:\d{1}-\d{4}-\d{4})|(?:\d{2}-\d{3}-\d{4})|(?:\d{2}\s\d{3}\s\d{4})|(?:\d{2}-\d{7})|(?:\d{3}\s\d{3}\s\d{4})|(?:\d{3}\s\d{3}\s\d{3})|(?:\d{10})))[ ]{0,2},?)+|02\d{7}|0[3-7][2-9]\d{6}|0[6-9][0-9]\d{7}"
+TEL_RE = re.compile(TEL_PATTERN, re.MULTILINE)
+DATE1_PATTERN = "(?:(?:การปรับปรุงปัจจุบัน|ตั้งแต่|ลงประกาศเมื่อ|อัพเดทล่าสุด|แก้ไขครั้งสุดท้าย|แก้ไขครั้งล่าสุด|เผยแพร่เมื่อ|เผยแพร่|เขียนเมื่อ|ตอบเมื่อ|เมื่อ|เขียนวันที่|วันที่|วัน)?(?:[ ]{0,2}:[ ]{0,2})?(?:จันทร์|อังคาร|พุธ|พฤหัสบดี|พฤหัสฯ?\.?|ศุกร์|เสาร์|อาทิตย์|จ\.|อ\.|พ\.|พฤ\.|ศ\.|ส\.|อา\.?)?(?:[ ]{0,2}ที่)?(?:[ ]{0,2}[\w\u0E01-\u0E5B]*[ ]{0,2}(?:,|(?:-|\u2013)))?(?:\d{1,4}[ ]{0,2}-)?[ ]{0,2}\d{0,4}(?:-|[ ]{0,2})(?:เดือน[ ]{0,2})?(?:มกราคม|กุมภาพันธ์|มีนาคม|เมษายน|พฤษภาคม|มิถุนายน|กรกฎาคม|สิงหาคม|กันยายน|ตุลาคม|พฤศจิกายน|ธันวาคม| ม\.?ค\.? | ก\.?พ\.? | มี\.?ค\.? | เม\.?ย\.? | พ\.?ค\.? | มิ\.?ย\.? | ก\.?ค\.? | ส\.?ค\.? | ก\.?ย\.? | ต\.?ค\.? | พ\.?ย\.? | ธ\.?ค\.? |January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec|/)(?:-|[ ]|,]){0,2}(?:[ ]{0,2})?(?:\d{4}|\d{2})?,?(?:(?:[ ]{0,2}\d{4}|\d{2}:\d{2}:\d{2})|(?:[ ]{0,2}\d{1,2}:\d{2}(?::\d{2})?[ ]*(?:(?:(?:p|P|a|A)(?:m|M)))?)|[ ]{0,2}เวลา[ ]{0,2}\d{1,2}:\d{2}:\d{2})?(?:[ ]{0,2}น\.)?(?:[ ]{0,2}:[ ]{0,2}\d{1,2}(?:\.|:)\d{2}[ ]{0,2}น\.)?(?:[ ]{0,2}เวลา[ ]{0,2}\d{1,2}:\d{2}[ ]{0,2}[pPaA][mM][ ]{0,2}(?:PDT)?)?(?:[ ]{0,2}เวลา[ ]{0,2}\d{1,2}(?:\.|:)\d{2}[ ]{0,2}(?:น\.)?)?(?:[ ]*\d*[ ]{0,9}ผู้ชม[ ]{0,2}\d)?(?:[ ]{0,2}เวลา[ ]{0,2}:[ ]{0,2}\d{1,2}:\d{2}:\d{2}[ ](?:น\.)?)?(?:[ ]{0,2}เข้าชม[ ]{0,2}\d*[ ]{0,8}ครั้ง)?(?:[ ]{0,2}ที่[ ]{0,2}\d{1,2}:\d{2}[ ]{0,2}(?:[pPaA][mM])?)?(?:[ ]{0,2}Views:[ ]{0,2}(?:\d{0,3},?){0,3})?(?:[ ]{0,2}\(\d{1,2}:\d{2}[ ](?:น\.)?[ ]{0,2}\)(?:[ ]{0,2}ความคิดเห็น[ ]{0,2}\d)?)?(?:[ ]{0,2}-[ ]{0,2}\d{1,2}:\d{2}[ ]{0,2}(?:น\.)?)?(?:[ ]{0,2}จำนวนเข้าชม:(?:[ ]{0,2}\d{0,9},?)*)?(?:[ ]*พ\.ศ\.[ ]{0,2}\d{4}[ ]{0,2}(?:\d{1,2}:\d{2}[ ]{0,2}(?:น\.)?)?)?(?:(?:\d{0,3},?){0,3}[ ]{0,2}ครั้ง)?(?:[ ]{0,2}-\d{1,2}(?:\.|:)?\d{2}[ ]{0,2}(?:น\.)?)?(?:เวลา[ ]{0,2}\d{1,2}:\d{2}:\d{2})?(?:[ ]{0,2}(?:ถึง|จนถึง))?)(?:,[ ]{0,2})?(?:[ ]{0,2}(?:[pPaA][mM])?)?|(?:(?:[ ]{0,2}(?:\d{4}|\d{2})-\d{1,2}-(?:\d{4}|\d{1,2}))(?:[ ]{0,2},[ ]{0,2})?(?:\d{1,2}(?:\.|:)\d{2}[ ]{0,2}(?:#\d*)?(?:น\.)?)?(?:[ ]{0,2}\d{1,2}:\d{2}:\d{2})?)|(?:(?:เปิดบริการ[ ]{0,2})?(?:เวลา[ ]{0,2}\d{1,2}:\d{2}-\d{1,2}:\d{2}[ ]{0,2}(?:น\.)?))|(?:(?:Time[ ]Online[ ]:[ ]{0,2})?(?:\d{1,2}:\d{2}:\d{2})(?:[ ]{0,2}น\.)?(?:[ ]{0,2}[pPaA][mM])?)|(?:\(\d{1,2}:\d{2}[ ]{0,2}-[ ]{0,2}\d{1,2}:\d{2}(?:[ ]*น\.)?(?:[ ]{0,2}[pPaA][mM])?\))|(?:นี้[ ]{0,2}เวลา[ ]{0,2}\d{1,2}(?:\.|:)\d{2}[ ]{0,2}(?:น\.)?)|(?:\d{1,2}[ ]{0,2}(?:มกราคม|กุมภาพันธ์|มีนาคม|เมษายน|พฤษภาคม|มิถุนายน|กรกฎาคม|สิงหาคม|กันยายน|ตุลาคม|พฤศจิกายน|ธันวาคม))"
+DATE1_RE = re.compile(DATE1_PATTERN, re.MULTILINE)
+DATE2_PATTERN = "[พค]\.?ศ\.?[ ]{0,2}\d{4}|\d{4}[ ]{0,2}เวลา[ ]{0,2}\d{2}:?\.?\d{2}(?:[ ][Pp][Mm])|[พค]\.?ศ\."
+DATE2_RE = re.compile(DATE2_PATTERN, re.MULTILINE)
+HTML_PATTERN = (
+    "<br>?|&nbsp|{\s*document\..*|SELECT.*FROM.*WHERE.*|<a\s*href=.*|<img\s*src=.*"
+)
+HTML_RE = re.compile(HTML_PATTERN, re.MULTILINE)
+REFINE1_PATTERN = "^[ ]?ตอนที่[ ]*\d{0,3}(?:[-–]?\d{0,3})?[ ]{0,2}.{0,100}|^สั่ง.{0,50}บาท|^[ ]?เลขจดแจ้ง[ ]{0,2}.{0,13}|^.{0,100}\.jpg[ ]{0,2}\(.{0,50}|^.{0,20}รายการ|^[ ]?สนใจ[ ]{0,2}.{0,15}โทร[ ]{0,2}.{0,12}|^[ ]?ผู้แสดงความคิดเห็น.{0,60}|^\(.{0,40}[ ]{0,2}\d{0,5}[ ]{0,2}.{0,10}\).{0,200}|^[ ]?ผู้เข้าชมทั้งหมด.{0,30}|^[ ]?ฉบับที่[ ]{0,2}\d{0,7}[^-–]{0,30}-?–?[ ]|^[ ]?โพสต์ที่แชร์โดย.{0,200}|^[ ]?Copyright.{0,200}|กำลังแสดงหน้าที.{0,200}|[ ]{0,2}รีวิว.{0,100}|^[ ]?ข้อที่ \d{0,4}|^เข้าชม/ผู้ติดตาม.{0,13}"
+REFINE1_RE = re.compile(REFINE1_PATTERN, re.MULTILINE)
+REFINE2_PATTERN = "Submitted[ ]by.{0,100}|^เขียนโดย.{0,100}|^Poste?d?[ ]{0,2}(?:by|on){0,2}.{0,100}|^เมื่อวาน[ ]{0,2}\d.{0,100}|^อาทิตย์นี้[ ]{0,2}\d.{0,100}|^อาทิตย์ที่แล้ว[ ]{0,2}\d.{0,100}|^เดือนนี้[ ]{0,2}\d.{0,100}|^เดือนที่แล้ว[ ]{0,2}\d.{0,100}|^รวมผู้เยี่ยมชม[ ]{0,2}\d.{0,100}|^จำนวนผู้ชมโดยประมาณ[ ]{0,2}\d.{0,100}|^รหัสสินค้า[ ]{0,2}\d.{0,100}|^บาร์โค้ด[ ]{0,2}\d.{0,100}|^[ ]โดย[ ]{0,2}.{0,100}|^เข้าชม[ ]{0,2}\d.{0,100}|^โหวต[ ]{0,2}\d.{0,100}|^มุมมอง[ ]{0,2}\d.{0,100}"
+REFINE2_RE = re.compile(REFINE2_PATTERN, re.MULTILINE)
+REFINE3_PATTERN = "^[^@\n]{0,30}@\d{0,10}.{0,30}|.{0,100}[-]$|\d*[ ]*x[ ]\d*[^ ][ ]?|^ดูหนัง[ ]?(?:ออนไลน์)?[ ].{0,60}|^คุ้มค่าที่สุดอันดับ[ ]{0,2}\d{0,2}.{0,80}|^เปิด[^\d\n]+.{0,10}"
+REFINE3_RE = re.compile(REFINE3_PATTERN, re.MULTILINE)
+REFINE4_PATTERN = "^[^\n]{0,50}คลิก\)|[Ff]acebook[ ]{0,2}(?:\d{0,3},?\d{0,3})[ ]{0,2}เข้าชม|^[ ]{0,2}[^ ]{0,20}[ ]{0,2}\d{0,9}[ ]{0,2}ความเห็น|\[url=.{0,100}|^ผู้ชม[ ]{0,2}(?:\d{0,3},?)+|\([ ]?\)"
+REFINE4_RE = re.compile(REFINE4_PATTERN, re.MULTILINE)
+REFINE5_PATTERN = "^[^\d]{0,30}\d{0,10}[ ]{0,2}views.{0,100}|^Prev.{0,100}Next|^สินค้าติดต่อที่.{0,100}|^อ่านต่อคลิก.{0,100}|^สินค้าโปรโมชั่น.{0,200}|^US[ ]?\$\d{0,3},?\d{0,3}.?\d{0,3}.{0,50}"
+REFINE5_RE = re.compile(REFINE5_PATTERN, re.MULTILINE)
+REFINE6_PATTERN = "^เจ้าหน้าที่ฝ่ายขาย:\n.{0,80}|^(?:\*+[ ]{0,2}[^\*\n]{0,50}[ ]{0,2}\*+)[ ]{0,2}[\+]?(?:(?:\d{0,3},?)+)?|[\*\+]{2,5}|^(?:[^:\n]{0,30}:).{0,200}"
+REFINE6_RE = re.compile(REFINE6_PATTERN, re.MULTILINE)
+REFINE7_PATTERN = "\(?อ่าน[ ]{0,2}\d{0,3},?\d{0,3}[ ]{0,2}(?:ครั้ง[ ]{0,2})?\)?|โพสต์[ ].{0,100}|Read[ ]{0,2}\d{0,9}[ ]{0,2}times|[^ \n]{0,20}[ ]{0,2}pantip|^Previous (?:Post|article).{0,150}|^Next (?:Post|article).{0,150}|^ตอบกลับ[ ]{0,2}.{0,200}"
+REFINE7_RE = re.compile(REFINE7_PATTERN, re.MULTILINE)
+REFINE8_PATTERN = "^[ ]?(?:[Pp]ostby|[Pp]osted[ ](?:by|on)).*|^[ ]?เข้าชม/ผู้ติดตาม.*|^[ ]?จำนวนผู้ชมโดยประมาณ[ :]?.*|^[ ]?ลงประกาศฟรี[ ].*|^\|[ ]|^[ ]?จาก[ ].*|^[ ]?By.*|^[ ]{0,2}?โดย[ ]{0,2}?.*"
+REFINE8_RE = re.compile(REFINE8_PATTERN, re.MULTILINE)
+REFINE9_PATTERN = "^[^\n\.]{0,60}\.{3}$|^[^\n]{0,30}ฉบับที่[ ].*|^Home[ ]/[ ].{100}|^[^\n\|]{0,60}\|.{0,60}"
+REFINE9_RE = re.compile(REFINE9_PATTERN, re.MULTILINE)
+REFINE10_PATTERN = "^[ ]?(?:\)|↑|►|←|«)[ ]?|^[-_]+"
+REFINE10_RE = re.compile(REFINE10_PATTERN, re.MULTILINE)
+REFINE11_PATTERN = "^สถิติ(?:วันนี้|สัปดาห์นี้|เมื่อวาน(?:นี้)?|เดือนนี้)[ ]{0,2}.{0,50}|Online[ ]สถิติ.{0,50}"
+REFINE11_RE = re.compile(REFINE11_PATTERN, re.MULTILINE)
+REFINE12_PATTERN = (
+    "^[^\n\(]{0,80}\(รายละเอียด\)[ ]\(แจ้งลิงก์เสีย\)|^ด[\. ][ชญ][\. ].*|\.{5,}"
+)
+REFINE12_RE = re.compile(REFINE12_PATTERN, re.MULTILINE)
+REFINE13_PATTERN = "^[ ]?(?:เรื่องย่อ[ ].{0,100}|คุ้มค่าที่สุดอันดับ.{0,100}|คุ้[ ]่าที่สุดอันดับ.{0,100}|\(?ลงโฆษณาฟรี[ ].{0,200}|\(free[ ]online[ ].{0,100}|\(คลิกเพื่อดูต้นฉบับ\)[ ].{0,100}|แก้ไขครั้งสุดท้ายโดย[ ].{0,100})|^[^\d\n]{0,30}[ ]\d{0,3},?\d{0,3}[ ]ครั้ง.{0,50}"
+REFINE13_RE = re.compile(REFINE13_PATTERN, re.MULTILINE)
+REFINE14_PATTERN = "^(?:[฿$]?\d{0,9}\.?,?\d{0,9}-?–?:?/?(?:[ ]{0,2}x[ ]{0,2}\d{0,8})?(?:\\bกม\\b\.?)?(?:\\bน\\b\.)?(?:ล้าน|แสน|หมื่น|พัน|ร้อย|สิบ|บาท|[ ])?){0,5}"
+REFINE14_RE = re.compile(REFINE14_PATTERN, re.MULTILINE)
+from datetime import datetime
+from typing import List, Dict
+import re
+def clean_with_remove_document(text: str) -> bool:
+    # ---- Clean too large unused lines
+    # Limit matches list to 2 items only, enough
+    matches = TOOLARGE_RE.findall(text)[:2]
+    # Classify as toolarge row if number of matches = 2
+    if len(matches) == 2:
+        return True
+    # ---- Clean none characters row
+    # Limit matches list to 25 items
+    matches = NONECHAR_RE.findall(text)[:25]
+    # Classify as none character row if number of matches = 25
+    if len(matches) == 25:
+        return True
+    # ---- Clean none tone mark row
+    # Limit matches list to 25 items
+    matches = NONE_TONE_MARK_RE.findall(text)[:25]
+    # Classify as none tone mark row if number of matches = 25
+    if len(matches) == 25:
+        return True
+    # ---- Clean Gamble ~ 9.2% of mC4 data
+    # if found gamble word 2 times in a row, classify as gamble row
+    # remove the row
+    # Limit matches list to 2 items only, enough
+    matches = GAMBLE_RE.findall(text)[:2]
+    # Classify as gamble if number of matches = 2
+    if len(matches) == 2:
+        return True
+    # ---- Clean Football data
+    # if found gamble word 4 times in a row, classify as football data
+    # remove the row
+    # Limit matches list to 4 items only
+    matches = FOOTBALL_RE.findall(text)[:4]
+    if len(matches) == 4:
+        return True
+    # ---- Clean Hotel Advertising
+    # if found hotel word 4 times in a row, classify as Hotel Ad. data
+    # remove the row
+    # Limit matches list to 4 items only, enough
+    matches = HOTEL_AD_RE.findall(text)[:4]
+    if len(matches) == 4:
+        return True
+    # ----  Clean Sale ~26% of mC4 data
+    # Sale row data is diverse,
+    # so the regex is not used in this case.
+    # Rules:
+    # 1. Remove row if it contains common specific Sale's URL
+    # 2. Skip to next clean rule if it contains specific keywords, eg. "สอบราคา", "จัดซื้อจัดจ้าง, etc."
+    # 3. If not found keywords in (2) then scan the row with sale keywords, if there are at leat 3 sale kewords found then remove the row.
+    if SALE_URL_RE.search(text):
+        return True
+    if not SALE_SKIP_RE.search(text):
+        # Classify as Sale data ( 3 matches, can be adjusted)
+        matches = SALE_RE.findall(text)[:3]
+        if len(matches) == 3:
+            return True
+    # ---- Clean Rent (พวกเช่า ~2% of mC4 data)
+    # Rent use another rules
+    # 1. find skip words in the row. If found, skip to next rule (not remove)
+    # 2. if found rent word 2 times in a row, classify as rent row
+    #    remove the row
+    if not RENT_SKIP_RE.search(text):
+        # Limit matches list to 2 items only, enough
+        matches = RENT_RE.findall(text)[:2]
+        if len(matches) == 2:
+            return True
+    # ---- Clean pattern (json like -> "abc": ~.5-1% )
+    # 99% can classify as gabage: so remove them
+    # match n items to make sure they are garbages n=20, can change
+    matches = JSON_RE.findall(text)[:20]
+    # if match only 20+, classify as garbage
+    if len(matches) == 20:
+        return True
+    # ---- Clean script (Javascript, etc. ~.5% )
+    # 99% can classify as gabage: so remove them
+    matches = SCRIPT_RE.findall(text)[:10]
+    # Classify as script if number of matches = 10
+    if len(matches) == 10:
+        return True
+    # ---- Clean garbage (useless or not necessary ~.45%)
+    # classify as gabage: so remove them
+    matches = GARBAGE_RE.findall(text)[:4]
+    # Classify as garbage if number of matches = 4
+    if len(matches) == 4:
+        return True
+    # ---- Clean ghost language (~0.008% can cancel this clean)
+    # classify as ghost : so remove them
+    matches = GHOST_RE.findall(text)[:4]
+    # Classify as ghost if number of matches = 4
+    if len(matches) == 4:
+        return True
+    # ---- Clean HEX code
+    # classify as HEX : so remove them
+    matches = HEX_RE.findall(text)[:25]
+    # Classify as HEX if number of matches = 25
+    if len(matches) == 25:
+        return True
+    return False
+def clean_mc4_text(text: str) -> str:
+    text = PAGE_RE.sub(" ", text)
+    text = EMBEDDED_SERVER_RE.sub(" ", text)
+    text = U_RE.sub(" ", text)
+    text = EMAIL_RE.sub(" ", text)
+    text = URL_RE.sub(" ", text)
+    text = MENU1_RE.sub(" ", text)
+    text = MENU2_RE.sub(" ", text)
+    text = MENU3_RE.sub(" ", text)
+    text = MENU4_RE.sub(" ", text)
+    text = SIDEBAR_RE.sub(" ", text)
+    text = BLOCK_RE.sub(" ", text)
+    text = HASHTAG_RE.sub(" ", text)
+    text = MARKUP_RE.sub(" ", text)
+    text = IFRAME_RE.sub(" ", text)
+    text = IP_RE.sub(" ", text)
+    text = TEL_RE.sub(" ", text)
+    text = DATE1_RE.sub(" ", text)
+    text = DATE2_RE.sub(" ", text)
+    text = HTML_RE.sub(" ", text)
+    # --- Refinements (in sequence)
+    text = REFINE1_RE.sub(" ", text)
+    text = REFINE2_RE.sub(" ", text)
+    text = REFINE3_RE.sub(" ", text)
+    text = REFINE4_RE.sub(" ", text)
+    text = REFINE5_RE.sub(" ", text)
+    text = REFINE6_RE.sub(" ", text)
+    text = REFINE7_RE.sub(" ", text)
+    text = REFINE8_RE.sub(" ", text)
+    text = REFINE9_RE.sub(" ", text)
+    text = REFINE10_RE.sub(" ", text)
+    text = REFINE11_RE.sub(" ", text)
+    text = REFINE12_RE.sub(" ", text)
+    text = REFINE13_RE.sub(" ", text)
+    text = REFINE14_RE.sub(" ", text)
+    # Split the text into lines and remove any empty lines
+    lines = [line for line in text.split("\n") if line]
+    # Initialize the list with the first line
+    deduplicated_list = [lines[0]]
+    # Iterate over the rest of the lines
+    for i in range(1, len(lines)):
+        # Find the common prefix between this line and the previous line
+        common_prefix = ""
+        for char1, char2 in zip(lines[i], lines[i - 1]):
+            if char1 == char2:
+                common_prefix += char1
+            else:
+                break
+        # Remove the common prefix from this line and add it to the list
+        deduplicated_list.append(lines[i][len(common_prefix) :])
+    text = "\n".join(deduplicated_list)
+    # Clean short lines
+    # ( len(line) <= 30 characters , cut this line off)
+    text = "\n".join(line for line in text.split("\n") if len(line) > 30)
+    # ---- The scan row that passes all filter is written to disk
+    # before write to disk, get rid of spaces by change them to single space (' ').
+    text = re.sub("[ ]+", " ", text, 0, re.MULTILINE)
+    text = re.sub("^[ ]", "", text, 0, re.MULTILINE)
+    text = re.sub(r"\n\s*", "\n", text, 0, re.MULTILINE)
+    return text
+def clean_dataset(dataset: List[Dict[str, str]]) -> List[Dict[str, str]]:
+    """
+    Description : Call function clean_text to process the whole dataset.
+    Input text : An input dataset having each element as a document in the dataset.
+    Output : A clean dataset.
+    """
+    for i, data_point in enumerate(dataset):
+        cleaned_text = clean_text(data_point["text"])
+        if cleaned_text != dataset[i]["text"]:
+            dataset[i]["text"] = cleaned_text
+            dataset[i]["updated_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    return [data_point for data_point in dataset if data_point["text"] != ""]
+def sorted_by_len(keyword_list):
+    len_keywords = [[len(kw), kw] for kw in keyword_list]
+    return [kw for _, kw in sorted(len_keywords)][::-1]
+PORN_KEYWORDS = [
+    "คลิปหลุด",
+    "กระเจี๊ยว",
+    "คลิปโป๊",
+    "หนังโป๊",
+    "หนังโป้",
+    "หนังโป็",
+    "เรื่องเสียว",
+    "ซอยหี",
+    "ชักว่าว",
+    "ท่าหมา",
+    "ขย่มควย",
+    "เล่นเสียว",
+    "ควยใหญ่",
+    "หนังเอ็กซ์",
+    "แหกหี",
+    "เย็ด",
+]
+GAMBLE_KEYWORDS = [
+    "ufabet",
+    "UFABET",
+    "ล้มโต๊ะ",
+    "เซียนสเต็ป",
+    "บอลเต็ง",
+    "แทงบอล",
+    "คาสิโน",
+    "บาคาร่า",
+    "เว็บสล็อต",
+    "เกมสล็อต",
+    "สล็อตออนไลน์",
+    "คาสิโนออนไลน์",
+    "หวยมาเลย์",
+    "หวยฮานอย",
+    "น้ำเต้าปูปลา",
+    "หวยออนไลน์",
+    "แทงหวย",
+    "หวยหุ้น",
+    "ยิง��ลาออนไลน์",
+    "ได้เงินจริง",
+    "บา คา ร่า",
+]
+SPAM_MOVIE_KEYWORDS = [
+    "ดูหนังออนไลน์",
+    "หนังออนไลน์",
+    "เว็บดูหนัง",
+    "หนังชนโรง",
+    "หนังใหม่ชนโรง",
+    "เสียงไทย",
+    "เสียงญี่ปุ่น",
+    "เสียงอังกฤษ",
+]
+SPAM_LIKE_KEYWORDS = [
+    "ปั้มไลค์",
+    "รับจ้างกดไลค์",
+    "จ้างไลค์",
+    "ปั๊มไลค์",
+    "ปั่นไลค์",
+    "เพิ่มไลค์",
+    "ซื้อไลค์",
+]
+CODE_KEYWORDS = [
+    "padding:",
+    "display:",
+    "S3=n8",
+    "phpBB Debug",
+    "getElementById",
+    "innerHTML",
+    "parseInt",
+    "addEventListener",
+    "console\.log",
+    "checkCookieForTarget",
+    "setAttribute",
+    "getItem",
+    "if \(",
+    "else {",
+    "JSON\.stringify",
+    "onclick",
+]
+WEBBOARD_KEYWORDS = [
+    "คุณกำลังใช้งานแบบปิดการใช้ Javascript",
+    "Longdo Dictionary",
+    "นโยบายการคุ้มครองข้อมูลส่วนบุคคล",
+    "เงื่อนไขการให้บริการเว็บไซต์",
+    "นโยบายความปลอดภัย",
+    "นโยบายเว็บไซต์และการปฏิเสธความรับผิด",
+    "คุณอาจจะยังไม่ได้เข้าสู่ระบบหรือยังไม่ได้ลงทะเบียน",
+    "คุณไม่ได้เข้าสู่ระบบหรือคุณไม่มีสิทธิ์เข้าหน้านี้",
+]
+PORN_KEYWORDS += [" ".join(list(kw)) for kw in PORN_KEYWORDS]
+GAMBLE_KEYWORDS += [" ".join(list(kw)) for kw in GAMBLE_KEYWORDS]
+SPAM_MOVIE_KEYWORDS += [" ".join(list(kw)) for kw in SPAM_MOVIE_KEYWORDS]
+DOCUMENT_REMOVAL_KEYWORDS = (
+    PORN_KEYWORDS
+    + GAMBLE_KEYWORDS
+    + SPAM_MOVIE_KEYWORDS
+    + SPAM_LIKE_KEYWORDS
+    + CODE_KEYWORDS
+    + WEBBOARD_KEYWORDS
+)
+PARTIAL_REMOVAL_KEYWORDS = [
+    "Posted on",
+    "Posted by",
+    "Posted by:",
+    "Posted By:",
+    "สมาชิกหมายเลข [0-9,]+",
+    "อ่าน [0-9,]+ ครั้ง",
+    "เปิดดู [0-9,]+ ครั้ง",
+    "ดู [0-9,]+ ครั้ง",
+    "คะแนนสะสม: [0-9,]+ แต้ม",
+    "ความคิดเห็น: [0-9,]+",
+    "[0-9,]+ บุคคลทั่วไป กำลังดูบอร์ดนี้",
+    "หน้าที่แล้ว ต่อไป",
+    "ความคิดเห็นที่ [0-9,]+",
+    "[0-9,]+ สมาชิก และ [0-9,]+ บุคคลทั่วไป",
+    "กำลังดูหัวข้อนี้",
+    "เข้าสู่ระบบด้วยชื่อผู้ใช้",
+    "แสดงกระทู้จาก:",
+    "กระทู้: [0-9,]+",
+    "เว็บไซต์เรามีการใช้คุกกี้และเก็บข้อมูลผู้ใช้งาน โปรดศึกษาและยอมรับ นโยบายคุ้มครองข้อมูลส่วนบุคคล ก่อนใช้งาน",
+    "Privacy & Cookies: This site uses cookies. By continuing to use this website, you agree to their use\.",
+    "Previous\t\nNext\nLeave a Reply Cancel reply\nYou must be logged in to post a comment.\nSearch for:\nFeatured Post\n",
+    "Click to read more\nYou must be logged in to view or write comments\.",
+    "[0-9,]+ Views",
+    "Skip to content",
+    "Last Modified Posts",
+    "Last Updated:",
+    "\(อ่าน [0-9,]+ ครั้ง\)",
+    "Recent Comments",
+    "«.*?»",
+    "< --แสดงทั้งหมด-- >",
+    "นโยบายความเป็นส่วนตัว",
+    "เงื่อนไขการใช้เว็บไซต์",
+    "ตั้งค่าคุกกี้",
+    "ท่านยอมรับให้เว็บไซต์นี้จัดเก็บคุกกี้เพื่อประสบการณ์การใช้งานเว็บไซต์ที่ดียิ่งขึ้น",
+    "รวมถึงช่วยให้ท่านมีโอกาสได้รับข้อเสนอหรือเนื้อหาที่ตรงตามความสนใจของท่าน",
+    "ท่านสามารถดู Privacy Notice ของเว็บไซต์เรา ได้ที่นี่",
+    "You may be trying to access this site from a secured browser on the server. Please enable scripts and reload this page.",
+    "เผยแพร่: \d\d [ก-๙]+ \d\d\d\d \d\d:\d\d น\.",
+    "Last updated: \d\d [ก-๙]+\.[ก-๙]+\. \d\d\d\d \d\d:\d\d น\.",
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit\.",
+    "Search for:",
+    "Save my name, email, and website in this browser for the next time I comment",
+    "Your email address will not be published. Required fields are marked",
+    "Leave a Reply Cancel reply",
+    "((?:หน้าหลัก|เข้าสู่ระบบ|หน้าแรก) \|(?: [^\s]+(?:(?: \|)|$|\s))+)",
+    "กลับหน้าแรก",
+    "ติดต่อเรา",
+    "Contact Us",
+    "#\w+",
+    "ติดต่อผู้ดูแลเว็บไซต์",
+    "หากท่านพบว่ามีข้อมูลใดๆที่ละเมิดทรัพย์สินทางปัญญาปรากฏอยู่ในเว็บไซต์โปรดแจ้งให้ทราบ",
+    "No related posts",
+    "Posted in",
+    "((?:Tags:|Tagged|Tag) (?:.{1,40}(?:,|\n|$))+)",
+    "ตอบ:",
+    "Sort by:",
+    "All rights reserved",
+    "ความยาวอย่างน้อย",
+    "ระบบได้ดำเนินการส่ง OTP",
+    "เป็นสมาชิกอยู่แล้ว\?",
+    "We use cookies",
+    "Cookie Settings",
+    "Homeหน้าหลัก",
+    "Home หน้าหลัก",
+    "ข่าวสารล่าสุด",
+    "ปัญหา การใช้งาน",
+    "ปัญหาการใช้งาน" "ผู้เขียน",
+    "หัวข้อ:",
+    "\*\* พร้อมส่ง \*\*",
+]
+TH_MONTHS = [
+    "มกราคม",
+    "กุมภาพันธ์",
+    "มีนาคม",
+    "เมษายน",
+    "พฤษภาคม",
+    "มิถุนายน",
+    "กรกฎาคม",
+    "สิงหาคม",
+    "กันยายน",
+    "ตุลาคม",
+    "พฤศจิกายน",
+    "ธันวาคม",
+    "ม\.ค\.",
+    "ก\.พ\.",
+    "มี\.ค\.",
+    "เม\.ย\.",
+    "พ\.ค\.",
+    "มิ\.ย\.",
+    "ก\.ค\.",
+    "ส\.ค\.",
+    "ก\.ย\.",
+    "ต\.ค\.",
+    "พ\.ย\.",
+    "ธ\.ค\.",
+]
+CODE_SPECIAL_CHARACTERS = ["\{", "\+", "\}", "/", ":"]
+PARTIAL_REMOVAL_KEYWORDS = sorted_by_len(PARTIAL_REMOVAL_KEYWORDS)
+import re
+from pythainlp.util import countthai
+from typing import List, Dict
+from datetime import datetime
+def contains_document_removal_keywords(text: str) -> bool:
+    """
+    Description : Check if an input document contains any document removal keywords.
+    Input text : An input document.
+    Output : True if the document contains the keywords. Otherwise, False
+    """
+    pattern = "|".join(DOCUMENT_REMOVAL_KEYWORDS)
+    return bool(re.search(pattern, text))
+def check_ratio_bad_substring(text: str) -> bool:
+    """
+    Description : Check if the ratio between number of keywords and length of a document
+                  is exceeds the threshold for each groups.
+                  Group #1 : Name of months in Thai including abbreviations.
+                  Group #2 : Special char that usually found in the code section.
+                  Group #3 : Space.
+                  Group #4 : Commar.
+                  Note : Thresholds of each group are from the experiment on oscar.
+    Input text : An input document.
+    Output : True if a ratio of at least 1 group is above . Otherwise, False
+    """
+    n = len(text)
+    if len(re.findall("|".join(TH_MONTHS), text)) / n > 0.015:
+        return True
+    if len(re.findall("|".join(CODE_SPECIAL_CHARACTERS), text)) / n > 0.075:
+        return True
+    if len(re.findall(" ", text)) / n > 0.13:
+        return True
+    if len(re.findall(",", text)) / n > 0.05:
+        return True
+    return False
+def remove_partial_keywords(text: str) -> str:
+    """
+    Description : Remove partial removal keywords from the document.
+    Input text : An input document.
+    Output : A document after removed keywords.
+    """
+    return re.sub("|".join(PARTIAL_REMOVAL_KEYWORDS), "", text)
+def clean_oscar_text(text: str) -> str:
+    """
+    Description : Clean an input document by these steps
+                  1. Remove the whole document if
+                    1.1. Contains any document removal keywords (ex. porn, gamble)
+                    1.2. Contains too much TH months, code character, space and commar.
+                    1.3. The percent of thai characters is less than 50%.
+                  2. Remove partial removal keywords.
+    Input text : An input document.
+    Output : A clean document ("" if the whole document should be removed).
+    """
+    if (
+        len(text) == 0
+        or contains_document_removal_keywords(text)
+        or check_ratio_bad_substring(text)
+        or countthai(text) < 50
+    ):
+        return ""
+    text = remove_partial_keywords(text).strip()
+    return text
+def clean_dataset(dataset: List[Dict[str, str]]) -> List[Dict[str, str]]:
+    """
+    Description : Call function clean_text to process the whole dataset.
+    Input text : An input dataset having each element as a document in the dataset.
+    Output : A clean dataset.
+    """
+    for i, data_point in enumerate(dataset):
+        cleaned_text = clean_text(data_point["text"])
+        if cleaned_text != dataset[i]["text"]:
+            dataset[i]["text"] = cleaned_text
+            dataset[i]["updated_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    return [data_point for data_point in dataset if data_point["text"] != ""]

step2.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import re
+import unicodedata
+UNICODE_PUNCT = {
+    "，": ",",
+    "。": ".",
+    "、": ",",
+    "„": '"',
+    "”": '"',
+    "“": '"',
+    "«": '"',
+    "»": '"',
+    "１": '"',
+    "」": '"',
+    "「": '"',
+    "《": '"',
+    "》": '"',
+    "´": "'",
+    "∶": ":",
+    "：": ":",
+    "？": "?",
+    "！": "!",
+    "（": "(",
+    "）": ")",
+    "；": ";",
+    "–": "-",
+    "—": " - ",
+    "．": ". ",
+    "～": "~",
+    "’": "'",
+    "…": "...",
+    "━": "-",
+    "〈": "<",
+    "〉": ">",
+    "【": "[",
+    "】": "]",
+    "％": "%",
+    "►": "-",
+}
+UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]")
+def replace_unicode_punct(text: str) -> str:
+    return "".join((UNICODE_PUNCT.get(c, c) for c in text))
+def remove_unicode_punct(text: str) -> str:
+    """More aggressive version of replace_unicode_punct but also faster."""
+    return UNICODE_PUNCT_RE.sub("", text)
+def strip_accents(line: str) -> str:
+    """Strips accents from a piece of text."""
+    nfd = unicodedata.normalize("NFD", line)
+    output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+    if len(output) == line:
+        return line
+    return "".join(output)
+# Build a regex matching all control characters.
+NON_PRINTING_CHARS_RE = re.compile(
+    f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+)
+DIGIT_RE = re.compile(r"\d")
+PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(
+    (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "")
+)
+def remove_non_printing_char(text: str) -> str:
+    return NON_PRINTING_CHARS_RE.sub("", text)
+def normalize_spacing_for_tok(text: str, language: str = "en") -> str:
+    res = (
+        text.replace("\r", "")
+        # remove extra spaces
+        .replace("(", " (")
+        .replace(")", ") ")
+        .replace(" +", " ")
+    )
+    res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res)
+    res = res.replace("( ", "(").replace(" )", ")")
+    res = re.sub(r"(\d) \%", r"\1\%", res)
+    res = res.replace(" :", ":").replace(" ;", ";")
+    res = res.replace("`", "'").replace("''", ' " ')
+    res = (
+        res.replace("„", '"')
+        .replace("“", '"')
+        .replace("”", '"')
+        .replace("–", "-")
+        .replace("—", " - ")
+        .replace(" +", " ")
+        .replace("´", "'")
+        .replace("([a-z])‘([a-z])", r"\1'\2/")
+        .replace("([a-z])’([a-z])", r"\1'\2/")
+        .replace("‘", '"')
+        .replace("‚", '"')
+        .replace("’", '"')
+        .replace("''", '"')
+        .replace("´´", '"')
+        .replace("…", "...")
+        # French quotes
+        .replace(" « ", ' "')
+        .replace("« ", '"')
+        .replace("«", '"')
+        .replace(" » ", '" ')
+        .replace(" »", '"')
+        .replace("»", '"')
+        # handle pseudo-spaces
+        .replace(" %", "%")
+        .replace("nº ", "nº ")
+        .replace(" :", ":")
+        .replace(" ºC", " ºC")
+        .replace(" cm", " cm")
+        .replace(" ?", "?")
+        .replace(" !", "!")
+        .replace(" ;", ";")
+        .replace(", ", ", ")
+        .replace(" +", " ")
+        .replace("．", ". ")
+    )
+    # English "quotation," followed by comma, style
+    if language == "en":
+        res = re.sub(r"\"([,\.]+)", r"\1\"", res)
+    # Czech is confused
+    elif language == "cs" or language == "cz":
+        pass
+    # German/Spanish/French "quotation", followed by comma, style
+    else:
+        res = res.replace(',"', '",')
+        res = re.sub(
+            r"(\.+)\"(\s*[^<])", r"\"\1\2", res
+        )  # don't fix period at end of sentence
+    if (
+        language == "de"
+        or language == "es"
+        or language == "cz"
+        or language == "cs"
+        or language == "fr"
+    ):
+        res = re.sub(r"(\d) (\d)", r"\1,\2", res)
+    else:
+        res = re.sub(r"(\d) (\d)", r"\1.\2", res)
+    return res
+def normalize(line: str, accent=True, case=True, numbers=True, punct=1) -> str:
+    line = line.strip()
+    if not line:
+        return line
+    if case:
+        line = line.lower()
+    if accent:
+        line = strip_accents(line)
+    if numbers:
+        line = DIGIT_RE.sub("0", line)
+    if punct == 1:
+        line = replace_unicode_punct(line)
+    elif punct == 2:
+        line = remove_unicode_punct(line)
+    line = remove_non_printing_char(line)
+    return line
+def slow_normalize_for_dedup(line: str) -> str:
+    return normalize(line, accent=False, case=True, numbers=True, punct=2)
+def normalize_for_dedup(line: str) -> str:
+    line = line.strip()
+    if not line:
+        return line
+    # case
+    line = line.lower()
+    # numbers
+    line = DIGIT_RE.sub("0", line)
+    line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
+    return line

step2_perplexity.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import kenlm
+import math
+import numpy as np
+import pandas as pd
+import pickle
+import scipy
+import sentencepiece  # type: ignore
+from typing import List
+import warnings
+from step2 import normalize
+import os
+warnings.simplefilter(action="ignore", category=FutureWarning)
+def load_model():
+    os.system("gdown 1OBbo21v_-esL31rxtNtsMHrA8T1JYqAd")
+    os.system("unzip /content/core.zip")
+    os.system("!rm /content/core.zip")
+class SentencesLM:
+    """Returns the score of each individual paragraph."""
+    def __init__(self):
+        lm_config = kenlm.Config()
+        lm_config.load_method = 2
+        lm_model_filename = (
+            os.path.join("core", "th.arpa.bin")
+        )
+        self.lm = kenlm.Model(str(lm_model_filename), lm_config)
+        self.sp = sentencepiece.SentencePieceProcessor()
+        self.sp.load(
+            os.path.join("core", "th.sp.model")
+        )
+    def pp(self, log_score: float, length: int) -> float:
+        """Compute perplexity score"""
+        power = min(30, -log_score / length)
+        return 10.0**power
+    def do(self, document: List[str]) -> float:  # type: ignore
+        """Compute perplexity for each line of document"""
+        total_pp = 0
+        total_length = 0
+        for line in document:
+            line = normalize(line, accent=False)
+            tokenized_line = " ".join(self.sp.encode_as_pieces(line))
+            log_score = self.lm.score(tokenized_line)
+            length = len(line.split()) + 1
+            total_length += length
+            total_pp += log_score
+        return round(self.pp(total_pp, total_length), 1)
+classifier_filename = os.path.join("core", "decision_tree.sav")
+classifier = pickle.load(open(classifier_filename, "rb"))
+lm = SentencesLM()
+def classify_spam(text: str):
+    """Classify if text is spam using perplexity and decision tree as thresholder
+    Input : text -> a text to classify.
+    Output : prediction -> Prediction whether text is spam.
+                    1 Represents spam and 0 represent non-spam.
+    Output : log_pp_score -> log of perplexity score.
+    """
+    pp_score = lm.do(text.split("\n"))
+    log_pp_score = math.log(pp_score)
+    prediction = classifier.predict(pd.DataFrame({"log_score": [log_pp_score]}))
+    return prediction, log_pp_score
+def sample_score(log_scores, n, percentage=0.1) -> np.ndarray:
+    np.random.seed(0)
+    lower_bound, upper_bound = min(log_scores), max(log_scores)
+    mean, std = np.mean(log_scores), np.std(log_scores)
+    sampled_scores = scipy.stats.truncnorm.rvs(
+        (lower_bound - mean) / std,
+        (upper_bound - mean) / std,
+        loc=mean,
+        scale=std,
+        size=int(percentage * n),
+    )
+    return sampled_scores
+def sample_text_back(texts, log_scores, percentage=0.5, replace=True) -> List[str]:
+    """Sample some spam text back in the dataset
+    using log score distribution of language model"""
+    sampled_scores = sample_score(log_scores, len(texts), percentage)
+    sampled_texts = []
+    selected_idx = set()
+    for samp_score in sampled_scores:
+        min_diff, min_idx = float("inf"), -1
+        for idx, s in enumerate(log_scores):
+            if idx in selected_idx:
+                continue
+            diff = (samp_score - s) ** 2
+            if diff < min_diff:
+                min_diff = diff
+                min_idx = idx
+        sampled_texts.append(texts[min_idx])
+        if not replace:
+            selected_idx.add(min_idx)
+    return sampled_texts