Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +93 -160
src/streamlit_app.py
CHANGED
|
@@ -20,7 +20,6 @@ from dotenv import load_dotenv
|
|
| 20 |
# /tmp ํด๋๋ ์กด์ฌํ ์ ์์ง๋ง ๊ถํ ๋ฌธ์ ๊ฐ ์์ ์ ์์ผ๋ฏ๋ก ํ์ฌ ์์
๋๋ ํ ๋ฆฌ ๊ธฐ๋ฐ์ผ๋ก ๋ณ๊ฒฝ
|
| 21 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
|
| 22 |
DATA_DIR = os.path.join(CURRENT_DIR, "data")
|
| 23 |
-
NLTK_DATA_DIR = os.path.join(DATA_DIR, "nltk_data")
|
| 24 |
SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
|
| 25 |
SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
|
| 26 |
|
|
@@ -35,60 +34,34 @@ def ensure_directory(directory):
|
|
| 35 |
|
| 36 |
# ํ์ํ ๋ชจ๋ ๋๋ ํ ๋ฆฌ ์์ฑ
|
| 37 |
ensure_directory(DATA_DIR)
|
| 38 |
-
ensure_directory(NLTK_DATA_DIR)
|
| 39 |
ensure_directory(SCHEDULED_NEWS_DIR)
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
import nltk
|
| 43 |
-
nltk.data.path.append(NLTK_DATA_DIR)
|
| 44 |
-
|
| 45 |
-
# ํ์ํ NLTK ๋ฐ์ดํฐ ๋ค์ด๋ก๋ (๊ถํ ๋ฌธ์ ํด๊ฒฐ)
|
| 46 |
try:
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
try:
|
| 54 |
-
nltk.data.find('corpora/stopwords')
|
| 55 |
-
except LookupError:
|
| 56 |
-
nltk.download('stopwords', download_dir=NLTK_DATA_DIR)
|
| 57 |
-
except Exception as e:
|
| 58 |
-
st.warning(f"NLTK ๋ฐ์ดํฐ ๋ค์ด๋ก๋ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}. ๊ธฐ๋ณธ ํ ํฌ๋์ด์ง ๋ฐฉ์์ ์ฌ์ฉํฉ๋๋ค.")
|
| 59 |
|
| 60 |
-
# ํ๊ตญ์ด
|
| 61 |
def tokenize_korean(text):
|
| 62 |
try:
|
| 63 |
-
|
| 64 |
-
try:
|
| 65 |
-
from transformers import AutoTokenizer
|
| 66 |
-
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
|
| 67 |
-
return tokenizer.tokenize(text)
|
| 68 |
-
except (ImportError, Exception) as e:
|
| 69 |
-
st.debug(f"Transformers ํ ํฌ๋์ด์ ๋ก๋ ์คํจ: {str(e)}")
|
| 70 |
-
|
| 71 |
-
# 2. soynlp ์๋
|
| 72 |
-
try:
|
| 73 |
-
from soynlp.tokenizer import LTokenizer
|
| 74 |
-
tokenizer = LTokenizer()
|
| 75 |
-
return tokenizer.tokenize(text)
|
| 76 |
-
except (ImportError, Exception) as e:
|
| 77 |
-
st.debug(f"soynlp ํ ํฌ๋์ด์ ๋ก๋ ์คํจ: {str(e)}")
|
| 78 |
-
|
| 79 |
-
# 3. kss ์๋
|
| 80 |
-
try:
|
| 81 |
-
import kss
|
| 82 |
tokens = []
|
|
|
|
| 83 |
for sentence in kss.split_sentences(text):
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
return tokens
|
| 86 |
-
except (ImportError, Exception) as e:
|
| 87 |
-
st.debug(f"kss ํ ํฌ๋์ด์ ๋ก๋ ์คํจ: {str(e)}")
|
| 88 |
except Exception as e:
|
| 89 |
-
st.debug(f"
|
| 90 |
|
| 91 |
-
#
|
| 92 |
return re.findall(r'[๊ฐ-ํฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ-ํฃa-zA-Z0-9]+', text)
|
| 93 |
|
| 94 |
# ์๋ํด๋ผ์ฐ๋ ์ถ๊ฐ (์ ํ์ ์ฌ์ฉ)
|
|
@@ -252,31 +225,43 @@ def get_article_content(url):
|
|
| 252 |
except Exception as e:
|
| 253 |
return f"์ค๋ฅ ๋ฐ์: {str(e)}"
|
| 254 |
|
| 255 |
-
#
|
| 256 |
def analyze_keywords(text, top_n=10):
|
| 257 |
-
# ํ๊ตญ์ด ๋ถ์ฉ์ด ๋ชฉ๋ก
|
| 258 |
korean_stopwords = [
|
| 259 |
'์ด', '๊ทธ', '์ ', '๊ฒ', '๋ฐ', '๋ฑ', '๋ฅผ', '์', '์', '์์', '์', '์ผ๋ก', '๋ก',
|
| 260 |
-
'์๊ฒ', '๋ฟ', '๋ค', '๋', '๊ฐ', '์ด๋ค', '์๊ฒ์', '๊ป', '๊ป์', '๋ถํฐ', '๊น์ง'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
]
|
| 262 |
|
| 263 |
# ์ธ์ด ๊ฐ์ง (๊ฐ๋จํ๊ฒ ํ๊ธ ํฌํจ ์ฌ๋ถ๋ก ์ฒดํฌ)
|
| 264 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', text))
|
| 265 |
|
| 266 |
if is_korean:
|
| 267 |
-
# ํ๊ตญ์ด ํ
์คํธ์ธ ๊ฒฝ์ฐ
|
| 268 |
tokens = tokenize_korean(text)
|
| 269 |
else:
|
| 270 |
-
#
|
| 271 |
-
|
| 272 |
-
from nltk.tokenize import word_tokenize
|
| 273 |
-
tokens = word_tokenize(text)
|
| 274 |
-
except Exception:
|
| 275 |
-
# NLTK๊ฐ ์คํจํ๋ฉด ๊ฐ๋จํ ํ ํฌ๋์ด์ ๋ก ๋์ฒด
|
| 276 |
-
tokens = re.findall(r'\b\w+\b', text.lower())
|
| 277 |
|
| 278 |
-
# ๋ถ์ฉ์ด ํํฐ๋ง
|
| 279 |
-
|
|
|
|
| 280 |
|
| 281 |
# ๋น๋ ๊ณ์ฐ
|
| 282 |
from collections import Counter
|
|
@@ -294,38 +279,22 @@ def extract_keywords_for_wordcloud(text, top_n=50):
|
|
| 294 |
# ์ธ์ด ๊ฐ์ง (๊ฐ๋จํ๊ฒ ํ๊ธ ํฌํจ ์ฌ๋ถ๋ก ์ฒดํฌ)
|
| 295 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', text))
|
| 296 |
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
tokens = tokenize_korean(text.lower())
|
| 300 |
-
else:
|
| 301 |
-
# ์์ด ๋๋ ๊ธฐํ ์ธ์ด๋ NLTK ์ฌ์ฉ ์๋
|
| 302 |
-
try:
|
| 303 |
-
from nltk.tokenize import word_tokenize
|
| 304 |
-
tokens = word_tokenize(text.lower())
|
| 305 |
-
except Exception:
|
| 306 |
-
# ์คํจํ๋ฉด ๊ฐ๋จํ ํ ํฌ๋์ด์ง
|
| 307 |
-
tokens = text.lower().split()
|
| 308 |
|
| 309 |
# ๋ถ์ฉ์ด ์ค์
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
| 320 |
-
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
| 321 |
-
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
| 322 |
-
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
| 323 |
-
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
| 324 |
-
'will', 'shall', 'can', 'may', 'must', 'ought'
|
| 325 |
-
}
|
| 326 |
|
| 327 |
# ํ๊ตญ์ด ๋ถ์ฉ์ด
|
| 328 |
-
|
| 329 |
'๋ฐ', '๋ฑ', '๋ฅผ', '์ด', '์', '๊ฐ', '์', '๋', '์ผ๋ก', '์์', '๊ทธ', '๋', '๋๋', 'ํ๋', 'ํ ', 'ํ๊ณ ',
|
| 330 |
'์๋ค', '์ด๋ค', '์ํด', '๊ฒ์ด๋ค', '๊ฒ์', '๋ํ', '๋๋ฌธ', '๊ทธ๋ฆฌ๊ณ ', 'ํ์ง๋ง', '๊ทธ๋ฌ๋', '๊ทธ๋์',
|
| 331 |
'์
๋๋ค', 'ํฉ๋๋ค', '์ต๋๋ค', '์', '์ฃ ', '๊ณ ', '๊ณผ', '์', '๋', '์', '์', '๊ฒ', '๋ค', '์ ', '์ ',
|
|
@@ -336,7 +305,9 @@ def extract_keywords_for_wordcloud(text, top_n=50):
|
|
| 336 |
'๊ธฐ์', '๋ด์ค', '์ฌ์ง', '์ฐํฉ๋ด์ค', '๋ด์์ค', '์ ๊ณต', '๋ฌด๋จ', '์ ์ฌ', '์ฌ๋ฐฐํฌ', '๊ธ์ง', '์ต์ปค', '๋ฉํธ',
|
| 337 |
'์ผ๋ณด', '๋ฐ์ผ๋ฆฌ', '๊ฒฝ์ ', '์ฌํ', '์ ์น', '์ธ๊ณ', '๊ณผํ', '์์ดํฐ', '๋ท์ปด', '์จ๋ท', '๋ธ๋กํฐ', '์ ์์ ๋ฌธ'
|
| 338 |
}
|
| 339 |
-
|
|
|
|
|
|
|
| 340 |
|
| 341 |
# 1๊ธ์ ์ด์์ด๊ณ ๋ถ์ฉ์ด๊ฐ ์๋ ํ ํฐ๋ง ํํฐ๋ง
|
| 342 |
filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
|
|
@@ -718,7 +689,18 @@ elif menu == "๊ธฐ์ฌ ๋ถ์ํ๊ธฐ":
|
|
| 718 |
# ํ
์คํธ ํต๊ณ ๊ณ์ฐ
|
| 719 |
word_count = len(re.findall(r'\b\w+\b', content))
|
| 720 |
char_count = len(content)
|
| 721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
|
| 723 |
avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
|
| 724 |
|
|
@@ -750,81 +732,32 @@ elif menu == "๊ธฐ์ฌ ๋ถ์ํ๊ธฐ":
|
|
| 750 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', content))
|
| 751 |
|
| 752 |
try:
|
| 753 |
-
#
|
|
|
|
|
|
|
| 754 |
if is_korean:
|
| 755 |
-
# ํ๊ตญ์ด์ธ ๊ฒฝ์ฐ
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
pos_counts
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
elif token.endswith("๊ฒ") or token.endswith("ํ"):
|
| 770 |
-
pos_counts['๋ถ์ฌ'] += 1
|
| 771 |
-
elif token.endswith("์") or token.endswith("๋") or token.endswith("์ด") or token.endswith("๊ฐ"):
|
| 772 |
-
pos_counts['๋ช
์ฌ'] += 1
|
| 773 |
-
else:
|
| 774 |
-
if len(token) > 1:
|
| 775 |
-
pos_counts['๋ช
์ฌ'] += 1
|
| 776 |
-
else:
|
| 777 |
-
pos_counts['๊ธฐํ'] += 1
|
| 778 |
-
|
| 779 |
-
except Exception:
|
| 780 |
-
# ์คํจํ๋ฉด ๊ฐ๋จํ ํ ํฐํ๋ก ๋์ฒด
|
| 781 |
-
tokens = tokenize_korean(content[:5000])
|
| 782 |
-
pos_counts = {
|
| 783 |
-
'๋ช
์ฌ๋ฅ': len([t for t in tokens if len(t) > 1 and not any(t.endswith(s) for s in ["๋ค", "์", "๊ฒ", "ํ", "์", "๋"])]),
|
| 784 |
-
'๊ธฐํ': len([t for t in tokens if len(t) <= 1 or any(t.endswith(s) for s in ["๋ค", "์", "๊ฒ", "ํ", "์", "๋"])])
|
| 785 |
-
}
|
| 786 |
-
except Exception as e:
|
| 787 |
-
st.error(f"ํ๊ตญ์ด ํ์ฌ ๋ถ์ ์คํจ: {str(e)}")
|
| 788 |
-
pos_counts = {'๋ฐ์ดํฐ': len(content) // 10, '๋ถ์': len(content) // 15, '์ค๋ฅ': len(content) // 20}
|
| 789 |
else:
|
| 790 |
-
# ์์ด ๋ฌธ์์ธ ๊ฒฝ์ฐ
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR)
|
| 798 |
-
except Exception:
|
| 799 |
-
pass
|
| 800 |
-
|
| 801 |
-
tokens = word_tokenize(content[:5000])
|
| 802 |
-
tagged = pos_tag(tokens)
|
| 803 |
-
|
| 804 |
-
# ์์ด ํ์ฌ ๋งคํ
|
| 805 |
-
pos_dict = {
|
| 806 |
-
'NN': '๋ช
์ฌ', 'NNS': '๋ช
์ฌ', 'NNP': '๊ณ ์ ๋ช
์ฌ', 'NNPS': '๊ณ ์ ๋ช
์ฌ',
|
| 807 |
-
'VB': '๋์ฌ', 'VBD': '๋์ฌ', 'VBG': '๋์ฌ', 'VBN': '๋์ฌ', 'VBP': '๋์ฌ', 'VBZ': '๋์ฌ',
|
| 808 |
-
'JJ': 'ํ์ฉ์ฌ', 'JJR': 'ํ์ฉ์ฌ', 'JJS': 'ํ์ฉ์ฌ',
|
| 809 |
-
'RB': '๋ถ์ฌ', 'RBR': '๋ถ์ฌ', 'RBS': '๋ถ์ฌ'
|
| 810 |
-
}
|
| 811 |
-
|
| 812 |
-
pos_counts = {'๋ช
์ฌ': 0, '๋์ฌ': 0, 'ํ์ฉ์ฌ': 0, '๋ถ์ฌ': 0, '๊ธฐํ': 0}
|
| 813 |
-
|
| 814 |
-
for _, pos in tagged:
|
| 815 |
-
if pos in pos_dict:
|
| 816 |
-
pos_counts[pos_dict[pos]] += 1
|
| 817 |
-
else:
|
| 818 |
-
pos_counts['๊ธฐํ'] += 1
|
| 819 |
-
except Exception:
|
| 820 |
-
# ์คํจํ๋ฉด ๊ฐ๋จํ ๊ท์น์ผ๋ก ํ์ฌ ์ ์ถ
|
| 821 |
-
tokens = re.findall(r'\b\w+\b', content.lower())
|
| 822 |
-
pos_counts = {
|
| 823 |
-
'๋ช
์ฌ': len([t for t in tokens if not t.endswith(('ly', 'ing', 'ed'))]),
|
| 824 |
-
'๋์ฌ': len([t for t in tokens if t.endswith(('ing', 'ed', 's'))]),
|
| 825 |
-
'๋ถ์ฌ': len([t for t in tokens if t.endswith('ly')]),
|
| 826 |
-
'๊ธฐํ': len([t for t in tokens if len(t) <= 2])
|
| 827 |
-
}
|
| 828 |
|
| 829 |
# ๊ฒฐ๊ณผ ์๊ฐํ
|
| 830 |
pos_df = pd.DataFrame({
|
|
|
|
| 20 |
# /tmp ํด๋๋ ์กด์ฌํ ์ ์์ง๋ง ๊ถํ ๋ฌธ์ ๊ฐ ์์ ์ ์์ผ๋ฏ๋ก ํ์ฌ ์์
๋๋ ํ ๋ฆฌ ๊ธฐ๋ฐ์ผ๋ก ๋ณ๊ฒฝ
|
| 21 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
|
| 22 |
DATA_DIR = os.path.join(CURRENT_DIR, "data")
|
|
|
|
| 23 |
SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
|
| 24 |
SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
|
| 25 |
|
|
|
|
| 34 |
|
| 35 |
# ํ์ํ ๋ชจ๋ ๋๋ ํ ๋ฆฌ ์์ฑ
|
| 36 |
ensure_directory(DATA_DIR)
|
|
|
|
| 37 |
ensure_directory(SCHEDULED_NEWS_DIR)
|
| 38 |
|
| 39 |
+
# ํ๊ตญ์ด ํ ํฌ๋์ด์ง์ ์ํ KSS ์ค์
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
try:
|
| 41 |
+
import kss
|
| 42 |
+
kss_available = True
|
| 43 |
+
except ImportError:
|
| 44 |
+
st.warning("KSS ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ์ค์น๋์ด ์์ง ์์ต๋๋ค. 'pip install kss'๋ก ์ค์นํ์ธ์.")
|
| 45 |
+
kss_available = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
# ํ๊ตญ์ด ํ ํฌ๋์ด์ง ํจ์ (KSS ์ฌ์ฉ)
|
| 48 |
def tokenize_korean(text):
|
| 49 |
try:
|
| 50 |
+
if kss_available:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
tokens = []
|
| 52 |
+
# ๋ฌธ์ฅ ๋ถ๋ฆฌ ํ ๊ฐ ๋ฌธ์ฅ์์ ๋จ์ด ์ถ์ถ
|
| 53 |
for sentence in kss.split_sentences(text):
|
| 54 |
+
# ๊ธฐ๋ณธ ๊ณต๋ฐฑ ๊ธฐ๋ฐ ํ ํฐํ์ ์ ๊ท์ ํจํด ์ถ๊ฐํ์ฌ ๋ ์ ๊ตํ๊ฒ ์ฒ๋ฆฌ
|
| 55 |
+
raw_tokens = sentence.split()
|
| 56 |
+
for token in raw_tokens:
|
| 57 |
+
# ์กฐ์ฌ, ํน์๋ฌธ์ ๋ฑ์ ๋ถ๋ฆฌ
|
| 58 |
+
sub_tokens = re.findall(r'[๊ฐ-ํฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ-ํฃa-zA-Z0-9]+', token)
|
| 59 |
+
tokens.extend(sub_tokens)
|
| 60 |
return tokens
|
|
|
|
|
|
|
| 61 |
except Exception as e:
|
| 62 |
+
st.debug(f"KSS ํ ํฌ๋์ด์ง ์คํจ: {str(e)}")
|
| 63 |
|
| 64 |
+
# KSS ์ฌ์ฉ ๋ถ๊ฐ๋ฅํ๊ฑฐ๋ ์ค๋ฅ ๋ฐ์์ ๊ธฐ๋ณธ ์ ๊ท์ ๊ธฐ๋ฐ ํ ํฌ๋์ด์ ์ฌ์ฉ
|
| 65 |
return re.findall(r'[๊ฐ-ํฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ-ํฃa-zA-Z0-9]+', text)
|
| 66 |
|
| 67 |
# ์๋ํด๋ผ์ฐ๋ ์ถ๊ฐ (์ ํ์ ์ฌ์ฉ)
|
|
|
|
| 225 |
except Exception as e:
|
| 226 |
return f"์ค๋ฅ ๋ฐ์: {str(e)}"
|
| 227 |
|
| 228 |
+
# KSS๋ฅผ ์ด์ฉํ ํค์๋ ๋ถ์
|
| 229 |
def analyze_keywords(text, top_n=10):
|
| 230 |
+
# ํ๊ตญ์ด ๋ถ์ฉ์ด ๋ชฉ๋ก (ํ์ฅ)
|
| 231 |
korean_stopwords = [
|
| 232 |
'์ด', '๊ทธ', '์ ', '๊ฒ', '๋ฐ', '๋ฑ', '๋ฅผ', '์', '์', '์์', '์', '์ผ๋ก', '๋ก',
|
| 233 |
+
'์๊ฒ', '๋ฟ', '๋ค', '๋', '๊ฐ', '์ด๋ค', '์๊ฒ์', '๊ป', '๊ป์', '๋ถํฐ', '๊น์ง',
|
| 234 |
+
'์ด๋ฐ', '์ ๋ฐ', '๊ทธ๋ฐ', '์ด๋ค', '๋ฌด์จ', '์ด๊ฒ', '์ ๊ฒ', '๊ทธ๊ฒ', '์ด๋ฒ', '์ ๋ฒ', '๊ทธ๋ฒ',
|
| 235 |
+
'์ด๊ฑฐ', '์ ๊ฑฐ', '๊ทธ๊ฑฐ', 'ํ๋ค', '๋๋ค', '์๋ค', '์๋ค', '๊ฐ๋ค', '๋ณด๋ค', '์ด๋ ๋ค', '๊ทธ๋ ๋ค',
|
| 236 |
+
'ํ๋', '๋๋', '์๋', '์๋', '๊ฐ์', '๋ณด๋', '์ด๋ฐ', '๊ทธ๋ฐ', '์ ๋ฐ', 'ํ๋ค', '๋๋ค',
|
| 237 |
+
'์์๋ค', '์์๋ค', '๊ฐ์๋ค', '๋ดค๋ค', '๋', '๋ํ', '๊ทธ๋ฆฌ๊ณ ', 'ํ์ง๋ง', '๊ทธ๋ฌ๋', '๊ทธ๋์',
|
| 238 |
+
'๋๋ฌธ์', '๋ฐ๋ผ์', 'ํ๋ฉฐ', '๋๋ฉฐ', '๏ฟฝ๏ฟฝ์ผ๋ฉฐ', '์์ผ๋ฉฐ', '๊ฐ์ผ๋ฉฐ', '๋ณด๋ฉฐ', 'ํ๊ณ ', '๋๊ณ ',
|
| 239 |
+
'์๊ณ ', '์๊ณ ', '๊ฐ๊ณ ', '๋ณด๊ณ ', 'ํตํด', '์ํด', '๋', '์ค', 'ํ'
|
| 240 |
+
]
|
| 241 |
+
|
| 242 |
+
# ์์ด ๋ถ์ฉ์ด ๋ชฉ๋ก
|
| 243 |
+
english_stopwords = [
|
| 244 |
+
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
| 245 |
+
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
| 246 |
+
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
| 247 |
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
| 248 |
+
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
| 249 |
+
'will', 'shall', 'can', 'may', 'must', 'ought'
|
| 250 |
]
|
| 251 |
|
| 252 |
# ์ธ์ด ๊ฐ์ง (๊ฐ๋จํ๊ฒ ํ๊ธ ํฌํจ ์ฌ๋ถ๋ก ์ฒดํฌ)
|
| 253 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', text))
|
| 254 |
|
| 255 |
if is_korean:
|
| 256 |
+
# ํ๊ตญ์ด ํ
์คํธ์ธ ๊ฒฝ์ฐ KSS ๊ธฐ๋ฐ ํ ํฌ๋์ด์ ์ฌ์ฉ
|
| 257 |
tokens = tokenize_korean(text)
|
| 258 |
else:
|
| 259 |
+
# ์์ด ๋๋ ๊ธฐํ ์ธ์ด๋ ๊ฐ๋จํ ์ ๊ท์ ํ ํฐํ
|
| 260 |
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
+
# ๋ถ์ฉ์ด ํํฐ๋ง (์ธ์ด์ ๋ฐ๋ผ ๋ค๋ฅธ ๋ถ์ฉ์ด ์ ์ฉ)
|
| 263 |
+
stopwords = korean_stopwords if is_korean else english_stopwords
|
| 264 |
+
tokens = [word for word in tokens if len(word) > 1 and word.lower() not in stopwords]
|
| 265 |
|
| 266 |
# ๋น๋ ๊ณ์ฐ
|
| 267 |
from collections import Counter
|
|
|
|
| 279 |
# ์ธ์ด ๊ฐ์ง (๊ฐ๋จํ๊ฒ ํ๊ธ ํฌํจ ์ฌ๋ถ๋ก ์ฒดํฌ)
|
| 280 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', text))
|
| 281 |
|
| 282 |
+
# ํ ํฐํ (KSS ์ฌ์ฉ)
|
| 283 |
+
tokens = tokenize_korean(text.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
# ๋ถ์ฉ์ด ์ค์
|
| 286 |
+
# ์์ด ๋ถ์ฉ์ด ๋ชฉ๋ก
|
| 287 |
+
english_stopwords = {
|
| 288 |
+
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
| 289 |
+
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
| 290 |
+
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
| 291 |
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
| 292 |
+
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
| 293 |
+
'will', 'shall', 'can', 'may', 'must', 'ought'
|
| 294 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
# ํ๊ตญ์ด ๋ถ์ฉ์ด
|
| 297 |
+
korean_stopwords = {
|
| 298 |
'๋ฐ', '๋ฑ', '๋ฅผ', '์ด', '์', '๊ฐ', '์', '๋', '์ผ๋ก', '์์', '๊ทธ', '๋', '๋๋', 'ํ๋', 'ํ ', 'ํ๊ณ ',
|
| 299 |
'์๋ค', '์ด๋ค', '์ํด', '๊ฒ์ด๋ค', '๊ฒ์', '๋ํ', '๋๋ฌธ', '๊ทธ๋ฆฌ๊ณ ', 'ํ์ง๋ง', '๊ทธ๋ฌ๋', '๊ทธ๋์',
|
| 300 |
'์
๋๋ค', 'ํฉ๋๋ค', '์ต๋๋ค', '์', '์ฃ ', '๊ณ ', '๊ณผ', '์', '๋', '์', '์', '๊ฒ', '๋ค', '์ ', '์ ',
|
|
|
|
| 305 |
'๊ธฐ์', '๋ด์ค', '์ฌ์ง', '์ฐํฉ๋ด์ค', '๋ด์์ค', '์ ๊ณต', '๋ฌด๋จ', '์ ์ฌ', '์ฌ๋ฐฐํฌ', '๊ธ์ง', '์ต์ปค', '๋ฉํธ',
|
| 306 |
'์ผ๋ณด', '๋ฐ์ผ๋ฆฌ', '๊ฒฝ์ ', '์ฌํ', '์ ์น', '์ธ๊ณ', '๊ณผํ', '์์ดํฐ', '๋ท์ปด', '์จ๋ท', '๋ธ๋กํฐ', '์ ์์ ๋ฌธ'
|
| 307 |
}
|
| 308 |
+
|
| 309 |
+
# ์ธ์ด์ ๋ฐ๋ผ ๋ถ์ฉ์ด ์ ํ
|
| 310 |
+
stop_words = korean_stopwords if is_korean else english_stopwords
|
| 311 |
|
| 312 |
# 1๊ธ์ ์ด์์ด๊ณ ๋ถ์ฉ์ด๊ฐ ์๋ ํ ํฐ๋ง ํํฐ๋ง
|
| 313 |
filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
|
|
|
|
| 689 |
# ํ
์คํธ ํต๊ณ ๊ณ์ฐ
|
| 690 |
word_count = len(re.findall(r'\b\w+\b', content))
|
| 691 |
char_count = len(content)
|
| 692 |
+
|
| 693 |
+
# KSS๋ฅผ ์ฌ์ฉํ์ฌ ๋ฌธ์ฅ ๋ถ๋ฆฌ
|
| 694 |
+
if kss_available:
|
| 695 |
+
try:
|
| 696 |
+
sentences = kss.split_sentences(content)
|
| 697 |
+
sentence_count = len(sentences)
|
| 698 |
+
except Exception:
|
| 699 |
+
# KSS ์คํจ ์ ๊ฐ๋จํ ๋ฌธ์ฅ ๋ถ๋ฆฌ
|
| 700 |
+
sentence_count = len(re.split(r'[.!?]+', content))
|
| 701 |
+
else:
|
| 702 |
+
sentence_count = len(re.split(r'[.!?]+', content))
|
| 703 |
+
|
| 704 |
avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
|
| 705 |
avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
|
| 706 |
|
|
|
|
| 732 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', content))
|
| 733 |
|
| 734 |
try:
|
| 735 |
+
# KSS๋ฅผ ์ฌ์ฉํ์ฌ ๊ฐ๋จํ ํ์ฌ ์ ์ฌ ๋ถ์
|
| 736 |
+
tokens = tokenize_korean(content[:5000]) # ๋๋ฌด ๊ธด ํ
์คํธ๋ ์๋ผ์ ๋ถ์
|
| 737 |
+
|
| 738 |
if is_korean:
|
| 739 |
+
# ํ๊ตญ์ด์ธ ๊ฒฝ์ฐ ๊ฐ๋จํ ํจํด ๋งค์นญ์ผ๋ก ํ์ฌ ์ถ์
|
| 740 |
+
pos_counts = {'๋ช
์ฌ/๋๋ช
์ฌ': 0, '๋์ฌ/ํ์ฉ์ฌ': 0, '๋ถ์ฌ/์กฐ์ฌ': 0, '๊ธฐํ': 0}
|
| 741 |
+
|
| 742 |
+
for token in tokens:
|
| 743 |
+
if token.endswith(("๋ค", "์", "๊น", "์ฃ ", "๋ค", "๊ตฐ", "๋๋ค", "์ธ์")):
|
| 744 |
+
pos_counts['๋์ฌ/ํ์ฉ์ฌ'] += 1
|
| 745 |
+
elif token.endswith(("๊ฒ", "ํ", "์ด", "์ง")):
|
| 746 |
+
pos_counts['๋ถ์ฌ/์กฐ์ฌ'] += 1
|
| 747 |
+
elif token.endswith(("์", "๋", "์ด", "๊ฐ", "์", "๋ฅผ", "์", "์")):
|
| 748 |
+
pos_counts['๋ถ์ฌ/์กฐ์ฌ'] += 1
|
| 749 |
+
elif len(token) > 1:
|
| 750 |
+
pos_counts['๋ช
์ฌ/๋๋ช
์ฌ'] += 1
|
| 751 |
+
else:
|
| 752 |
+
pos_counts['๊ธฐํ'] += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
else:
|
| 754 |
+
# ์์ด ๋ฌธ์์ธ ๊ฒฝ์ฐ ๊ฐ๋จํ ํจํด ๋งค์นญ
|
| 755 |
+
pos_counts = {
|
| 756 |
+
'๋ช
์ฌ/๋๋ช
์ฌ': len([t for t in tokens if not t.lower().endswith(('ly', 'ing', 'ed'))]),
|
| 757 |
+
'๋์ฌ': len([t for t in tokens if t.lower().endswith(('ing', 'ed', 's'))]),
|
| 758 |
+
'๋ถ์ฌ/ํ์ฉ์ฌ': len([t for t in tokens if t.lower().endswith('ly')]),
|
| 759 |
+
'๊ธฐํ': len([t for t in tokens if len(t) <= 2])
|
| 760 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 761 |
|
| 762 |
# ๊ฒฐ๊ณผ ์๊ฐํ
|
| 763 |
pos_df = pd.DataFrame({
|