| import argparse
|
| import yaml
|
| import subprocess
|
| import time
|
| from pathlib import Path
|
| """
|
| total.py β ν λ²μ νμ΄νλΌμΈ μ 체 ( ν
μ€νΈ ν΄λ¦¬λ β λ¬Έμ₯ λΆλ¦¬ β κ°μ± λΆλ₯ β ν€μλ λΆλ₯ > μ΅μ’
κ²°κ³Ό μ μ₯) λλ €μ£Όλ μ€ν¬λ¦½νΈ
|
| β’ μν : ν
μ€νΈ ν΄λ¦¬λ β λ¬Έμ₯ λΆλ¦¬ β κ°μ± λΆλ₯ β ν€μλ λΆλ₯
|
| β’ μ¬μ©λ²:
|
| python total.py --config C:/Users/parkm/NLP/config/default.yaml
|
| β’ configμ: input/output κ²½λ‘, λͺ¨λΈ κ²½λ‘, λ°°μΉ ν¬κΈ°
|
| β’ μ€κ° κ²°κ³Όλ μ§μ λ intermediate_dirμ step1~step3.csvλ‘ μ μ₯λ¨.
|
| """
|
|
|
|
|
|
|
| def run_step(script_path, args):
|
| """
|
| βΆ μ£Όμ΄μ§ μ€ν¬λ¦½νΈλ₯Ό μ€ννκ³ , μμ μκ°μ λ°ν
|
| """
|
| cmd = ['python', str(script_path)] + args
|
| print(f"\n--- Running: {' '.join(cmd)}")
|
| start = time.perf_counter()
|
| result = subprocess.run(cmd, capture_output=True, text=True)
|
| elapsed = time.perf_counter() - start
|
|
|
| if result.returncode != 0:
|
| print(f"[Error] {script_path.name} μ€ν μ€ μ€λ₯ λ°μ:\n{result.stderr}")
|
| exit(1)
|
| else:
|
| print(result.stdout)
|
| print(f"[Info] {script_path.name} μλ£: {elapsed:.2f}s μμ")
|
| return elapsed
|
|
|
| def main(config_path: Path):
|
| """
|
| βΆ νμ΄νλΌμΈ λ©μΈ ν¨μ
|
| 1) config YAML νμΌ λ‘λ
|
| 2) μ€μ μ λ§μΆ° intermediate ν΄λμ μ€κ°κ²°κ³Ό μ μ₯λ¨
|
| 3) 4λ¨κ³ μ€ν¬λ¦½νΈ μμ°¨μ μΌλ‘ μ€ν ν
|
| 4) λ¨κ³λ³ λ° μ΄ μμ μκ° + μλ£ λ©μμ§ μΆλ ₯
|
| """
|
|
|
| cfg = yaml.safe_load(config_path.read_text(encoding='utf-8'))
|
| data_cfg = cfg['data']
|
| paths_cfg = cfg['paths']
|
|
|
|
|
| input_csv = Path(data_cfg['input_csv'])
|
| intermediate = Path(data_cfg['intermediate_dir'])
|
| output_csv = Path(data_cfg['output_csv'])
|
| scripts_dir = Path(paths_cfg['scripts_dir'])
|
| model_dir = Path(paths_cfg['model_dir'])
|
|
|
|
|
| intermediate.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
| total_start = time.perf_counter()
|
| durations = {}
|
|
|
|
|
|
|
| clean_out = intermediate / 'step1_clean.csv'
|
| durations['clean'] = run_step(
|
| scripts_dir / 'text_cleaner.py',
|
| ['--input', str(input_csv), '--output', str(clean_out)]
|
| )
|
|
|
|
|
|
|
| split_out = intermediate / 'step2_split.csv'
|
| durations['split'] = run_step(
|
| scripts_dir / 'sentence_splitter.py',
|
| [
|
| '--input', str(clean_out),
|
| '--output', str(split_out),
|
| '--id-col', data_cfg.get('id_col', 'ID'),
|
| '--time-col', data_cfg.get('time_col', ''),
|
| '--text-col', 'cleaned',
|
| '--output-col', 'divided_comment'
|
| ]
|
| )
|
|
|
|
|
| senti_out = intermediate / 'step3_sentiment.csv'
|
| durations['sentiment'] = run_step(
|
| scripts_dir / 'sentiment.py',
|
| [
|
| '--input', str(split_out),
|
| '--output', str(senti_out),
|
| '--model-dir', str(model_dir),
|
| '--text-col', 'divided_comment',
|
| '--output-col','sentiment',
|
| '--max-length', str(cfg['sentiment']['max_length']),
|
| '--batch-size', str(cfg['sentiment']['batch_size'])
|
| ]
|
| )
|
|
|
|
|
| durations['keyword'] = run_step(
|
| scripts_dir / 'keyword_classifier.py',
|
| [
|
| '--input', str(senti_out),
|
| '--output', str(output_csv),
|
| '--text-col', 'divided_comment'
|
| ]
|
| )
|
| total_elapsed = time.perf_counter() - total_start
|
|
|
| print("\n=== TIME REPORT ===")
|
| print(f"1) ν
μ€νΈ ν΄λ¦¬λ : {durations['clean']:.2f}s")
|
| print(f"2) λ¬Έμ₯ λΆλ¦¬ : {durations['split']:.2f}s")
|
| print(f"3) κ°μ± λΆλ₯ : {durations['sentiment']:.2f}s")
|
| print(f"4) ν€μλ λΆλ₯ : {durations['keyword']:.2f}s")
|
| print(f"-----------------------------")
|
| print(f"μ΄ μμ μκ° : {total_elapsed:.2f}s")
|
|
|
| print(f"\n ν
μ€νΈ ν΄λ¦¬λ β λ¬Έμ₯ λΆλ¦¬ β κ°μ± λΆλ₯ β ν€μλ λΆλ₯ λ... Final output : {output_csv}")
|
|
|
| if __name__ == '__main__':
|
| parser = argparse.ArgumentParser(description='ν λ²μ μ 체 νμ΄νλΌμΈ μ€ννλ μ€ν¬λ¦½νΈ')
|
| parser.add_argument(
|
| '--config', '-c',
|
| default='C:/Users/parkm/NLP/config/default.yaml',
|
| help='config YAML νμΌ κ²½λ‘ (κΈ°λ³Έ: C:/Users/parkm/NLP/config/default.yaml)'
|
| )
|
| args = parser.parse_args()
|
| main(Path(args.config))
|
|
|