Spaces:

kland
/

llm_eval3

Running

App Files Files Community

llm_eval3 / templates /index.html

kland

Upload 4 files

6c886f2 verified 16 days ago

raw

history blame contribute delete

33.3 kB

	<!DOCTYPE html>
	<html lang="ko">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>㈜강원랜드 생성형 AI 모델 실증 평가</title>
	<style>
	body { font-family: 'Malgun Gothic', sans-serif; background-color: #f4f7f6; color: #333; margin: 0; padding: 20px; display: flex; justify-content: center; }
	.container { max-width: 900px; width: 100%; }
	h1, h2 { text-align: center; color: #2c3e50; }
	h1 { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 30px; }
	h2 { margin-top: 40px; border-top: 2px solid #3498db; padding-top: 20px;}
	.eval-section { background-color: #ffffff; border-radius: 8px; padding: 25px; margin-bottom: 25px; box-shadow: 0 4px 8px rgba(0,0,0,0.1); }
	h3 { margin-top: 20px; color: #7f8c8d; display: flex; justify-content: space-between; align-items: center; }
	h4 { color: #34495e; margin-bottom: 5px; }
	textarea, input[type="text"] { width: 100%; padding: 10px; border: 1px solid #ccc; border-radius: 4px; box-sizing: border-box; font-size: 1em; margin-top: 10px; }
	button { background-color: #3498db; color: white; border: none; padding: 12px 20px; border-radius: 4px; cursor: pointer; font-size: 1em; font-weight: bold; display: block; margin: 20px 0 10px 0; transition: background-color 0.3s; }
	button:hover { background-color: #2980b9; }
	.result { margin-top: 15px; padding: 15px; background-color: #ecf0f1; border-radius: 4px; white-space: pre-wrap; font-family: 'D2Coding', monospace; font-size: 1.1em; line-height: 1.6; color: #2c3e50; }
	.result .detail { font-size: 0.9em; color: #555; }
	.error { color: #e74c3c; font-weight: bold; }
	.status { text-align: center; padding: 10px; background-color: #27ae60; color: white; font-weight: bold; border-radius: 5px; margin-bottom: 20px; }
	.status.error { background-color: #e74c3c; }
	.criteria-box { background-color: #f8f9fa; border: 1px solid #dee2e6; border-radius: 4px; padding: 15px; margin-top: 20px; }
	.criteria-box p, .criteria-box code { font-size: 0.9em; color: #212529; margin: 0 0 10px 0; }
	.criteria-box code { display: block; background-color: #e9ecef; padding: 10px; border-radius: 4px; white-space: pre-wrap; font-size: 1em; }
	.criteria-box .scoring { background-color: #fff3cd; padding: 10px; border-radius: 4px; margin-top: 10px; border: 1px solid #ffc107; }
	.info-box { background-color: #d1ecf1; border: 1px solid #bee5eb; border-radius: 4px; padding: 12px; margin: 10px 0; }
	.info-box h5 { margin: 0 0 8px 0; color: #0c5460; }
	.info-box p { margin: 0; font-size: 0.9em; color: #0c5460; }
	.rubric { line-height: 1.7; }
	.final-actions { text-align: center; padding: 20px; }
	.final-actions a { text-decoration: none; display: inline-block; background-color: #2ecc71; color: white; padding: 15px 30px; font-size: 1.2em; border-radius: 5px; margin: 0 10px; }
	.final-actions a.reset { background-color: #e74c3c; }
	.expand-btn { padding: 2px 8px; font-size: 0.8em; background-color: #7f8c8d; border-radius: 10px; cursor: pointer; color: white; border: none; margin: 0; font-weight: normal; }
	textarea.collapsible { height: 120px; transition: height 0.3s ease-in-out; overflow-y: auto; }
	textarea.collapsible.expanded { height: 400px; }
	.radio-group { display: flex; justify-content: space-around; padding: 10px; background-color: #f8f9fa; border-radius: 5px; margin-top: 15px; }
	.radio-group label { cursor: pointer; }
	.modal { display: none; position: fixed; z-index: 1000; left: 0; top: 0; width: 100%; height: 100%; overflow: auto; background-color: rgba(0,0,0,0.6); }
	.modal-content { background-color: #fefefe; margin: 15% auto; padding: 20px; border: 1px solid #888; width: 80%; max-width: 500px; border-radius: 8px; text-align: center; }
	.modal-content h3 { color: #e74c3c; }
	.modal-content ul { list-style-type: none; padding: 0; }
	.modal-content li { margin: 5px 0; font-weight: bold; }
	.close-btn { color: #aaa; float: right; font-size: 28px; font-weight: bold; cursor: pointer; }
	.close-btn:hover, .close-btn:focus { color: black; }
	.example-box { background-color: #e8f5e9; border: 1px solid #4caf50; border-radius: 4px; padding: 10px; margin-top: 10px; font-size: 0.85em; }
	.metric-badge { display: inline-block; background-color: #6c757d; color: white; padding: 3px 8px; border-radius: 4px; font-size: 0.85em; margin-left: 10px; }
	</style>
	<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
	<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
	</head>
	<body>

	<div id="validation-modal" class="modal">
	<div class="modal-content">
	<span class="close-btn">×</span>
	<h3>평가 미완료 항목</h3>
	<p>보고서를 출력하려면 아래 항목의 평가를 먼저 진행해주세요.</p>
	<ul id="missing-items-list"></ul>
	</div>
	</div>

	<div class="container">
	<h1>㈜강원랜드 생성형 AI 모델 실증 평가</h1>

	{% if model_loaded %}
	<p class="status">✅ AI 모델이 성공적으로 로딩되었습니다.</p>
	{% else %}
	<p class="status error">❌ AI 모델 로딩 실패! 서버 로그를 확인하세요.</p>
	{% endif %}

	<div class="eval-section">
	<h3>평가 대상 LLM 모델 주소</h3>
	<input type="text" id="target_url_input" placeholder="예: https://chat.openai.com" value="{{ all_results.target_url if all_results.target_url }}">
	</div>

	<h2>정량 평가 (자동 계산)</h2>

	<div class="eval-section" id="perplexity">
	<h4>1. Perplexity (언어 모델 품질) <span class="metric-badge">배점: 3점</span></h4>

	<div class="info-box">
	<h5>📊 Perplexity(PPL)란?</h5>
	<p>AI가 텍스트를 얼마나 자연스럽게 이해하고 생성하는지를 측정하는 지표입니다. 낮은 값일수록 모델이 언어를 더 잘 이해한다는 의미입니다.</p>
	<p style="margin-top: 5px;"><strong>평가 방법:</strong> 다양한 주제의 긴 텍스트를 입력하면, AI가 각 단어를 예측하는 난이도를 계산합니다.</p>
	</div>

	<div class="criteria-box">
	<p><strong>측정 원리:</strong> 다양한 n-gram(연속된 단어 조합) 반복도와 어휘 다양성을 종합하여 조정된 PPL 점수를 산출합니다.</p>
	<div class="scoring">
	<strong>배점 기준 (5단계):</strong><br>
	• 15 미만: 3.0점 (100%) - 매우 우수<br>
	• 15~30: 2.4점 (80%) - 우수<br>
	• 30~50: 1.8점 (60%) - 보통<br>
	• 50~100: 1.2점 (40%) - 미흡<br>
	• 100 이상: 0.6점 (20%) - 매우 미흡
	</div>
	</div>

	<div class="example-box">
	<strong>💡 입력 예시:</strong> 뉴스 기사, 소설 일부, 기술 문서 등 2,000자 이상의 다양한 내용을 포함한 텍스트를 입력하세요.
	</div>

	<form action="/evaluate" method="post">
	<input type="hidden" name="metric" value="perplexity">
	<h3><span>평가할 텍스트</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="ppl_text" class="collapsible" placeholder="정확한 측정을 위해 2,000자 이상의 다양한 내용으로 구성된 텍스트를 입력해주세요.">{{ input_texts.ppl_text if input_texts.ppl_text }}</textarea>
	<button type="submit">Perplexity 값 계산</button>
	</form>
	{% if all_results.perplexity %}
	<div class="result">
	{% if all_results.perplexity.error %}<p class="error">{{ all_results.perplexity.error }}</p>
	{% else %}
	<b>조정된 Perplexity: {{ all_results.perplexity.score_display }}</b>
	<b>최종 점수: {{ "%.2f"\|format(all_results.perplexity.final_score) }}점</b>
	<hr><span class="detail">(참고) 기본 PPL: {{ all_results.perplexity.details.base_ppl }} \| 페널티 계수: {{ all_results.perplexity.details.penalty_factor }} \| 토큰 수: {{ all_results.perplexity.details.token_count }}</span>
	{% endif %}
	</div>
	{% endif %}
	</div>

	<div class="eval-section" id="rouge">
	<h4>2. ROUGE (요약 능력) <span class="metric-badge">배점: 3점</span></h4>

	<div class="info-box">
	<h5>📝 ROUGE란?</h5>
	<p>AI가 긴 문서를 얼마나 정확하게 요약하는지 평가하는 지표입니다. 생성된 요약문과 정답 요약문 간의 단어/구문 일치도를 측정합니다.</p>
	<p style="margin-top: 5px;"><strong>평가 방법:</strong> AI가 생성한 요약문과 사람이 작성한 정답 요약문을 비교하여 점수를 산출합니다.</p>
	</div>

	<div class="criteria-box">
	<p><strong>측정 원리:</strong> ROUGE-1(단어), ROUGE-2(2단어 조합), ROUGE-L(최장 공통 부분)을 종합 평가하며, 요약문 길이 적절성도 반영합니다.</p>
	<div class="scoring">
	<strong>배점 기준 (5단계):</strong><br>
	• 0.60 이상: 3.0점 (100%) - 매우 우수<br>
	• 0.50~0.60: 2.4점 (80%) - 우수<br>
	• 0.40~0.50: 1.8점 (60%) - 보통<br>
	• 0.30~0.40: 1.2점 (40%) - 미흡<br>
	• 0.30 미만: 0.6점 (20%) - 매우 미흡
	</div>
	</div>

	<div class="example-box">
	<strong>💡 평가 팁:</strong> 원문의 핵심 내용을 포함하면서도 간결한 요약문이 높은 점수를 받습니다.
	</div>

	<form action="/evaluate" method="post">
	<input type="hidden" name="metric" value="rouge">
	<h3><span>모델이 생성한 요약문</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="rouge_generated" class="collapsible" placeholder="모델이 생성한 요약문을 여기에 입력하세요.">{{ input_texts.rouge_generated if input_texts.rouge_generated }}</textarea>
	<h3><span>참조(정답) 요약문</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="rouge_reference" class="collapsible" placeholder="정답 요약문을 여기에 입력하세요.">{{ input_texts.rouge_reference if input_texts.rouge_reference }}</textarea>
	<button type="submit">ROUGE 점수 계산</button>
	</form>
	{% if all_results.rouge %}
	<div class="result">
	{% if all_results.rouge.error %}<p class="error">{{ all_results.rouge.error }}</p>
	{% else %}
	<b>최종 ROUGE 점수: {{ all_results.rouge.score_display }}</b>
	<b>최종 점수: {{ "%.2f"\|format(all_results.rouge.final_score) }}점</b>
	<hr><span class="detail">(참고) ROUGE-1: {{ all_results.rouge.details.rouge1 }} \| ROUGE-2: {{ all_results.rouge.details.rouge2 }} \| ROUGE-L: {{ all_results.rouge.details.rougeL }} \| 가중 평균: {{ all_results.rouge.details.weighted_avg }} \| 길이 페널티: {{ all_results.rouge.details.length_penalty }}</span>
	{% endif %}
	</div>
	{% endif %}
	</div>

	<div class="eval-section" id="bleu">
	<h4>3. BLEU (번역 정확성) <span class="metric-badge">배점: 2점</span></h4>

	<div class="info-box">
	<h5>🌐 BLEU란?</h5>
	<p>AI의 번역 품질을 평가하는 국제 표준 지표입니다. 기계 번역문과 전문가가 번역한 정답 번역문의 유사도를 측정합니다.</p>
	<p style="margin-top: 5px;"><strong>평가 방법:</strong> AI가 생성한 번역문을 2개의 참조 번역문과 비교하여 단어 및 구문 일치도를 계산합니다.</p>
	</div>

	<div class="criteria-box">
	<p><strong>측정 원리:</strong> 1~4개 단어 조합(n-gram)의 일치도를 종합 평가하며, 2개의 참조 번역문을 사용하여 더 정확한 평가를 수행합니다.</p>
	<div class="scoring">
	<strong>배점 기준 (5단계):</strong><br>
	• 0.50 이상: 2.0점 (100%) - 매우 우수<br>
	• 0.40~0.50: 1.6점 (80%) - 우수<br>
	• 0.30~0.40: 1.2점 (60%) - 보통<br>
	• 0.20~0.30: 0.8점 (40%) - 미흡<br>
	• 0.20 미만: 0.4점 (20%) - 매우 미흡
	</div>
	</div>

	<div class="example-box">
	<strong>💡 평가 팁:</strong> 서로 다른 스타일의 2개 참조 번역문을 준비하면 더 공정한 평가가 가능합니다.
	</div>

	<form action="/evaluate" method="post">
	<input type="hidden" name="metric" value="bleu">
	<h3><span>모델이 생성한 번역문 (영문)</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="bleu_generated" class="collapsible" placeholder="모델이 생성한 번역문을 여기에 입력하세요.">{{ input_texts.bleu_generated if input_texts.bleu_generated }}</textarea>

	<h3><span>참조(정답) 번역문 1</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="bleu_reference1" class="collapsible" placeholder="첫 번째 정답 번역문을 입력하세요.">{{ input_texts.bleu_reference1 if input_texts.bleu_reference1 }}</textarea>

	<h3><span>참조(정답) 번역문 2</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="bleu_reference2" class="collapsible" placeholder="두 번째 정답 번역문을 입력하세요.">{{ input_texts.bleu_reference2 if input_texts.bleu_reference2 }}</textarea>

	<button type="submit">BLEU 점수 계산</button>
	</form>
	{% if all_results.bleu %}
	<div class="result">
	{% if all_results.bleu.error %}<p class="error">{{ all_results.bleu.error }}</p>
	{% else %}
	<b>BLEU Score: {{ all_results.bleu.score_display }}</b>
	<b>최종 점수: {{ "%.2f"\|format(all_results.bleu.final_score) }}점</b>
	{% endif %}
	</div>
	{% endif %}
	</div>

	<h2>정성 평가 (심사위원 평가)</h2>

	<div class="eval-section" id="mmlu">
	<h4>4. MMLU (복합 문제 해결) <span class="metric-badge">배점: 3점</span></h4>
	<form action="/evaluate" method="post">
	<input type="hidden" name="metric" value="mmlu">
	<div class="criteria-box">
	<p><strong>평가 내용:</strong> 데이터 가공(CSV→MD), 적합 차트 추천, 표준 용어집 기반 문장 오류 탐지 등 복합 문제 해결 능력 평가</p>
	<div class="scoring">
	<strong>배점 기준 (5단계):</strong><br>
	• 수: 3.0점 (100%) - 매우 우수<br>
	• 우: 2.4점 (80%) - 우수<br>
	• 미: 1.8점 (60%) - 보통<br>
	• 양: 1.2점 (40%) - 미흡<br>
	• 가: 0.6점 (20%) - 매우 미흡
	</div>
	</div>
	<h3><span>모델 생성 결과</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="mmlu_generated" class="collapsible" placeholder="모델이 생성한 답변이나 결과를 여기에 입력하세요.">{{ input_texts.mmlu_generated if input_texts.mmlu_generated }}</textarea>
	<h3><span>참조(정답)</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="mmlu_reference" class="collapsible" placeholder="참조할 정답이나 평가 기준을 입력하세요.">{{ input_texts.mmlu_reference if input_texts.mmlu_reference }}</textarea>
	<div class="radio-group">
	<label><input type="radio" name="mmlu_grade" value="수" {% if all_results.mmlu and all_results.mmlu.grade == '수' %}checked{% endif %}> 수</label>
	<label><input type="radio" name="mmlu_grade" value="우" {% if all_results.mmlu and all_results.mmlu.grade == '우' %}checked{% endif %}> 우</label>
	<label><input type="radio" name="mmlu_grade" value="미" {% if all_results.mmlu and all_results.mmlu.grade == '미' %}checked{% endif %}> 미</label>
	<label><input type="radio" name="mmlu_grade" value="양" {% if all_results.mmlu and all_results.mmlu.grade == '양' %}checked{% endif %}> 양</label>
	<label><input type="radio" name="mmlu_grade" value="가" {% if all_results.mmlu and all_results.mmlu.grade == '가' %}checked{% endif %}> 가</label>
	</div>
	<button type="submit">정성 평가 점수 저장</button>
	</form>
	{% if all_results.mmlu %}
	<div class="result">
	<b>평가 등급: {{ all_results.mmlu.grade }}</b>
	<b>최종 점수: {{ "%.2f"\|format(all_results.mmlu.final_score) }}점</b>
	</div>
	{% endif %}
	</div>

	<div class="eval-section" id="truthfulqa">
	<h4>5. TruthfulQA (사실 기반 답변) <span class="metric-badge">배점: 3점</span></h4>
	<form action="/evaluate" method="post">
	<input type="hidden" name="metric" value="truthfulqa">
	<div class="criteria-box">
	<p><strong>평가 내용:</strong> 급여 대장, 업무분장표 등 제시된 파일 내에서 "특정 조직의 담당 업무는?" 등 정확한 사실을 찾아 답변하는 능력 (환각 현상 검증)</p>
	<div class="scoring">
	<strong>배점 기준 (5단계):</strong><br>
	• 수: 3.0점 (100%) - 매우 우수<br>
	• 우: 2.4점 (80%) - 우수<br>
	• 미: 1.8점 (60%) - 보통<br>
	• 양: 1.2점 (40%) - 미흡<br>
	• 가: 0.6점 (20%) - 매우 미흡
	</div>
	</div>
	<h3><span>모델 생성 결과</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="truthfulqa_generated" class="collapsible" placeholder="모델이 생성한 답변이나 결과를 여기에 입력하세요.">{{ input_texts.truthfulqa_generated if input_texts.truthfulqa_generated }}</textarea>
	<h3><span>참조(정답)</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="truthfulqa_reference" class="collapsible" placeholder="참조할 정답이나 평가 기준을 입력하세요.">{{ input_texts.truthfulqa_reference if input_texts.truthfulqa_reference }}</textarea>
	<div class="radio-group">
	<label><input type="radio" name="truthfulqa_grade" value="수" {% if all_results.truthfulqa and all_results.truthfulqa.grade == '수' %}checked{% endif %}> 수</label>
	<label><input type="radio" name="truthfulqa_grade" value="우" {% if all_results.truthfulqa and all_results.truthfulqa.grade == '우' %}checked{% endif %}> 우</label>
	<label><input type="radio" name="truthfulqa_grade" value="미" {% if all_results.truthfulqa and all_results.truthfulqa.grade == '미' %}checked{% endif %}> 미</label>
	<label><input type="radio" name="truthfulqa_grade" value="양" {% if all_results.truthfulqa and all_results.truthfulqa.grade == '양' %}checked{% endif %}> 양</label>
	<label><input type="radio" name="truthfulqa_grade" value="가" {% if all_results.truthfulqa and all_results.truthfulqa.grade == '가' %}checked{% endif %}> 가</label>
	</div>
	<button type="submit">정성 평가 점수 저장</button>
	</form>
	{% if all_results.truthfulqa %}
	<div class="result">
	<b>평가 등급: {{ all_results.truthfulqa.grade }}</b>
	<b>최종 점수: {{ "%.2f"\|format(all_results.truthfulqa.final_score) }}점</b>
	</div>
	{% endif %}
	</div>

	<div class="eval-section" id="drop">
	<h4>6. DROP (문서 독해/추론) <span class="metric-badge">배점: 3점</span></h4>
	<form action="/evaluate" method="post">
	<input type="hidden" name="metric" value="drop">
	<div class="criteria-box">
	<p><strong>평가 내용:</strong> 공지사항 목록, 논문 DB 등에서 "A와 B 공지 사이에 무슨 일이 있었나?" 와 같이 문서의 맥락과 관계를 추론하는 능력 평가</p>
	<div class="scoring">
	<strong>배점 기준 (5단계):</strong><br>
	• 수: 3.0점 (100%) - 매우 우수<br>
	• 우: 2.4점 (80%) - 우수<br>
	• 미: 1.8점 (60%) - 보통<br>
	• 양: 1.2점 (40%) - 미흡<br>
	• 가: 0.6점 (20%) - 매우 미흡
	</div>
	</div>
	<h3><span>모델 생성 결과</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="drop_generated" class="collapsible" placeholder="모델이 생성한 답변이나 결과를 여기에 입력하세요.">{{ input_texts.drop_generated if input_texts.drop_generated }}</textarea>
	<h3><span>참조(정답)</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="drop_reference" class="collapsible" placeholder="참조할 정답이나 평가 기준을 입력하세요.">{{ input_texts.drop_reference if input_texts.drop_reference }}</textarea>
	<div class="radio-group">
	<label><input type="radio" name="drop_grade" value="수" {% if all_results.drop and all_results.drop.grade == '수' %}checked{% endif %}> 수</label>
	<label><input type="radio" name="drop_grade" value="우" {% if all_results.drop and all_results.drop.grade == '우' %}checked{% endif %}> 우</label>
	<label><input type="radio" name="drop_grade" value="미" {% if all_results.drop and all_results.drop.grade == '미' %}checked{% endif %}> 미</label>
	<label><input type="radio" name="drop_grade" value="양" {% if all_results.drop and all_results.drop.grade == '양' %}checked{% endif %}> 양</label>
	<label><input type="radio" name="drop_grade" value="가" {% if all_results.drop and all_results.drop.grade == '가' %}checked{% endif %}> 가</label>
	</div>
	<button type="submit">정성 평가 점수 저장</button>
	</form>
	{% if all_results.drop %}
	<div class="result">
	<b>평가 등급: {{ all_results.drop.grade }}</b>
	<b>최종 점수: {{ "%.2f"\|format(all_results.drop.final_score) }}점</b>
	</div>
	{% endif %}
	</div>

	<div class="eval-section" id="mbpp_humaneval">
	<h4>7. MBPP & HumanEval (코드 생성/업무 자동화) <span class="metric-badge">배점: 3점</span></h4>
	<form action="/evaluate" method="post">
	<input type="hidden" name="metric" value="mbpp_humaneval">
	<div class="criteria-box">
	<p><strong>평가 내용:</strong> 태양광 발전 실적 데이터 분석, 복잡한 조건(연령, 국가별 최고점)을 만족하는 참가자 필터링 등 실제 업무용 Python 코드 생성 능력 평가</p>
	<div class="scoring">
	<strong>배점 기준 (5단계):</strong><br>
	• 수: 3.0점 (100%) - 매우 우수<br>
	• 우: 2.4점 (80%) - 우수<br>
	• 미: 1.8점 (60%) - 보통<br>
	• 양: 1.2점 (40%) - 미흡<br>
	• 가: 0.6점 (20%) - 매우 미흡
	</div>
	</div>
	<h3><span>모델 생성 결과</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="mbpp_humaneval_generated" class="collapsible" placeholder="모델이 생성한 답변이나 결과를 여기에 입력하세요.">{{ input_texts.mbpp_humaneval_generated if input_texts.mbpp_humaneval_generated }}</textarea>
	<h3><span>참조(정답)</span> <button type="button" class="expand-btn">확장</button></h3>
	<textarea name="mbpp_humaneval_reference" class="collapsible" placeholder="참조할 정답이나 평가 기준을 입력하세요.">{{ input_texts.mbpp_humaneval_reference if input_texts.mbpp_humaneval_reference }}</textarea>
	<div class="radio-group">
	<label><input type="radio" name="mbpp_humaneval_grade" value="수" {% if all_results.mbpp_humaneval and all_results.mbpp_humaneval.grade == '수' %}checked{% endif %}> 수</label>
	<label><input type="radio" name="mbpp_humaneval_grade" value="우" {% if all_results.mbpp_humaneval and all_results.mbpp_humaneval.grade == '우' %}checked{% endif %}> 우</label>
	<label><input type="radio" name="mbpp_humaneval_grade" value="미" {% if all_results.mbpp_humaneval and all_results.mbpp_humaneval.grade == '미' %}checked{% endif %}> 미</label>
	<label><input type="radio" name="mbpp_humaneval_grade" value="양" {% if all_results.mbpp_humaneval and all_results.mbpp_humaneval.grade == '양' %}checked{% endif %}> 양</label>
	<label><input type="radio" name="mbpp_humaneval_grade" value="가" {% if all_results.mbpp_humaneval and all_results.mbpp_humaneval.grade == '가' %}checked{% endif %}> 가</label>
	</div>
	<button type="submit">정성 평가 점수 저장</button>
	</form>
	{% if all_results.mbpp_humaneval %}
	<div class="result">
	<b>평가 등급: {{ all_results.mbpp_humaneval.grade }}</b>
	<b>최종 점수: {{ "%.2f"\|format(all_results.mbpp_humaneval.final_score) }}점</b>
	</div>
	{% endif %}
	</div>


	<div class="final-actions">
	<a href="/report" id="report-link" target="_blank">최종 결과 보고서 출력</a>
	<a href="/reset" class="reset">평가 초기화</a>
	</div>

	</div>

	<script>
	document.addEventListener('DOMContentLoaded', function() {
	document.querySelectorAll('.expand-btn').forEach(button => {
	button.addEventListener('click', function() {
	const textarea = this.closest('h3').nextElementSibling;
	if (textarea && textarea.tagName === 'TEXTAREA') {
	textarea.classList.toggle('expanded');
	this.textContent = textarea.classList.contains('expanded') ? '축소' : '확장';
	}
	});
	});

	const mainUrlInput = document.getElementById('target_url_input');
	const forms = document.querySelectorAll('form');
	function addOrUpdateHiddenUrlField(form) {
	let hiddenInput = form.querySelector('input[name="target_url"]');
	if (!hiddenInput) {
	hiddenInput = document.createElement('input');
	hiddenInput.type = 'hidden';
	hiddenInput.name = 'target_url';
	form.appendChild(hiddenInput);
	}
	hiddenInput.value = mainUrlInput.value;
	}
	forms.forEach(form => {
	form.addEventListener('submit', function() {
	addOrUpdateHiddenUrlField(form);
	});
	});

	const reportLink = document.getElementById('report-link');
	const modal = document.getElementById('validation-modal');
	const closeBtn = document.querySelector('.close-btn');
	const missingItemsList = document.getElementById('missing-items-list');

	reportLink.addEventListener('click', function(event) {
	event.preventDefault();

	// 서버에서 렌더링된 결과를 직접 확인
	const all_results = {};

	// 각 항목의 결과 확인
	{% if all_results.perplexity %}
	all_results.perplexity = {
	final_score: {{ all_results.perplexity.final_score if all_results.perplexity.final_score else 'null' }},
	error: {{ 'true' if all_results.perplexity.error else 'false' }}
	};
	{% endif %}

	{% if all_results.rouge %}
	all_results.rouge = {
	final_score: {{ all_results.rouge.final_score if all_results.rouge.final_score else 'null' }},
	error: {{ 'true' if all_results.rouge.error else 'false' }}
	};
	{% endif %}

	{% if all_results.bleu %}
	all_results.bleu = {
	final_score: {{ all_results.bleu.final_score if all_results.bleu.final_score else 'null' }},
	error: {{ 'true' if all_results.bleu.error else 'false' }}
	};
	{% endif %}

	{% if all_results.mmlu %}
	all_results.mmlu = {
	grade: "{{ all_results.mmlu.grade if all_results.mmlu.grade else '' }}",
	final_score: {{ all_results.mmlu.final_score if all_results.mmlu.final_score else 'null' }}
	};
	{% endif %}

	{% if all_results.truthfulqa %}
	all_results.truthfulqa = {
	grade: "{{ all_results.truthfulqa.grade if all_results.truthfulqa.grade else '' }}",
	final_score: {{ all_results.truthfulqa.final_score if all_results.truthfulqa.final_score else 'null' }}
	};
	{% endif %}

	{% if all_results.drop %}
	all_results.drop = {
	grade: "{{ all_results.drop.grade if all_results.drop.grade else '' }}",
	final_score: {{ all_results.drop.final_score if all_results.drop.final_score else 'null' }}
	};
	{% endif %}

	{% if all_results.mbpp_humaneval %}
	all_results.mbpp_humaneval = {
	grade: "{{ all_results.mbpp_humaneval.grade if all_results.mbpp_humaneval.grade else '' }}",
	final_score: {{ all_results.mbpp_humaneval.final_score if all_results.mbpp_humaneval.final_score else 'null' }}
	};
	{% endif %}

	const required_metrics = [
	{key: 'perplexity', name: '1. Perplexity', type: 'quantitative'},
	{key: 'rouge', name: '2. ROUGE', type: 'quantitative'},
	{key: 'bleu', name: '3. BLEU', type: 'quantitative'},
	{key: 'mmlu', name: '4. MMLU', type: 'qualitative'},
	{key: 'truthfulqa', name: '5. TruthfulQA', type: 'qualitative'},
	{key: 'drop', name: '6. DROP', type: 'qualitative'},
	{key: 'mbpp_humaneval', name: '7. MBPP & HumanEval', type: 'qualitative'}
	];

	let missingItems = [];
	required_metrics.forEach(metric => {
	const result = all_results[metric.key];
	let isMissing = false;

	if (!result) {
	isMissing = true;
	} else {
	if (metric.type === 'quantitative') {
	if (result.error \|\| typeof result.final_score === 'undefined') {
	isMissing = true;
	}
	} else {
	if (!result.grade) {
	isMissing = true;
	}
	}
	}

	if (isMissing) {
	missingItems.push(metric.name);
	}
	});

	if (missingItems.length > 0) {
	missingItemsList.innerHTML = '';
	missingItems.forEach(item => {
	const li = document.createElement('li');
	li.textContent = item;
	missingItemsList.appendChild(li);
	});
	modal.style.display = 'block';
	} else {
	window.open(this.href, '_blank');
	}
	});

	closeBtn.onclick = function() {
	modal.style.display = "none";
	}
	window.onclick = function(event) {
	if (event.target == modal) {
	modal.style.display = "none";
	}
	}
	});
	</script>

	</body>
	</html>