llm_eval3 / templates /report.html
kland's picture
Upload 4 files
6c886f2 verified
<!DOCTYPE html>
<html lang="ko">
<head>
<meta charset="UTF-8">
<title>γˆœκ°•μ›λžœλ“œ μƒμ„±ν˜• AI λͺ¨λΈ 싀증 평가 λ³΄κ³ μ„œ</title>
<style>
@media print {
body { -webkit-print-color-adjust: exact; }
.no-print { display: none; }
.page-break { page-break-before: always; }
}
body { font-family: 'Malgun Gothic', sans-serif; margin: 0; padding: 0; background-color: #fff; }
.container { max-width: 800px; margin: 40px auto; padding: 40px; border: 1px solid #ccc; box-shadow: 0 0 10px rgba(0,0,0,0.1); }
h1, h2 { text-align: center; border-bottom: 2px solid #000; padding-bottom: 10px; margin-bottom: 40px; }
h1 { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border: none; }
h2 { border-top: 2px solid #ccc; margin-top: 40px; }
table { width: 100%; border-collapse: collapse; margin-bottom: 30px; }
th, td { border: 1px solid #999; padding: 10px; text-align: left; word-break: break-all; }
th { background-color: #f2f2f2; font-weight: bold; text-align: center; }
.label { font-weight: bold; color: #333; }
.score { font-size: 1.1em; color: #0056b3; font-weight: bold; }
.final-score { font-size: 1.2em; color: #d9534f; font-weight: bold; }
.footer { margin-top: 80px; text-align: center; }
.signature-box { margin-top: 60px; float: right; }
.button-container { text-align: center; margin-top: 40px; }
button { background-color: #007bff; color: white; border: none; padding: 10px 20px; font-size: 1em; cursor: pointer; border-radius: 5px; }
.total-score-row th, .total-score-row td { background-color: #e9ecef; font-weight: bold; font-size: 1.1em; }
.details-section h3 { color: #34495e; border-bottom: 1px solid #ccc; padding-bottom: 5px; }
.details-section pre { background-color: #f8f9fa; padding: 15px; border-radius: 5px; white-space: pre-wrap; word-wrap: break-word; font-size: 0.9em; }
</style>
</head>
<body>
<div class="container">
<h1>γˆœκ°•μ›λžœλ“œ μƒμ„±ν˜• AI λͺ¨λΈ 싀증 평가 λ³΄κ³ μ„œ</h1>
<table>
<tr>
<th style="width: 30%;">평가 λŒ€μƒ LLM λͺ¨λΈ μ£Όμ†Œ</th>
<td colspan="3">{{ all_results.target_url if all_results.target_url else 'μž…λ ₯λ˜μ§€ μ•ŠμŒ' }}</td>
</tr>
<tr>
<th>평가 μ˜μ—­</th>
<th>평가 ν•­λͺ©</th>
<th>κ²°κ³Όκ°’</th>
<th>μ΅œμ’… 점수</th>
</tr>
<tr>
<th rowspan="3" style="vertical-align: middle;">μ •λŸ‰ 평가 (8점)</th>
<td>1. Perplexity</td>
<td style="text-align: center;"><span class="score">{{ all_results.perplexity.score_display if all_results.perplexity and not all_results.perplexity.error else '-' }}</span></td>
<td style="text-align: center;"><span class="final-score">{{ "%.2f"|format(all_results.perplexity.final_score if all_results.perplexity and not all_results.perplexity.error else 0) }} / 3</span></td>
</tr>
<tr>
<td>2. ROUGE</td>
<td style="text-align: center;"><span class="score">{{ all_results.rouge.score_display if all_results.rouge and not all_results.rouge.error else '-' }}</span></td>
<td style="text-align: center;"><span class="final-score">{{ "%.2f"|format(all_results.rouge.final_score if all_results.rouge and not all_results.rouge.error else 0) }} / 3</span></td>
</tr>
<tr>
<td>3. BLEU</td>
<td style="text-align: center;"><span class="score">{{ all_results.bleu.score_display if all_results.bleu and not all_results.bleu.error else '-' }}</span></td>
<td style="text-align: center;"><span class="final-score">{{ "%.2f"|format(all_results.bleu.final_score if all_results.bleu and not all_results.bleu.error else 0) }} / 2</span></td>
</tr>
<tr>
<th rowspan="4" style="vertical-align: middle;">μ •μ„± 평가 (12점)</th>
<td>4. MMLU</td>
<td style="text-align: center;"><span class="score">{{ all_results.mmlu.grade if all_results.mmlu and all_results.mmlu.grade else '-' }}</span></td>
<td style="text-align: center;"><span class="final-score">{{ "%.2f"|format(all_results.mmlu.final_score if all_results.mmlu and all_results.mmlu.final_score else 0) }} / 3</span></td>
</tr>
<tr>
<td>5. TruthfulQA</td>
<td style="text-align: center;"><span class="score">{{ all_results.truthfulqa.grade if all_results.truthfulqa and all_results.truthfulqa.grade else '-' }}</span></td>
<td style="text-align: center;"><span class="final-score">{{ "%.2f"|format(all_results.truthfulqa.final_score if all_results.truthfulqa and all_results.truthfulqa.final_score else 0) }} / 3</span></td>
</tr>
<tr>
<td>6. DROP</td>
<td style="text-align: center;"><span class="score">{{ all_results.drop.grade if all_results.drop and all_results.drop.grade else '-' }}</span></td>
<td style="text-align: center;"><span class="final-score">{{ "%.2f"|format(all_results.drop.final_score if all_results.drop and all_results.drop.final_score else 0) }} / 3</span></td>
</tr>
<tr>
<td>7. MBPP & HumanEval</td>
<td style="text-align: center;"><span class="score">{{ all_results.mbpp_humaneval.grade if all_results.mbpp_humaneval and all_results.mbpp_humaneval.grade else '-' }}</span></td>
<td style="text-align: center;"><span class="final-score">{{ "%.2f"|format(all_results.mbpp_humaneval.final_score if all_results.mbpp_humaneval and all_results.mbpp_humaneval.final_score else 0) }} / 3</span></td>
</tr>
{% set total_score = (all_results.perplexity.final_score|default(0, true) if not all_results.perplexity.error|default(false, true) else 0) +
(all_results.rouge.final_score|default(0, true) if not all_results.rouge.error|default(false, true) else 0) +
(all_results.bleu.final_score|default(0, true) if not all_results.bleu.error|default(false, true) else 0) +
(all_results.mmlu.final_score|default(0, true)) +
(all_results.truthfulqa.final_score|default(0, true)) +
(all_results.drop.final_score|default(0, true)) +
(all_results.mbpp_humaneval.final_score|default(0, true)) %}
<tr class="total-score-row">
<th colspan="3">총점</th>
<td style="text-align: center;"><span class="final-score">{{ "%.2f"|format(total_score) }} / 20</span></td>
</tr>
</table>
<div class="footer">
<p>상기 λ‚΄μš©κ³Ό 같이 μƒμ„±ν˜• AI λͺ¨λΈ 싀증 평가λ₯Ό μ™„λ£Œν•˜μ˜€μŒμ„ ν™•μΈν•©λ‹ˆλ‹€.</p>
<p class="eval-date"><b>ν‰κ°€μΌμž:</b> <span id="report-date"></span></p>
<div class="signature-box">
<p><b>ν‰κ°€μž:</b> ____________________ (μ„œλͺ…)</p>
</div>
</div>
</div>
<div class="page-break"></div>
<div class="container details-section">
<h2>상세 평가 λ‚΄μš©</h2>
<div class="detail-item">
<h3>1. Perplexity (μ–Έμ–΄ λͺ¨λΈ ν’ˆμ§ˆ)</h3>
<h4>μž…λ ₯ ν…μŠ€νŠΈ:</h4>
<pre>{{ input_texts.ppl_text if input_texts.ppl_text else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
{% if all_results.perplexity and all_results.perplexity.details %}
<h4>μ„ΈλΆ€ μΈ‘μ •κ°’:</h4>
<pre>κΈ°λ³Έ PPL: {{ all_results.perplexity.details.base_ppl }}
νŽ˜λ„ν‹° κ³„μˆ˜: {{ all_results.perplexity.details.penalty_factor }}
토큰 수: {{ all_results.perplexity.details.token_count }}
계산 μ‹œκ°„: {{ all_results.perplexity.details.calc_time }}</pre>
{% endif %}
</div>
<div class="detail-item">
<h3>2. ROUGE (μš”μ•½ λŠ₯λ ₯)</h3>
<h4>λͺ¨λΈ 생성 μš”μ•½λ¬Έ:</h4>
<pre>{{ input_texts.rouge_generated if input_texts.rouge_generated else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
<h4>μ°Έμ‘° μš”μ•½λ¬Έ:</h4>
<pre>{{ input_texts.rouge_reference if input_texts.rouge_reference else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
{% if all_results.rouge and all_results.rouge.details %}
<h4>μ„ΈλΆ€ μΈ‘μ •κ°’:</h4>
<pre>ROUGE-1: {{ all_results.rouge.details.rouge1 if all_results.rouge.details.rouge1 else '-' }}
ROUGE-2: {{ all_results.rouge.details.rouge2 if all_results.rouge.details.rouge2 else '-' }}
ROUGE-L: {{ all_results.rouge.details.rougeL if all_results.rouge.details.rougeL else '-' }}
가쀑 평균: {{ all_results.rouge.details.weighted_avg }}
길이 νŽ˜λ„ν‹°: {{ all_results.rouge.details.length_penalty }}</pre>
{% endif %}
</div>
<div class="detail-item">
<h3>3. BLEU (λ²ˆμ—­ μ •ν™•μ„±)</h3>
<h4>λͺ¨λΈ 생성 λ²ˆμ—­λ¬Έ:</h4>
<pre>{{ input_texts.bleu_generated if input_texts.bleu_generated else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
<h4>μ°Έμ‘° λ²ˆμ—­λ¬Έ 1:</h4>
<pre>{{ input_texts.bleu_reference1 if input_texts.bleu_reference1 else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
<h4>μ°Έμ‘° λ²ˆμ—­λ¬Έ 2:</h4>
<pre>{{ input_texts.bleu_reference2 if input_texts.bleu_reference2 else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
</div>
<div class="detail-item">
<h3>4. MMLU (볡합 문제 ν•΄κ²°)</h3>
<h4>λͺ¨λΈ 생성 κ²°κ³Ό:</h4>
<pre>{{ input_texts.mmlu_generated if input_texts.mmlu_generated else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
<h4>μ°Έμ‘°(μ •λ‹΅):</h4>
<pre>{{ input_texts.mmlu_reference if input_texts.mmlu_reference else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
</div>
<div class="detail-item">
<h3>5. TruthfulQA (사싀 기반 λ‹΅λ³€)</h3>
<h4>λͺ¨λΈ 생성 κ²°κ³Ό:</h4>
<pre>{{ input_texts.truthfulqa_generated if input_texts.truthfulqa_generated else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
<h4>μ°Έμ‘°(μ •λ‹΅):</h4>
<pre>{{ input_texts.truthfulqa_reference if input_texts.truthfulqa_reference else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
</div>
<div class="detail-item">
<h3>6. DROP (λ¬Έμ„œ 독해/μΆ”λ‘ )</h3>
<h4>λͺ¨λΈ 생성 κ²°κ³Ό:</h4>
<pre>{{ input_texts.drop_generated if input_texts.drop_generated else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
<h4>μ°Έμ‘°(μ •λ‹΅):</h4>
<pre>{{ input_texts.drop_reference if input_texts.drop_reference else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
</div>
<div class="detail-item">
<h3>7. MBPP & HumanEval (μ½”λ“œ 생성/업무 μžλ™ν™”)</h3>
<h4>λͺ¨λΈ 생성 κ²°κ³Ό:</h4>
<pre>{{ input_texts.mbpp_humaneval_generated if input_texts.mbpp_humaneval_generated else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
<h4>μ°Έμ‘°(μ •λ‹΅):</h4>
<pre>{{ input_texts.mbpp_humaneval_reference if input_texts.mbpp_humaneval_reference else '(ν‰κ°€λ˜μ§€ μ•ŠμŒ)' }}</pre>
</div>
</div>
<div class="button-container no-print">
<button onclick="window.print()">λ³΄κ³ μ„œ μΈμ‡„ν•˜κΈ°</button>
</div>
<script>
document.addEventListener('DOMContentLoaded', function() {
const now = new Date();
const year = now.getFullYear();
const month = String(now.getMonth() + 1).padStart(2, '0');
const day = String(now.getDate()).padStart(2, '0');
const hours = String(now.getHours()).padStart(2, '0');
const minutes = String(now.getMinutes()).padStart(2, '0');
const seconds = String(now.getSeconds()).padStart(2, '0');
const formattedDate = `${year}λ…„ ${month}μ›” ${day}일 ${hours}:${minutes}:${seconds}`;
document.getElementById('report-date').textContent = formattedDate;
});
</script>
</body>
</html>