PLOBIN / app.py
dohyune's picture
Update app.py
41800ca verified
"""
PLOBIN
"""
import difflib
import streamlit as st
import streamlit.components.v1 as components
import fitz # PyMuPDF
import chromadb
from sentence_transformers import SentenceTransformer, util
import requests
import os
import re
import shutil
from collections import Counter
import numpy as np
from typing import List, Dict, Tuple
import base64
from dotenv import load_dotenv
import json
from difflib import SequenceMatcher
import pdfplumber
def get_svg_content(svg_path):
with open(svg_path, "r", encoding="utf-8") as f:
return f.read()
plobin_logo_svg = get_svg_content("img/plobin.svg")
load_dotenv()
GROK_API_KEY = os.getenv("GROK_API_KEY")
GROK_API_BASE = "https://api.x.ai/v1"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = "https://api.openai.com/v1"
CHROMA_DIR = "./chroma_db"
EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
class HighlightConfig:
def __init__(self):
self.color = [1.0, 1.0, 0.0]
st.set_page_config(
page_title="PLOBIN",
page_icon="img/plobin-left-only.png",
layout="wide",
initial_sidebar_state="expanded"
)
st.markdown("""
<style>
[data-testid="stSidebar"] {
background: linear-gradient(180deg,
#f9f9f9 0%,
#f9f9f9 100%);
box-shadow: none;
border-right: 1px solid #ededed;
width: 280px !important;
}
[data-testid="stSidebar"] h1 {
color: white !important;
font-weight: 900 !important;
text-shadow:
0 0 30px rgba(255,255,255,0.6),
0 0 50px rgba(102,126,234,0.4),
3px 3px 40px rgba(0,0,0,0.4);
animation: sidebarTitlePulse 4s ease-in-out infinite;
letter-spacing: 2px;
}
@keyframes sidebarTitlePulse {
0%, 100% {
transform: scale(1);
text-shadow:
0 0 30px rgba(255,255,255,0.6),
0 0 50px rgba(102,126,234,0.4),
3px 3px 40px rgba(0,0,0,0.4);
}
50% {
transform: scale(1.03);
text-shadow:
0 0 40px rgba(255,255,255,0.8),
0 0 70px rgba(102,126,234,0.6),
0 0 100px rgba(118,75,162,0.4),
3px 3px 40px rgba(0,0,0,0.4);
}
}
[data-testid="stSidebar"] [data-testid="stFileUploader"] {
background: rgba(198,198,198,0.15);
border-radius: 15px;
padding: 1.5rem;
border: 1.5px dashed rgba(198,198,198,0.4);
transition: all 0.3s ease;
backdrop-filter: blur(10px);
}
[data-testid="stFileUploader"] > section {
background: transparent !important;
}
[data-testid="stFileUploader"] > section > div {
background: transparent !important;
}
[data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] {
color: #c6c6c6;
}
/* ์‚ฌ์šฉ์ž ๋ฉ”์‹œ์ง€ ์•„์ด์ฝ˜ ๋ณ€๊ฒฝ */
[data-testid="stChatMessage"][data-testid="user"]
[data-testid="chat-message-avatar"] img {
content: url("https://your-image-url.com/user-icon.png") !important;
}
[data-testid="stSidebar"] [data-testid="stFileUploader"] > section,
[data-testid="stSidebar"] [data-testid="stFileUploader"] section > div {
background: transparent !important;
border: none !important;
}
[data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p {
color: #555555 !important;
}
[data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] {
background: rgba(127,128,134,0.2) !important;
color: #8A8A8A !important;
border: 1px solid rgba(127,128,134,0.3) !important;
}
[data-testid="stSidebar"] .stButton button {
background: rgba(127,128,134,0.15) !important;
color: #555555 !important;
border: 2px solid rgba(127,128,134,0.4) !important;
border-radius: 12px !important;
font-weight: 700 !important;
padding: 0.75rem 1.5rem !important;
backdrop-filter: blur(10px) !important;
transition: all 0.3s ease !important;
box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
}
[data-testid="stSidebar"] .stButton button:hover {
background: rgba(255, 36, 36,0.25) !important;
border-color: rgba(255, 36, 36,0.6) !important;
transform: translateY(-2px) scale(1.02) !important;
}
[data-testid="stSidebar"] .stButton button:active {
transform: translateY(0px) scale(0.98) !important;
}
[data-testid="stSidebar"] .stButton button[kind="primary"] {
background: rgba(255,255,255,0.25) !important;
border: 2px solid rgba(255,255,255,0.5) !important;
font-size: 1.05rem !important;
}
[data-testid="stSidebar"] .stButton button[kind="primary"]:hover {
background: rgba(255,255,255,0.35) !important;
border-color: rgba(255,255,255,0.7) !important;
}
[data-testid="stSidebar"] [data-testid="stAlert"] {
background-color: #f2f2f2 !important;
border-radius: 0.5rem !important;
}
}
[data-testid="stSidebar"] [data-testid="stFileUploader"] button {
display: block;
}
/* ์‚ฌ์ด๋“œ๋ฐ” ์ ‘๊ธฐ/ํŽผ์น˜๊ธฐ ๋ฒ„ํŠผ ํ•ญ์ƒ ๋ณด์ด๊ฒŒ */
[data-testid="stSidebarCollapseButton"] {
opacity: 1 !important;
visibility: visible !important;
transition: opacity 0.2s ease !important;
}
/* ํ˜ธ๋ฒ„ ํšจ๊ณผ ์—†์• ๊ณ  ํ•ญ์ƒ ๋ฐ๊ฒŒ */
[data-testid="stSidebarCollapseButton"]:hover {
opacity: 1 !important;
}
[data-testid="stAlert"] p {
color: #747474;
}
/* ์‚ฌ์ด๋“œ๋ฐ” Alert ์ „์ฒด ๋ฐ•์Šค ๊ฐ•์ œ ์Šคํƒ€์ผ */
[data-testid="stSidebar"] [data-testid="stAlert"] {
background-color: #f2f2f2 !important; /* ์›ํ•˜๋Š” ๋ฐฐ๊ฒฝ์ƒ‰ */
border-radius: 0.5rem !important;
}
/* Alert ๋‚ด๋ถ€ ์ปจํ…Œ์ด๋„ˆ๊นŒ์ง€ ๊ฐ•์ œ๋กœ ์ƒ‰์ƒ ์ ์šฉ */
[data-testid="stSidebar"] [data-testid="stAlert"] > div {
background-color: #f2f2f2 !important;
}
/* ๊ฐ€์žฅ ์•ˆ์ชฝ Alert message ๋ฐ•์Šค */
[data-testid="stSidebar"] [data-testid="stAlert"] [role="alert"] {
background-color: #f2f2f2 !important;
}
.main .block-container {
max-width: 100%;
padding-left: 2rem;
padding-right: 2rem;
}
.plobin-header {
padding: 1.5rem 2rem;
margin-bottom: 2rem;
}
.plobin-logo {
display: block;
margin: 0 auto;
height: 60px;
}
.plobin-title {
font-size: 2.5rem;
font-weight: bold;
color: white;
margin: 0;
text-align: center;
text-shadow: 2px 2px 8px rgba(0, 0, 0, 0.4),
0 0 20px rgba(102, 126, 234, 0.4);
}
.plobin-subtitle {
font-size: 1rem;
color: rgba(255, 255, 255, 0.9);
text-align: center;
margin-top: 0.5rem;
text-shadow: 1px 1px 6px rgba(0, 0, 0, 0.4);
}
[data-testid="stFileUploader"] {
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
border: 3px dashed #667eea;
border-radius: 1rem;
padding: 3rem 2rem;
}
[data-testid="stFileUploader"] > div {
text-align: center;
}
[data-testid="stFileUploader"] label {
font-size: 1.2rem !important;
color: #2D3748 !important;
font-weight: 600 !important;
}
.pdf-container {
border: 2px solid #E2E8F0;
border-radius: 0.5rem;
padding: 0.5rem;
height: 706px;
overflow-y: auto;
background: white;
}
.chat-container {
border: 2px solid #E2E8F0;
border-radius: 0.5rem;
padding: 1rem;
height: 650px;
overflow-y: auto;
background: white;
margin-bottom: 0.5rem;
}
[data-testid="stChatInput"] {
margin-top: 0 !important;
padding-top: 0 !important;
}
.source-box {
background: #F1F5F9;
padding: 1rem;
border-radius: 0.5rem;
margin: 0.5rem 0;
border-left: 3px solid #667eea;
}
.source-title {
font-weight: bold;
color: #667eea;
margin-bottom: 0.5rem;
}
.page-indicator {
background: #667eea;
color: white;
padding: 0.3rem 0.8rem;
border-radius: 1rem;
font-size: 0.85rem;
display: inline-block;
margin: 0.2rem;
}
.highlight-indicator {
background: #FEF08A;
color: #854D0E;
padding: 0.5rem 1rem;
border-radius: 0.5rem;
margin: 0.5rem 0;
font-weight: bold;
border-left: 4px solid #EAB308;
}
.usage-guide {
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
padding: 2rem;
border-radius: 1rem;
margin-bottom: 2rem;
height: 100%;
}
.guide-step {
display: flex;
align-items: center;
margin: 1.5rem 0;
margin-left: 3.5rem;
font-size: 1.1rem;
color: #2D3748;
}
.step-number {
background: #667eea;
color: white;
width: 2.5rem;
height: 2.5rem;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
font-weight: bold;
font-size: 1.2rem;
margin-right: 1rem;
flex-shrink: 0;
}
.viewer-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 1rem;
}
@keyframes pulse {
0%, 100% {
box-shadow: 0 0 0 0 rgba(16, 185, 129, 0.7);
}
50% {
box-shadow: 0 0 20px 10px rgba(16, 185, 129, 0);
}
}
.chat-title {
color: black !important;
font-weight: 900 !important;
font-size: 1.75rem !important;
margin-bottom: 1rem !important;
text-shadow:
0 0 30px rgba(255,255,255,0.6),
0 0 50px rgba(102,126,234,0.4),
3px 3px 40px rgba(0,0,0,0.4);
letter-spacing: 2px;
}
[data-testid="column"] button[kind="secondary"] {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
color: white !important;
border: none !important;
border-radius: 0.5rem !important;
padding: 0.6rem 1rem !important;
font-weight: bold !important;
font-size: 0.95rem !important;
text-align: left !important;
box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
transition: all 0.2s ease !important;
cursor: pointer !important;
}
[data-testid="column"] button[kind="secondary"]:hover {
transform: translateY(-2px) !important;
box-shadow: 0 4px 8px rgba(102, 126, 234, 0.3) !important;
background: linear-gradient(135deg, #7c8ff5 0%, #8a5db8 100%) !important;
}
[data-testid="column"] button[kind="primary"] {
background: linear-gradient(135deg, #FEF08A 0%, #FDE047 100%) !important;
color: #854D0E !important;
border: 2px solid #EAB308 !important;
border-radius: 0.5rem !important;
padding: 0.6rem 1rem !important;
font-weight: bold !important;
font-size: 0.95rem !important;
text-align: left !important;
box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
transition: all 0.2s ease !important;
cursor: pointer !important;
}
[data-testid="column"] button[kind="primary"]:hover {
transform: translateY(-2px) !important;
box-shadow: 0 4px 8px rgba(234, 179, 8, 0.3) !important;
background: linear-gradient(135deg, #FDE047 0%, #FACC15 100%) !important;
}
/* ์ฑ„ํŒ… ์ž…๋ ฅ์ฐฝ ํฌ์ปค์Šค ์‹œ ํ…Œ๋‘๋ฆฌ ์ƒ‰์ƒ ๋ณ€๊ฒฝ */
[data-testid="stChatInput"] textarea:focus {
border-color: #3f3f3f !important;
box-shadow: 0 0 0 1px #3f3f3f !important;
}
/* ์ฑ„ํŒ… ์ž…๋ ฅ์ฐฝ ๊ธฐ๋ณธ ์ƒํƒœ */
[data-testid="stChatInput"] textarea {
border-color: #3f3f3f; !important;
transition: border-color 0.2s ease;
}
/* ํ˜ธ๋ฒ„ ์ƒํƒœ */
[data-testid="stChatInput"] textarea:hover {
border-color: #3f3f3f; !important;
}
/* Streamlit ๊ธฐ๋ณธ ์•„๋ฐ”ํƒ€ ์ˆจ๊ธฐ๊ธฐ */
[data-testid="stChatMessage"][data-testid="user"]
[data-testid="chat-message-avatar"] img {
display: none !important;
}
/* ์›ํ•˜๋Š” ์•„์ด์ฝ˜์œผ๋กœ ๊ต์ฒด */
[data-testid="stChatMessage"][data-testid="user"]
[data-testid="chat-message-avatar"] {
background-image: "final/img/user-profile.png";
background-size: cover;
background-position: center;
width: 36px !important;
height: 36px !important;
border-radius: 50%; /* ์›ํ˜• */
}
/* ๊ธฐ๋ณธ ์•„๋ฐ”ํƒ€ ์ œ๊ฑฐ */
[data-testid="stChatMessage"][data-testid="assistant"]
[data-testid="chat-message-avatar"] img {
display: none !important;
}
/* ์ปค์Šคํ…€ ์•„์ด์ฝ˜ ์ง€์ • */
[data-testid="stChatMessage"][data-testid="assistant"]
[data-testid="chat-message-avatar"] {
background-image: "final/img/cloud.png;
background-size: cover;
background-position: center;
width: 36px !important;
height: 36px !important;
border-radius: 50%;
}
</style>
""", unsafe_allow_html=True)
SPACE_RE = re.compile(r'\s+')
def normalize_for_search(text: str) -> str:
"""
๊ฒ€์ƒ‰/๋งค์นญ์šฉ ํ…์ŠคํŠธ ์ •๊ทœํ™”:
- ์–‘๋ ๊ณต๋ฐฑ ์ œ๊ฑฐ
- ์†Œ๋ฌธ์ž ๋ณ€ํ™˜
- ๋ชจ๋“  ๊ณต๋ฐฑ ๋ฌธ์ž ์ œ๊ฑฐ (๋„์–ด์“ฐ๊ธฐ ์ฐจ์ด ๋ฌด์‹œ)
"""
text = text.strip().lower()
text = SPACE_RE.sub('', text) # ๋ชจ๋“  ๊ณต๋ฐฑ ๋‚ ๋ฆฌ๊ธฐ
return text
def init_session():
if 'processed' not in st.session_state:
st.session_state.processed = False
if 'vector_db' not in st.session_state:
st.session_state.vector_db = None
if 'embedder' not in st.session_state:
st.session_state.embedder = None
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
if 'doc_metadata' not in st.session_state:
st.session_state.doc_metadata = {}
if 'pdf_bytes' not in st.session_state:
st.session_state.pdf_bytes = None
if 'pdf_pages_text' not in st.session_state:
st.session_state.pdf_pages_text = {}
if 'current_highlights' not in st.session_state:
st.session_state.current_highlights = []
if 'zoom_level' not in st.session_state:
st.session_state.zoom_level = 2.0
if 'highlight_config' not in st.session_state:
st.session_state.highlight_config = HighlightConfig()
if 'processing_query' not in st.session_state:
st.session_state.processing_query = None
if 'scroll_to_page' not in st.session_state:
st.session_state.scroll_to_page = None
def extract_table_image_as_base64(pdf_bytes: bytes, page_num: int, bbox: tuple) -> str:
"""
PDF ํŽ˜์ด์ง€์—์„œ ํ‘œ ์˜์—ญ์„ ์ด๋ฏธ์ง€๋กœ ์ถ”์ถœํ•˜์—ฌ base64๋กœ ์ธ์ฝ”๋”ฉ
Args:
pdf_bytes: PDF ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ
page_num: ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ (0๋ถ€ํ„ฐ ์‹œ์ž‘)
bbox: (x0, y0, x1, y1) ํ‘œ ์˜์—ญ ์ขŒํ‘œ
Returns:
base64 ์ธ์ฝ”๋”ฉ๋œ ์ด๋ฏธ์ง€ ๋ฌธ์ž์—ด
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page = doc[page_num]
# bbox ์˜์—ญ์„ ์ด๋ฏธ์ง€๋กœ ๋ Œ๋”๋ง (๊ณ ํ•ด์ƒ๋„)
rect = fitz.Rect(bbox)
pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0), clip=rect)
img_bytes = pix.tobytes("png")
doc.close()
# base64 ์ธ์ฝ”๋”ฉ
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
return img_base64
def convert_table_to_markdown_with_vision(
pdf_bytes: bytes,
page_num: int,
bbox: tuple,
api_key: str
) -> str:
"""
OpenAI Vision API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ‘œ ์ด๋ฏธ์ง€๋ฅผ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ๋ณ€ํ™˜
Args:
pdf_bytes: PDF ๋ฐ”์ดํŠธ ๋ฐ์ดํ„ฐ
page_num: ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
bbox: ํ‘œ ์˜์—ญ ์ขŒํ‘œ
api_key: OpenAI API ํ‚ค
Returns:
๋งˆํฌ๋‹ค์šด ํ˜•์‹์˜ ํ‘œ
"""
# ํ‘œ ์˜์—ญ ์ด๋ฏธ์ง€ ์ถ”์ถœ
img_base64 = extract_table_image_as_base64(pdf_bytes, page_num, bbox)
# OpenAI Vision API ํ˜ธ์ถœ
prompt = """์ด ์ด๋ฏธ์ง€๋Š” PDF ๋ฌธ์„œ์˜ ํ‘œ์ž…๋‹ˆ๋‹ค.
ํ‘œ์˜ ๋‚ด์šฉ์„ ์ •ํ™•ํ•˜๊ฒŒ ๋งˆํฌ๋‹ค์šด ํ‘œ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•ด์ฃผ์„ธ์š”.
๊ทœ์น™:
1. ์…€ ๋ณ‘ํ•ฉ์ด ์žˆ์œผ๋ฉด ์ ์ ˆํžˆ ์ฒ˜๋ฆฌ
2. ์ค‘์ฒฉ๋œ ํ‘œ๊ฐ€ ์žˆ์œผ๋ฉด ํ…์ŠคํŠธ๋กœ ํ‘œํ˜„
3. ๋นˆ ์…€์€ ๋นˆ ์นธ์œผ๋กœ ์œ ์ง€
4. ํ‘œ ํ˜•์‹๋งŒ ๋ฐ˜ํ™˜ (์ถ”๊ฐ€ ์„ค๋ช… ์—†์ด)
๋งˆํฌ๋‹ค์šด ํ‘œ ํ˜•์‹:
| ์—ด1 | ์—ด2 | ์—ด3 |
| --- | --- | --- |
| ๋ฐ์ดํ„ฐ1 | ๋ฐ์ดํ„ฐ2 | ๋ฐ์ดํ„ฐ3 |"""
try:
response = requests.post(
f"{OPENAI_API_BASE}/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
},
json={
"model": "gpt-4o", # gpt-4o ๋˜๋Š” gpt-4o-mini
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{img_base64}",
"detail": "high" # "low", "high", "auto"
}
}
]
}
],
"temperature": 0.1,
"max_tokens": 2000
},
timeout=120
)
if response.status_code == 200:
result = response.json()
markdown_table = result['choices'][0]['message']['content']
# ์ฝ”๋“œ๋ธ”๋ก ์ œ๊ฑฐ
markdown_table = re.sub(r'```markdown\s*|\s*```', '', markdown_table)
markdown_table = re.sub(r'```\s*|\s*```', '', markdown_table)
return markdown_table.strip()
else:
# ์—๋Ÿฌ ์ƒ์„ธ ์ถœ๋ ฅ
error_detail = response.text
print(f"OpenAI API ์˜ค๋ฅ˜: {response.status_code}")
print(f"์ƒ์„ธ: {error_detail}")
return f"[ํ‘œ ๋ณ€ํ™˜ ์‹คํŒจ: {response.status_code} - {error_detail[:200]}]"
except Exception as e:
return f"[ํ‘œ ๋ณ€ํ™˜ ์‹คํŒจ: {str(e)}]"
def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
"""
PDF์—์„œ ํ…์ŠคํŠธ์™€ ํ‘œ๋ฅผ ์ถ”์ถœ (ํ‘œ๋Š” Grok Vision API๋กœ ์ฒ˜๋ฆฌ)
"""
pdf_bytes = pdf_file.read()
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
chunks = []
metadata_list = []
pages_text = {}
CHUNK_SIZE = 800
OVERLAP_SIZE = 150
# pdfplumber๋กœ PDF ์—ด๊ธฐ
pdf_file.seek(0)
with pdfplumber.open(pdf_file) as pdf_plumber:
for page_num in range(len(doc)):
# PyMuPDF๋กœ ํ…์ŠคํŠธ ์ถ”์ถœ
fitz_page = doc[page_num]
text = fitz_page.get_text("text")
# pdfplumber๋กœ ํ‘œ ํƒ์ง€
tables_markdown = []
if page_num < len(pdf_plumber.pages):
plumber_page = pdf_plumber.pages[page_num]
# ํ‘œ ํƒ์ง€
table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"snap_tolerance": 3,
"join_tolerance": 3,
}
tables = plumber_page.find_tables(table_settings=table_settings)
# ๊ฐ ํ‘œ๋ฅผ Vision API๋กœ ์ฒ˜๋ฆฌ
for idx, table in enumerate(tables):
bbox = table.bbox # (x0, y0, x1, y1)
# Grok Vision API๋กœ ๋งˆํฌ๋‹ค์šด ๋ณ€ํ™˜
markdown_table = convert_table_to_markdown_with_vision(
pdf_bytes,
page_num,
bbox,
OPENAI_API_KEY
)
tables_markdown.append(f"\n\n**[ํ‘œ {idx + 1}]**\n{markdown_table}\n")
# ํ…์ŠคํŠธ์™€ ํ‘œ๋ฅผ ๊ฒฐํ•ฉ
combined_content = text
if tables_markdown:
combined_content += "\n\n" + "\n".join(tables_markdown)
pages_text[page_num + 1] = combined_content
if not combined_content.strip():
continue
# ์ฒญํฌ๋กœ ๋ถ„ํ• 
lines = [line.strip() for line in combined_content.split('\n') if line.strip()]
cleaned_text = '\n'.join(lines)
# ํ‘œ ๋งˆ์ปค๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ถ„ํ•  ์šฐ์„  ์ฒ˜๋ฆฌ
if "**[ํ‘œ" in cleaned_text:
# ํ‘œ ๋‹จ์œ„๋กœ ๋ถ„ํ• 
table_pattern = r'\*\*\[ํ‘œ \d+\]\*\*'
parts = re.split(f'({table_pattern})', cleaned_text)
current_chunk = ""
for part in parts:
part = part.strip()
if not part:
continue
# ํ‘œ ์„น์…˜์ธ ๊ฒฝ์šฐ
if re.match(table_pattern, part):
if current_chunk:
chunks.append(current_chunk.strip())
metadata_list.append({
"page": page_num + 1,
"source": pdf_file.name,
"chunk_type": "text"
})
current_chunk = ""
current_chunk = part
else:
# ํ‘œ ๋‚ด์šฉ์ด๊ฑฐ๋‚˜ ์ผ๋ฐ˜ ํ…์ŠคํŠธ
if current_chunk and re.match(table_pattern, current_chunk):
# ์ด์ „์ด ํ‘œ ๋งˆ์ปค์˜€๋‹ค๋ฉด ํ‘œ ๋‚ด์šฉ ์ถ”๊ฐ€
current_chunk += "\n" + part
chunks.append(current_chunk.strip())
metadata_list.append({
"page": page_num + 1,
"source": pdf_file.name,
"chunk_type": "table"
})
current_chunk = ""
else:
# ์ผ๋ฐ˜ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ
if len(current_chunk) + len(part) > CHUNK_SIZE:
if current_chunk:
chunks.append(current_chunk.strip())
metadata_list.append({
"page": page_num + 1,
"source": pdf_file.name,
"chunk_type": "text"
})
current_chunk = part
else:
current_chunk += "\n" + part if current_chunk else part
if current_chunk:
chunk_type = "table" if re.match(table_pattern, current_chunk) else "text"
chunks.append(current_chunk.strip())
metadata_list.append({
"page": page_num + 1,
"source": pdf_file.name,
"chunk_type": chunk_type
})
else:
# ํ‘œ๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ ์ผ๋ฐ˜ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ
sentences = re.split(r'([.!?]\s+|\n{2,})', cleaned_text)
sentences = [s for s in sentences if s.strip()]
current_chunk = ""
current_length = 0
for sentence in sentences:
sentence_length = len(sentence)
if current_length + sentence_length > CHUNK_SIZE and current_chunk:
chunks.append(current_chunk.strip())
metadata_list.append({
"page": page_num + 1,
"source": pdf_file.name,
"chunk_type": "text"
})
overlap_text = current_chunk[-OVERLAP_SIZE:] if len(current_chunk) > OVERLAP_SIZE else current_chunk
current_chunk = overlap_text + sentence
current_length = len(current_chunk)
else:
current_chunk += sentence
current_length += sentence_length
if current_chunk.strip():
chunks.append(current_chunk.strip())
metadata_list.append({
"page": page_num + 1,
"source": pdf_file.name,
"chunk_type": "text"
})
doc.close()
return chunks, metadata_list, pdf_bytes, pages_text
def save_extracted_text_to_file(chunks: List[str], metadata_list: List[Dict], filename: str):
"""
์ถ”์ถœํ•œ ํ…์ŠคํŠธ๋ฅผ ๋กœ์ปฌ ํŒŒ์ผ๋กœ ์ €์žฅ
"""
import os
from datetime import datetime
# ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
output_dir = "extracted_text"
os.makedirs(output_dir, exist_ok=True)
# ํŒŒ์ผ๋ช… ์ƒ์„ฑ (ํƒ€์ž„์Šคํƒฌํ”„ ํฌํ•จ)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_name = os.path.splitext(filename)[0]
output_file = os.path.join(output_dir, f"{base_name}_{timestamp}.txt")
# ํ…์ŠคํŠธ ์ €์žฅ
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f"=" * 80 + "\n")
f.write(f"๋ฌธ์„œ๋ช…: {filename}\n")
f.write(f"์ถ”์ถœ ์‹œ๊ฐ„: {timestamp}\n")
f.write(f"์ด ์ฒญํฌ ์ˆ˜: {len(chunks)}\n")
f.write(f"=" * 80 + "\n\n")
for idx, (chunk, meta) in enumerate(zip(chunks, metadata_list), 1):
f.write(f"\n{'='*80}\n")
f.write(f"์ฒญํฌ #{idx}\n")
f.write(f"ํŽ˜์ด์ง€: {meta.get('page', 'N/A')}\n")
f.write(f"ํƒ€์ž…: {meta.get('chunk_type', 'text')}\n")
f.write(f"{'-'*80}\n")
f.write(chunk)
f.write(f"\n{'='*80}\n")
return output_file
@st.cache_resource(show_spinner=False)
def load_embedding_model():
return SentenceTransformer(EMBEDDING_MODEL)
def create_vector_db(chunks: List[str], metadata_list: List[Dict]):
embedder = load_embedding_model()
client = chromadb.EphemeralClient(
settings=chromadb.Settings(
anonymized_telemetry=False,
allow_reset=True
)
)
try:
client.delete_collection("rfx_docs")
except Exception:
pass
collection = client.create_collection(
name="rfx_docs",
metadata={"hnsw:space": "cosine"}
)
batch_size = 32
all_embeddings = []
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
embeddings = embedder.encode(batch, show_progress_bar=False, convert_to_numpy=True)
all_embeddings.extend(embeddings)
ids = [f"doc_{i}" for i in range(len(chunks))]
collection.add(
embeddings=[emb.tolist() for emb in all_embeddings],
documents=chunks,
metadatas=metadata_list,
ids=ids
)
return collection, embedder
def extract_keywords_semantic(text: str, embedder, top_n: int = 5) -> List[str]:
words_with_numbers = re.findall(r'[๊ฐ€-ํžฃ]*\d+[๊ฐ€-ํžฃ]*', text)
candidate_words = re.findall(r'[๊ฐ€-ํžฃ]{2,}', text)
if not candidate_words:
return words_with_numbers[:top_n]
word_freq = Counter(candidate_words)
text_embedding = embedder.encode([text], convert_to_numpy=True)[0]
word_embeddings = embedder.encode(list(word_freq.keys()), convert_to_numpy=True)
similarities = util.cos_sim(text_embedding, word_embeddings)[0].numpy()
scored_words = []
for idx, (word, freq) in enumerate(word_freq.items()):
semantic_score = similarities[idx]
frequency_score = np.log1p(freq) / 10.0
combined_score = 0.7 * semantic_score + 0.3 * frequency_score
scored_words.append((word, combined_score))
scored_words.sort(key=lambda x: x[1], reverse=True)
result = []
for word in words_with_numbers[:3]:
if word and word not in result:
result.append(word)
for word, score in scored_words:
if word not in result:
result.append(word)
if len(result) >= top_n:
break
return result[:top_n]
def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
query_embedding = embedder.encode([query], convert_to_numpy=True)[0]
vector_results = collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=20,
include=["documents", "metadatas", "distances"]
)
keywords = extract_keywords_semantic(query, embedder, top_n=5)
hybrid_results = []
for i, doc_id in enumerate(vector_results['ids'][0]):
doc = vector_results['documents'][0][i]
metadata = vector_results['metadatas'][0][i]
vector_score = 1 - vector_results['distances'][0][i]
keyword_score = 0
# ์›๋ฌธ/์ •๊ทœํ™” ๋‘˜ ๋‹ค ์ค€๋น„
doc_lower = doc.lower()
doc_norm = normalize_for_search(doc) # ๊ณต๋ฐฑ ์ œ๊ฑฐ ๋ฒ„์ „
for keyword in keywords:
kw_lower = keyword.lower()
kw_norm = normalize_for_search(keyword)
# 1) ์›๋ž˜ ๋ฐฉ์‹: ๊ทธ๋Œ€๋กœ ํฌํ•จ ์—ฌ๋ถ€
# 2) ๊ณต๋ฐฑ ์ œ๊ฑฐ ๋ฒ„์ „: ๋ถ™์–ด ์žˆ๊ฑฐ๋‚˜ ์ด์ƒํ•˜๊ฒŒ ๋„์–ด์ ธ๋„ ๋งค์นญ ๊ฐ€๋Šฅ
if kw_lower in doc_lower or kw_norm in doc_norm:
keyword_score += 1
keyword_score = keyword_score / len(keywords) if keywords else 0
hybrid_score = 0.7 * vector_score + 0.3 * keyword_score
hybrid_results.append({
'id': doc_id,
'document': doc,
'metadata': metadata,
'hybrid_score': hybrid_score,
'vector_score': vector_score,
'keyword_score': keyword_score
})
hybrid_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
top_results = hybrid_results[:top_k]
return {
'documents': [[r['document'] for r in top_results]],
'metadatas': [[r['metadata'] for r in top_results]],
'scores': [r['hybrid_score'] for r in top_results],
'keywords': keywords
}
def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> Dict:
docs = search_results['documents'][0]
metas = search_results['metadatas'][0]
formatted_docs = []
for i, (doc, meta) in enumerate(zip(docs, metas), 1):
formatted_docs.append(f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc}")
context = "\n\n".join(formatted_docs)
system_prompt = """๋‹น์‹ ์€ RFx ๋ฌธ์„œ ๋ถ„์„ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
์ฃผ์–ด์ง„ 3๊ฐœ์˜ ๋ฌธ์„œ ์ค‘์—์„œ ์‚ฌ์šฉ์ž ์งˆ๋ฌธ๊ณผ **๊ฐ€์žฅ ๊ด€๋ จ ์žˆ๋Š” ๋‹จ 1๊ฐœ์˜ ํ•ต์‹ฌ ์ •๋ณด**๋งŒ ์„ ํƒํ•˜์„ธ์š”.
**์ค‘์š” ๊ทœ์น™:**
1. ๋ฐ˜๋“œ์‹œ **1๊ฐœ์˜ ํ…์ŠคํŠธ**๋งŒ ์ถ”์ถœ
2. ๊ฐ€์žฅ ์ง์ ‘์ ์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ตํ•˜๋Š” ์ •๋ณด ์„ ํƒ
3. ๊ธˆ์•ก, ๋‚ ์งœ, ์ˆ˜๋Ÿ‰ ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ˆซ์ž ์ •๋ณด ์šฐ์„ 
4. ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๋Š” ์›๋ฌธ ๊ทธ๋Œ€๋กœ ์œ ์ง€ (150์ž ์ด๋‚ด)
5. JSON ํ˜•์‹์œผ๋กœ๋งŒ ์‘๋‹ต
**์‘๋‹ต ํ˜•์‹:**
{
"selected_text": "์„ ํƒ๋œ ํ…์ŠคํŠธ (์›๋ฌธ ๊ทธ๋Œ€๋กœ)",
"page": ํŽ˜์ด์ง€๋ฒˆํ˜ธ,
"relevance_reason": "์ด ํ…์ŠคํŠธ๋ฅผ ์„ ํƒํ•œ ์ด์œ "
}"""
user_prompt = f"""<์งˆ๋ฌธ>
{query}
</์งˆ๋ฌธ>
<๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ๋“ค>
{context}
</๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ๋“ค>
์œ„ 3๊ฐœ ๋ฌธ์„œ์—์„œ ์งˆ๋ฌธ์— ๊ฐ€์žฅ ์ •ํ™•ํ•˜๊ฒŒ ๋‹ตํ•˜๋Š” **๋‹จ 1๊ฐœ์˜ ํ•ต์‹ฌ ์ •๋ณด**๋ฅผ JSON ํ˜•์‹์œผ๋กœ ์„ ํƒํ•˜์„ธ์š”.
์„ ํƒํ•œ ํ…์ŠคํŠธ๋Š” 150์ž ์ด๋‚ด๋กœ ํ•˜์„ธ์š”."""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "grok-3",
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"temperature": 0.1,
"max_tokens": 1000,
"stream": False
}
try:
response = requests.post(
f"{GROK_API_BASE}/chat/completions",
headers=headers,
json=payload,
timeout=120
)
if response.status_code != 200:
return {"error": f"API ์˜ค๋ฅ˜: {response.status_code}"}
result = response.json()
content = result["choices"][0]["message"]["content"]
content = content.replace("```json", "").replace("```", "").strip()
extracted_data = json.loads(content)
return extracted_data
except Exception as e:
return {"error": f"์˜ค๋ฅ˜: {str(e)}"}
def build_context(search_results: Dict, max_length: int = 3000) -> str:
context_parts = []
current_length = 0
docs = search_results['documents'][0]
metas = search_results['metadatas'][0]
for i, (doc, meta) in enumerate(zip(docs, metas), 1):
part = f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc}\n"
part_length = len(part)
if current_length + part_length > max_length:
remaining = max_length - current_length
if remaining > 200:
part = f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc[:remaining-50]}...\n"
context_parts.append(part)
break
context_parts.append(part)
current_length += part_length
return "\n".join(context_parts)
def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
context = build_context(search_results, max_length=4000)
system_prompt = """๋‹น์‹ ์€ ์ž๋™์ฐจ ์ œ์กฐ์—… RFx ๋ฌธ์„œ ์ „๋ฌธ ๋ถ„์„๊ฐ€์ž…๋‹ˆ๋‹ค.
**์‚ฐ์—… ํŠนํ™” ์ง€์นจ:**
1. **์ž๋™์ฐจ ์ œ์กฐ์—… ์€์–ดยท์•ฝ์–ด ํ•ด์„**: ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์—๋Š” ์ž๋™์ฐจ ์ œ์กฐ์—… ํŠน์œ ์˜ ์€์–ดยท์•ฝ์–ดยท์ „๋ฌธ์šฉ์–ด๊ฐ€ ํฌํ•จ๋  ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์‚ฐ์—… ๋ฌธ๋งฅ์— ๋งž๊ฒŒ ์ •ํ™•ํžˆ ํ•ด์„ํ•˜๋ผ.
2. **์–ธ์–ด ํ˜ผ์šฉ ๋ฐ ๋น„๋ฌธ ๋Œ€์‘**: ์‚ฌ์šฉ์ž์˜ ๋ฌธ์žฅ์€ ํ•œ๊ตญ์–ด์™€ ์˜์–ด๊ฐ€ ์„ž์ด๊ฑฐ๋‚˜ ๋ฌธ๋ฒ• ์˜ค๋ฅ˜๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์˜๋„๋ฅผ ์ถ”๋ก ํ•˜์—ฌ ์ •ํ™•ํžˆ ์ดํ•ดํ•˜๋ผ.
3. **๋ชจํ˜ธํ•œ ์งˆ๋ฌธ ์ž๋™ ๋ณด์ •**: ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์ด ๋ถˆ์™„์ „ํ•˜๊ฑฐ๋‚˜ ๋ชจํ˜ธํ•ด๋„ ์งˆ๋ฌธ ์˜๋„๋ฅผ ์ถ”๋ก ํ•˜์—ฌ ์ ์ ˆํ•˜๊ฒŒ ์žฌ๊ตฌ์„ฑํ•˜๋ผ.
**๋ฌธ์„œ ๊ธฐ๋ฐ˜ ์‘๋‹ต ์›์น™ (์ ˆ๋Œ€ ์ถ”์ธก ๊ธˆ์ง€):**
1. ์ œ๊ณต๋œ ๋ฌธ์„œ๋ฅผ **๋งค์šฐ ๊ผผ๊ผผํžˆ** ์ฝ๊ณ  ์ •ํ™•ํ•œ ์ •๋ณด๋ฅผ ์ฐพ์œผ์„ธ์š”
2. **๋ฐ˜๋“œ์‹œ ๋ฌธ์„œ์—์„œ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์•„ ๋‹ต๋ณ€**ํ•˜๊ณ , ๋ฌธ์„œ์— ์—†๋Š” ๋‚ด์šฉ์€ ์ž„์˜๋กœ ์ถ”์ธกํ•˜์ง€ ๋ง๊ณ  **"๋ฌธ์„œ์—์„œ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"**๋ผ๊ณ  ๋ช…์‹œํ•˜๋ผ
3. **๋ฌธ์„œ์™€ ์ „ํ˜€ ๋ฌด๊ด€ํ•œ ์งˆ๋ฌธ**(์˜ˆ: ์ ์‹ฌ ์ถ”์ฒœ, ๋‚ ์”จ, ์ผ์ƒ ๋Œ€ํ™” ๋“ฑ)์€ **"์ฃ„์†กํ•˜์ง€๋งŒ, ์ œ๊ณต๋œ ๋ฌธ์„œ์—๋Š” ํ•ด๋‹น ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ •๋ณด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค."**๋ผ๊ณ ๋งŒ ๋‹ต๋ณ€ํ•˜๊ณ  ์ถ”๊ฐ€ ์„ค๋ช… ์—†์ด ์ข…๋ฃŒํ•˜๋ผ
4. ๋ฌธ์„œ์— ์ •๋ณด๊ฐ€ ์žˆ๋Š”๋ฐ๋„ "์—†๋‹ค"๊ณ  ํ•˜์ง€ ๋งˆ์„ธ์š”
**ํ•ต์‹ฌ ์ •๋ณด ์šฐ์„  ์ถ”์ถœ:**
- ๊ธˆ์•ก, ์ˆ˜๋Ÿ‰, ๊ทœ๊ฒฉ, ์ผ์ •, ์š”๊ตฌ์กฐ๊ฑด ๋“ฑ **์ˆ˜์น˜ ๊ธฐ๋ฐ˜ ์ •๋ณด๋ฅผ ์ตœ์šฐ์„ **์œผ๋กœ ์‹๋ณ„ํ•˜๊ณ  ์ •ํ™•ํ•˜๊ฒŒ ๋ฐ˜ํ™˜ํ•˜๋ผ
- ์ˆซ์ž, ๊ธˆ์•ก, ๋‚ ์งœ ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ •๋ณด๋ฅผ ์šฐ์„ ์ ์œผ๋กœ ์ฐพ์œผ์„ธ์š”
**๋‹ต๋ณ€ ํ˜•์‹:**
- ๋‹ต๋ณ€ ์‹œ ๋ฐ˜๋“œ์‹œ **[ํŽ˜์ด์ง€ X]** ํ˜•ํƒœ๋กœ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œํ•˜์„ธ์š”
- **์ ˆ๋Œ€ ์ค‘์š”**: "๋ฌธ์„œ 1", "๋ฌธ์„œ 2" ๊ฐ™์€ ํ‘œ๊ธฐ๋Š” ์ ˆ๋Œ€ ์‚ฌ์šฉํ•˜์ง€ ๋งˆ์„ธ์š”
- ํ•ต์‹ฌ ๋‹ต๋ณ€์„ ๋จผ์ € ๋ช…ํ™•ํ•˜๊ฒŒ ์ œ์‹œ
- ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ๋งŒ ๋‹ต๋ณ€ํ•˜์„ธ์š”
- ์งˆ๋ฌธ์— ๋”ฐ๋ผ ๊ฐ€์žฅ ์ ์ ˆํ•œ ๊ตฌ์กฐ๋กœ ๋‹ต๋ณ€ํ•˜์„ธ์š” (๋‹จ๊ณ„๋ณ„, ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„, ์‹œ๊ฐ„์ˆœ ๋“ฑ)
**์›๋ฌธ ์ธ์šฉ ๊ทœ์น™ (ํ•˜์ด๋ผ์ดํŠธ์šฉ):**
- ํ•ต์‹ฌ ๋‚ด์šฉ์„ ์„ค๋ช…ํ•  ๋•Œ๋Š” ํฐ๋”ฐ์˜ดํ‘œ("")๋กœ PDF ์›๋ฌธ์„ ๊ทธ๋Œ€๋กœ ์ธ์šฉํ•˜์„ธ์š”
- ํฐ๋”ฐ์˜ดํ‘œ ์•ˆ์˜ ๋‚ด์šฉ์€ PDF ์›๋ฌธ์„ **ํ•œ ๊ธ€์ž๋„ ๋ฐ”๊พธ์ง€ ๋ง๊ณ ** ๊ทธ๋Œ€๋กœ ๋ณต์‚ฌ
- ๋ฌธ์žฅ ์ข…๊ฒฐ์–ด("~ํ•จ", "~์ž„", "~์š”์ฒญํ•จ" ๋“ฑ)๋„ ์›๋ฌธ ๊ทธ๋Œ€๋กœ ์œ ์ง€
- ์ธ์šฉ ์˜ˆ์‹œ: "๊ธฐ์ˆ ํ‰๊ฐ€ ์ ์ˆ˜๊ฐ€ ๋ฐฐ์ ํ•œ๋„(100์ )์˜ 85% ์ด์ƒ์ธ ์ž๋ฅผ ๊ธฐ์ˆ ํ‰๊ฐ€ ์ ๊ฒฉ์ž๋กœ ์„ ์ •" [ํŽ˜์ด์ง€ 9]
- ์›๋ฌธ ์ธ์šฉ ํ›„ ํ•„์š”ํ•˜๋ฉด ๋ถ€์—ฐ ์„ค๋ช… ์ถ”๊ฐ€ ๊ฐ€๋Šฅ"""
user_prompt = f"""๋‹ค์Œ ๋ฌธ์„œ๋“ค์„ ๋งค์šฐ ๊ผผ๊ผผํžˆ ์ฝ๊ณ  ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜์„ธ์š”.
<๋ฌธ์„œ>
{context}
</๋ฌธ์„œ>
<์งˆ๋ฌธ>
{query}
</์งˆ๋ฌธ>
**๋‹ต๋ณ€ ์ž‘์„ฑ ๊ฐ€์ด๋“œ:**
1. **๊ตฌ์กฐํ™”**: ์งˆ๋ฌธ ์œ ํ˜•์— ๋งž๋Š” ๊ฐ€์žฅ ์ฝ๊ธฐ ์‰ฌ์šด ๊ตฌ์กฐ ์„ ํƒ
- ์ ˆ์ฐจ/ํ”„๋กœ์„ธ์Šค ์งˆ๋ฌธ โ†’ ๋‹จ๊ณ„๋ณ„ ๋ฒˆํ˜ธ (1, 2, 3...)
- ํ•ญ๋ชฉ ๋‚˜์—ด ์งˆ๋ฌธ โ†’ ๋ถˆ๋ฆฟ ํฌ์ธํŠธ (โ€ข ๋˜๋Š” *)
- ๋น„๊ต/์„ ํƒ ์งˆ๋ฌธ โ†’ ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๊ตฌ๋ถ„
2. **์›๋ฌธ ์ธ์šฉ**: ํ•ต์‹ฌ ๋‚ด์šฉ์€ ํฐ๋”ฐ์˜ดํ‘œ๋กœ PDF ์›๋ฌธ ๊ทธ๋Œ€๋กœ ์ธ์šฉ
- ์˜ˆ: "๊ธฐ์ˆ ํ‰๊ฐ€ ์ ๊ฒฉ์ž๋ฅผ ๋Œ€์ƒ์œผ๋กœ ๊ฐ€๊ฒฉ ์ž…์ฐฐ์„ ์‹ค์‹œํ•˜์—ฌ, ํ•œ๊ตญ์ž๋™์ฐจ์—ฐ๊ตฌ์›์˜ ์˜ˆ์ •๊ฐ€๊ฒฉ์ดํ•˜ ์ตœ์ €๊ฐ€๊ฒฉ ํˆฌ์ฐฐ์ž๋ฅผ ๋‚™์ฐฐ์ž๋กœ ์„ ์ •" [ํŽ˜์ด์ง€ 9]
- ํฐ๋”ฐ์˜ดํ‘œ ์•ˆ = ์›๋ฌธ ๊ทธ๋Œ€๋กœ (์ ˆ๋Œ€ ์˜์—ญ ๊ธˆ์ง€)
3. **์ถœ์ฒ˜ ํ‘œ๊ธฐ**: ๋ชจ๋“  ์ •๋ณด์— [ํŽ˜์ด์ง€ X] ํ‘œ๊ธฐ
4. **ํ˜•์‹**: ๋งˆํฌ๋‹ค์šด๋งŒ ์‚ฌ์šฉ, "๋ฌธ์„œ 1" ๊ฐ™์€ ํ‘œ๊ธฐ ๊ธˆ์ง€"""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "grok-3",
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"temperature": 0.1,
"max_tokens": 2000,
"stream": False
}
try:
response = requests.post(
f"{GROK_API_BASE}/chat/completions",
headers=headers,
json=payload,
timeout=120
)
if response.status_code != 200:
error_detail = ""
try:
error_data = response.json()
error_detail = error_data.get('error', {}).get('message', '')
except Exception:
error_detail = response.text
return f"โŒ API ์˜ค๋ฅ˜ (์ฝ”๋“œ: {response.status_code})\n์ƒ์„ธ: {error_detail}"
result = response.json()
return result["choices"][0]["message"]["content"]
except Exception as e:
return f"โŒ ์˜ค๋ฅ˜: {str(e)}"
def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
"""
PyMuPDF ๊ธฐ๋ฐ˜์˜ ํ•˜์ด๋ผ์ดํŠธ ํ•จ์ˆ˜ - ์ „์ฒด ์šฐ์„ , ์‹คํŒจ์‹œ์—๋งŒ ๋ถ„ํ• 
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
yellow_color = [1.0, 1.0, 0.0]
def normalize_text(text):
"""ํ…์ŠคํŠธ ์ •๊ทœํ™”"""
return re.sub(r'\s+', ' ', text.strip())
def merge_rects(rects, threshold=5):
"""๊ฒน์น˜๊ฑฐ๋‚˜ ์ธ์ ‘ํ•œ ์‚ฌ๊ฐํ˜•๋“ค์„ ๋ณ‘ํ•ฉ"""
if not rects:
return []
# ์‚ฌ๊ฐํ˜•๋“ค์„ y์ขŒํ‘œ๋กœ ์ •๋ ฌ
sorted_rects = sorted(rects, key=lambda r: (r.y0, r.x0))
merged = [sorted_rects[0]]
for rect in sorted_rects[1:]:
last = merged[-1]
# ๊ฐ™์€ ๋ผ์ธ์ด๊ณ  x๊ฐ€ ๊ฒน์น˜๊ฑฐ๋‚˜ ์ธ์ ‘ํ•˜๋ฉด ๋ณ‘ํ•ฉ
if abs(rect.y0 - last.y0) < threshold:
if rect.x0 <= last.x1 + threshold:
merged[-1] = fitz.Rect(
min(last.x0, rect.x0),
min(last.y0, rect.y0),
max(last.x1, rect.x1),
max(last.y1, rect.y1)
)
else:
merged.append(rect)
# ๋‹ค๋ฅธ ๋ผ์ธ์ด์ง€๋งŒ y๊ฐ€ ์—ฐ์†๋˜๋ฉด (์ค„๋ฐ”๊ฟˆ)
elif rect.y0 <= last.y1 + 20:
merged.append(rect)
else:
merged.append(rect)
return merged
def find_text_across_lines(page, search_text):
"""์ค„๋ฐ”๊ฟˆ์„ ๋„˜์–ด์„œ ํ…์ŠคํŠธ ์ฐพ๊ธฐ - ๊ณต๋ฐฑ ๋ฌด์‹œ ๋น„๊ต"""
found_rects = []
# ํŽ˜์ด์ง€ ํ…์ŠคํŠธ ๊ตฌ์กฐ ๊ฐ€์ ธ์˜ค๊ธฐ
blocks = page.get_text("dict")["blocks"]
# ๋ชจ๋“  ๋ผ์ธ์˜ ํ…์ŠคํŠธ์™€ bbox๋ฅผ ์ˆ˜์ง‘
lines_info = [] # [(text, bbox), ...]
for block in blocks:
if "lines" not in block:
continue
for line in block["lines"]:
line_text = ""
for span in line["spans"]:
line_text += span["text"]
if line_text.strip():
lines_info.append((line_text, fitz.Rect(line["bbox"])))
if not lines_info:
return []
# ๊ฒ€์ƒ‰์–ด ์ •๊ทœํ™” - ๊ณต๋ฐฑ ์™„์ „ ์ œ๊ฑฐ ๋ฒ„์ „
search_no_space = search_text.lower().replace(" ", "").replace("\n", "")
# ์—ฐ์†๋œ ๋ผ์ธ๋“ค์„ ํ•ฉ์ณ์„œ ๊ฒ€์ƒ‰
for start_idx in range(len(lines_info)):
combined_text = ""
combined_bboxes = []
for end_idx in range(start_idx, min(start_idx + 5, len(lines_info))): # ์ตœ๋Œ€ 5์ค„
line_text, line_bbox = lines_info[end_idx]
combined_text += line_text
combined_bboxes.append(line_bbox)
# ๊ณต๋ฐฑ ์ œ๊ฑฐ ํ›„ ๋น„๊ต (ํ•ต์‹ฌ!)
combined_no_space = combined_text.lower().replace(" ", "").replace("\n", "")
# ๊ฒ€์ƒ‰์–ด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธ
if search_no_space in combined_no_space:
# ๋งค์นญ๋จ - ํ•ด๋‹น ๋ผ์ธ๋“ค์˜ bbox ๋ฐ˜ํ™˜
for bbox in combined_bboxes:
found_rects.append(bbox)
print(f" โœ… ๋ผ์ธ ๋งค์นญ ({start_idx+1}~{end_idx+1}์ค„): {len(combined_bboxes)}๊ฐœ ์˜์—ญ")
return merge_rects(found_rects)
return []
def find_text_with_pymupdf(page, search_text):
"""PyMuPDF๋กœ ํ…์ŠคํŠธ ์ฐพ๊ธฐ - ์ •ํ™•ํ•˜๊ณ  ๊น”๋”ํ•˜๊ฒŒ"""
found_rects = []
search_text = search_text.strip()
print(f" ๊ฒ€์ƒ‰ ์ค‘...")
# === ์šฐ์„ ์ˆœ์œ„ 1: PyMuPDF ๊ธฐ๋ณธ ๊ฒ€์ƒ‰ ===
instances = page.search_for(search_text)
if instances:
print(f" โœ… ์„ฑ๊ณต [์›๋ณธ]: {len(instances)}๊ฐœ")
return merge_rects(instances)
# === ์šฐ์„ ์ˆœ์œ„ 2: ์ •๊ทœํ™” ํ›„ ๊ฒ€์ƒ‰ ===
normalized = normalize_text(search_text)
if normalized != search_text:
instances = page.search_for(normalized)
if instances:
print(f" โœ… ์„ฑ๊ณต [์ •๊ทœํ™”]: {len(instances)}๊ฐœ")
return merge_rects(instances)
# === ์šฐ์„ ์ˆœ์œ„ 3: ์ค„๋ฐ”๊ฟˆ ๋„˜์–ด์„œ ๊ฒ€์ƒ‰ (๋ผ์ธ ๋งค์นญ) ===
line_results = find_text_across_lines(page, search_text)
if line_results:
return line_results
print(f" โš ๏ธ ๋ผ์ธ ๋งค์นญ ์‹คํŒจ โ†’ ํ•ต์‹ฌ ๊ตฌ๋ฌธ")
# === ์šฐ์„ ์ˆœ์œ„ 4: ํ•ต์‹ฌ ๊ตฌ๋ฌธ๋งŒ ๊ฒ€์ƒ‰ (์ฒ˜์Œ 30์ž + ๋งˆ์ง€๋ง‰ 20์ž) ===
if len(search_text) > 50:
# ์•ž๋ถ€๋ถ„
front = search_text[:30]
front_inst = page.search_for(front)
if front_inst:
print(f" โœ… ์•ž๋ถ€๋ถ„ ๋งค์นญ: {front[:20]}...")
found_rects.extend(front_inst[:1]) # ์ฒซ ๋ฒˆ์งธ๋งŒ
# ๋’ท๋ถ€๋ถ„
back = search_text[-20:]
back_inst = page.search_for(back)
if back_inst:
print(f" โœ… ๋’ท๋ถ€๋ถ„ ๋งค์นญ: ...{back[:15]}")
found_rects.extend(back_inst[:1]) # ์ฒซ ๋ฒˆ์งธ๋งŒ
if found_rects:
return merge_rects(found_rects)
print(f" โš ๏ธ ํ•ต์‹ฌ ๊ตฌ๋ฌธ ์‹คํŒจ โ†’ ํ‚ค์›Œ๋“œ")
# === ์šฐ์„ ์ˆœ์œ„ 5: ํ‚ค์›Œ๋“œ (์ตœ๋Œ€ 2๊ฐœ๋งŒ) ===
keywords = re.findall(r'[๊ฐ€-ํžฃ]{10,}', search_text)
if not keywords:
keywords = re.findall(r'[๊ฐ€-ํžฃ]{7,}', search_text)
if keywords:
for kw in keywords[:2]: # ์ตœ๋Œ€ 2๊ฐœ๋งŒ
inst = page.search_for(kw)
if inst:
print(f" โœ… ํ‚ค์›Œ๋“œ: {kw}")
found_rects.extend(inst[:1]) # ์ฒซ ๋ฒˆ์งธ๋งŒ
if found_rects:
return merge_rects(found_rects)
# === ์šฐ์„ ์ˆœ์œ„ 6: ๋ธ”๋ก ===
print(f" ์ตœํ›„: ๋ธ”๋ก")
blocks = page.get_text("dict")["blocks"]
search_norm = normalize_text(search_text.lower())
for block in blocks:
if "lines" not in block:
continue
block_text = ""
for line in block["lines"]:
for span in line["spans"]:
block_text += span["text"] + " "
block_norm = normalize_text(block_text.lower())
if search_norm in block_norm:
found_rects.append(fitz.Rect(block["bbox"]))
print(f" โœ… ๋ธ”๋ก ์ผ์น˜")
break
return merge_rects(found_rects) if found_rects else []
print(f"\n{'='*80}")
print(f"ํ•˜์ด๋ผ์ดํŠธ ์‹œ์ž‘ - ์ด {len(highlight_info)}๊ฐœ ํ•ญ๋ชฉ")
print(f"{'='*80}")
total_success = 0
total_failed = 0
for idx, item in enumerate(highlight_info, 1):
page_num = item['page'] - 1
text_to_highlight = item['text'].strip()
if page_num >= len(doc):
print(f"\n[{idx}] โŒ ํŽ˜์ด์ง€ ์˜ค๋ฅ˜: {page_num + 1}")
total_failed += 1
continue
page = doc[page_num]
print(f"\n[{idx}/{len(highlight_info)}]")
print(f" ๐Ÿ“„ ํŽ˜์ด์ง€: {page_num + 1}")
print(f" ๐Ÿ“ ๊ธธ์ด: {len(text_to_highlight)}์ž")
print(f" ๐Ÿ’ฌ ๋‚ด์šฉ: {text_to_highlight[:70]}...")
# ํ…์ŠคํŠธ ์ฐพ๊ธฐ
found_rects = find_text_with_pymupdf(page, text_to_highlight)
# ์ค‘๋ณต ์ œ๊ฑฐ (๊ฐ™์€ ์œ„์น˜์˜ ์‚ฌ๊ฐํ˜•)
unique_rects = []
for rect in found_rects:
is_duplicate = False
for existing in unique_rects:
# ์ขŒํ‘œ๊ฐ€ ๊ฑฐ์˜ ๊ฐ™์œผ๋ฉด ์ค‘๋ณต์œผ๋กœ ๊ฐ„์ฃผ
if (abs(rect.x0 - existing.x0) < 5 and
abs(rect.y0 - existing.y0) < 5 and
abs(rect.x1 - existing.x1) < 5 and
abs(rect.y1 - existing.y1) < 5):
is_duplicate = True
break
if not is_duplicate:
unique_rects.append(rect)
# ํ•˜์ด๋ผ์ดํŠธ ์ถ”๊ฐ€
highlighted_count = 0
for rect in unique_rects:
try:
highlight = page.add_highlight_annot(rect)
highlight.set_colors(stroke=yellow_color)
highlight.update()
highlighted_count += 1
except Exception as e:
print(f" โœ— ํ•˜์ด๋ผ์ดํŠธ ์‹คํŒจ: {e}")
if highlighted_count > 0:
print(f" โœ… ์™„๋ฃŒ: {highlighted_count}๊ฐœ ์˜์—ญ")
total_success += 1
else:
print(f" โŒ ์‹คํŒจ: ํ…์ŠคํŠธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Œ")
total_failed += 1
print(f"\n{'='*80}")
print(f"๐Ÿ“Š ์ตœ์ข… ๊ฒฐ๊ณผ: โœ… ์„ฑ๊ณต {total_success}๊ฐœ / โŒ ์‹คํŒจ {total_failed}๊ฐœ")
print(f"{'='*80}\n")
output_bytes = doc.tobytes()
doc.close()
return output_bytes
def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
if "error" in grok_result:
return []
highlights = []
selected_text = grok_result.get("selected_text", "")
page = grok_result.get("page", 1)
if selected_text and len(selected_text) <= 150:
highlights.append({
'text': selected_text,
'page': page
})
return highlights
def extract_highlights_from_answer(answer: str) -> List[Dict]:
"""
๋‹ต๋ณ€์—์„œ ํ•˜์ด๋ผ์ดํŠธํ•  ํ…์ŠคํŠธ ์ถ”์ถœ
[ํŽ˜์ด์ง€ X] ์•ž๋’ค ๋ชจ๋‘ ํ•ด๋‹น ํŽ˜์ด์ง€๋กœ ๊ฐ„์ฃผ
"""
highlights = []
print(f"\n{'='*80}")
print(f"๋‹ต๋ณ€ ํ…์ŠคํŠธ ๋ถ„์„ ์ค‘...")
print(f"{'='*80}\n")
# [ํŽ˜์ด์ง€ X] ํŒจํ„ด ์ฐพ๊ธฐ
page_pattern = r'\[\s*ํŽ˜์ด์ง€\s*(\d+)\s*\]'
page_matches = list(re.finditer(page_pattern, answer))
print(f"๐Ÿ“ [ํŽ˜์ด์ง€] ํƒœ๊ทธ {len(page_matches)}๊ฐœ ๋ฐœ๊ฒฌ\n")
quoted_matches = []
list_matches = []
# ๊ฐ [ํŽ˜์ด์ง€ X]์— ๋Œ€ํ•ด ์•ž๋’ค ์„น์…˜ ๋ถ„์„
for i, match in enumerate(page_matches):
page_num = match.group(1)
tag_start = match.start()
tag_end = match.end()
# === ์„น์…˜ 1: [ํŽ˜์ด์ง€ X] ์•ž ๋ถ€๋ถ„ (๊ฐ™์€ ๋‹จ๋ฝ ๋‚ด) ===
# ์ด์ „ [ํŽ˜์ด์ง€] ๋˜๋Š” ์ค„๋ฐ”๊ฟˆ 2๊ฐœ๊นŒ์ง€
section_start = 0
if i > 0:
section_start = page_matches[i-1].end()
# [ํŽ˜์ด์ง€ X] ์•ž์˜ ๊ฐ™์€ ๋‹จ๋ฝ (์ค„๋ฐ”๊ฟˆ 2๊ฐœ ์ „๊นŒ์ง€)
before_section = answer[section_start:tag_start]
# ๋งˆ์ง€๋ง‰ ๋ถˆ๋ฆฟ ํฌ์ธํŠธ๋‚˜ ์ธ์šฉ๋ฌธ ์ฐพ๊ธฐ
last_para_match = re.search(r'([-*โ—‹]\s+.+)$', before_section, re.DOTALL)
if last_para_match:
before_text = last_para_match.group(1)
print(f"--- ํŽ˜์ด์ง€ {page_num} ์•ž๋ถ€๋ถ„ (๊ธธ์ด: {len(before_text)}์ž) ---")
print(f"{before_text[:150]}...\n")
# ํฐ๋”ฐ์˜ดํ‘œ ์ธ์šฉ๋ฌธ ์ถ”์ถœ
quotes = re.findall(r'"([^"]+)"', before_text)
for quote in quotes:
quote_clean = quote.strip()
if len(quote_clean) > 10:
quoted_matches.append((quote_clean, int(page_num)))
print(f" โœ“ [์•ž-์ธ์šฉ๋ฌธ] \"{quote_clean[:60]}...\"")
# === ์„น์…˜ 2: [ํŽ˜์ด์ง€ X] ๋’ค ๋ถ€๋ถ„ (๊ธฐ์กด ๋กœ์ง) ===
next_page_pos = len(answer)
if i + 1 < len(page_matches):
next_page_pos = page_matches[i + 1].start()
section = answer[tag_end:next_page_pos]
print(f"--- ํŽ˜์ด์ง€ {page_num} ๋’ท๋ถ€๋ถ„ (๊ธธ์ด: {len(section)}์ž) ---")
print(f"{section[:150]}...\n")
# ํฐ๋”ฐ์˜ดํ‘œ ์ธ์šฉ๋ฌธ
quotes = re.findall(r'"([^"]+)"', section)
for quote in quotes:
quote_clean = quote.strip()
if len(quote_clean) > 10:
quoted_matches.append((quote_clean, int(page_num)))
print(f" โœ“ [๋’ค-์ธ์šฉ๋ฌธ] \"{quote_clean[:60]}...\"")
# ๋ฆฌ์ŠคํŠธ ํ•ญ๋ชฉ
lines = section.split('\n')
for line in lines:
line_stripped = line.strip()
if len(line_stripped) < 3:
continue
if line_stripped.startswith('**') or line_stripped.startswith('#'):
continue
item = None
if line_stripped.startswith('โ—‹'):
item = line_stripped[1:].strip()
elif line_stripped.startswith('- ') or line_stripped.startswith('* '):
item = line_stripped[2:].strip()
elif re.match(r'^\d+\.\s+', line_stripped):
match_obj = re.match(r'^\d+\.\s+(.+)$', line_stripped)
if match_obj:
item = match_obj.group(1).strip()
if item:
item = re.sub(r'\[\s*ํŽ˜์ด์ง€\s*\d+\s*\]', '', item).strip()
item = re.sub(r'\*\*([^*]+)\*\*', r'\1', item).strip()
item = re.sub(r'\([""""][^)]+[""""\)]+', '', item).strip()
item = re.sub(r'\s*\([^)]{0,50}\)\s*$', '', item).strip()
if 3 <= len(item) <= 200:
list_matches.append((item, int(page_num)))
print(f" โœ“ [๋ฆฌ์ŠคํŠธ] {item[:50]}...")
print(f"\n{'='*40}")
print(f"๐Ÿ“ ์ธ์šฉ๋ฌธ: {len(quoted_matches)}๊ฐœ")
print(f"๐Ÿ“‹ ๋ฆฌ์ŠคํŠธ: {len(list_matches)}๊ฐœ")
print(f"{'='*40}\n")
# ์šฐ์„ ์ˆœ์œ„
all_matches = []
if quoted_matches and list_matches:
all_short = all(len(q[0]) <= 30 for q in quoted_matches)
if all_short:
print(f"โœ“ ์งง์€ ์ธ์šฉ๋ฌธ + ๋ฆฌ์ŠคํŠธ ๋ชจ๋‘")
all_matches = quoted_matches + list_matches
else:
print(f"โœ“ ์ธ์šฉ๋ฌธ๋งŒ")
all_matches = quoted_matches
elif quoted_matches:
print(f"โœ“ ์ธ์šฉ๋ฌธ๋งŒ")
all_matches = quoted_matches
elif list_matches:
print(f"โœ“ ๋ฆฌ์ŠคํŠธ๋งŒ")
all_matches = list_matches
# ์ค‘๋ณต ์ œ๊ฑฐ
seen = set()
for text, page in all_matches:
if text and (text, page) not in seen:
highlights.append({
'text': text,
'page': page
})
seen.add((text, page))
print(f"\n{'='*80}")
print(f"โœ… ์ตœ์ข… ์ถ”์ถœ: {len(highlights)}๊ฐœ")
for i, h in enumerate(highlights, 1):
print(f" [{i}] ํŽ˜์ด์ง€ {h['page']}: {h['text'][:60]}...")
print(f"{'='*80}\n")
return highlights
def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0):
highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
highlighted_pages = set(h['page'] for h in highlight_info)
pdf_html = '<div class="pdf-container" id="pdf-viewer-container">'
for page_num in range(len(doc)):
page = doc[page_num]
pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
img_data = pix.tobytes("png")
img_base64 = base64.b64encode(img_data).decode()
zoom_percentage = int(zoom_level * 50)
page_id = f'page-{page_num + 1}'
pdf_html += f'<div id="{page_id}" style="margin-bottom: 2rem; position: relative;">'
if (page_num + 1) in highlighted_pages:
pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">โญ ํŽ˜์ด์ง€ {page_num + 1}</div>'
else:
pdf_html += f'<div style="background: #ADADAD; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;"> ํŽ˜์ด์ง€ {page_num + 1}</div>'
pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />'
pdf_html += '</div>'
pdf_html += '</div>'
doc.close()
return pdf_html
def main():
init_session()
if not st.session_state.processed:
col1, col2, col3 = st.columns([1, 1, 1])
with col2:
st.markdown("<div style='height: 30vh;'></div>", unsafe_allow_html=True)
st.image("img/plobin-grey.png", use_container_width=True)
st.text(' ')
with st.sidebar:
st.image("img/plobin-right-only.png", width=85)
uploaded_file = st.file_uploader(
"๋“œ๋ž˜๊ทธํ•˜์—ฌ ํŒŒ์ผ์„ ์—…๋กœ๋“œ ๋˜๋Š” ํด๋ฆญํ•˜์—ฌ ์„ ํƒํ•˜์„ธ์š”.",
type=['pdf'],
label_visibility="visible",
help="PDF ํŒŒ์ผ๋งŒ ์—…๋กœ๋“œ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค (์ตœ๋Œ€ 200MB)"
)
if uploaded_file:
if st.button("๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘", type="primary", use_container_width=True):
if not GROK_API_KEY or not OPENAI_API_KEY:
st.error("โš ๏ธ GROK_API_KEY ๋˜๋Š” OPENAI_API_KEY๊ฐ€ .env ํŒŒ์ผ์— ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค!")
st.stop()
st.session_state.vector_db = None
st.session_state.embedder = None
st.session_state.chat_history = []
st.session_state.current_highlights = []
with st.spinner("๋ฌธ์„œ ๋ถ„์„์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค..."):
try:
chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
with st.spinner("ํ•ต์‹ฌ ๋‚ด์šฉ์„ ํŒŒ์•…ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค..."):
collection, embedder = create_vector_db(chunks, metadata_list)
st.session_state.vector_db = collection
st.session_state.embedder = embedder
st.session_state.pdf_bytes = pdf_bytes
st.session_state.pdf_pages_text = pages_text
st.session_state.processed = True
st.session_state.doc_metadata = {
"filename": uploaded_file.name,
"chunks": len(chunks),
"pages": len(set(m['page'] for m in metadata_list))
}
# ํ…์ŠคํŠธ ๋กœ์ปฌ ์ €์žฅ
saved_file = save_extracted_text_to_file(
chunks,
metadata_list,
uploaded_file.name
)
st.success(f"๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
st.rerun()
except Exception as e:
st.error(f"์˜ค๋ฅ˜: {str(e)}")
if st.session_state.processed:
st.markdown("#### ๋ฌธ์„œ ์ •๋ณด")
st.info(f"**{st.session_state.doc_metadata['filename']}**")
st.info(f"ํŽ˜์ด์ง€: {st.session_state.doc_metadata['pages']}")
# if not st.session_state.processed:
# st.markdown("""
# <div class="usage-guide">
# <h2 style="text-align: center; color: #2D3748; margin-bottom: 1.5rem;">์‚ฌ์šฉ ๋ฐฉ๋ฒ•</h2>
# <div class="guide-step">
# <div class="step-number">1</div>
# <div>PDF ํŒŒ์ผ์„ ์˜ฌ๋ ค์ฃผ์„ธ์š”</div>
# </div>
# <div class="guide-step">
# <div class="step-number">2</div>
# <div>๋ฌธ์„œ ์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋  ๋•Œ๊นŒ์ง€ ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”</div>
# </div>
# <div class="guide-step">
# <div class="step-number">3</div>
# <div>๋ฌธ์„œ ๋‚ด ๊ถ๊ธˆํ•œ ๋‚ด์šฉ์„ ๋ฌผ์–ด๋ณด์„ธ์š”</div>
# </div>
# <div class="guide-step">
# <div class="step-number">4</div>
# <div>AI๊ฐ€ ์ •ํ™•ํ•œ ๋‹ต๋ณ€๊ณผ ์ถœ์ฒ˜๋ฅผ ํ•จ๊ป˜ ์•Œ๋ ค๋“œ๋ ค์š”</div>
# </div>
# </div>
# """, unsafe_allow_html=True)
if st.session_state.processed:
col1, col2 = st.columns([1, 1])
with col1:
header_cols = st.columns([7, 1, 1.5, 1])
with header_cols[0]:
st.markdown("### ")
if st.session_state.pdf_bytes:
pdf_html = render_pdf_with_highlights(
st.session_state.pdf_bytes,
st.session_state.current_highlights,
st.session_state.zoom_level
)
st.markdown(pdf_html, unsafe_allow_html=True)
if st.session_state.scroll_to_page:
scroll_js = f"""
<script>
const container = parent.document.querySelector('.pdf-container');
const targetPage = parent.document.getElementById('page-{st.session_state.scroll_to_page}');
if (container && targetPage) {{
const containerRect = container.getBoundingClientRect();
const targetRect = targetPage.getBoundingClientRect();
const scrollTop = container.scrollTop;
const offset = targetRect.top - containerRect.top + scrollTop;
container.scrollTo({{
top: offset - 20,
behavior: 'smooth'
}});
}}
</script>
"""
components.html(scroll_js, height=0)
st.session_state.scroll_to_page = None
with col2:
st.markdown('### ', unsafe_allow_html=True)
chat_container = st.container(height=650)
with chat_container:
for msg_idx, msg in enumerate(st.session_state.chat_history):
with st.chat_message(msg["role"]):
st.markdown(msg["content"])
prompt = st.chat_input("์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”...", key="chat_input")
if prompt:
st.session_state.chat_history.append({"role": "user", "content": prompt})
st.session_state.processing_query = prompt
st.rerun()
# main() ํ•จ์ˆ˜ ๋‚ด๋ถ€์˜ ์งˆ๋ฌธ ์ฒ˜๋ฆฌ ๋ถ€๋ถ„
if st.session_state.processing_query:
query = st.session_state.processing_query
st.session_state.processing_query = None
with st.spinner("PLOBIN์ด ์ตœ์ ์˜ ๋‹ต๋ณ€์„ ์ฐพ๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค..."):
try:
search_results = hybrid_search(
query,
st.session_state.vector_db,
st.session_state.embedder,
top_k=3
)
grok_result = grok_verify_and_extract(
query,
search_results,
GROK_API_KEY
)
answer = generate_answer(
query,
search_results,
GROK_API_KEY
)
# โญ ์ค‘์š”: ํฐ๋”ฐ์˜ดํ‘œ ์•ˆ์˜ ํ…์ŠคํŠธ๋งŒ ์ถ”์ถœ
print("\n" + "="*80)
print("๋‹ต๋ณ€์—์„œ ์ธ์šฉ๋ฌธ ์ถ”์ถœ ์ค‘...")
print("="*80)
highlights = extract_highlights_from_answer(answer)
# grok_result์—์„œ ์ถ”์ถœํ•œ ๊ฒƒ์€ ์‚ฌ์šฉํ•˜์ง€ ์•Š์Œ (ํ•„์š”์‹œ ์ฃผ์„ ํ•ด์ œ)
# grok_highlights = extract_highlights_from_grok(grok_result)
# highlights.extend(grok_highlights)
st.session_state.current_highlights = highlights
if grok_result and "page" in grok_result and "error" not in grok_result:
st.session_state.scroll_to_page = grok_result["page"]
chat_data = {
"role": "assistant",
"content": answer
}
st.session_state.chat_history.append(chat_data)
st.rerun()
except Exception as e:
error_msg = f"โŒ ์˜ค๋ฅ˜: {str(e)}"
st.session_state.chat_history.append({
"role": "assistant",
"content": error_msg
})
st.rerun()
if __name__ == "__main__":
main()