File size: 8,464 Bytes
6169786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import gradio as gr
import bs4

from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq

# ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋กœ๋ถ€ํ„ฐ Groq API Key ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
groq_api_key = os.environ.get("GROQ_API_KEY", "")

# ๊ตญ๊ฐ€๊ธฐ๋ก์› ์›น ๋ฌธ์„œ ๋ชฉ๋ก
urls = [
    "https://archives.go.kr/next/newsearch/listSubjectContent.do?subjectFieldId=000011",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003140&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003288&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003290&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003292&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008757&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003293&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003294&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003295&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003289&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010816&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010817&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009154&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003260&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003278&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003281&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003283&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003284&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003280&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003282&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003287&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003286&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003285&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003279&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003141&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003143&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003144&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003142&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008653&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010827&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008582&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008663&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008581&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010828&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010830&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010831&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003145&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=009425&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003146&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=010821&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003151&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003149&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003148&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008655&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=008654&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newsearch/listSubjectDescription.do?id=003150&pageFlag=A&sitePage=1-2-1",
    "https://archives.go.kr/next/newmanager/recodeRegister.do",
    "https://archives.go.kr/next/newtour/tourCourse.do",
    "https://archives.go.kr/next/newrecordsMngPro/recordsDonateInfo.do",
    "https://archives.go.kr/next/newdata/pepoleRecodPresentIntro.do",
    "https://archives.go.kr/next/newsearch/searchGuideList.do",
    "https://archives.go.kr/next/newsearch/searchGuideList.do?page=2",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=441",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=381",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=341",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=261",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=227",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=59",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=30",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=64",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=321",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=124",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=267",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=141",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=149",
    "https://archives.go.kr/next/newsearch/searchGuideDetail.do?guideSeq=22"
]

# ์›น๋ฌธ์„œ ๋กœ๋”ฉ
loader = WebBaseLoader(web_paths=urls, bs_kwargs=dict(parse_only=bs4.SoupStrainer()))
docs = loader.load()

# ๋ฌธ์„œ ๋ถ„ํ• 
splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)
split_docs = splitter.split_documents(docs)

# ์ž„๋ฒ ๋”ฉ ๋ฐ ๋ฒกํ„ฐ ์ €์žฅ ๋ฐ ๋ฆฌํŠธ๋ฆฌ๋ฒ„ ์„ค์ •
embedding_model = HuggingFaceEmbeddings(model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS")
vectorstore = FAISS.from_documents(split_docs, embedding_model)
retriever = vectorstore.as_retriever()

# LLM + QA ์ฒด์ธ
llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192")
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

# Gradio ์ฑ„ํŒ… ํ•จ์ˆ˜
def chat_with_history(user_input, history):
    if history is None:
        history = []
    query = user_input.strip() + " ํ•œ๊ตญ์–ด๋กœ ๋‹ตํ•ด์ฃผ์„ธ์š”."
    result = qa_chain({"query": query})
    answer = result.get("result", "๋‹ต๋ณ€์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
    history.append((user_input, answer))
    return "", history, history

# Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
with gr.Blocks() as demo:
    gr.Markdown("## ๐Ÿ“š ๊ตญ๊ฐ€๊ธฐ๋ก์› ์ •๋ณด ์ฑ—๋ด‡")
    chatbot = gr.Chatbot(label="๊ธฐ๋ก์› ์ฑ—๋ด‡")
    msg = gr.Textbox(placeholder="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”", label="๐Ÿ’ฌ ์งˆ๋ฌธ ์ž…๋ ฅ")
    state = gr.State([])
    msg.submit(chat_with_history, inputs=[msg, state], outputs=[msg, chatbot, state])

demo.launch()