File size: 7,753 Bytes
80f4fb3 897ec15 80f4fb3 897ec15 80f4fb3 897ec15 80f4fb3 897ec15 80f4fb3 ec113e6 80f4fb3 ec113e6 80f4fb3 ec113e6 80f4fb3 897ec15 80f4fb3 f1ec1d5 80f4fb3 f1ec1d5 af69459 f1ec1d5 80f4fb3 af69459 897ec15 80f4fb3 897ec15 80f4fb3 897ec15 80f4fb3 897ec15 80f4fb3 1138359 780971d ba021ee 80f4fb3 897ec15 80f4fb3 897ec15 80f4fb3 897ec15 80f4fb3 b701d33 80f4fb3 780971d 80f4fb3 897ec15 80f4fb3 f5f9605 80f4fb3 f5f9605 80f4fb3 f5f9605 80f4fb3 f5f9605 80f4fb3 f5f9605 80f4fb3 f5f9605 80f4fb3 f5f9605 80f4fb3 f5f9605 80f4fb3 f5f9605 80f4fb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
import tempfile # μμ νμΌμ μμ±νκΈ° μν λΌμ΄λΈλ¬λ¦¬μ
λλ€.
import os
# PDF λ¬Έμλ‘λΆν° ν
μ€νΈλ₯Ό μΆμΆνλ ν¨μμ
λλ€.
def get_pdf_text(pdf_docs):
temp_dir = tempfile.TemporaryDirectory() # μμ λλ ν 리λ₯Ό μμ±ν©λλ€.
temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # μμ νμΌ κ²½λ‘λ₯Ό μμ±ν©λλ€.
with open(temp_filepath, "wb") as f: # μμ νμΌμ λ°μ΄λ리 μ°κΈ° λͺ¨λλ‘ μ½λλ€.
f.write(pdf_docs.getvalue()) # PDF λ¬Έμμ λ΄μ©μ μμ νμΌμ μλλ€.
pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoaderλ₯Ό μ¬μ©ν΄ PDFλ₯Ό λ‘λν©λλ€.
pdf_doc = pdf_loader.load() # ν
μ€νΈλ₯Ό μΆμΆν©λλ€.
return pdf_doc # μΆμΆν ν
μ€νΈλ₯Ό λ°νν©λλ€.
# κ³Όμ
# μλ ν
μ€νΈ μΆμΆ ν¨μλ₯Ό μμ±
def get_text_file(docs):
if docs.type == 'text/plain':
# ν
μ€νΈ νμΌ (.txt)μμ ν
μ€νΈλ₯Ό μΆμΆνλ ν¨μ
return [docs.getvalue().decode('utf-8')]
else:
st.warning("Unsupported file type for get_text_file")
def get_csv_file(docs):
if docs.type == 'text/csv':
# CSV νμΌ (.csv)μμ ν
μ€νΈλ₯Ό μΆμΆνλ ν¨μ
csv_loader = CSVLoader(docs)
csv_data = csv_loader.load()
# CSV νμΌμ κ° νμ λ¬Έμμ΄λ‘ λ³ννμ¬ λ°ν
return [' '.join(map(str, row)) for row in csv_data]
else:
st.warning("Unsupported file type for get_csv_file")
def get_json_file(docs):
if docs.type == 'application/json':
# JSON νμΌ (.json)μμ ν
μ€νΈλ₯Ό μΆμΆνλ ν¨μ
json_loader = JSONLoader(docs)
json_data = json_loader.load()
# JSON νμΌμ κ° νλͺ©μ λ¬Έμμ΄λ‘ λ³ννμ¬ λ°ν
return [json.dumps(item) for item in json_data]
else:
st.warning("Unsupported file type for get_json_file")
# λ¬Έμλ€μ μ²λ¦¬νμ¬ ν
μ€νΈ μ²ν¬λ‘ λλλ ν¨μμ
λλ€.
def get_text_chunks(documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
# κ° λ¬Έμμ λ΄μ©μ 리μ€νΈμ μΆκ°
texts = []
for doc in documents:
if hasattr(doc, 'page_content'):
# λ¬Έμ κ°μ²΄μΈ κ²½μ°μλ§ μΆκ°
texts.append(doc.page_content)
elif isinstance(doc, str):
# λ¬Έμμ΄μΈ κ²½μ° κ·Έλλ‘ μΆκ°
texts.append(doc)
# λλ μ²ν¬λ₯Ό λ°ν
return text_splitter.split_documents(texts)
# ν
μ€νΈ μ²ν¬λ€λ‘λΆν° λ²‘ν° μ€ν μ΄λ₯Ό μμ±νλ ν¨μμ
λλ€.
def get_vectorstore(text_chunks):
# OpenAI μλ² λ© λͺ¨λΈμ λ‘λν©λλ€. (Embedding models - Ada v2)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS λ²‘ν° μ€ν μ΄λ₯Ό μμ±ν©λλ€.
return vectorstore # μμ±λ λ²‘ν° μ€ν μ΄λ₯Ό λ°νν©λλ€.
def get_conversation_chain(vectorstore):
print(f"DEBUG: session_state.conversation before initialization: {st.session_state.conversation}")
try:
if st.session_state.conversation is None:
gpt_model_name = 'gpt-3.5-turbo'
llm = ChatOpenAI(model_name=gpt_model_name)
# λν κΈ°λ‘μ μ μ₯νκΈ° μν λ©λͺ¨λ¦¬λ₯Ό μμ±ν©λλ€.
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)
# λν κ²μ 체μΈμ μμ±ν©λλ€.
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
st.session_state.conversation = conversation_chain
except Exception as e:
print(f"Error during conversation initialization: {e}")
print(f"DEBUG: session_state.conversation after initialization: {st.session_state.conversation}")
return st.session_state.conversation if st.session_state.conversation else ConversationalRetrievalChain()
# μ¬μ©μ μ
λ ₯μ μ²λ¦¬νλ ν¨μμ
λλ€.
def handle_userinput(user_question):
# λν 체μΈμ μ¬μ©νμ¬ μ¬μ©μ μ§λ¬Έμ λν μλ΅μ μμ±ν©λλ€.
response = st.session_state.conversation({'question': user_question})
# λν κΈ°λ‘μ μ μ₯ν©λλ€.
st.session_state.chat_history = response['chat_history']
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write(user_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
else:
st.write(bot_template.replace(
"{{MSG}}", message.content), unsafe_allow_html=True)
def main():
load_dotenv()
st.set_page_config(page_title="Chat with multiple Files :)",
page_icon=":books:")
st.write(css, unsafe_allow_html=True)
if "conversation" not in st.session_state or st.session_state.conversation is None:
st.session_state.conversation = None
st.session_state.chat_history = None
st.header("Chat with multiple Files :")
user_question = st.text_input("Ask a question about your documents:")
if user_question:
handle_userinput(user_question)
with st.sidebar:
openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
if openai_key:
os.environ["OPENAI_API_KEY"] = openai_key
st.subheader("Your documents")
docs = st.file_uploader(
"Upload your documents here and click on 'Process'", accept_multiple_files=True)
if st.button("Process"):
with st.spinner("Processing"):
# λ¬Έμμμ μΆμΆν ν
μ€νΈλ₯Ό λ΄μ 리μ€νΈ
doc_list = []
for file in docs:
if file.type == 'text/plain':
# .txt νμΌμ κ²½μ°
doc_list.extend(get_text_file(file))
elif file.type == 'text/csv':
# .csv νμΌμ κ²½μ°
doc_list.extend(get_csv_file(file))
elif file.type == 'application/json':
# .json νμΌμ κ²½μ°
doc_list.extend(get_json_file(file))
elif file.type in ['application/octet-stream', 'application/pdf']:
# .pdf νμΌμ κ²½μ°
doc_list.extend(get_pdf_text(file))
# ν
μ€νΈ μ²ν¬λ‘ λλκΈ°
text_chunks = get_text_chunks(doc_list)
# λ²‘ν° μ€ν μ΄ μμ±
vectorstore = get_vectorstore(text_chunks)
# λν μ²΄μΈ μμ±
st.session_state.conversation = get_conversation_chain(vectorstore)
if __name__ == '__main__':
main()
|