tony346 commited on
Commit
897ec15
β€’
1 Parent(s): 38e323c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -46
app.py CHANGED
@@ -11,78 +11,64 @@ from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
- from tempfile import NamedTemporaryFile
15
  import os
16
 
17
 
 
18
  def get_pdf_text(pdf_docs):
19
- with NamedTemporaryFile() as temp_file:
20
- temp_file.write(pdf_docs.getvalue())
21
- temp_file.seek(0)
22
- pdf_loader = PyPDFLoader(temp_file.name)
23
- # print('pdf_loader = ', pdf_loader)
24
- pdf_doc = pdf_loader.load()
25
- # print('pdf_doc = ',pdf_doc)
26
- return pdf_doc
27
 
 
 
28
 
29
  def get_text_file(docs):
30
- with NamedTemporaryFile() as temp_file:
31
- temp_file.write(docs.getvalue())
32
- temp_file.seek(0)
33
- text_loader = TextLoader(temp_file.name)
34
- text_doc = text_loader.load()
35
-
36
- return text_doc
37
 
38
 
39
  def get_csv_file(docs):
40
- with NamedTemporaryFile() as temp_file:
41
- temp_file.write(docs.getvalue())
42
- temp_file.seek(0)
43
- text_loader = CSVLoader(temp_file.name)
44
- text_doc = text_loader.load()
45
-
46
- return text_doc
47
-
48
 
49
  def get_json_file(docs):
50
- with NamedTemporaryFile() as temp_file:
51
- temp_file.write(docs.getvalue())
52
- temp_file.seek(0)
53
- json_loader = JSONLoader(temp_file.name,
54
- jq_schema='.scans[].relationships',
55
- text_content=False)
56
- json_doc = json_loader.load()
57
-
58
- return json_doc
59
-
60
 
 
 
61
  def get_text_chunks(documents):
62
  text_splitter = RecursiveCharacterTextSplitter(
63
- chunk_size=1000,
64
- chunk_overlap=200,
65
- length_function=len
66
  )
67
 
68
- documents = text_splitter.split_documents(documents)
69
- return documents
70
 
71
 
 
72
  def get_vectorstore(text_chunks):
73
- # Load the desired embeddings model.
74
 
75
  embeddings = OpenAIEmbeddings()
76
- vectorstore = FAISS.from_documents(text_chunks, embeddings)
77
 
78
- return vectorstore
79
 
80
 
81
  def get_conversation_chain(vectorstore):
82
  gpt_model_name = 'gpt-3.5-turbo'
83
- llm = ChatOpenAI(model_name = gpt_model_name)
 
 
84
  memory = ConversationBufferMemory(
85
  memory_key='chat_history', return_messages=True)
 
86
  conversation_chain = ConversationalRetrievalChain.from_llm(
87
  llm=llm,
88
  retriever=vectorstore.as_retriever(),
@@ -90,9 +76,11 @@ def get_conversation_chain(vectorstore):
90
  )
91
  return conversation_chain
92
 
93
-
94
  def handle_userinput(user_question):
 
95
  response = st.session_state.conversation({'question': user_question})
 
96
  st.session_state.chat_history = response['chat_history']
97
 
98
  for i, message in enumerate(st.session_state.chat_history):
@@ -106,7 +94,7 @@ def handle_userinput(user_question):
106
 
107
  def main():
108
  load_dotenv()
109
- st.set_page_config(page_title="Chat with multiple PDFs",
110
  page_icon=":books:")
111
  st.write(css, unsafe_allow_html=True)
112
 
@@ -115,7 +103,7 @@ def main():
115
  if "chat_history" not in st.session_state:
116
  st.session_state.chat_history = None
117
 
118
- st.header("Chat with multiple PDFs :books:")
119
  user_question = st.text_input("Ask a question about your documents:")
120
  if user_question:
121
  handle_userinput(user_question)
 
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
+ import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
15
  import os
16
 
17
 
18
+ # PDF λ¬Έμ„œλ‘œλΆ€ν„° ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
19
  def get_pdf_text(pdf_docs):
20
+ temp_dir = tempfile.TemporaryDirectory() # μž„μ‹œ 디렉토리λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
21
+ temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # μž„μ‹œ 파일 경둜λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
22
+ with open(temp_filepath, "wb") as f: # μž„μ‹œ νŒŒμΌμ„ λ°”μ΄λ„ˆλ¦¬ μ“°κΈ° λͺ¨λ“œλ‘œ μ—½λ‹ˆλ‹€.
23
+ f.write(pdf_docs.getvalue()) # PDF λ¬Έμ„œμ˜ λ‚΄μš©μ„ μž„μ‹œ νŒŒμΌμ— μ”λ‹ˆλ‹€.
24
+ pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoaderλ₯Ό μ‚¬μš©ν•΄ PDFλ₯Ό λ‘œλ“œν•©λ‹ˆλ‹€.
25
+ pdf_doc = pdf_loader.load() # ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•©λ‹ˆλ‹€.
26
+ return pdf_doc # μΆ”μΆœν•œ ν…μŠ€νŠΈλ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
 
27
 
28
+ # 과제
29
+ # μ•„λž˜ ν…μŠ€νŠΈ μΆ”μΆœ ν•¨μˆ˜λ₯Ό μž‘μ„±
30
 
31
  def get_text_file(docs):
32
+ pass
 
 
 
 
 
 
33
 
34
 
35
  def get_csv_file(docs):
36
+ pass
 
 
 
 
 
 
 
37
 
38
  def get_json_file(docs):
39
+ pass
 
 
 
 
 
 
 
 
 
40
 
41
+
42
+ # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
43
  def get_text_chunks(documents):
44
  text_splitter = RecursiveCharacterTextSplitter(
45
+ chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
46
+ chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
47
+ length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
48
  )
49
 
50
+ documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€
51
+ return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
52
 
53
 
54
+ # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
55
  def get_vectorstore(text_chunks):
56
+ # OpenAI μž„λ² λ”© λͺ¨λΈμ„ λ‘œλ“œν•©λ‹ˆλ‹€. (Embedding models - Ada v2)
57
 
58
  embeddings = OpenAIEmbeddings()
59
+ vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
60
 
61
+ return vectorstore # μƒμ„±λœ 벑터 μŠ€ν† μ–΄λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
62
 
63
 
64
  def get_conversation_chain(vectorstore):
65
  gpt_model_name = 'gpt-3.5-turbo'
66
+ llm = ChatOpenAI(model_name = gpt_model_name) #gpt-3.5 λͺ¨λΈ λ‘œλ“œ
67
+
68
+ # λŒ€ν™” 기둝을 μ €μž₯ν•˜κΈ° μœ„ν•œ λ©”λͺ¨λ¦¬λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
69
  memory = ConversationBufferMemory(
70
  memory_key='chat_history', return_messages=True)
71
+ # λŒ€ν™” 검색 체인을 μƒμ„±ν•©λ‹ˆλ‹€.
72
  conversation_chain = ConversationalRetrievalChain.from_llm(
73
  llm=llm,
74
  retriever=vectorstore.as_retriever(),
 
76
  )
77
  return conversation_chain
78
 
79
+ # μ‚¬μš©μž μž…λ ₯을 μ²˜λ¦¬ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
80
  def handle_userinput(user_question):
81
+ # λŒ€ν™” 체인을 μ‚¬μš©ν•˜μ—¬ μ‚¬μš©μž μ§ˆλ¬Έμ— λŒ€ν•œ 응닡을 μƒμ„±ν•©λ‹ˆλ‹€.
82
  response = st.session_state.conversation({'question': user_question})
83
+ # λŒ€ν™” 기둝을 μ €μž₯ν•©λ‹ˆλ‹€.
84
  st.session_state.chat_history = response['chat_history']
85
 
86
  for i, message in enumerate(st.session_state.chat_history):
 
94
 
95
  def main():
96
  load_dotenv()
97
+ st.set_page_config(page_title="Chat with multiple Files",
98
  page_icon=":books:")
99
  st.write(css, unsafe_allow_html=True)
100
 
 
103
  if "chat_history" not in st.session_state:
104
  st.session_state.chat_history = None
105
 
106
+ st.header("Chat with multiple Files :")
107
  user_question = st.text_input("Ask a question about your documents:")
108
  if user_question:
109
  handle_userinput(user_question)