lds204 commited on
Commit
170ba17
β€’
1 Parent(s): 11f8ff9
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +27 -31
  3. requirements.txt +0 -1
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Basic DAG AI Chatbot With ChatGPT
3
- emoji: πŸ“š
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
  app_file: app.py
 
1
  ---
2
+ title: Basic DAG AI Chatbot With Llama2
3
+ emoji: πŸ”₯
4
+ colorFrom: green
5
+ colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
  app_file: app.py
app.py CHANGED
@@ -1,19 +1,16 @@
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
- from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
- from langchain.vectorstores import FAISS, Chroma
7
  from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
8
- from langchain.chat_models import ChatOpenAI
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
- from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers # For loading transformer models.
13
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
14
  import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
15
  import os
16
-
17
 
18
  # PDF λ¬Έμ„œλ‘œλΆ€ν„° ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
19
  def get_pdf_text(pdf_docs):
@@ -27,11 +24,9 @@ def get_pdf_text(pdf_docs):
27
 
28
  # 과제
29
  # μ•„λž˜ ν…μŠ€νŠΈ μΆ”μΆœ ν•¨μˆ˜λ₯Ό μž‘μ„±
30
-
31
  def get_text_file(docs):
32
- pass
33
-
34
-
35
  def get_csv_file(docs):
36
  pass
37
 
@@ -42,29 +37,33 @@ def get_json_file(docs):
42
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
43
  def get_text_chunks(documents):
44
  text_splitter = RecursiveCharacterTextSplitter(
45
- chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
46
- chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
47
- length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
48
  )
49
 
50
- documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€
51
- return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
52
 
53
 
54
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
55
  def get_vectorstore(text_chunks):
56
- # OpenAI μž„λ² λ”© λͺ¨λΈμ„ λ‘œλ“œν•©λ‹ˆλ‹€. (Embedding models - Ada v2)
57
-
58
- embeddings = OpenAIEmbeddings()
59
- vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
60
-
61
- return vectorstore # μƒμ„±λœ 벑터 μŠ€ν† μ–΄λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
62
 
63
 
64
  def get_conversation_chain(vectorstore):
65
- gpt_model_name = 'gpt-3.5-turbo'
66
- llm = ChatOpenAI(model_name = gpt_model_name) #gpt-3.5 λͺ¨λΈ λ‘œλ“œ
67
-
 
 
 
 
 
68
  # λŒ€ν™” 기둝을 μ €μž₯ν•˜κΈ° μœ„ν•œ λ©”λͺ¨λ¦¬λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
69
  memory = ConversationBufferMemory(
70
  memory_key='chat_history', return_messages=True)
@@ -74,10 +73,11 @@ def get_conversation_chain(vectorstore):
74
  retriever=vectorstore.as_retriever(),
75
  memory=memory
76
  )
77
- return conversation_chain
78
 
79
  # μ‚¬μš©μž μž…λ ₯을 μ²˜λ¦¬ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
80
  def handle_userinput(user_question):
 
81
  # λŒ€ν™” 체인을 μ‚¬μš©ν•˜μ—¬ μ‚¬μš©μž μ§ˆλ¬Έμ— λŒ€ν•œ 응닡을 μƒμ„±ν•©λ‹ˆλ‹€.
82
  response = st.session_state.conversation({'question': user_question})
83
  # λŒ€ν™” 기둝을 μ €μž₯ν•©λ‹ˆλ‹€.
@@ -103,16 +103,12 @@ def main():
103
  if "chat_history" not in st.session_state:
104
  st.session_state.chat_history = None
105
 
106
- st.header("Chat with multiple Files :")
107
  user_question = st.text_input("Ask a question about your documents:")
108
  if user_question:
109
  handle_userinput(user_question)
110
 
111
  with st.sidebar:
112
- openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
113
- if openai_key:
114
- os.environ["OPENAI_API_KEY"] = openai_key
115
-
116
  st.subheader("Your documents")
117
  docs = st.file_uploader(
118
  "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
@@ -148,4 +144,4 @@ def main():
148
 
149
 
150
  if __name__ == '__main__':
151
- main()
 
1
  import streamlit as st
2
  from dotenv import load_dotenv
 
3
  from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import FAISS
 
5
  from langchain.embeddings import HuggingFaceEmbeddings # General embeddings from HuggingFace models.
 
6
  from langchain.memory import ConversationBufferMemory
7
  from langchain.chains import ConversationalRetrievalChain
8
  from htmlTemplates import css, bot_template, user_template
9
+ from langchain.llms import LlamaCpp # For loading transformer models.
10
  from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
11
  import tempfile # μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜κΈ° μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬μž…λ‹ˆλ‹€.
12
  import os
13
+ from huggingface_hub import hf_hub_download # Hugging Face Hubμ—μ„œ λͺ¨λΈμ„ λ‹€μš΄λ‘œλ“œν•˜κΈ° μœ„ν•œ ν•¨μˆ˜μž…λ‹ˆλ‹€.
14
 
15
  # PDF λ¬Έμ„œλ‘œλΆ€ν„° ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
16
  def get_pdf_text(pdf_docs):
 
24
 
25
  # 과제
26
  # μ•„λž˜ ν…μŠ€νŠΈ μΆ”μΆœ ν•¨μˆ˜λ₯Ό μž‘μ„±
 
27
  def get_text_file(docs):
28
+ pass
29
+
 
30
  def get_csv_file(docs):
31
  pass
32
 
 
37
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
38
  def get_text_chunks(documents):
39
  text_splitter = RecursiveCharacterTextSplitter(
40
+ chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
41
+ chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
42
+ length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
43
  )
44
 
45
+ documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€.
46
+ return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
47
 
48
 
49
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
50
  def get_vectorstore(text_chunks):
51
+ # μ›ν•˜λŠ” μž„λ² λ”© λͺ¨λΈμ„ λ‘œλ“œν•©λ‹ˆλ‹€.
52
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L12-v2',
53
+ model_kwargs={'device': 'cpu'}) # μž„λ² λ”© λͺ¨λΈμ„ μ„€μ •ν•©λ‹ˆλ‹€.
54
+ vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
55
+ return vectorstore # μƒμ„±λœ 벑터 μŠ€ν† μ–΄λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
 
56
 
57
 
58
  def get_conversation_chain(vectorstore):
59
+ model_name_or_path = 'TheBloke/Llama-2-7B-chat-GGUF'
60
+ model_basename = 'llama-2-7b-chat.Q2_K.gguf'
61
+ model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
62
+
63
+ llm = LlamaCpp(model_path=model_path,
64
+ n_ctx=4086,
65
+ input={"temperature": 0.75, "max_length": 2000, "top_p": 1},
66
+ verbose=True, )
67
  # λŒ€ν™” 기둝을 μ €μž₯ν•˜κΈ° μœ„ν•œ λ©”λͺ¨λ¦¬λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
68
  memory = ConversationBufferMemory(
69
  memory_key='chat_history', return_messages=True)
 
73
  retriever=vectorstore.as_retriever(),
74
  memory=memory
75
  )
76
+ return conversation_chain # μƒμ„±λœ λŒ€ν™” 체인을 λ°˜ν™˜ν•©λ‹ˆλ‹€.
77
 
78
  # μ‚¬μš©μž μž…λ ₯을 μ²˜λ¦¬ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
79
  def handle_userinput(user_question):
80
+ print('user_question => ', user_question)
81
  # λŒ€ν™” 체인을 μ‚¬μš©ν•˜μ—¬ μ‚¬μš©μž μ§ˆλ¬Έμ— λŒ€ν•œ 응닡을 μƒμ„±ν•©λ‹ˆλ‹€.
82
  response = st.session_state.conversation({'question': user_question})
83
  # λŒ€ν™” 기둝을 μ €μž₯ν•©λ‹ˆλ‹€.
 
103
  if "chat_history" not in st.session_state:
104
  st.session_state.chat_history = None
105
 
106
+ st.header("Chat with multiple Files:")
107
  user_question = st.text_input("Ask a question about your documents:")
108
  if user_question:
109
  handle_userinput(user_question)
110
 
111
  with st.sidebar:
 
 
 
 
112
  st.subheader("Your documents")
113
  docs = st.file_uploader(
114
  "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
 
144
 
145
 
146
  if __name__ == '__main__':
147
+ main()
requirements.txt CHANGED
@@ -11,4 +11,3 @@ streamlit-extras
11
  InstructorEmbedding
12
  sentence-transformers
13
  jq
14
- openai
 
11
  InstructorEmbedding
12
  sentence-transformers
13
  jq