RAHMAN00700 commited on
Commit
0de8564
·
1 Parent(s): 123e33d

changes made in repo

Browse files
Files changed (6) hide show
  1. app.py +3 -0
  2. app1.py +0 -176
  3. appcsvhtml.py +0 -220
  4. appfinal.py +0 -193
  5. appfinalokokok.py +0 -199
  6. sample env.txt +2 -0
app.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  import json
6
  import xml.etree.ElementTree as ET
7
  import yaml
 
8
  from bs4 import BeautifulSoup
9
  from pptx import Presentation
10
  from docx import Document
@@ -124,6 +125,8 @@ def load_file(file_name, file_type):
124
  return None
125
 
126
  # Watsonx API setup
 
 
127
  watsonx_api_key = os.getenv("WATSONX_API_KEY")
128
  watsonx_project_id = os.getenv("WATSONX_PROJECT_ID")
129
 
 
5
  import json
6
  import xml.etree.ElementTree as ET
7
  import yaml
8
+ from dotenv import load_dotenv
9
  from bs4 import BeautifulSoup
10
  from pptx import Presentation
11
  from docx import Document
 
125
  return None
126
 
127
  # Watsonx API setup
128
+ load_dotenv()
129
+
130
  watsonx_api_key = os.getenv("WATSONX_API_KEY")
131
  watsonx_project_id = os.getenv("WATSONX_PROJECT_ID")
132
 
app1.py DELETED
@@ -1,176 +0,0 @@
1
- import os
2
- import tempfile
3
- from dotenv import load_dotenv
4
- import streamlit as st
5
- from langchain.document_loaders import PyPDFLoader, TextLoader
6
- from langchain.indexes import VectorstoreIndexCreator
7
- from langchain.chains import RetrievalQA
8
- from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- from langchain.embeddings import HuggingFaceEmbeddings
10
- from langchain.chains import LLMChain
11
- from langchain.prompts import PromptTemplate
12
-
13
- from ibm_watson_machine_learning.foundation_models import Model
14
- from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
15
- from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
16
- from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
17
- from pptx import Presentation
18
- from docx import Document
19
-
20
- # Load environment variables
21
- load_dotenv()
22
-
23
- # Watsonx API setup
24
- watsonx_api_key = os.getenv("API_KEY")
25
- watsonx_project_id = os.getenv("PROJECT_ID")
26
- watsonx_url = "https://us-south.ml.cloud.ibm.com"
27
-
28
- if not watsonx_api_key or not watsonx_project_id:
29
- st.error("API Key or Project ID is not set. Please set them as environment variables.")
30
-
31
- # Custom loader for DOCX files
32
- class DocxLoader:
33
- def __init__(self, file_path):
34
- self.file_path = file_path
35
-
36
- def load(self):
37
- document = Document(self.file_path)
38
- text_content = [para.text for para in document.paragraphs]
39
- return " ".join(text_content)
40
-
41
- # Custom loader for PPTX files
42
- class PptxLoader:
43
- def __init__(self, file_path):
44
- self.file_path = file_path
45
-
46
- def load(self):
47
- presentation = Presentation(self.file_path)
48
- text_content = []
49
- for slide in presentation.slides:
50
- for shape in slide.shapes:
51
- if hasattr(shape, "text"):
52
- text_content.append(shape.text)
53
- return " ".join(text_content)
54
-
55
- # Caching function to load various file types
56
- @st.cache_resource
57
- def load_file(uploaded_file, file_type):
58
- loaders = []
59
-
60
- # Save uploaded file to a temporary path
61
- with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type}") as temp_file:
62
- temp_file.write(uploaded_file.read())
63
- temp_file_path = temp_file.name
64
-
65
- if file_type == "pdf":
66
- loaders = [PyPDFLoader(temp_file_path)]
67
- elif file_type == "docx":
68
- loader = DocxLoader(temp_file_path)
69
- text = loader.load()
70
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_txt_file:
71
- temp_txt_file.write(text.encode("utf-8"))
72
- temp_txt_file_path = temp_txt_file.name
73
- loaders = [TextLoader(temp_txt_file_path)]
74
- elif file_type == "txt":
75
- loaders = [TextLoader(temp_file_path)]
76
- elif file_type == "pptx":
77
- loader = PptxLoader(temp_file_path)
78
- text = loader.load()
79
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_txt_file:
80
- temp_txt_file.write(text.encode("utf-8"))
81
- temp_txt_file_path = temp_txt_file.name
82
- loaders = [TextLoader(temp_txt_file_path)]
83
- else:
84
- st.error("Unsupported file type.")
85
- return None
86
-
87
- # Create the index with the loaded documents
88
- index = VectorstoreIndexCreator(
89
- embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2"),
90
- text_splitter=RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50)
91
- ).from_loaders(loaders)
92
-
93
- return index
94
-
95
- # Prompt template
96
- prompt_template = PromptTemplate(
97
- input_variables=["context", "question"],
98
- template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
99
- I am a helpful assistant.
100
-
101
- <|eot_id|>
102
- {context}
103
- <|start_header_id|>user<|end_header_id|>
104
- {question}<|eot_id|>
105
- """
106
- )
107
-
108
- # Sidebar settings
109
- with st.sidebar:
110
- st.title("Watsonx RAG Demo")
111
- model_name = st.selectbox("Model", ["meta-llama/llama-3-405b-instruct", "codellama/codellama-34b-instruct-hf", "ibm/granite-20b-multilingual"])
112
- max_new_tokens = st.slider("Max output tokens", min_value=100, max_value=1000, value=300, step=100)
113
- decoding_method = st.radio("Decoding Method", [DecodingMethods.GREEDY.value, DecodingMethods.SAMPLE.value])
114
- st.info("Upload a PDF, DOCX, TXT, or PPTX file for RAG")
115
- uploaded_file = st.file_uploader("Upload file", accept_multiple_files=False, type=["pdf", "docx", "txt", "pptx"])
116
-
117
- if uploaded_file:
118
- file_type = uploaded_file.name.split('.')[-1].lower()
119
- index = load_file(uploaded_file, file_type)
120
-
121
- # Watsonx Model setup with UI feedback
122
- credentials = {
123
- "url": watsonx_url,
124
- "apikey": watsonx_api_key
125
- }
126
- parameters = {
127
- GenParams.DECODING_METHOD: decoding_method,
128
- GenParams.MAX_NEW_TOKENS: max_new_tokens,
129
- GenParams.MIN_NEW_TOKENS: 1,
130
- GenParams.TEMPERATURE: 0.7,
131
- GenParams.TOP_K: 50,
132
- GenParams.TOP_P: 1,
133
- GenParams.REPETITION_PENALTY: 1.0
134
- }
135
-
136
- # Display setup status
137
- status_placeholder = st.empty()
138
- status_placeholder.markdown("**Setting up Watsonx...**")
139
-
140
- try:
141
- model = WatsonxLLM(Model(model_name, credentials, parameters, project_id=watsonx_project_id))
142
- status_placeholder.markdown(f"**Model [{model_name}] ready.**")
143
- except Exception as e:
144
- st.error(f"Failed to initialize model: {str(e)}")
145
-
146
- # Chat History Setup
147
- if "messages" not in st.session_state:
148
- st.session_state.messages = []
149
-
150
- # Display chat messages from history on app rerun
151
- for message in st.session_state.messages:
152
- st.chat_message(message["role"]).markdown(message["content"])
153
-
154
- # User Input
155
- prompt = st.chat_input("Ask your question here", disabled=False if model else True)
156
-
157
- # Process User Input
158
- if prompt:
159
- st.chat_message("user").markdown(prompt)
160
-
161
- if index:
162
- rag_chain = RetrievalQA.from_chain_type(
163
- llm=model,
164
- chain_type="stuff",
165
- retriever=index.vectorstore.as_retriever(),
166
- chain_type_kwargs={"prompt": prompt_template},
167
- verbose=True
168
- )
169
- response_text = rag_chain.run(prompt).strip()
170
- else:
171
- chain = LLMChain(llm=model, prompt=prompt_template)
172
- response_text = chain.run(context="", question=prompt).strip("<|start_header_id|>assistant<|end_header_id|>").strip("<|eot_id|>")
173
-
174
- st.session_state.messages.append({'role': 'user', 'content': prompt})
175
- st.chat_message("assistant").markdown(response_text)
176
- st.session_state.messages.append({'role': 'assistant', 'content': response_text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appcsvhtml.py DELETED
@@ -1,220 +0,0 @@
1
- import os
2
- import streamlit as st
3
- import tempfile
4
- import pandas as pd
5
- import json
6
- import xml.etree.ElementTree as ET
7
- import yaml
8
- from bs4 import BeautifulSoup
9
- from pptx import Presentation
10
- from docx import Document
11
-
12
- from langchain.document_loaders import PyPDFLoader, TextLoader
13
- from langchain.indexes import VectorstoreIndexCreator
14
- from langchain.chains import RetrievalQA
15
- from langchain.text_splitter import RecursiveCharacterTextSplitter
16
- from langchain.embeddings import HuggingFaceEmbeddings
17
- from langchain.chains import LLMChain
18
- from langchain.prompts import PromptTemplate
19
-
20
- from ibm_watson_machine_learning.foundation_models import Model
21
- from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
22
- from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
23
- from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
24
-
25
- # Initialize index to None
26
- index = None
27
- rag_chain = None # Initialize rag_chain as None by default
28
-
29
- # Custom loader for DOCX files
30
- class DocxLoader:
31
- def __init__(self, file_path):
32
- self.file_path = file_path
33
-
34
- def load(self):
35
- document = Document(self.file_path)
36
- text_content = [para.text for para in document.paragraphs]
37
- return " ".join(text_content)
38
-
39
- # Custom loader for PPTX files
40
- class PptxLoader:
41
- def __init__(self, file_path):
42
- self.file_path = file_path
43
-
44
- def load(self):
45
- presentation = Presentation(self.file_path)
46
- text_content = [shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")]
47
- return " ".join(text_content)
48
-
49
- # Custom loader for additional file types
50
- def load_csv(file_path):
51
- df = pd.read_csv(file_path)
52
- return df.to_string(index=False)
53
-
54
- def load_json(file_path):
55
- with open(file_path, 'r') as file:
56
- data = json.load(file)
57
- return json.dumps(data, indent=2)
58
-
59
- def load_xml(file_path):
60
- tree = ET.parse(file_path)
61
- root = tree.getroot()
62
- return ET.tostring(root, encoding="unicode")
63
-
64
- def load_yaml(file_path):
65
- with open(file_path, 'r') as file:
66
- data = yaml.safe_load(file)
67
- return yaml.dump(data)
68
-
69
- def load_html(file_path):
70
- with open(file_path, 'r', encoding='utf-8') as file:
71
- soup = BeautifulSoup(file, 'html.parser')
72
- return soup.get_text()
73
-
74
- # Caching function to load various file types
75
- @st.cache_resource
76
- def load_file(file_name, file_type):
77
- loaders = []
78
-
79
- if file_type == "pdf":
80
- loaders = [PyPDFLoader(file_name)]
81
- elif file_type == "docx":
82
- loader = DocxLoader(file_name)
83
- text = loader.load()
84
- elif file_type == "pptx":
85
- loader = PptxLoader(file_name)
86
- text = loader.load()
87
- elif file_type == "txt":
88
- loaders = [TextLoader(file_name)]
89
- elif file_type == "csv":
90
- text = load_csv(file_name)
91
- elif file_type == "json":
92
- text = load_json(file_name)
93
- elif file_type == "xml":
94
- text = load_xml(file_name)
95
- elif file_type == "yaml":
96
- text = load_yaml(file_name)
97
- elif file_type == "html":
98
- text = load_html(file_name)
99
- else:
100
- st.error("Unsupported file type.")
101
- return None
102
-
103
- # Use TextLoader for intermediate text files from custom loaders
104
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
105
- temp_file.write(text.encode("utf-8"))
106
- temp_file_path = temp_file.name
107
- loaders = [TextLoader(temp_file_path)]
108
-
109
- index = VectorstoreIndexCreator(
110
- embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2"),
111
- text_splitter=RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50)
112
- ).from_loaders(loaders)
113
- return index
114
-
115
- # Watsonx API setup
116
- watsonx_api_key = os.getenv("WATSONX_API_KEY")
117
- watsonx_project_id = os.getenv("WATSONX_PROJECT_ID")
118
-
119
- if not watsonx_api_key or not watsonx_project_id:
120
- st.error("API Key or Project ID is not set. Please set them as environment variables.")
121
-
122
- prompt_template_br = PromptTemplate(
123
- input_variables=["context", "question"],
124
- template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
125
- I am a helpful assistant.
126
-
127
- <|eot_id|>
128
- {context}
129
- <|start_header_id|>user<|end_header_id|>
130
- {question}<|eot_id|>
131
- """
132
- )
133
-
134
- with st.sidebar:
135
- st.title("Watsonx RAG with Multiple docs")
136
- watsonx_model = st.selectbox("Model", ["meta-llama/llama-3-405b-instruct", "codellama/codellama-34b-instruct-hf", "ibm/granite-20b-multilingual"])
137
- max_new_tokens = st.slider("Max output tokens", min_value=100, max_value=4000, value=600, step=100)
138
- decoding_method = st.radio("Decoding", (DecodingMethods.GREEDY.value, DecodingMethods.SAMPLE.value))
139
- parameters = {
140
- GenParams.DECODING_METHOD: decoding_method,
141
- GenParams.MAX_NEW_TOKENS: max_new_tokens,
142
- GenParams.MIN_NEW_TOKENS: 1,
143
- GenParams.TEMPERATURE: 0,
144
- GenParams.TOP_K: 50,
145
- GenParams.TOP_P: 1,
146
- GenParams.STOP_SEQUENCES: [],
147
- GenParams.REPETITION_PENALTY: 1
148
- }
149
- st.info("Upload a file to use RAG")
150
- uploaded_file = st.file_uploader("Upload file", accept_multiple_files=False, type=["pdf", "docx", "txt", "pptx", "csv", "json", "xml", "yaml", "html"])
151
-
152
- if uploaded_file is not None:
153
- bytes_data = uploaded_file.read()
154
- st.write("Filename:", uploaded_file.name)
155
-
156
- with open(uploaded_file.name, 'wb') as f:
157
- f.write(bytes_data)
158
-
159
- file_type = uploaded_file.name.split('.')[-1].lower()
160
- index = load_file(uploaded_file.name, file_type)
161
-
162
- model_name = watsonx_model
163
-
164
- st.info("Setting up Watsonx...")
165
- my_credentials = {
166
- "url": "https://us-south.ml.cloud.ibm.com",
167
- "apikey": watsonx_api_key
168
- }
169
- params = parameters
170
- project_id = watsonx_project_id
171
- space_id = None
172
- verify = False
173
- model = WatsonxLLM(model=Model(model_name, my_credentials, params, project_id, space_id, verify))
174
-
175
- if model:
176
- st.info(f"Model {model_name} ready.")
177
- chain = LLMChain(llm=model, prompt=prompt_template_br, verbose=True)
178
-
179
- if chain and index is not None:
180
- rag_chain = RetrievalQA.from_chain_type(
181
- llm=model,
182
- chain_type="stuff",
183
- retriever=index.vectorstore.as_retriever(),
184
- chain_type_kwargs={"prompt": prompt_template_br},
185
- return_source_documents=False,
186
- verbose=True
187
- )
188
- st.info("Document-based retrieval is ready.")
189
- else:
190
- st.warning("No document uploaded or chain setup issue.")
191
-
192
- # Chat loop
193
- if "messages" not in st.session_state:
194
- st.session_state.messages = []
195
-
196
- for message in st.session_state.messages:
197
- st.chat_message(message["role"]).markdown(message["content"])
198
-
199
- prompt = st.chat_input("Ask your question here", disabled=False if chain else True)
200
-
201
- if prompt:
202
- st.chat_message("user").markdown(prompt)
203
- if rag_chain:
204
- response_text = rag_chain.run(prompt).strip()
205
- else:
206
- response_text = chain.run(question=prompt, context="").strip()
207
-
208
- st.session_state.messages.append({'role': 'User', 'content': prompt})
209
- st.chat_message("assistant").markdown(response_text)
210
- st.session_state.messages.append({'role': 'Assistant', 'content': response_text})
211
-
212
- # requirements.txt
213
- # Streamlit
214
- # pandas
215
- # beautifulsoup4
216
- # ibm-watson-machine-learning
217
- # python-pptx
218
- # python-docx
219
- # PyYAML
220
- # xml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appfinal.py DELETED
@@ -1,193 +0,0 @@
1
- import os
2
- from langchain.document_loaders import PyPDFLoader, TextLoader
3
- from langchain.indexes import VectorstoreIndexCreator
4
- from langchain.chains import RetrievalQA
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
- from langchain.chains import LLMChain
8
- from langchain.prompts import PromptTemplate
9
- import streamlit as st
10
- import tempfile
11
-
12
- from ibm_watson_machine_learning.foundation_models import Model
13
- from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
14
- from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
15
- from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
16
-
17
- from pptx import Presentation
18
- from docx import Document
19
-
20
- # Initialize index to None
21
- index = None
22
-
23
- # Custom loader for DOCX files
24
- class DocxLoader:
25
- def __init__(self, file_path):
26
- self.file_path = file_path
27
-
28
- def load(self):
29
- document = Document(self.file_path)
30
- text_content = []
31
- for para in document.paragraphs:
32
- text_content.append(para.text)
33
- return " ".join(text_content)
34
-
35
- # Custom loader for PPTX files
36
- class PptxLoader:
37
- def __init__(self, file_path):
38
- self.file_path = file_path
39
-
40
- def load(self):
41
- presentation = Presentation(self.file_path)
42
- text_content = []
43
- for slide in presentation.slides:
44
- for shape in slide.shapes:
45
- if hasattr(shape, "text"):
46
- text_content.append(shape.text)
47
- return " ".join(text_content)
48
-
49
- # Caching function to load various file types
50
- @st.cache_resource
51
- def load_file(file_name, file_type):
52
- loaders = []
53
-
54
- if file_type == "pdf":
55
- loaders = [PyPDFLoader(file_name)]
56
- elif file_type == "docx":
57
- loader = DocxLoader(file_name)
58
- text = loader.load()
59
-
60
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
61
- temp_file.write(text.encode("utf-8"))
62
- temp_file_path = temp_file.name
63
- loaders = [TextLoader(temp_file_path)]
64
-
65
- elif file_type == "txt":
66
- loaders = [TextLoader(file_name)]
67
-
68
- elif file_type == "pptx":
69
- loader = PptxLoader(file_name)
70
- text = loader.load()
71
-
72
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
73
- temp_file.write(text.encode("utf-8"))
74
- temp_file_path = temp_file.name
75
- loaders = [TextLoader(temp_file_path)]
76
-
77
- else:
78
- st.error("Unsupported file type.")
79
- return None
80
-
81
- index = VectorstoreIndexCreator(
82
- embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2"),
83
- text_splitter=RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50)
84
- ).from_loaders(loaders)
85
- return index
86
-
87
- def format_history():
88
- return ""
89
-
90
- # Watsonx API setup using environment variables
91
- watsonx_api_key = os.getenv("WATSONX_API_KEY")
92
- watsonx_project_id = os.getenv("WATSONX_PROJECT_ID")
93
-
94
- if not watsonx_api_key or not watsonx_project_id:
95
- st.error("API Key or Project ID is not set. Please set them as environment variables.")
96
-
97
- prompt_template_br = PromptTemplate(
98
- input_variables=["context", "question"],
99
- template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
100
- I am a helpful assistant.
101
-
102
- <|eot_id|>
103
- {context}
104
- <|start_header_id|>user<|end_header_id|>
105
- {question}<|eot_id|>
106
- """
107
- )
108
-
109
- with st.sidebar:
110
- st.title("Watsonx RAG with Multiple docs")
111
- watsonx_model = st.selectbox("Model", ["meta-llama/llama-3-405b-instruct", "codellama/codellama-34b-instruct-hf", "ibm/granite-20b-multilingual"])
112
- max_new_tokens = st.slider("Max output tokens", min_value=100, max_value=4000, value=600, step=100)
113
- decoding_method = st.radio("Decoding", (DecodingMethods.GREEDY.value, DecodingMethods.SAMPLE.value))
114
- parameters = {
115
- GenParams.DECODING_METHOD: decoding_method,
116
- GenParams.MAX_NEW_TOKENS: max_new_tokens,
117
- GenParams.MIN_NEW_TOKENS: 1,
118
- GenParams.TEMPERATURE: 0,
119
- GenParams.TOP_K: 50,
120
- GenParams.TOP_P: 1,
121
- GenParams.STOP_SEQUENCES: [],
122
- GenParams.REPETITION_PENALTY: 1
123
- }
124
- st.info("Upload a PDF, DOCX, TXT, or PPTX file to use RAG")
125
- uploaded_file = st.file_uploader("Upload file", accept_multiple_files=False, type=["pdf", "docx", "txt", "pptx"])
126
- if uploaded_file is not None:
127
- bytes_data = uploaded_file.read()
128
- st.write("Filename:", uploaded_file.name)
129
-
130
- with open(uploaded_file.name, 'wb') as f:
131
- f.write(bytes_data)
132
-
133
- file_type = uploaded_file.name.split('.')[-1].lower()
134
- index = load_file(uploaded_file.name, file_type)
135
-
136
- model_name = watsonx_model
137
-
138
- def clear_messages():
139
- st.session_state.messages = []
140
-
141
- st.button('Clear messages', on_click=clear_messages)
142
-
143
- st.info("Setting up Watsonx...")
144
-
145
- my_credentials = {
146
- "url": "https://us-south.ml.cloud.ibm.com",
147
- "apikey": watsonx_api_key
148
- }
149
- params = parameters
150
- project_id = watsonx_project_id
151
- space_id = None
152
- verify = False
153
- model = WatsonxLLM(model=Model(model_name, my_credentials, params, project_id, space_id, verify))
154
-
155
- if model:
156
- st.info(f"Model {model_name} ready.")
157
- chain = LLMChain(llm=model, prompt=prompt_template_br, verbose=True)
158
-
159
- if chain:
160
- st.info("Chat ready.")
161
- if index:
162
- rag_chain = RetrievalQA.from_chain_type(
163
- llm=model,
164
- chain_type="stuff",
165
- retriever=index.vectorstore.as_retriever(),
166
- chain_type_kwargs={"prompt": prompt_template_br},
167
- return_source_documents=False,
168
- verbose=True
169
- )
170
- st.info("Chat with document ready.")
171
-
172
- if "messages" not in st.session_state:
173
- st.session_state.messages = []
174
-
175
- for message in st.session_state.messages:
176
- st.chat_message(message["role"]).markdown(message["content"])
177
-
178
- prompt = st.chat_input("Ask your question here", disabled=False if chain else True)
179
-
180
- if prompt:
181
- st.chat_message("user").markdown(prompt)
182
-
183
- response_text = None
184
- if rag_chain:
185
- response_text = rag_chain.run(prompt).strip()
186
-
187
- if not response_text:
188
- response = chain.run(question=prompt, context=format_history())
189
- response_text = response.strip("<|start_header_id|>assistant<|end_header_id|>").strip("<|eot_id|>")
190
-
191
- st.session_state.messages.append({'role': 'User', 'content': prompt })
192
- st.chat_message("assistant").markdown(response_text)
193
- st.session_state.messages.append({'role': 'Assistant', 'content': response_text })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appfinalokokok.py DELETED
@@ -1,199 +0,0 @@
1
- import os
2
- import streamlit as st
3
- import tempfile
4
- from pptx import Presentation
5
- from docx import Document
6
-
7
- from langchain.document_loaders import PyPDFLoader, TextLoader
8
- from langchain.indexes import VectorstoreIndexCreator
9
- from langchain.chains import RetrievalQA
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- from langchain.embeddings import HuggingFaceEmbeddings
12
- from langchain.chains import LLMChain
13
- from langchain.prompts import PromptTemplate
14
-
15
- from ibm_watson_machine_learning.foundation_models import Model
16
- from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
17
- from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
18
- from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
19
-
20
- # Initialize index to None
21
- index = None
22
- rag_chain = None # Initialize rag_chain as None by default
23
-
24
- # Custom loader for DOCX files
25
- class DocxLoader:
26
- def __init__(self, file_path):
27
- self.file_path = file_path
28
-
29
- def load(self):
30
- document = Document(self.file_path)
31
- text_content = []
32
- for para in document.paragraphs:
33
- text_content.append(para.text)
34
- return " ".join(text_content)
35
-
36
- # Custom loader for PPTX files
37
- class PptxLoader:
38
- def __init__(self, file_path):
39
- self.file_path = file_path
40
-
41
- def load(self):
42
- presentation = Presentation(self.file_path)
43
- text_content = []
44
- for slide in presentation.slides:
45
- for shape in slide.shapes:
46
- if hasattr(shape, "text"):
47
- text_content.append(shape.text)
48
- return " ".join(text_content)
49
-
50
- # Caching function to load various file types
51
- @st.cache_resource
52
- def load_file(file_name, file_type):
53
- loaders = []
54
-
55
- if file_type == "pdf":
56
- loaders = [PyPDFLoader(file_name)]
57
- elif file_type == "docx":
58
- loader = DocxLoader(file_name)
59
- text = loader.load()
60
-
61
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
62
- temp_file.write(text.encode("utf-8"))
63
- temp_file_path = temp_file.name
64
- loaders = [TextLoader(temp_file_path)]
65
-
66
- elif file_type == "txt":
67
- loaders = [TextLoader(file_name)]
68
-
69
- elif file_type == "pptx":
70
- loader = PptxLoader(file_name)
71
- text = loader.load()
72
-
73
- with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
74
- temp_file.write(text.encode("utf-8"))
75
- temp_file_path = temp_file.name
76
- loaders = [TextLoader(temp_file_path)]
77
-
78
- else:
79
- st.error("Unsupported file type.")
80
- return None
81
-
82
- index = VectorstoreIndexCreator(
83
- embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2"),
84
- text_splitter=RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50)
85
- ).from_loaders(loaders)
86
- return index
87
-
88
- def format_history():
89
- return ""
90
-
91
- # Watsonx API setup using environment variables
92
- watsonx_api_key = os.getenv("WATSONX_API_KEY")
93
- watsonx_project_id = os.getenv("WATSONX_PROJECT_ID")
94
-
95
- if not watsonx_api_key or not watsonx_project_id:
96
- st.error("API Key or Project ID is not set. Please set them as environment variables.")
97
-
98
- prompt_template_br = PromptTemplate(
99
- input_variables=["context", "question"],
100
- template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
101
- I am a helpful assistant.
102
-
103
- <|eot_id|>
104
- {context}
105
- <|start_header_id|>user<|end_header_id|>
106
- {question}<|eot_id|>
107
- """
108
- )
109
-
110
- with st.sidebar:
111
- st.title("Watsonx RAG with Multiple docs")
112
- watsonx_model = st.selectbox("Model", ["meta-llama/llama-3-405b-instruct", "codellama/codellama-34b-instruct-hf", "ibm/granite-20b-multilingual"])
113
- max_new_tokens = st.slider("Max output tokens", min_value=100, max_value=4000, value=600, step=100)
114
- decoding_method = st.radio("Decoding", (DecodingMethods.GREEDY.value, DecodingMethods.SAMPLE.value))
115
- parameters = {
116
- GenParams.DECODING_METHOD: decoding_method,
117
- GenParams.MAX_NEW_TOKENS: max_new_tokens,
118
- GenParams.MIN_NEW_TOKENS: 1,
119
- GenParams.TEMPERATURE: 0,
120
- GenParams.TOP_K: 50,
121
- GenParams.TOP_P: 1,
122
- GenParams.STOP_SEQUENCES: [],
123
- GenParams.REPETITION_PENALTY: 1
124
- }
125
- st.info("Upload a PDF, DOCX, TXT, or PPTX file to use RAG")
126
- uploaded_file = st.file_uploader("Upload file", accept_multiple_files=False, type=["pdf", "docx", "txt", "pptx"])
127
- if uploaded_file is not None:
128
- bytes_data = uploaded_file.read()
129
- st.write("Filename:", uploaded_file.name)
130
-
131
- with open(uploaded_file.name, 'wb') as f:
132
- f.write(bytes_data)
133
-
134
- file_type = uploaded_file.name.split('.')[-1].lower()
135
- index = load_file(uploaded_file.name, file_type)
136
-
137
- model_name = watsonx_model
138
-
139
- def clear_messages():
140
- st.session_state.messages = []
141
-
142
- st.button('Clear messages', on_click=clear_messages)
143
-
144
- st.info("Setting up Watsonx...")
145
-
146
- my_credentials = {
147
- "url": "https://us-south.ml.cloud.ibm.com",
148
- "apikey": watsonx_api_key
149
- }
150
- params = parameters
151
- project_id = watsonx_project_id
152
- space_id = None
153
- verify = False
154
- model = WatsonxLLM(model=Model(model_name, my_credentials, params, project_id, space_id, verify))
155
-
156
- if model:
157
- st.info(f"Model {model_name} ready.")
158
- chain = LLMChain(llm=model, prompt=prompt_template_br, verbose=True)
159
-
160
- if chain:
161
- st.info("Chat ready.")
162
-
163
- # Only create rag_chain if index is successfully created
164
- if index is not None:
165
- rag_chain = RetrievalQA.from_chain_type(
166
- llm=model,
167
- chain_type="stuff",
168
- retriever=index.vectorstore.as_retriever(),
169
- chain_type_kwargs={"prompt": prompt_template_br},
170
- return_source_documents=False,
171
- verbose=True
172
- )
173
- st.info("Document-based retrieval is ready.")
174
- else:
175
- st.warning("No document uploaded. Answering common queries without retrieval.")
176
-
177
- # Chat loop for handling queries
178
- if "messages" not in st.session_state:
179
- st.session_state.messages = []
180
-
181
- for message in st.session_state.messages:
182
- st.chat_message(message["role"]).markdown(message["content"])
183
-
184
- prompt = st.chat_input("Ask your question here", disabled=False if chain else True)
185
-
186
- if prompt:
187
- st.chat_message("user").markdown(prompt)
188
-
189
- # Answer based on availability of rag_chain or chain
190
- if rag_chain:
191
- response_text = rag_chain.run(prompt).strip()
192
- else:
193
- # Use general model-based response if rag_chain is not available
194
- response_text = chain.run(question=prompt, context=format_history()).strip("<|start_header_id|>assistant<|end_header_id|>").strip("<|eot_id|>")
195
-
196
- # Store and display conversation
197
- st.session_state.messages.append({'role': 'User', 'content': prompt})
198
- st.chat_message("assistant").markdown(response_text)
199
- st.session_state.messages.append({'role': 'Assistant', 'content': response_text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sample env.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WATSONX_API_KEY=<your_watsonx_api_key>
2
+ WATSONX_PROJECT_ID=<your_watsonx_project_id>