RAHMAN00700 commited on
Commit
c0bb395
·
unverified ·
1 Parent(s): c47e53e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +196 -0
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import tempfile
4
+ import pandas as pd
5
+ import json
6
+ import xml.etree.ElementTree as ET
7
+ import yaml
8
+ from bs4 import BeautifulSoup
9
+ from pptx import Presentation
10
+ from docx import Document
11
+
12
+ from langchain.document_loaders import PyPDFLoader, TextLoader
13
+ from langchain.indexes import VectorstoreIndexCreator
14
+ from langchain.chains import RetrievalQA
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.embeddings import HuggingFaceEmbeddings
17
+ from langchain.chains import LLMChain
18
+ from langchain.prompts import PromptTemplate
19
+
20
+ from ibm_watson_machine_learning.foundation_models import Model
21
+ from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
22
+ from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
23
+ from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
24
+
25
+ # Initialize index and chain to None
26
+ index = None
27
+ rag_chain = None
28
+
29
+ # Custom loader for DOCX files
30
+ class DocxLoader:
31
+ def __init__(self, file_path):
32
+ self.file_path = file_path
33
+
34
+ def load(self):
35
+ document = Document(self.file_path)
36
+ text_content = [para.text for para in document.paragraphs]
37
+ return " ".join(text_content)
38
+
39
+ # Custom loader for PPTX files
40
+ class PptxLoader:
41
+ def __init__(self, file_path):
42
+ self.file_path = file_path
43
+
44
+ def load(self):
45
+ presentation = Presentation(self.file_path)
46
+ text_content = [shape.text for slide in presentation.slides for shape in slide.shapes if hasattr(shape, "text")]
47
+ return " ".join(text_content)
48
+
49
+ # Custom loader for additional file types
50
+ def load_csv(file_path):
51
+ df = pd.read_csv(file_path)
52
+ page_size = 100
53
+ page_number = st.number_input("Page number", min_value=1, max_value=(len(df) // page_size) + 1, step=1, value=1)
54
+ start_index = (page_number - 1) * page_size
55
+ end_index = start_index + page_size
56
+ st.dataframe(df.iloc[start_index:end_index])
57
+ return df.to_string(index=False)
58
+
59
+ def load_json(file_path):
60
+ with open(file_path, 'r') as file:
61
+ data = json.load(file)
62
+ return json.dumps(data, indent=2)
63
+
64
+ def load_xml(file_path):
65
+ tree = ET.parse(file_path)
66
+ root = tree.getroot()
67
+ return ET.tostring(root, encoding="unicode")
68
+
69
+ def load_yaml(file_path):
70
+ with open(file_path, 'r') as file:
71
+ data = yaml.safe_load(file)
72
+ return yaml.dump(data)
73
+
74
+ def load_html(file_path):
75
+ with open(file_path, 'r', encoding='utf-8') as file:
76
+ soup = BeautifulSoup(file, 'html.parser')
77
+ return soup.get_text()
78
+
79
+ # Caching function to load various file types
80
+ @st.cache_resource
81
+ def load_file(file_name, file_type):
82
+ loaders = []
83
+ text = None
84
+
85
+ if file_type == "pdf":
86
+ loaders = [PyPDFLoader(file_name)]
87
+ elif file_type == "docx":
88
+ loader = DocxLoader(file_name)
89
+ text = loader.load()
90
+ elif file_type == "pptx":
91
+ loader = PptxLoader(file_name)
92
+ text = loader.load()
93
+ elif file_type == "txt":
94
+ loaders = [TextLoader(file_name)]
95
+ elif file_type == "csv":
96
+ text = load_csv(file_name)
97
+ elif file_type == "json":
98
+ text = load_json(file_name)
99
+ elif file_type == "xml":
100
+ text = load_xml(file_name)
101
+ elif file_type == "yaml":
102
+ text = load_yaml(file_name)
103
+ elif file_type == "html":
104
+ text = load_html(file_name)
105
+ elif file_type == "htm":
106
+ text = load_html(file_name)
107
+ else:
108
+ st.error("Unsupported file type.")
109
+ return None
110
+
111
+ if text:
112
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
113
+ temp_file.write(text.encode("utf-8"))
114
+ temp_file_path = temp_file.name
115
+ loaders = [TextLoader(temp_file_path)]
116
+
117
+ if loaders:
118
+ index = VectorstoreIndexCreator(
119
+ embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2"),
120
+ text_splitter=RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50)
121
+ ).from_loaders(loaders)
122
+ st.success("Index created successfully!")
123
+ return index
124
+ return None
125
+
126
+ # Watsonx API setup
127
+ watsonx_api_key = os.getenv("WATSONX_API_KEY")
128
+ watsonx_project_id = os.getenv("WATSONX_PROJECT_ID")
129
+
130
+ if not watsonx_api_key or not watsonx_project_id:
131
+ st.error("API Key or Project ID is not set. Please set them as environment variables.")
132
+
133
+ prompt_template_br = PromptTemplate(
134
+ input_variables=["context", "question"],
135
+ template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
136
+ I am a helpful assistant.
137
+
138
+ <|eot_id|>
139
+ {context}
140
+ <|start_header_id|>user<|end_header_id|>
141
+ {question}<|eot_id|>
142
+ """
143
+ )
144
+
145
+ # Sidebar tab selection
146
+ with st.sidebar:
147
+ st.title("Watsonx RAG: Multi-Document Retrieval")
148
+ tab = st.radio("Select Mode", ["Watsonx Discovery", "File Upload"])
149
+
150
+ # Handle Watsonx Discovery tab
151
+ if tab == "Watsonx Discovery":
152
+ # Watsonx Discovery setup
153
+ st.info("Ask a question using Watsonx Discovery")
154
+ uploaded_file = st.text_input("Enter document ID from Watsonx Discovery")
155
+
156
+ if uploaded_file:
157
+ # Use Watsonx Discovery to fetch documents and process
158
+ # Replace with your Discovery query and model call
159
+
160
+ st.info(f"Querying Watsonx Discovery with document ID: {uploaded_file}")
161
+ # Assuming you have a method to query Discovery based on the doc ID
162
+ # Replace `fetch_from_discovery` with actual API interaction
163
+
164
+ # Assuming the response comes as a list of documents or text
165
+ context = "Sample response from Watsonx Discovery based on the doc ID."
166
+ question = st.text_input("Ask your question about the document")
167
+
168
+ if question:
169
+ st.info(f"Question: {question}")
170
+ # LLM call to Watsonx for question-answering
171
+ response = "Answer from LLM based on Discovery content."
172
+ st.write(response)
173
+
174
+ # Handle File Upload tab
175
+ elif tab == "File Upload":
176
+ with st.sidebar:
177
+ watsonx_model = st.selectbox("Model", ["meta-llama/llama-3-405b-instruct", "codellama/codellama-34b-instruct-hf", "ibm/granite-20b-multilingual"])
178
+ max_new_tokens = st.slider("Max output tokens", min_value=100, max_value=4000, value=600, step=100)
179
+ decoding_method = st.radio("Decoding", (DecodingMethods.GREEDY.value, DecodingMethods.SAMPLE.value))
180
+
181
+ # File upload logic
182
+ uploaded_file = st.file_uploader("Upload a file", accept_multiple_files=False, type=["pdf", "docx", "txt", "pptx", "csv", "json", "xml", "yaml", "html"])
183
+
184
+ if uploaded_file is not None:
185
+ bytes_data = uploaded_file.read()
186
+ st.write("Filename:", uploaded_file.name)
187
+ with open(uploaded_file.name, 'wb') as f:
188
+ f.write(bytes_data)
189
+ file_type = uploaded_file.name.split('.')[-1].lower()
190
+ index = load_file(uploaded_file.name, file_type)
191
+
192
+ model_name = watsonx_model
193
+
194
+ # Watsonx Model Setup (similar to your existing code)
195
+
196
+ # Other code related to file handling and LLM interaction remains the same...