Prathamesh Sable commited on
Commit
d3ab78b
·
1 Parent(s): 0b42653

working single time use without session

Browse files
Files changed (5) hide show
  1. .gitignore +2 -1
  2. __pycache__/utils.cpython-312.pyc +0 -0
  3. app.py +35 -164
  4. templates/index.html +2 -0
  5. utils.py +93 -67
.gitignore CHANGED
@@ -7,4 +7,5 @@ chroma
7
  uploads/
8
  /flask_session
9
  log.txt
10
- *.db
 
 
7
  uploads/
8
  /flask_session
9
  log.txt
10
+ *.db
11
+ __pycache__
__pycache__/utils.cpython-312.pyc CHANGED
Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ
 
app.py CHANGED
@@ -1,27 +1,22 @@
1
- from flask import Flask,request, jsonify,session
2
  from flask import render_template
3
- from flask_session import Session
4
  from werkzeug.utils import secure_filename
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
  from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
8
  import google.generativeai as genai
 
 
9
 
10
  from langchain_chroma import Chroma
11
 
12
- from utils import add_file_to_chroma,remove_file_from_chroma,generate_query_response,remove_session_data_from_chroma
13
-
14
- import sqlite3
15
-
16
  import os
17
  from dotenv import load_dotenv
18
- import time
19
- import shutil
20
  import logging
21
  from flask_cors import CORS
22
 
23
- logging.basicConfig(filename='log.txt',filemode='w', level=logging.DEBUG,
24
- format='%(asctime)s - %(levelname)s - %(message)s')
25
  logger = logging.getLogger()
26
 
27
  load_dotenv()
@@ -30,7 +25,6 @@ HF_TOKEN = os.getenv('HF_TOKEN')
30
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
31
  CHROMA_PATH = "chroma"
32
  UPLOAD_FOLDER = "uploads"
33
- SESSION_TIMEOUT = 2 * 60 * 60 # 2 hours
34
  PROMPT_TEMPLATE = """
35
  Answer the given query based only on the context given below.
36
  context:
@@ -49,93 +43,21 @@ hugging_face_ef = HuggingFaceInferenceAPIEmbeddings(
49
  api_key=HF_TOKEN,
50
  model_name="sentence-transformers/all-mpnet-base-v2"
51
  )
 
52
  # initialize LLM
53
  genai.configure(api_key=GOOGLE_API_KEY)
54
  llm_model = genai.GenerativeModel("gemini-1.5-flash")
55
 
56
  app = Flask(__name__)
57
-
58
- app.secret_key = os.getenv('SECRET_KEY', 'default_secret_key')
59
- CORS(app,supports_credentials=True)
60
-
61
- # # initialize session
62
- # app.config["SESSION_PERMANENT"] = True
63
- # # app.config["SESSION_TYPE"] = "filesystem"
64
- # app.config['SESSION_COOKIE_SECURE'] = False # Set to True if using HTTPS
65
- # app.config['SESSION_COOKIE_HTTPONLY'] = True
66
- # app.config['SESSION_COOKIE_SAMESITE'] = 'Lax'
67
- # app.config["SESSION_USE_SIGNER"] = True
68
-
69
- app.config["SESSION_TYPE"] = "sqlalchemy"
70
- app.config["SESSION_SQLALCHEMY_TABLE"] = "flask_session"
71
- app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///flask_session.db"
72
-
73
- Session(app)
74
 
75
  # Initialize ChromaDB client
76
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hugging_face_ef)
77
 
78
- sqldb = sqlite3.connect("sessions.db",check_same_thread=False)
79
- cursor = sqldb.cursor()
80
-
81
- def init_db(sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
82
- cursor.execute("""
83
- CREATE TABLE IF NOT EXISTS sessions (
84
- session_id TEXT PRIMARY KEY,
85
- last_accessed DATETIME DEFAULT CURRENT_TIMESTAMP,
86
- CREATED_AT DATETIME DEFAULT CURRENT_TIMESTAMP
87
- );""")
88
- cursor.execute("""CREATE TABLE IF NOT EXISTS files (
89
- id INTEGER PRIMARY KEY AUTOINCREMENT,
90
- session_id TEXT,
91
- file_id TEXT,
92
- file_path TEXT,
93
- file_name TEXT,
94
- FOREIGN KEY (session_id) REFERENCES sessions(session_id) ON DELETE CASCADE
95
- );""")
96
- sqldb.commit()
97
-
98
- init_db(sqldb,cursor)
99
-
100
-
101
- def create_or_update_session():
102
- session_id = session.sid
103
- if cursor.execute("SELECT * FROM sessions WHERE session_id = ?", (session_id,)).fetchone() is None:
104
- cursor.execute("INSERT INTO sessions (session_id) VALUES (?)", (session_id,))
105
- sqldb.commit()
106
- logger.info(f"CREATED NEW SESSION with ID {session_id}")
107
- else:
108
- cursor.execute("UPDATE sessions SET last_accessed = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
109
- sqldb.commit()
110
- logger.info(f"UPDATED SESSION with ID {session_id}")
111
-
112
- def pure_update_session(session_id,cursor):
113
- cursor.execute("UPDATE sessions SET last_accessed = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
114
-
115
- def add_file_to_session(session_id, file_id, file_path, file_name,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
116
- cursor.execute("INSERT INTO files (session_id, file_id, file_path, file_name) VALUES (?, ?, ?, ?)", (session_id, file_id, file_path, file_name))
117
- pure_update_session(session_id,cursor)
118
- sqldb.commit()
119
- logger.info(f"ADDED FILE with ID {file_id} to SESSION with ID {session_id}")
120
-
121
- def remove_file_from_session(session_id, file_id,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
122
- cursor.execute("DELETE FROM files WHERE session_id = ? AND file_id = ?", (session_id, file_id))
123
- pure_update_session(session_id,cursor)
124
- sqldb.commit()
125
- logger.info(f"REMOVED FILE with ID {file_id} from SESSION with ID {session_id}")
126
-
127
-
128
- def get_file_list(session_id,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
129
- cursor.execute("SELECT file_id, file_path, file_name FROM files WHERE session_id = ?", (session_id,))
130
- return cursor.fetchall()
131
-
132
-
133
  @app.route('/')
134
  def index():
135
- create_or_update_session()
136
- return render_template('index.html') # Serve the HTML file we created
137
 
138
- # add files
139
  @app.route('/upload-file', methods=['POST'])
140
  def upload_file():
141
  """Handle file uploads."""
@@ -144,53 +66,42 @@ def upload_file():
144
 
145
  file = request.files['file']
146
  file_id = request.form.get('file_count')
147
- session_id = session.sid
148
 
149
  if not file or not file.filename:
150
  return jsonify({'error': 'No file selected', 'status': 'error'}), 400
151
 
152
-
153
  filename = secure_filename(file.filename)
154
  file_path = os.path.join(UPLOAD_FOLDER, filename)
155
  file.save(file_path)
156
-
157
- # Update session data
158
- add_file_to_session(session_id, file_id, file_path, filename,sqldb,cursor)
159
 
160
- # Add file chunks to ChromaDB
161
- add_file_to_chroma(file_path, file_id, session_id,hugging_face_ef,db,logger)
162
-
163
- return jsonify({'message': 'File uploaded successfully', 'status': 'success'}), 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
-
166
- @app.route('/get-files',methods=["GET"])
167
- def get_files():
168
- return jsonify({"files":get_file_list(session.sid,sqldb,cursor)}),200
169
-
170
-
171
- @app.route('/status',methods=["GET"])
172
- def status():
173
- print(request.cookies.keys())
174
- # return all data from chroma db
175
- return jsonify({
176
- "current_session":session.sid,
177
- "z-chroma_data":db.get()
178
- }),200
179
- @app.after_request
180
- def check_response_cookie(response):
181
- logger.debug(f"Response Cookies: {response.headers.get('Set-Cookie')}")
182
- return response
183
-
184
- @app.route('/remove-file',methods=["POST"])
185
  def remove_file():
186
  file_id = request.form.get('file_id')
187
- session_id = session.sid
188
 
189
- # remove file entry from session
190
- remove_file_from_session(session_id, file_id,sqldb,cursor)
191
-
192
- # remove file chunks from chroma
193
- if remove_file_from_chroma(file_id,session_id,db):
194
  return jsonify({
195
  'message': 'File deleted successfully',
196
  'status': 'success'
@@ -201,51 +112,11 @@ def remove_file():
201
  'status': 'fail'
202
  }), 404
203
 
204
- # Clean up expired files and ChromaDB collections
205
- def cleanup_resources():
206
- """Clean up expired files and ChromaDB collections."""
207
- now = time.time()
208
- # get time before all sessions are expired
209
- last_update_time_required = now - SESSION_TIMEOUT
210
-
211
- # get session to delete
212
- cursor.execute("SELECT session_id FROM sessions WHERE last_accessed < ?", (last_update_time_required,))
213
- expired_sessions = cursor.fetchall()
214
- logger.info(f"Expired sessions: {expired_sessions}")
215
-
216
- # Remove expired sessions
217
- cursor.execute("DELETE FROM sessions WHERE session_id IN (?)", (expired_sessions,))
218
-
219
- sqldb.commit()
220
-
221
- # Remove expired files chunk from chroma
222
- remove_session_data_from_chroma(expired_sessions,db,logger)
223
-
224
-
225
  @app.route("/ask_query", methods=['POST'])
226
  def ask_query():
227
  query = request.form.get("query")
228
-
229
- resp = generate_query_response(query,session.sid,db,llm_model,PROMPT_TEMPLATE)
230
-
231
- return jsonify(resp),200
232
-
233
- """
234
- # Start the scheduler
235
- scheduler = BackgroundScheduler()
236
- scheduler.add_job(cleanup_resources, 'interval', minutes=5) # Run every 5 minutes
237
- scheduler.start()
238
-
239
- # Ensure scheduler stops on app exit
240
- @app.teardown_appcontext
241
- def shutdown_scheduler(exception=None):
242
- if exception is not None:
243
- logger.error("Scheduler shutdown failed", exc_info=exception)
244
- if scheduler.running:
245
- scheduler.shutdown()
246
 
247
- """
248
  if __name__ == "__main__":
249
- app.run(host="0.0.0.0",port=8000,debug=True,threaded=True)
250
-
251
-
 
1
+ from flask import Flask, request, jsonify
2
  from flask import render_template
 
3
  from werkzeug.utils import secure_filename
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
 
6
  from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
7
  import google.generativeai as genai
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from utils import add_file_to_chroma,remove_file_from_chroma,generate_query_response
10
 
11
  from langchain_chroma import Chroma
12
 
 
 
 
 
13
  import os
14
  from dotenv import load_dotenv
 
 
15
  import logging
16
  from flask_cors import CORS
17
 
18
+ logging.basicConfig(filename='log.txt', filemode='w', level=logging.DEBUG,
19
+ format='%(asctime)s - %(levelname)s - %(message)s')
20
  logger = logging.getLogger()
21
 
22
  load_dotenv()
 
25
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
26
  CHROMA_PATH = "chroma"
27
  UPLOAD_FOLDER = "uploads"
 
28
  PROMPT_TEMPLATE = """
29
  Answer the given query based only on the context given below.
30
  context:
 
43
  api_key=HF_TOKEN,
44
  model_name="sentence-transformers/all-mpnet-base-v2"
45
  )
46
+
47
  # initialize LLM
48
  genai.configure(api_key=GOOGLE_API_KEY)
49
  llm_model = genai.GenerativeModel("gemini-1.5-flash")
50
 
51
  app = Flask(__name__)
52
+ CORS(app)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # Initialize ChromaDB client
55
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hugging_face_ef)
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  @app.route('/')
58
  def index():
59
+ return render_template('index.html')
 
60
 
 
61
  @app.route('/upload-file', methods=['POST'])
62
  def upload_file():
63
  """Handle file uploads."""
 
66
 
67
  file = request.files['file']
68
  file_id = request.form.get('file_count')
 
69
 
70
  if not file or not file.filename:
71
  return jsonify({'error': 'No file selected', 'status': 'error'}), 400
72
 
 
73
  filename = secure_filename(file.filename)
74
  file_path = os.path.join(UPLOAD_FOLDER, filename)
75
  file.save(file_path)
 
 
 
76
 
77
+ try:
78
+ # Add file chunks to ChromaDB
79
+ add_file_to_chroma(file_path, file_id, hugging_face_ef, db, logger)
80
+
81
+ return jsonify({
82
+ 'message': 'File uploaded successfully',
83
+ 'status': 'success',
84
+ 'file_info': {
85
+ 'file_id': file_id,
86
+ 'file_name': filename
87
+ }
88
+ }), 200
89
+ except ValueError as e:
90
+ return jsonify({
91
+ 'error': str(e),
92
+ 'status': 'error'
93
+ }), 400
94
+ except Exception as e:
95
+ return jsonify({
96
+ 'error': str(e),
97
+ 'status': 'error'
98
+ }), 500
99
 
100
+ @app.route('/remove-file', methods=["POST"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def remove_file():
102
  file_id = request.form.get('file_id')
 
103
 
104
+ if remove_file_from_chroma(file_id, db):
 
 
 
 
105
  return jsonify({
106
  'message': 'File deleted successfully',
107
  'status': 'success'
 
112
  'status': 'fail'
113
  }), 404
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  @app.route("/ask_query", methods=['POST'])
116
  def ask_query():
117
  query = request.form.get("query")
118
+ resp = generate_query_response(query, db, llm_model, PROMPT_TEMPLATE)
119
+ return jsonify(resp), 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
 
121
  if __name__ == "__main__":
122
+ app.run(host="0.0.0.0", port=8000, debug=True, threaded=True)
 
 
templates/index.html CHANGED
@@ -244,6 +244,8 @@
244
  const messageInput = document.getElementById('message-input');
245
  const message = messageInput.value.trim();
246
 
 
 
247
  add_user_message(message)
248
 
249
  if (message) {
 
244
  const messageInput = document.getElementById('message-input');
245
  const message = messageInput.value.trim();
246
 
247
+ messageInput.value = "";
248
+
249
  add_user_message(message)
250
 
251
  if (message) {
utils.py CHANGED
@@ -1,31 +1,15 @@
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
  from langchain_community.document_loaders import PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader
3
  import os
 
4
 
5
- def remove_file_from_chroma(file_id,session_id,db):
6
- # Get chunks for session
7
- session_chunks = db.get(where={"session_id": session_id})
8
-
9
- # Further filter by file_id
10
- ids_to_del = []
11
- for i in range(len(session_chunks['ids'])):
12
- if session_chunks['metadatas'][i]['file_id'] == str(file_id):
13
- ids_to_del.append(session_chunks['ids'][i])
14
-
15
- # delete chunks from db where metadata file_id is equal to file_id if there are ;)
16
- if len(ids_to_del) > 0:
17
- db.delete(ids=ids_to_del)
18
- return True
19
- return False
20
-
21
- def remove_session_data_from_chroma(session_ids,db,logger):
22
- db.delete(where={"session_id": {"$in": session_ids}})
23
- logger.info(f"Deleted ChromaDB chunks for sessions: {session_ids}")
24
-
25
 
26
- def add_file_to_chroma(file_path, file_id, session_id,hugging_face_ef,db,logger):
27
- """Add file chunks to ChromaDB."""
28
- extension = file_path.split(".")[-1]
29
  loader_map = {
30
  "pdf": PyPDFLoader,
31
  "docx": UnstructuredWordDocumentLoader,
@@ -33,52 +17,94 @@ def add_file_to_chroma(file_path, file_id, session_id,hugging_face_ef,db,logger)
33
  "html": UnstructuredHTMLLoader,
34
  "md": UnstructuredMarkdownLoader,
35
  }
 
36
  if extension not in loader_map:
37
  raise ValueError(f"Unsupported file type: {extension}")
38
 
39
- loader = loader_map[extension](file_path)
40
- documents = loader.load()
41
-
42
- text_splitter = RecursiveCharacterTextSplitter(
43
- chunk_size=1500,
44
- chunk_overlap=200,
45
- length_function=len,
46
- add_start_index=True
47
- )
48
- texts = text_splitter.split_documents(documents)
49
-
50
- # Add metadata
51
- for text in texts:
52
- text.metadata.update({"file_id": file_id, "session_id": session_id})
53
-
54
- # Save to ChromaDB
55
- db.add_documents(texts,embedding=hugging_face_ef)
56
-
57
- # delete file
58
- if os.path.exists(file_path):
59
- os.remove(file_path)
60
- logger.info(f"Added file '{file_path}' to ChromaDB for session '{session_id}'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- def generate_query_response(query,session_id,db,llm_model,PROMPT_TEMPLATE):
63
- response = dict()
64
- top_related = db.similarity_search_with_relevance_scores(query,filter={"session_id": session_id},k=4)
65
-
66
- response['is_relevant'] = top_related[0][1] >= 0.6
67
-
68
- # filter chunks with score > 0.3
69
- # top_related = [chunk for chunk in top_related if chunk[1] > 0.3]
70
 
71
- context = "\n".join([chunk[0].page_content for chunk in top_related])
72
-
73
- prompt = PROMPT_TEMPLATE.format(context = context,query = query)
74
-
75
- # print(top_related)
76
-
77
- response['answer'] = llm_model.generate_content(prompt).text
78
- response['sources'] = [{
79
- "page_content":chunk[0].page_content,
80
- "score" : chunk[1],
81
- "metadata":chunk[0].metadata
82
- } for chunk in top_related]
83
-
84
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
  from langchain_community.document_loaders import PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader
3
  import os
4
+ import logging
5
 
6
+ logging.basicConfig(filename='log.txt', filemode='w', level=logging.DEBUG,
7
+ format='%(asctime)s - %(levelname)s - %(message)s')
8
+ logger = logging.getLogger()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def add_file_to_chroma(file_path, file_id, hugging_face_ef, db, logger):
11
+ """Add file chunks to ChromaDB with advanced document handling."""
12
+ extension = file_path.split(".")[-1].lower()
13
  loader_map = {
14
  "pdf": PyPDFLoader,
15
  "docx": UnstructuredWordDocumentLoader,
 
17
  "html": UnstructuredHTMLLoader,
18
  "md": UnstructuredMarkdownLoader,
19
  }
20
+
21
  if extension not in loader_map:
22
  raise ValueError(f"Unsupported file type: {extension}")
23
 
24
+ try:
25
+ # Load document using appropriate loader
26
+ loader = loader_map[extension](file_path)
27
+ documents = loader.load()
28
+
29
+ # Split text into chunks
30
+ text_splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=1500,
32
+ chunk_overlap=200,
33
+ length_function=len,
34
+ add_start_index=True
35
+ )
36
+ texts = text_splitter.split_documents(documents)
37
+
38
+ # Add metadata
39
+ for text in texts:
40
+ text.metadata.update({
41
+ "file_id": str(file_id),
42
+ "file_name": os.path.basename(file_path),
43
+ "file_type": extension
44
+ })
45
+
46
+ # Save to ChromaDB
47
+ db.add_documents(texts, embedding=hugging_face_ef)
48
+
49
+ # Clean up uploaded file
50
+ if os.path.exists(file_path):
51
+ os.remove(file_path)
52
+
53
+ logger.info(f"Added file '{file_path}' to ChromaDB")
54
+ return True
55
+
56
+ except Exception as e:
57
+ logger.error(f"Error processing file {file_path}: {str(e)}")
58
+ if os.path.exists(file_path):
59
+ os.remove(file_path)
60
+ raise e
61
 
62
+ def remove_file_from_chroma(file_id, db):
63
+ """Remove file chunks from ChromaDB."""
64
+ try:
65
+ # Get chunks for file_id
66
+ results = db.get(where={"file_id": str(file_id)})
 
 
 
67
 
68
+ if results and results['ids']:
69
+ db.delete(ids=results['ids'])
70
+ return True
71
+ return False
72
+
73
+ except Exception as e:
74
+ logger.error(f"Error removing file from ChromaDB: {str(e)}")
75
+ return False
76
+
77
+ def generate_query_response(query, db, llm_model, PROMPT_TEMPLATE):
78
+ """Generate response for a query using the documents in ChromaDB."""
79
+ try:
80
+ # Search for relevant documents with scores
81
+ top_related = db.similarity_search_with_relevance_scores(query, k=4)
82
+
83
+ # Check relevance of top result
84
+ is_relevant = top_related[0][1] >= 0.6 if top_related else False
85
+
86
+ # Build context from relevant chunks
87
+ context = "\n".join([chunk[0].page_content for chunk in top_related])
88
+
89
+ # Generate response using the LLM
90
+ prompt = PROMPT_TEMPLATE.format(context=context, query=query)
91
+ answer = llm_model.generate_content(prompt).text
92
+
93
+ # Prepare response with sources
94
+ return {
95
+ "is_relevant": is_relevant,
96
+ "answer": answer,
97
+ "sources": [{
98
+ "page_content": chunk[0].page_content,
99
+ "score": chunk[1],
100
+ "metadata": chunk[0].metadata
101
+ } for chunk in top_related]
102
+ }
103
+
104
+ except Exception as e:
105
+ logger.error(f"Error generating response: {str(e)}")
106
+ return {
107
+ "is_relevant": False,
108
+ "answer": "An error occurred while processing your query.",
109
+ "error": str(e)
110
+ }