oceansweep commited on
Commit
a324812
1 Parent(s): 04171c3

Upload 3 files

Browse files
App_Function_Libraries/RAG/ChromaDB_Library.py CHANGED
@@ -11,6 +11,7 @@ from itertools import islice
11
  #
12
  # Local Imports:
13
  from App_Function_Libraries.Chunk_Lib import chunk_for_embedding, chunk_options
 
14
  from App_Function_Libraries.DB.SQLite_DB import process_chunks
15
  from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch
16
  # FIXME - related to Chunking
@@ -47,6 +48,40 @@ embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
47
  #
48
  # Functions:
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def batched(iterable, n):
51
  "Batch data into lists of length n. The last batch may be shorter."
52
  it = iter(iterable)
@@ -57,27 +92,55 @@ def batched(iterable, n):
57
  yield batch
58
 
59
 
60
- # FIXME - Fix summarization of entire document/storign in chunk issue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # FIXME - update all uses to reflect 'api_name' parameter
62
  def process_and_store_content(database, content: str, collection_name: str, media_id: int, file_name: str,
63
- create_embeddings: bool = False, create_summary: bool = False, api_name: str = None,
64
- chunk_options: Dict = None, embedding_provider: str = None,
65
  embedding_model: str = None, embedding_api_url: str = None):
66
  try:
67
  logger.info(f"Processing content for media_id {media_id} in collection {collection_name}")
68
 
69
- full_summary = None
70
- if create_summary and api_name:
71
- full_summary = summarize(content, None, api_name, None, None, None)
72
-
73
- chunks = chunk_for_embedding(content, file_name, full_summary, chunk_options)
74
 
75
  # Process chunks synchronously
76
  process_chunks(database, chunks, media_id)
77
 
78
  if create_embeddings:
79
- texts = [chunk['text'] for chunk in chunks]
80
- embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
 
 
 
 
 
 
 
 
 
 
 
81
  ids = [f"{media_id}_chunk_{i}" for i in range(1, len(chunks) + 1)]
82
  metadatas = [{
83
  "media_id": str(media_id),
@@ -85,11 +148,17 @@ def process_and_store_content(database, content: str, collection_name: str, medi
85
  "total_chunks": len(chunks),
86
  "start_index": int(chunk['metadata']['start_index']),
87
  "end_index": int(chunk['metadata']['end_index']),
88
- "file_name": str(file_name),
89
- "relative_position": float(chunk['metadata']['relative_position'])
 
 
 
90
  } for i, chunk in enumerate(chunks, 1)]
91
 
92
- store_in_chroma(collection_name, texts, embeddings, ids, metadatas)
 
 
 
93
 
94
  # Update full-text search index
95
  database.execute_query(
@@ -168,11 +237,13 @@ def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[Lis
168
 
169
  # Verify storage
170
  for doc_id in ids:
171
- result = collection.get(ids=[doc_id], include=["embeddings"])
172
  if not result['embeddings'] or result['embeddings'][0] is None:
173
  logging.error(f"Failed to store embedding for {doc_id}")
174
  else:
175
  logging.info(f"Embedding stored successfully for {doc_id}")
 
 
176
 
177
  except Exception as e:
178
  logging.error(f"Error storing embeddings in ChromaDB: {str(e)}")
@@ -194,9 +265,9 @@ def vector_search(collection_name: str, query: str, k: int = 10) -> List[Dict[st
194
  logging.error(f"Error in vector_search: {str(e)}")
195
  raise
196
 
197
- def schedule_embedding(media_id: int, content: str, media_name: str, summary: str):
198
  try:
199
- chunks = chunk_for_embedding(content, media_name, summary, chunk_options)
200
  texts = [chunk['text'] for chunk in chunks]
201
  embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
202
  ids = [f"{media_id}_chunk_{i}" for i in range(len(chunks))]
 
11
  #
12
  # Local Imports:
13
  from App_Function_Libraries.Chunk_Lib import chunk_for_embedding, chunk_options
14
+ from App_Function_Libraries.DB.DB_Manager import get_unprocessed_media, mark_media_as_processed
15
  from App_Function_Libraries.DB.SQLite_DB import process_chunks
16
  from App_Function_Libraries.RAG.Embeddings_Create import create_embeddings_batch
17
  # FIXME - related to Chunking
 
48
  #
49
  # Functions:
50
 
51
+
52
+ # Function to preprocess and store all existing content in the database
53
+ def preprocess_all_content(database, create_contextualized=True, api_name="gpt-3.5-turbo"):
54
+ unprocessed_media = get_unprocessed_media(db=database)
55
+ total_media = len(unprocessed_media)
56
+
57
+ for index, row in enumerate(unprocessed_media, 1):
58
+ media_id, content, media_type, file_name = row
59
+ collection_name = f"{media_type}_{media_id}"
60
+
61
+ logger.info(f"Processing media {index} of {total_media}: ID {media_id}, Type {media_type}")
62
+
63
+ try:
64
+ process_and_store_content(
65
+ database=database,
66
+ content=content,
67
+ collection_name=collection_name,
68
+ media_id=media_id,
69
+ file_name=file_name or f"{media_type}_{media_id}",
70
+ create_embeddings=True,
71
+ create_contextualized=create_contextualized,
72
+ api_name=api_name
73
+ )
74
+
75
+ # Mark the media as processed in the database
76
+ mark_media_as_processed(database, media_id)
77
+
78
+ logger.info(f"Successfully processed media ID {media_id}")
79
+ except Exception as e:
80
+ logger.error(f"Error processing media ID {media_id}: {str(e)}")
81
+
82
+ logger.info("Finished preprocessing all unprocessed content")
83
+
84
+
85
  def batched(iterable, n):
86
  "Batch data into lists of length n. The last batch may be shorter."
87
  it = iter(iterable)
 
92
  yield batch
93
 
94
 
95
+ def situate_context(api_name, doc_content: str, chunk_content: str) -> str:
96
+ doc_content_prompt = f"""
97
+ <document>
98
+ {doc_content}
99
+ </document>
100
+ """
101
+
102
+ chunk_context_prompt = f"""
103
+ \n\n\n\n\n
104
+ Here is the chunk we want to situate within the whole document
105
+ <chunk>
106
+ {chunk_content}
107
+ </chunk>
108
+
109
+ Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.
110
+ Answer only with the succinct context and nothing else.
111
+ """
112
+
113
+ response = summarize(chunk_context_prompt, doc_content_prompt, api_name, api_key=None, temp=0, system_message=None)
114
+ return response
115
+
116
+
117
  # FIXME - update all uses to reflect 'api_name' parameter
118
  def process_and_store_content(database, content: str, collection_name: str, media_id: int, file_name: str,
119
+ create_embeddings: bool = True, create_contextualized: bool = True, api_name: str = "gpt-3.5-turbo",
120
+ chunk_options = None, embedding_provider: str = None,
121
  embedding_model: str = None, embedding_api_url: str = None):
122
  try:
123
  logger.info(f"Processing content for media_id {media_id} in collection {collection_name}")
124
 
125
+ chunks = chunk_for_embedding(content, file_name, chunk_options)
 
 
 
 
126
 
127
  # Process chunks synchronously
128
  process_chunks(database, chunks, media_id)
129
 
130
  if create_embeddings:
131
+ texts = []
132
+ contextualized_chunks = []
133
+ for chunk in chunks:
134
+ chunk_text = chunk['text']
135
+ if create_contextualized:
136
+ context = situate_context(api_name, content, chunk_text)
137
+ contextualized_text = f"{chunk_text}\n\nContextual Summary: {context}"
138
+ contextualized_chunks.append(contextualized_text)
139
+ else:
140
+ contextualized_chunks.append(chunk_text)
141
+ texts.append(chunk_text) # Store original text for database
142
+
143
+ embeddings = create_embeddings_batch(contextualized_chunks, embedding_provider, embedding_model, embedding_api_url)
144
  ids = [f"{media_id}_chunk_{i}" for i in range(1, len(chunks) + 1)]
145
  metadatas = [{
146
  "media_id": str(media_id),
 
148
  "total_chunks": len(chunks),
149
  "start_index": int(chunk['metadata']['start_index']),
150
  "end_index": int(chunk['metadata']['end_index']),
151
+ "file_name": str(chunk['metadata']['file_name']),
152
+ "relative_position": float(chunk['metadata']['relative_position']),
153
+ "contextualized": create_contextualized,
154
+ "original_text": chunk['text'],
155
+ "contextual_summary": contextualized_chunks[i-1].split("\n\nContextual Summary: ")[-1] if create_contextualized else ""
156
  } for i, chunk in enumerate(chunks, 1)]
157
 
158
+ store_in_chroma(collection_name, contextualized_chunks, embeddings, ids, metadatas)
159
+
160
+ # Mark the media as processed
161
+ mark_media_as_processed(database, media_id)
162
 
163
  # Update full-text search index
164
  database.execute_query(
 
237
 
238
  # Verify storage
239
  for doc_id in ids:
240
+ result = collection.get(ids=[doc_id], include=["documents", "embeddings", "metadatas"])
241
  if not result['embeddings'] or result['embeddings'][0] is None:
242
  logging.error(f"Failed to store embedding for {doc_id}")
243
  else:
244
  logging.info(f"Embedding stored successfully for {doc_id}")
245
+ logging.debug(f"Stored document: {result['documents'][0][:100]}...")
246
+ logging.debug(f"Stored metadata: {result['metadatas'][0]}")
247
 
248
  except Exception as e:
249
  logging.error(f"Error storing embeddings in ChromaDB: {str(e)}")
 
265
  logging.error(f"Error in vector_search: {str(e)}")
266
  raise
267
 
268
+ def schedule_embedding(media_id: int, content: str, media_name: str):
269
  try:
270
+ chunks = chunk_for_embedding(content, media_name, chunk_options)
271
  texts = [chunk['text'] for chunk in chunks]
272
  embeddings = create_embeddings_batch(texts, embedding_provider, embedding_model, embedding_api_url)
273
  ids = [f"{media_id}_chunk_{i}" for i in range(len(chunks))]
App_Function_Libraries/RAG/Embeddings_Create.py CHANGED
@@ -35,7 +35,6 @@ overlap = loaded_config['Embeddings']['overlap']
35
 
36
  # FIXME - Add logging
37
 
38
-
39
  class HuggingFaceEmbedder:
40
  def __init__(self, model_name, timeout_seconds=120): # Default timeout of 2 minutes
41
  self.model_name = model_name
@@ -154,6 +153,7 @@ List[List[float]]:
154
  else:
155
  raise ValueError(f"Unsupported embedding provider: {provider}")
156
 
 
157
  def create_embedding(text: str, provider: str, model: str, api_url: str) -> List[float]:
158
  return create_embeddings_batch([text], provider, model, api_url)[0]
159
 
@@ -185,40 +185,6 @@ def create_openai_embedding(text: str, model: str) -> List[float]:
185
  embedding = get_openai_embeddings(text, model)
186
  return embedding
187
 
188
-
189
-
190
-
191
- #Dead
192
- # def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
193
- # response = requests.post(
194
- # api_url,
195
- # json={"text": text, "model": model},
196
- # headers={"Authorization": f"Bearer {api_key}"}
197
- # )
198
- # response.raise_for_status()
199
- # return response.json().get('embedding', None)
200
-
201
- # Dead
202
- # def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
203
- # response = requests.post(
204
- # api_url,
205
- # json={"input": text}
206
- # )
207
- # response.raise_for_status()
208
- # return response.json()['embedding']
209
-
210
- # dead
211
- # def create_huggingface_embedding(text: str, model: str) -> List[float]:
212
- # tokenizer = AutoTokenizer.from_pretrained(model)
213
- # model = AutoModel.from_pretrained(model)
214
- #
215
- # inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
216
- # with torch.no_grad():
217
- # outputs = model(**inputs)
218
- #
219
- # embeddings = outputs.last_hidden_state.mean(dim=1)
220
- # return embeddings[0].tolist()
221
-
222
  #
223
  # End of File.
224
  #######################################################################################################################
 
35
 
36
  # FIXME - Add logging
37
 
 
38
  class HuggingFaceEmbedder:
39
  def __init__(self, model_name, timeout_seconds=120): # Default timeout of 2 minutes
40
  self.model_name = model_name
 
153
  else:
154
  raise ValueError(f"Unsupported embedding provider: {provider}")
155
 
156
+
157
  def create_embedding(text: str, provider: str, model: str, api_url: str) -> List[float]:
158
  return create_embeddings_batch([text], provider, model, api_url)[0]
159
 
 
185
  embedding = get_openai_embeddings(text, model)
186
  return embedding
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  #
189
  # End of File.
190
  #######################################################################################################################
App_Function_Libraries/RAG/RAG_Libary_2.py CHANGED
@@ -9,8 +9,7 @@ from typing import Dict, Any, List, Optional
9
  # Local Imports
10
  from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
11
  from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article
12
- from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media, \
13
- fetch_keywords_for_media
14
  from App_Function_Libraries.Utils.Utils import load_comprehensive_config
15
  #
16
  # 3rd-Party Imports
@@ -32,71 +31,79 @@ config = configparser.ConfigParser()
32
  # Read the configuration file
33
  config.read('config.txt')
34
 
35
- # Main RAG pipeline function
36
- def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
37
- try:
38
- # Extract content
39
- try:
40
- article_data = scrape_article(url)
41
- content = article_data['content']
42
- title = article_data['title']
43
- except Exception as e:
44
- logging.error(f"Error scraping article: {str(e)}")
45
- return {"error": "Failed to scrape article", "details": str(e)}
46
-
47
- # Store the article in the database and get the media_id
48
- try:
49
- media_id = add_media_to_database(url, title, 'article', content)
50
- except Exception as e:
51
- logging.error(f"Error adding article to database: {str(e)}")
52
- return {"error": "Failed to store article in database", "details": str(e)}
53
-
54
- # Process and store content
55
- collection_name = f"article_{media_id}"
56
- try:
57
- # FIXME
58
- # def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
59
- # create_embeddings: bool = False, create_summary: bool = False,
60
- # api_name: str = None):
61
- process_and_store_content(content, collection_name, media_id, title)
62
- except Exception as e:
63
- logging.error(f"Error processing and storing content: {str(e)}")
64
- return {"error": "Failed to process and store content", "details": str(e)}
65
-
66
- # Perform searches
67
- try:
68
- vector_results = vector_search(collection_name, query, k=5)
69
- fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
70
- except Exception as e:
71
- logging.error(f"Error performing searches: {str(e)}")
72
- return {"error": "Failed to perform searches", "details": str(e)}
73
-
74
- # Combine results with error handling for missing 'content' key
75
- all_results = []
76
- for result in vector_results + fts_results:
77
- if isinstance(result, dict) and 'content' in result:
78
- all_results.append(result['content'])
79
- else:
80
- logging.warning(f"Unexpected result format: {result}")
81
- all_results.append(str(result))
82
-
83
- context = "\n".join(all_results)
84
-
85
- # Generate answer using the selected API
86
- try:
87
- answer = generate_answer(api_choice, context, query)
88
- except Exception as e:
89
- logging.error(f"Error generating answer: {str(e)}")
90
- return {"error": "Failed to generate answer", "details": str(e)}
91
-
92
- return {
93
- "answer": answer,
94
- "context": context
95
- }
96
-
97
- except Exception as e:
98
- logging.error(f"Unexpected error in rag_pipeline: {str(e)}")
99
- return {"error": "An unexpected error occurred", "details": str(e)}
 
 
 
 
 
 
 
 
100
 
101
 
102
 
@@ -213,21 +220,6 @@ def generate_answer(api_choice: str, context: str, query: str) -> str:
213
  else:
214
  raise ValueError(f"Unsupported API choice: {api_choice}")
215
 
216
- # Function to preprocess and store all existing content in the database
217
- def preprocess_all_content():
218
- unprocessed_media = get_unprocessed_media()
219
- for row in unprocessed_media:
220
- media_id = row[0]
221
- content = row[1]
222
- media_type = row[2]
223
- collection_name = f"{media_type}_{media_id}"
224
- # FIXME
225
- # def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
226
- # create_embeddings: bool = False, create_summary: bool = False,
227
- # api_name: str = None):
228
- process_and_store_content(content, collection_name, media_id, "")
229
-
230
-
231
  def perform_vector_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
232
  all_collections = chroma_client.list_collections()
233
  vector_results = []
@@ -303,30 +295,42 @@ def extract_media_id_from_result(result: str) -> Optional[int]:
303
  logging.error(f"Failed to extract media_id from result: {result}")
304
  return None
305
 
 
 
 
306
 
307
 
308
-
309
- # Example usage:
310
- # 1. Initialize the system:
311
- # create_tables(db) # Ensure FTS tables are set up
312
  #
313
- # 2. Create ChromaDB
314
- # chroma_client = ChromaDBClient()
 
315
  #
316
- # 3. Create Embeddings
317
- # Store embeddings in ChromaDB
318
- # preprocess_all_content() or create_embeddings()
319
  #
320
- # 4. Perform RAG search across all content:
321
- # result = rag_search("What are the key points about climate change?")
322
- # print(result['answer'])
 
 
 
 
 
 
 
 
323
  #
324
- # (Extra)5. Perform RAG on a specific URL:
325
- # result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
326
- # print(result['answer'])
327
  #
328
- ########################################################################################################################
329
-
 
 
 
330
 
331
  ############################################################################################################
332
  #
 
9
  # Local Imports
10
  from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
11
  from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article
12
+ from App_Function_Libraries.DB.DB_Manager import search_db, fetch_keywords_for_media
 
13
  from App_Function_Libraries.Utils.Utils import load_comprehensive_config
14
  #
15
  # 3rd-Party Imports
 
31
  # Read the configuration file
32
  config.read('config.txt')
33
 
34
+ # RAG pipeline function for web scraping
35
+ # def rag_web_scraping_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
36
+ # try:
37
+ # # Extract content
38
+ # try:
39
+ # article_data = scrape_article(url)
40
+ # content = article_data['content']
41
+ # title = article_data['title']
42
+ # except Exception as e:
43
+ # logging.error(f"Error scraping article: {str(e)}")
44
+ # return {"error": "Failed to scrape article", "details": str(e)}
45
+ #
46
+ # # Store the article in the database and get the media_id
47
+ # try:
48
+ # media_id = add_media_to_database(url, title, 'article', content)
49
+ # except Exception as e:
50
+ # logging.error(f"Error adding article to database: {str(e)}")
51
+ # return {"error": "Failed to store article in database", "details": str(e)}
52
+ #
53
+ # # Process and store content
54
+ # collection_name = f"article_{media_id}"
55
+ # try:
56
+ # # Assuming you have a database object available, let's call it 'db'
57
+ # db = get_database_connection()
58
+ #
59
+ # process_and_store_content(
60
+ # database=db,
61
+ # content=content,
62
+ # collection_name=collection_name,
63
+ # media_id=media_id,
64
+ # file_name=title,
65
+ # create_embeddings=True,
66
+ # create_contextualized=True,
67
+ # api_name=api_choice
68
+ # )
69
+ # except Exception as e:
70
+ # logging.error(f"Error processing and storing content: {str(e)}")
71
+ # return {"error": "Failed to process and store content", "details": str(e)}
72
+ #
73
+ # # Perform searches
74
+ # try:
75
+ # vector_results = vector_search(collection_name, query, k=5)
76
+ # fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
77
+ # except Exception as e:
78
+ # logging.error(f"Error performing searches: {str(e)}")
79
+ # return {"error": "Failed to perform searches", "details": str(e)}
80
+ #
81
+ # # Combine results with error handling for missing 'content' key
82
+ # all_results = []
83
+ # for result in vector_results + fts_results:
84
+ # if isinstance(result, dict) and 'content' in result:
85
+ # all_results.append(result['content'])
86
+ # else:
87
+ # logging.warning(f"Unexpected result format: {result}")
88
+ # all_results.append(str(result))
89
+ #
90
+ # context = "\n".join(all_results)
91
+ #
92
+ # # Generate answer using the selected API
93
+ # try:
94
+ # answer = generate_answer(api_choice, context, query)
95
+ # except Exception as e:
96
+ # logging.error(f"Error generating answer: {str(e)}")
97
+ # return {"error": "Failed to generate answer", "details": str(e)}
98
+ #
99
+ # return {
100
+ # "answer": answer,
101
+ # "context": context
102
+ # }
103
+ #
104
+ # except Exception as e:
105
+ # logging.error(f"Unexpected error in rag_pipeline: {str(e)}")
106
+ # return {"error": "An unexpected error occurred", "details": str(e)}
107
 
108
 
109
 
 
220
  else:
221
  raise ValueError(f"Unsupported API choice: {api_choice}")
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  def perform_vector_search(query: str, relevant_media_ids: List[str] = None) -> List[Dict[str, Any]]:
224
  all_collections = chroma_client.list_collections()
225
  vector_results = []
 
295
  logging.error(f"Failed to extract media_id from result: {result}")
296
  return None
297
 
298
+ #
299
+ #
300
+ ########################################################################################################################
301
 
302
 
303
+ # Function to preprocess and store all existing content in the database
304
+ # def preprocess_all_content(database, create_contextualized=True, api_name="gpt-3.5-turbo"):
305
+ # unprocessed_media = get_unprocessed_media()
306
+ # total_media = len(unprocessed_media)
307
  #
308
+ # for index, row in enumerate(unprocessed_media, 1):
309
+ # media_id, content, media_type, file_name = row
310
+ # collection_name = f"{media_type}_{media_id}"
311
  #
312
+ # logger.info(f"Processing media {index} of {total_media}: ID {media_id}, Type {media_type}")
 
 
313
  #
314
+ # try:
315
+ # process_and_store_content(
316
+ # database=database,
317
+ # content=content,
318
+ # collection_name=collection_name,
319
+ # media_id=media_id,
320
+ # file_name=file_name or f"{media_type}_{media_id}",
321
+ # create_embeddings=True,
322
+ # create_contextualized=create_contextualized,
323
+ # api_name=api_name
324
+ # )
325
  #
326
+ # # Mark the media as processed in the database
327
+ # mark_media_as_processed(database, media_id)
 
328
  #
329
+ # logger.info(f"Successfully processed media ID {media_id}")
330
+ # except Exception as e:
331
+ # logger.error(f"Error processing media ID {media_id}: {str(e)}")
332
+ #
333
+ # logger.info("Finished preprocessing all unprocessed content")
334
 
335
  ############################################################################################################
336
  #