oceansweep commited on
Commit
cfbac61
1 Parent(s): 0c0fe5b

Upload 6 files

Browse files
App_Function_Libraries/RAG/ChromaDB_Library.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import logging
3
+ import sqlite3
4
+ from typing import List, Dict, Any
5
+
6
+ import chromadb
7
+ import requests
8
+ from chromadb import Settings
9
+
10
+ from App_Function_Libraries.Chunk_Lib import improved_chunking_process
11
+ from App_Function_Libraries.DB.DB_Manager import add_media_chunk, update_fts_for_media
12
+ from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
13
+
14
+ #######################################################################################################################
15
+ #
16
+ # Functions for ChromaDB
17
+
18
+ # Get ChromaDB settings
19
+ # Load configuration
20
+ config = configparser.ConfigParser()
21
+ config.read('config.txt')
22
+ chroma_db_path = config.get('Database', 'chroma_db_path', fallback='chroma_db')
23
+ chroma_client = chromadb.PersistentClient(path=chroma_db_path, settings=Settings(anonymized_telemetry=False))
24
+
25
+ # Get embedding settings
26
+ embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
27
+ embedding_model = config.get('Embeddings', 'model', fallback='text-embedding-3-small')
28
+ embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
29
+ embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
30
+
31
+ # Get chunking options
32
+ chunk_options = {
33
+ 'method': config.get('Chunking', 'method', fallback='words'),
34
+ 'max_size': config.getint('Chunking', 'max_size', fallback=400),
35
+ 'overlap': config.getint('Chunking', 'overlap', fallback=200),
36
+ 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
37
+ 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
38
+ 'language': config.get('Chunking', 'language', fallback='english')
39
+ }
40
+
41
+
42
+ def auto_update_chroma_embeddings(media_id: int, content: str):
43
+ """
44
+ Automatically update ChromaDB embeddings when a new item is ingested into the SQLite database.
45
+
46
+ :param media_id: The ID of the newly ingested media item
47
+ :param content: The content of the newly ingested media item
48
+ """
49
+ collection_name = f"media_{media_id}"
50
+
51
+ # Initialize or get the ChromaDB collection
52
+ collection = chroma_client.get_or_create_collection(name=collection_name)
53
+
54
+ # Check if embeddings already exist for this media_id
55
+ existing_embeddings = collection.get(ids=[f"{media_id}_chunk_{i}" for i in range(len(content))])
56
+
57
+ if existing_embeddings and len(existing_embeddings) > 0:
58
+ logging.info(f"Embeddings already exist for media ID {media_id}, skipping...")
59
+ else:
60
+ # Process and store content if embeddings do not already exist
61
+ process_and_store_content(content, collection_name, media_id)
62
+ logging.info(f"Updated ChromaDB embeddings for media ID: {media_id}")
63
+
64
+
65
+ # Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
66
+ def process_and_store_content(content: str, collection_name: str, media_id: int):
67
+ # Process the content into chunks
68
+ chunks = improved_chunking_process(content, chunk_options)
69
+ texts = [chunk['text'] for chunk in chunks]
70
+
71
+ # Generate embeddings for each chunk
72
+ embeddings = [create_embedding(text) for text in texts]
73
+
74
+ # Create unique IDs for each chunk using the media_id and chunk index
75
+ ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
76
+
77
+ # Store the texts, embeddings, and IDs in ChromaDB
78
+ store_in_chroma(collection_name, texts, embeddings, ids)
79
+
80
+ # Store the chunk metadata in SQLite
81
+ for i, chunk in enumerate(chunks):
82
+ add_media_chunk(media_id, chunk['text'], chunk['start'], chunk['end'], ids[i])
83
+
84
+ # Update the FTS table
85
+ update_fts_for_media(media_id)
86
+
87
+ # Function to store documents and their embeddings in ChromaDB
88
+ def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]):
89
+ collection = chroma_client.get_or_create_collection(name=collection_name)
90
+ collection.add(
91
+ documents=texts,
92
+ embeddings=embeddings,
93
+ ids=ids
94
+ )
95
+
96
+ # Function to perform vector search using ChromaDB
97
+ def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]:
98
+ query_embedding = create_embedding(query)
99
+ collection = chroma_client.get_collection(name=collection_name)
100
+ results = collection.query(
101
+ query_embeddings=[query_embedding],
102
+ n_results=k
103
+ )
104
+ return results['documents'][0]
105
+
106
+
107
+ def create_embedding(text: str) -> List[float]:
108
+ global embedding_provider, embedding_model, embedding_api_url, embedding_api_key
109
+
110
+ if embedding_provider == 'openai':
111
+ return get_openai_embeddings(text, embedding_model)
112
+ elif embedding_provider == 'local':
113
+ response = requests.post(
114
+ embedding_api_url,
115
+ json={"text": text, "model": embedding_model},
116
+ headers={"Authorization": f"Bearer {embedding_api_key}"}
117
+ )
118
+ return response.json()['embedding']
119
+ elif embedding_provider == 'huggingface':
120
+ from transformers import AutoTokenizer, AutoModel
121
+ import torch
122
+
123
+ tokenizer = AutoTokenizer.from_pretrained(embedding_model)
124
+ model = AutoModel.from_pretrained(embedding_model)
125
+
126
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
127
+ with torch.no_grad():
128
+ outputs = model(**inputs)
129
+
130
+ # Use the mean of the last hidden state as the sentence embedding
131
+ embeddings = outputs.last_hidden_state.mean(dim=1)
132
+ return embeddings[0].tolist() # Convert to list for consistency
133
+ else:
134
+ raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
135
+
136
+
137
+ def create_all_embeddings(api_choice: str, model_or_url: str) -> str:
138
+ try:
139
+ all_content = get_all_content_from_database()
140
+
141
+ if not all_content:
142
+ return "No content found in the database."
143
+
144
+ texts_to_embed = []
145
+ embeddings_to_store = []
146
+ ids_to_store = []
147
+ collection_name = "all_content_embeddings"
148
+
149
+ # Initialize or get the ChromaDB collection
150
+ collection = chroma_client.get_or_create_collection(name=collection_name)
151
+
152
+ for content_item in all_content:
153
+ media_id = content_item['id']
154
+ text = content_item['content']
155
+
156
+ # Check if the embedding already exists in ChromaDB
157
+ embedding_exists = collection.get(ids=[f"doc_{media_id}"])
158
+
159
+ if embedding_exists:
160
+ logging.info(f"Embedding already exists for media ID {media_id}, skipping...")
161
+ continue # Skip if embedding already exists
162
+
163
+ # Create the embedding
164
+ if api_choice == "openai":
165
+ embedding = create_openai_embedding(text, model_or_url)
166
+ else: # Llama.cpp
167
+ embedding = create_llamacpp_embedding(text, model_or_url)
168
+
169
+ # Collect the text, embedding, and ID for batch storage
170
+ texts_to_embed.append(text)
171
+ embeddings_to_store.append(embedding)
172
+ ids_to_store.append(f"doc_{media_id}")
173
+
174
+ # Store all new embeddings in ChromaDB
175
+ if texts_to_embed and embeddings_to_store:
176
+ store_in_chroma(collection_name, texts_to_embed, embeddings_to_store, ids_to_store)
177
+
178
+ return "Embeddings created and stored successfully for all new content."
179
+ except Exception as e:
180
+ logging.error(f"Error during embedding creation: {str(e)}")
181
+ return f"Error: {str(e)}"
182
+
183
+
184
+ def create_openai_embedding(text: str, model: str) -> List[float]:
185
+ openai_api_key = config['API']['openai_api_key']
186
+ embedding = get_openai_embeddings(text, model)
187
+ return embedding
188
+
189
+
190
+ def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
191
+ response = requests.post(
192
+ api_url,
193
+ json={"input": text}
194
+ )
195
+ if response.status_code == 200:
196
+ return response.json()['embedding']
197
+ else:
198
+ raise Exception(f"Error from Llama.cpp API: {response.text}")
199
+
200
+
201
+ def get_all_content_from_database() -> List[Dict[str, Any]]:
202
+ """
203
+ Retrieve all media content from the database that requires embedding.
204
+
205
+ Returns:
206
+ List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields.
207
+ """
208
+ try:
209
+ from App_Function_Libraries.DB.DB_Manager import db
210
+ with db.get_connection() as conn:
211
+ cursor = conn.cursor()
212
+ cursor.execute("""
213
+ SELECT id, content, title, author, type
214
+ FROM Media
215
+ WHERE is_trash = 0 -- Exclude items marked as trash
216
+ """)
217
+ media_items = cursor.fetchall()
218
+
219
+ # Convert the results into a list of dictionaries
220
+ all_content = [
221
+ {
222
+ 'id': item[0],
223
+ 'content': item[1],
224
+ 'title': item[2],
225
+ 'author': item[3],
226
+ 'type': item[4]
227
+ }
228
+ for item in media_items
229
+ ]
230
+
231
+ return all_content
232
+
233
+ except sqlite3.Error as e:
234
+ logging.error(f"Error retrieving all content from database: {e}")
235
+ from App_Function_Libraries.DB.SQLite_DB import DatabaseError
236
+ raise DatabaseError(f"Error retrieving all content from database: {e}")
237
+
238
+
239
+ def store_in_chroma_with_citation(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str], sources: List[str]):
240
+ collection = chroma_client.get_or_create_collection(name=collection_name)
241
+ collection.add(
242
+ documents=texts,
243
+ embeddings=embeddings,
244
+ ids=ids,
245
+ metadatas=[{'source': source} for source in sources]
246
+ )
247
+
248
+
249
+ def check_embedding_status(selected_item):
250
+ if not selected_item:
251
+ return "Please select an item", ""
252
+ item_id = selected_item.split('(')[0].strip()
253
+ collection = chroma_client.get_or_create_collection(name="all_content_embeddings")
254
+ result = collection.get(ids=[f"doc_{item_id}"])
255
+ if result['ids']:
256
+ embedding = result['embeddings'][0]
257
+ embedding_preview = str(embedding[:50]) # Convert first 50 elements to string
258
+ return f"Embedding exists for item: {item_id}", f"Embedding preview: {embedding_preview}..."
259
+ else:
260
+ return f"No embedding found for item: {item_id}", ""
261
+
262
+
263
+ def create_new_embedding(selected_item, api_choice, openai_model, llamacpp_url):
264
+ if not selected_item:
265
+ return "Please select an item"
266
+ item_id = selected_item.split('(')[0].strip()
267
+ items = get_all_content_from_database()
268
+ item = next((item for item in items if item['title'] == item_id), None)
269
+ if not item:
270
+ return f"Item not found: {item_id}"
271
+
272
+ try:
273
+ if api_choice == "OpenAI":
274
+ embedding = create_embedding(item['content'])
275
+ else: # Llama.cpp
276
+ embedding = create_embedding(item['content'])
277
+
278
+ collection_name = "all_content_embeddings"
279
+ store_in_chroma(collection_name, [item['content']], [embedding], [f"doc_{item['id']}"])
280
+ return f"New embedding created and stored for item: {item_id}"
281
+ except Exception as e:
282
+ return f"Error creating embedding: {str(e)}"
283
+
284
+
285
+ #
286
+ # End of Functions for ChromaDB
287
+ #######################################################################################################################
App_Function_Libraries/RAG/RAG_Examples.md ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ```
3
+ ##################################################################################################################
4
+ # RAG Pipeline 1
5
+ # 0.62 0.61 0.75 63402.0
6
+ # from langchain_openai import ChatOpenAI
7
+ #
8
+ # from langchain_community.document_loaders import WebBaseLoader
9
+ # from langchain_openai import OpenAIEmbeddings
10
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ # from langchain_chroma import Chroma
12
+ #
13
+ # from langchain_community.retrievers import BM25Retriever
14
+ # from langchain.retrievers import ParentDocumentRetriever
15
+ # from langchain.storage import InMemoryStore
16
+ # import os
17
+ # from operator import itemgetter
18
+ # from langchain import hub
19
+ # from langchain_core.output_parsers import StrOutputParser
20
+ # from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
21
+ # from langchain.retrievers import MergerRetriever
22
+ # from langchain.retrievers.document_compressors import DocumentCompressorPipeline
23
+
24
+
25
+ # def rag_pipeline():
26
+ # try:
27
+ # def format_docs(docs):
28
+ # return "\n".join(doc.page_content for doc in docs)
29
+ #
30
+ # llm = ChatOpenAI(model='gpt-4o-mini')
31
+ #
32
+ # loader = WebBaseLoader('https://en.wikipedia.org/wiki/European_debt_crisis')
33
+ # docs = loader.load()
34
+ #
35
+ # embedding = OpenAIEmbeddings(model='text-embedding-3-large')
36
+ #
37
+ # splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=200)
38
+ # splits = splitter.split_documents(docs)
39
+ # c = Chroma.from_documents(documents=splits, embedding=embedding,
40
+ # collection_name='testindex-ragbuilder-1724657573', )
41
+ # retrievers = []
42
+ # retriever = c.as_retriever(search_type='mmr', search_kwargs={'k': 10})
43
+ # retrievers.append(retriever)
44
+ # retriever = BM25Retriever.from_documents(docs)
45
+ # retrievers.append(retriever)
46
+ #
47
+ # parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=600)
48
+ # splits = parent_splitter.split_documents(docs)
49
+ # store = InMemoryStore()
50
+ # retriever = ParentDocumentRetriever(vectorstore=c, docstore=store, child_splitter=splitter,
51
+ # parent_splitter=parent_splitter)
52
+ # retriever.add_documents(docs)
53
+ # retrievers.append(retriever)
54
+ # retriever = MergerRetriever(retrievers=retrievers)
55
+ # prompt = hub.pull("rlm/rag-prompt")
56
+ # rag_chain = (
57
+ # RunnableParallel(context=retriever, question=RunnablePassthrough())
58
+ # .assign(context=itemgetter("context") | RunnableLambda(format_docs))
59
+ # .assign(answer=prompt | llm | StrOutputParser())
60
+ # .pick(["answer", "context"]))
61
+ # return rag_chain
62
+ # except Exception as e:
63
+ # print(f"An error occurred: {e}")
64
+
65
+
66
+ # To get the answer and context, use the following code
67
+ # res=rag_pipeline().invoke("your prompt here")
68
+ # print(res["answer"])
69
+ # print(res["context"])
70
+
71
+ ############################################################################################################
72
+
73
+
74
+ ############################################################################################################
75
+ # RAG Pipeline 2
76
+
77
+ # 0.6 0.73 0.68 3125.0
78
+ # from langchain_openai import ChatOpenAI
79
+ #
80
+ # from langchain_community.document_loaders import WebBaseLoader
81
+ # from langchain_openai import OpenAIEmbeddings
82
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
83
+ # from langchain_chroma import Chroma
84
+ # from langchain.retrievers.multi_query import MultiQueryRetriever
85
+ # from langchain.retrievers import ParentDocumentRetriever
86
+ # from langchain.storage import InMemoryStore
87
+ # from langchain_community.document_transformers import EmbeddingsRedundantFilter
88
+ # from langchain.retrievers.document_compressors import LLMChainFilter
89
+ # from langchain.retrievers.document_compressors import EmbeddingsFilter
90
+ # from langchain.retrievers import ContextualCompressionRetriever
91
+ # import os
92
+ # from operator import itemgetter
93
+ # from langchain import hub
94
+ # from langchain_core.output_parsers import StrOutputParser
95
+ # from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
96
+ # from langchain.retrievers import MergerRetriever
97
+ # from langchain.retrievers.document_compressors import DocumentCompressorPipeline
98
+
99
+
100
+ # def rag_pipeline():
101
+ # try:
102
+ # def format_docs(docs):
103
+ # return "\n".join(doc.page_content for doc in docs)
104
+ #
105
+ # llm = ChatOpenAI(model='gpt-4o-mini')
106
+ #
107
+ # loader = WebBaseLoader('https://en.wikipedia.org/wiki/European_debt_crisis')
108
+ # docs = loader.load()
109
+ #
110
+ # embedding = OpenAIEmbeddings(model='text-embedding-3-large')
111
+ #
112
+ # splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=200)
113
+ # splits = splitter.split_documents(docs)
114
+ # c = Chroma.from_documents(documents=splits, embedding=embedding,
115
+ # collection_name='testindex-ragbuilder-1724650962', )
116
+ # retrievers = []
117
+ # retriever = MultiQueryRetriever.from_llm(c.as_retriever(search_type='similarity', search_kwargs={'k': 10}),
118
+ # llm=llm)
119
+ # retrievers.append(retriever)
120
+ #
121
+ # parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=600)
122
+ # splits = parent_splitter.split_documents(docs)
123
+ # store = InMemoryStore()
124
+ # retriever = ParentDocumentRetriever(vectorstore=c, docstore=store, child_splitter=splitter,
125
+ # parent_splitter=parent_splitter)
126
+ # retriever.add_documents(docs)
127
+ # retrievers.append(retriever)
128
+ # retriever = MergerRetriever(retrievers=retrievers)
129
+ # arr_comp = []
130
+ # arr_comp.append(EmbeddingsRedundantFilter(embeddings=embedding))
131
+ # arr_comp.append(LLMChainFilter.from_llm(llm))
132
+ # pipeline_compressor = DocumentCompressorPipeline(transformers=arr_comp)
133
+ # retriever = ContextualCompressionRetriever(base_retriever=retriever, base_compressor=pipeline_compressor)
134
+ # prompt = hub.pull("rlm/rag-prompt")
135
+ # rag_chain = (
136
+ # RunnableParallel(context=retriever, question=RunnablePassthrough())
137
+ # .assign(context=itemgetter("context") | RunnableLambda(format_docs))
138
+ # .assign(answer=prompt | llm | StrOutputParser())
139
+ # .pick(["answer", "context"]))
140
+ # return rag_chain
141
+ # except Exception as e:
142
+ # print(f"An error occurred: {e}")
143
+
144
+
145
+ # To get the answer and context, use the following code
146
+ # res=rag_pipeline().invoke("your prompt here")
147
+ # print(res["answer"])
148
+ # print(res["context"])
149
+
150
+ #
151
+ #
152
+ #
153
+ ############################################################################################################
154
+ # Plain bm25 retriever
155
+ # class BM25Retriever(BaseRetriever):
156
+ # """`BM25` retriever without Elasticsearch."""
157
+ #
158
+ # vectorizer: Any
159
+ # """ BM25 vectorizer."""
160
+ # docs: List[Document] = Field(repr=False)
161
+ # """ List of documents."""
162
+ # k: int = 4
163
+ # """ Number of documents to return."""
164
+ # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func
165
+ # """ Preprocessing function to use on the text before BM25 vectorization."""
166
+ #
167
+ # class Config:
168
+ # arbitrary_types_allowed = True
169
+ #
170
+ # @classmethod
171
+ # def from_texts(
172
+ # cls,
173
+ # texts: Iterable[str],
174
+ # metadatas: Optional[Iterable[dict]] = None,
175
+ # bm25_params: Optional[Dict[str, Any]] = None,
176
+ # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
177
+ # **kwargs: Any,
178
+ # ) -> BM25Retriever:
179
+ # """
180
+ # Create a BM25Retriever from a list of texts.
181
+ # Args:
182
+ # texts: A list of texts to vectorize.
183
+ # metadatas: A list of metadata dicts to associate with each text.
184
+ # bm25_params: Parameters to pass to the BM25 vectorizer.
185
+ # preprocess_func: A function to preprocess each text before vectorization.
186
+ # **kwargs: Any other arguments to pass to the retriever.
187
+ #
188
+ # Returns:
189
+ # A BM25Retriever instance.
190
+ # """
191
+ # try:
192
+ # from rank_bm25 import BM25Okapi
193
+ # except ImportError:
194
+ # raise ImportError(
195
+ # "Could not import rank_bm25, please install with `pip install "
196
+ # "rank_bm25`."
197
+ # )
198
+ #
199
+ # texts_processed = [preprocess_func(t) for t in texts]
200
+ # bm25_params = bm25_params or {}
201
+ # vectorizer = BM25Okapi(texts_processed, **bm25_params)
202
+ # metadatas = metadatas or ({} for _ in texts)
203
+ # docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)]
204
+ # return cls(
205
+ # vectorizer=vectorizer, docs=docs, preprocess_func=preprocess_func, **kwargs
206
+ # )
207
+ #
208
+ # @classmethod
209
+ # def from_documents(
210
+ # cls,
211
+ # documents: Iterable[Document],
212
+ # *,
213
+ # bm25_params: Optional[Dict[str, Any]] = None,
214
+ # preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
215
+ # **kwargs: Any,
216
+ # ) -> BM25Retriever:
217
+ # """
218
+ # Create a BM25Retriever from a list of Documents.
219
+ # Args:
220
+ # documents: A list of Documents to vectorize.
221
+ # bm25_params: Parameters to pass to the BM25 vectorizer.
222
+ # preprocess_func: A function to preprocess each text before vectorization.
223
+ # **kwargs: Any other arguments to pass to the retriever.
224
+ #
225
+ # Returns:
226
+ # A BM25Retriever instance.
227
+ # """
228
+ # texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
229
+ # return cls.from_texts(
230
+ # texts=texts,
231
+ # bm25_params=bm25_params,
232
+ # metadatas=metadatas,
233
+ # preprocess_func=preprocess_func,
234
+ # **kwargs,
235
+ # )
236
+ #
237
+ # def _get_relevant_documents(
238
+ # self, query: str, *, run_manager: CallbackManagerForRetrieverRun
239
+ # ) -> List[Document]:
240
+ # processed_query = self.preprocess_func(query)
241
+ # return_docs = self.vectorizer.get_top_n(processed_query, self.docs, n=self.k)
242
+ # return return_docs
243
+ ############################################################################################################
244
+
245
+ ############################################################################################################
246
+ # ElasticSearch BM25 Retriever
247
+ # class ElasticSearchBM25Retriever(BaseRetriever):
248
+ # """`Elasticsearch` retriever that uses `BM25`.
249
+ #
250
+ # To connect to an Elasticsearch instance that requires login credentials,
251
+ # including Elastic Cloud, use the Elasticsearch URL format
252
+ # https://username:password@es_host:9243. For example, to connect to Elastic
253
+ # Cloud, create the Elasticsearch URL with the required authentication details and
254
+ # pass it to the ElasticVectorSearch constructor as the named parameter
255
+ # elasticsearch_url.
256
+ #
257
+ # You can obtain your Elastic Cloud URL and login credentials by logging in to the
258
+ # Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and
259
+ # navigating to the "Deployments" page.
260
+ #
261
+ # To obtain your Elastic Cloud password for the default "elastic" user:
262
+ #
263
+ # 1. Log in to the Elastic Cloud console at https://cloud.elastic.co
264
+ # 2. Go to "Security" > "Users"
265
+ # 3. Locate the "elastic" user and click "Edit"
266
+ # 4. Click "Reset password"
267
+ # 5. Follow the prompts to reset the password
268
+ #
269
+ # The format for Elastic Cloud URLs is
270
+ # https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.
271
+ # """
272
+ #
273
+ # client: Any
274
+ # """Elasticsearch client."""
275
+ # index_name: str
276
+ # """Name of the index to use in Elasticsearch."""
277
+ #
278
+ # @classmethod
279
+ # def create(
280
+ # cls, elasticsearch_url: str, index_name: str, k1: float = 2.0, b: float = 0.75
281
+ # ) -> ElasticSearchBM25Retriever:
282
+ # """
283
+ # Create a ElasticSearchBM25Retriever from a list of texts.
284
+ #
285
+ # Args:
286
+ # elasticsearch_url: URL of the Elasticsearch instance to connect to.
287
+ # index_name: Name of the index to use in Elasticsearch.
288
+ # k1: BM25 parameter k1.
289
+ # b: BM25 parameter b.
290
+ #
291
+ # Returns:
292
+ #
293
+ # """
294
+ # from elasticsearch import Elasticsearch
295
+ #
296
+ # # Create an Elasticsearch client instance
297
+ # es = Elasticsearch(elasticsearch_url)
298
+ #
299
+ # # Define the index settings and mappings
300
+ # settings = {
301
+ # "analysis": {"analyzer": {"default": {"type": "standard"}}},
302
+ # "similarity": {
303
+ # "custom_bm25": {
304
+ # "type": "BM25",
305
+ # "k1": k1,
306
+ # "b": b,
307
+ # }
308
+ # },
309
+ # }
310
+ # mappings = {
311
+ # "properties": {
312
+ # "content": {
313
+ # "type": "text",
314
+ # "similarity": "custom_bm25", # Use the custom BM25 similarity
315
+ # }
316
+ # }
317
+ # }
318
+ #
319
+ # # Create the index with the specified settings and mappings
320
+ # es.indices.create(index=index_name, mappings=mappings, settings=settings)
321
+ # return cls(client=es, index_name=index_name)
322
+ #
323
+ # def add_texts(
324
+ # self,
325
+ # texts: Iterable[str],
326
+ # refresh_indices: bool = True,
327
+ # ) -> List[str]:
328
+ # """Run more texts through the embeddings and add to the retriever.
329
+ #
330
+ # Args:
331
+ # texts: Iterable of strings to add to the retriever.
332
+ # refresh_indices: bool to refresh ElasticSearch indices
333
+ #
334
+ # Returns:
335
+ # List of ids from adding the texts into the retriever.
336
+ # """
337
+ # try:
338
+ # from elasticsearch.helpers import bulk
339
+ # except ImportError:
340
+ # raise ImportError(
341
+ # "Could not import elasticsearch python package. "
342
+ # "Please install it with `pip install elasticsearch`."
343
+ # )
344
+ # requests = []
345
+ # ids = []
346
+ # for i, text in enumerate(texts):
347
+ # _id = str(uuid.uuid4())
348
+ # request = {
349
+ # "_op_type": "index",
350
+ # "_index": self.index_name,
351
+ # "content": text,
352
+ # "_id": _id,
353
+ # }
354
+ # ids.append(_id)
355
+ # requests.append(request)
356
+ # bulk(self.client, requests)
357
+ #
358
+ # if refresh_indices:
359
+ # self.client.indices.refresh(index=self.index_name)
360
+ # return ids
361
+ #
362
+ # def _get_relevant_documents(
363
+ # self, query: str, *, run_manager: CallbackManagerForRetrieverRun
364
+ # ) -> List[Document]:
365
+ # query_dict = {"query": {"match": {"content": query}}}
366
+ # res = self.client.search(index=self.index_name, body=query_dict)
367
+ #
368
+ # docs = []
369
+ # for r in res["hits"]["hits"]:
370
+ # docs.append(Document(page_content=r["_source"]["content"]))
371
+ # return docs
372
+ ############################################################################################################
373
+
374
+
375
+ ############################################################################################################
376
+ # Multi Query Retriever
377
+ # class MultiQueryRetriever(BaseRetriever):
378
+ # """Given a query, use an LLM to write a set of queries.
379
+ #
380
+ # Retrieve docs for each query. Return the unique union of all retrieved docs.
381
+ # """
382
+ #
383
+ # retriever: BaseRetriever
384
+ # llm_chain: Runnable
385
+ # verbose: bool = True
386
+ # parser_key: str = "lines"
387
+ # """DEPRECATED. parser_key is no longer used and should not be specified."""
388
+ # include_original: bool = False
389
+ # """Whether to include the original query in the list of generated queries."""
390
+ #
391
+ # @classmethod
392
+ # def from_llm(
393
+ # cls,
394
+ # retriever: BaseRetriever,
395
+ # llm: BaseLanguageModel,
396
+ # prompt: BasePromptTemplate = DEFAULT_QUERY_PROMPT,
397
+ # parser_key: Optional[str] = None,
398
+ # include_original: bool = False,
399
+ # ) -> "MultiQueryRetriever":
400
+ # """Initialize from llm using default template.
401
+ #
402
+ # Args:
403
+ # retriever: retriever to query documents from
404
+ # llm: llm for query generation using DEFAULT_QUERY_PROMPT
405
+ # prompt: The prompt which aims to generate several different versions
406
+ # of the given user query
407
+ # include_original: Whether to include the original query in the list of
408
+ # generated queries.
409
+ #
410
+ # Returns:
411
+ # MultiQueryRetriever
412
+ # """
413
+ # output_parser = LineListOutputParser()
414
+ # llm_chain = prompt | llm | output_parser
415
+ # return cls(
416
+ # retriever=retriever,
417
+ # llm_chain=llm_chain,
418
+ # include_original=include_original,
419
+ # )
420
+ #
421
+ # async def _aget_relevant_documents(
422
+ # self,
423
+ # query: str,
424
+ # *,
425
+ # run_manager: AsyncCallbackManagerForRetrieverRun,
426
+ # ) -> List[Document]:
427
+ # """Get relevant documents given a user query.
428
+ #
429
+ # Args:
430
+ # query: user query
431
+ #
432
+ # Returns:
433
+ # Unique union of relevant documents from all generated queries
434
+ # """
435
+ # queries = await self.agenerate_queries(query, run_manager)
436
+ # if self.include_original:
437
+ # queries.append(query)
438
+ # documents = await self.aretrieve_documents(queries, run_manager)
439
+ # return self.unique_union(documents)
440
+ #
441
+ # async def agenerate_queries(
442
+ # self, question: str, run_manager: AsyncCallbackManagerForRetrieverRun
443
+ # ) -> List[str]:
444
+ # """Generate queries based upon user input.
445
+ #
446
+ # Args:
447
+ # question: user query
448
+ #
449
+ # Returns:
450
+ # List of LLM generated queries that are similar to the user input
451
+ # """
452
+ # response = await self.llm_chain.ainvoke(
453
+ # {"question": question}, config={"callbacks": run_manager.get_child()}
454
+ # )
455
+ # if isinstance(self.llm_chain, LLMChain):
456
+ # lines = response["text"]
457
+ # else:
458
+ # lines = response
459
+ # if self.verbose:
460
+ # logger.info(f"Generated queries: {lines}")
461
+ # return lines
462
+ #
463
+ # async def aretrieve_documents(
464
+ # self, queries: List[str], run_manager: AsyncCallbackManagerForRetrieverRun
465
+ # ) -> List[Document]:
466
+ # """Run all LLM generated queries.
467
+ #
468
+ # Args:
469
+ # queries: query list
470
+ #
471
+ # Returns:
472
+ # List of retrieved Documents
473
+ # """
474
+ # document_lists = await asyncio.gather(
475
+ # *(
476
+ # self.retriever.ainvoke(
477
+ # query, config={"callbacks": run_manager.get_child()}
478
+ # )
479
+ # for query in queries
480
+ # )
481
+ # )
482
+ # return [doc for docs in document_lists for doc in docs]
483
+ #
484
+ # def _get_relevant_documents(
485
+ # self,
486
+ # query: str,
487
+ # *,
488
+ # run_manager: CallbackManagerForRetrieverRun,
489
+ # ) -> List[Document]:
490
+ # """Get relevant documents given a user query.
491
+ #
492
+ # Args:
493
+ # query: user query
494
+ #
495
+ # Returns:
496
+ # Unique union of relevant documents from all generated queries
497
+ # """
498
+ # queries = self.generate_queries(query, run_manager)
499
+ # if self.include_original:
500
+ # queries.append(query)
501
+ # documents = self.retrieve_documents(queries, run_manager)
502
+ # return self.unique_union(documents)
503
+ #
504
+ # def generate_queries(
505
+ # self, question: str, run_manager: CallbackManagerForRetrieverRun
506
+ # ) -> List[str]:
507
+ # """Generate queries based upon user input.
508
+ #
509
+ # Args:
510
+ # question: user query
511
+ #
512
+ # Returns:
513
+ # List of LLM generated queries that are similar to the user input
514
+ # """
515
+ # response = self.llm_chain.invoke(
516
+ # {"question": question}, config={"callbacks": run_manager.get_child()}
517
+ # )
518
+ # if isinstance(self.llm_chain, LLMChain):
519
+ # lines = response["text"]
520
+ # else:
521
+ # lines = response
522
+ # if self.verbose:
523
+ # logger.info(f"Generated queries: {lines}")
524
+ # return lines
525
+ #
526
+ # def retrieve_documents(
527
+ # self, queries: List[str], run_manager: CallbackManagerForRetrieverRun
528
+ # ) -> List[Document]:
529
+ # """Run all LLM generated queries.
530
+ #
531
+ # Args:
532
+ # queries: query list
533
+ #
534
+ # Returns:
535
+ # List of retrieved Documents
536
+ # """
537
+ # documents = []
538
+ # for query in queries:
539
+ # docs = self.retriever.invoke(
540
+ # query, config={"callbacks": run_manager.get_child()}
541
+ # )
542
+ # documents.extend(docs)
543
+ # return documents
544
+ #
545
+ # def unique_union(self, documents: List[Document]) -> List[Document]:
546
+ # """Get unique Documents.
547
+ #
548
+ # Args:
549
+ # documents: List of retrieved Documents
550
+ #
551
+ # Returns:
552
+ # List of unique retrieved Documents
553
+ # """
554
+ # return _unique_documents(documents)
555
+ ############################################################################################################
556
+ ```
App_Function_Libraries/RAG/RAG_Libary_2.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG_Library_2.py
2
+ # Description: This script contains the main RAG pipeline function and related functions for the RAG pipeline.
3
+ #
4
+ # Import necessary modules and functions
5
+ import configparser
6
+ from typing import Dict, Any
7
+ # Local Imports
8
+ from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content, vector_search, chroma_client
9
+ from App_Function_Libraries.Article_Extractor_Lib import scrape_article
10
+ from App_Function_Libraries.DB.DB_Manager import add_media_to_database, search_db, get_unprocessed_media
11
+ # 3rd-Party Imports
12
+ import openai
13
+ #
14
+ ########################################################################################################################
15
+ #
16
+ # Functions:
17
+
18
+ # Initialize OpenAI client (adjust this based on your API key management)
19
+ openai.api_key = "your-openai-api-key"
20
+
21
+ config = configparser.ConfigParser()
22
+ config.read('config.txt')
23
+
24
+ # Main RAG pipeline function
25
+ def rag_pipeline(url: str, query: str, api_choice=None) -> Dict[str, Any]:
26
+ # Extract content
27
+ article_data = scrape_article(url)
28
+ content = article_data['content']
29
+ title = article_data['title']
30
+
31
+ # Store the article in the database and get the media_id
32
+ media_id = add_media_to_database(url, title, 'article', content)
33
+
34
+ # Process and store content
35
+ collection_name = f"article_{media_id}"
36
+ process_and_store_content(content, collection_name, media_id)
37
+
38
+ # Perform searches
39
+ vector_results = vector_search(collection_name, query, k=5)
40
+ fts_results = search_db(query, ["content"], "", page=1, results_per_page=5)
41
+
42
+ # Combine results
43
+ all_results = vector_results + [result['content'] for result in fts_results]
44
+ context = "\n".join(all_results)
45
+
46
+ # Generate answer using the selected API
47
+ answer = generate_answer(api_choice, context, query)
48
+
49
+ return {
50
+ "answer": answer,
51
+ "context": context
52
+ }
53
+
54
+
55
+ def generate_answer(api_choice: str, context: str, query: str) -> str:
56
+ prompt = f"Context: {context}\n\nQuestion: {query}"
57
+ if api_choice == "OpenAI":
58
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai
59
+ return summarize_with_openai(config['API']['openai_api_key'], prompt, "")
60
+ elif api_choice == "Anthropic":
61
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_anthropic
62
+ return summarize_with_anthropic(config['API']['anthropic_api_key'], prompt, "")
63
+ elif api_choice == "Cohere":
64
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_cohere
65
+ return summarize_with_cohere(config['API']['cohere_api_key'], prompt, "")
66
+ elif api_choice == "Groq":
67
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_groq
68
+ return summarize_with_groq(config['API']['groq_api_key'], prompt, "")
69
+ elif api_choice == "OpenRouter":
70
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_openrouter
71
+ return summarize_with_openrouter(config['API']['openrouter_api_key'], prompt, "")
72
+ elif api_choice == "HuggingFace":
73
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_huggingface
74
+ return summarize_with_huggingface(config['API']['huggingface_api_key'], prompt, "")
75
+ elif api_choice == "DeepSeek":
76
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_deepseek
77
+ return summarize_with_deepseek(config['API']['deepseek_api_key'], prompt, "")
78
+ elif api_choice == "Mistral":
79
+ from App_Function_Libraries.Summarization_General_Lib import summarize_with_mistral
80
+ return summarize_with_mistral(config['API']['mistral_api_key'], prompt, "")
81
+ elif api_choice == "Local-LLM":
82
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_local_llm
83
+ return summarize_with_local_llm(config['API']['local_llm_path'], prompt, "")
84
+ elif api_choice == "Llama.cpp":
85
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama
86
+ return summarize_with_llama(config['API']['llama_api_key'], prompt, "")
87
+ elif api_choice == "Kobold":
88
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_kobold
89
+ return summarize_with_kobold(config['API']['kobold_api_key'], prompt, "")
90
+ elif api_choice == "Ooba":
91
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_oobabooga
92
+ return summarize_with_oobabooga(config['API']['ooba_api_key'], prompt, "")
93
+ elif api_choice == "TabbyAPI":
94
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_tabbyapi
95
+ return summarize_with_tabbyapi(config['API']['tabby_api_key'], prompt, "")
96
+ elif api_choice == "vLLM":
97
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_vllm
98
+ return summarize_with_vllm(config['API']['vllm_api_key'], prompt, "")
99
+ elif api_choice == "ollama":
100
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_ollama
101
+ return summarize_with_ollama(config['API']['ollama_api_key'], prompt, "")
102
+ else:
103
+ raise ValueError(f"Unsupported API choice: {api_choice}")
104
+
105
+ # Function to preprocess and store all existing content in the database
106
+ def preprocess_all_content():
107
+ unprocessed_media = get_unprocessed_media()
108
+ for row in unprocessed_media:
109
+ media_id = row[0]
110
+ content = row[1]
111
+ media_type = row[2]
112
+ collection_name = f"{media_type}_{media_id}"
113
+ process_and_store_content(content, collection_name, media_id)
114
+
115
+
116
+ # Function to perform RAG search across all stored content
117
+ def rag_search(query: str, api_choice: str) -> Dict[str, Any]:
118
+ # Perform vector search across all collections
119
+ all_collections = chroma_client.list_collections()
120
+ vector_results = []
121
+ for collection in all_collections:
122
+ vector_results.extend(vector_search(collection.name, query, k=2))
123
+
124
+ # Perform FTS search
125
+ fts_results = search_db(query, ["content"], "", page=1, results_per_page=10)
126
+
127
+ # Combine results
128
+ all_results = vector_results + [result['content'] for result in fts_results]
129
+ context = "\n".join(all_results[:10]) # Limit to top 10 results
130
+
131
+ # Generate answer using the selected API
132
+ answer = generate_answer(api_choice, context, query)
133
+
134
+ return {
135
+ "answer": answer,
136
+ "context": context
137
+ }
138
+
139
+
140
+ # Example usage:
141
+ # 1. Initialize the system:
142
+ # create_tables(db) # Ensure FTS tables are set up
143
+ #
144
+ # 2. Create ChromaDB
145
+ # chroma_client = ChromaDBClient()
146
+ #
147
+ # 3. Create Embeddings
148
+ # Store embeddings in ChromaDB
149
+ # preprocess_all_content() or create_embeddings()
150
+ #
151
+ # 4. Perform RAG search across all content:
152
+ # result = rag_search("What are the key points about climate change?")
153
+ # print(result['answer'])
154
+ #
155
+ # (Extra)5. Perform RAG on a specific URL:
156
+ # result = rag_pipeline("https://example.com/article", "What is the main topic of this article?")
157
+ # print(result['answer'])
158
+ #
159
+ ########################################################################################################################
160
+
161
+
162
+ ############################################################################################################
163
+ #
164
+ # ElasticSearch Retriever
165
+
166
+ # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-elasticsearch
167
+ #
168
+ # https://github.com/langchain-ai/langchain/tree/44e3e2391c48bfd0a8e6a20adde0b6567f4f43c3/templates/rag-self-query
169
+
170
+ #
171
+ # End of RAG_Library_2.py
172
+ ############################################################################################################
App_Function_Libraries/RAG/RAG_Library.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List, Tuple, Dict
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from sentence_transformers import SentenceTransformer
6
+ import math
7
+ from functools import lru_cache
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ import openai
10
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
11
+ import torch
12
+ import re
13
+ import psycopg2
14
+ from psycopg2.extras import execute_values
15
+ import sqlite3
16
+ import logging
17
+
18
+
19
+
20
+ ########################################################################################################################################################################################################################################
21
+ #
22
+ # RAG Chunking
23
+ # To fully integrate this chunking system, you'd need to:
24
+ #
25
+ # Create the UnvectorizedMediaChunks table in your SQLite database.
26
+ # Modify your document ingestion process to use chunk_and_store_unvectorized.
27
+ # Implement a background process that periodically calls vectorize_all_documents to process unvectorized chunks.
28
+
29
+ # This chunking is pretty weak and needs improvement
30
+ # See notes for improvements #FIXME
31
+ import json
32
+ from typing import List, Dict, Any
33
+ from datetime import datetime
34
+
35
+
36
+ def chunk_and_store_unvectorized(
37
+ db_connection,
38
+ media_id: int,
39
+ text: str,
40
+ chunk_size: int = 1000,
41
+ overlap: int = 100,
42
+ chunk_type: str = 'fixed-length'
43
+ ) -> List[int]:
44
+ chunks = create_chunks(text, chunk_size, overlap)
45
+ return store_unvectorized_chunks(db_connection, media_id, chunks, chunk_type)
46
+
47
+
48
+ def create_chunks(text: str, chunk_size: int, overlap: int) -> List[Dict[str, Any]]:
49
+ words = text.split()
50
+ chunks = []
51
+ for i in range(0, len(words), chunk_size - overlap):
52
+ chunk_text = ' '.join(words[i:i + chunk_size])
53
+ start_char = text.index(words[i])
54
+ end_char = start_char + len(chunk_text)
55
+ chunks.append({
56
+ 'text': chunk_text,
57
+ 'start_char': start_char,
58
+ 'end_char': end_char,
59
+ 'index': len(chunks)
60
+ })
61
+ return chunks
62
+
63
+
64
+ def store_unvectorized_chunks(
65
+ db_connection,
66
+ media_id: int,
67
+ chunks: List[Dict[str, Any]],
68
+ chunk_type: str
69
+ ) -> List[int]:
70
+ cursor = db_connection.cursor()
71
+ chunk_ids = []
72
+ for chunk in chunks:
73
+ cursor.execute("""
74
+ INSERT INTO UnvectorizedMediaChunks
75
+ (media_id, chunk_text, chunk_index, start_char, end_char, chunk_type, metadata)
76
+ VALUES (?, ?, ?, ?, ?, ?, ?)
77
+ """, (
78
+ media_id,
79
+ chunk['text'],
80
+ chunk['index'],
81
+ chunk['start_char'],
82
+ chunk['end_char'],
83
+ chunk_type,
84
+ json.dumps({'length': len(chunk['text'])}) # Example metadata
85
+ ))
86
+ chunk_ids.append(cursor.lastrowid)
87
+ db_connection.commit()
88
+ return chunk_ids
89
+
90
+
91
+ def get_unvectorized_chunks(
92
+ db_connection,
93
+ media_id: int,
94
+ limit: int = 100,
95
+ offset: int = 0
96
+ ) -> List[Dict[str, Any]]:
97
+ cursor = db_connection.cursor()
98
+ cursor.execute("""
99
+ SELECT id, chunk_text, chunk_index, start_char, end_char, chunk_type, metadata
100
+ FROM UnvectorizedMediaChunks
101
+ WHERE media_id = ? AND is_processed = FALSE
102
+ ORDER BY chunk_index
103
+ LIMIT ? OFFSET ?
104
+ """, (media_id, limit, offset))
105
+ return [
106
+ {
107
+ 'id': row[0],
108
+ 'text': row[1],
109
+ 'index': row[2],
110
+ 'start_char': row[3],
111
+ 'end_char': row[4],
112
+ 'type': row[5],
113
+ 'metadata': json.loads(row[6])
114
+ }
115
+ for row in cursor.fetchall()
116
+ ]
117
+
118
+
119
+ def mark_chunks_as_processed(db_connection, chunk_ids: List[int]):
120
+ cursor = db_connection.cursor()
121
+ cursor.executemany("""
122
+ UPDATE UnvectorizedMediaChunks
123
+ SET is_processed = TRUE, last_modified = ?
124
+ WHERE id = ?
125
+ """, [(datetime.now(), chunk_id) for chunk_id in chunk_ids])
126
+ db_connection.commit()
127
+
128
+
129
+ # Usage example
130
+ def process_media_chunks(db_connection, media_id: int, text: str):
131
+ chunk_ids = chunk_and_store_unvectorized(db_connection, media_id, text)
132
+ print(f"Stored {len(chunk_ids)} unvectorized chunks for media_id {media_id}")
133
+
134
+ # Later, when you want to process these chunks:
135
+ unprocessed_chunks = get_unvectorized_chunks(db_connection, media_id)
136
+ # Process chunks (e.g., vectorize them)
137
+ # ...
138
+ # After processing, mark them as processed
139
+ mark_chunks_as_processed(db_connection, [chunk['id'] for chunk in unprocessed_chunks])
140
+ ###########################################################################################################################################################################################################
141
+ #
142
+ # RAG System
143
+
144
+ # To use this updated RAG system in your existing application:
145
+ #
146
+ # Install required packages:
147
+ # pip install sentence-transformers psycopg2-binary scikit-learn transformers torch
148
+ # Set up PostgreSQL with pgvector:
149
+ #
150
+ # Install PostgreSQL and the pgvector extension.
151
+ # Create a new database for vector storage.
152
+ #
153
+ # Update your main application to use the RAG system:
154
+ #
155
+ # Import the RAGSystem class from this new file.
156
+ # Initialize the RAG system with your SQLite and PostgreSQL configurations.
157
+ # Use the vectorize_all_documents method to initially vectorize your existing documents.
158
+ #
159
+ #
160
+ # Modify your existing PDF_Ingestion_Lib.py and Book_Ingestion_Lib.py:
161
+ #
162
+ # After successfully ingesting a document into SQLite, call the vectorization method from the RAG system.
163
+
164
+ # Example modification for ingest_text_file in Book_Ingestion_Lib.py:
165
+ # from RAG_Library import RAGSystem
166
+ #
167
+ # # Initialize RAG system (do this once in your main application)
168
+ # rag_system = RAGSystem(sqlite_path, pg_config)
169
+ #
170
+ # def ingest_text_file(file_path, title=None, author=None, keywords=None):
171
+ # try:
172
+ # # ... (existing code)
173
+ #
174
+ # # Add the text file to the database
175
+ # doc_id = add_media_with_keywords(
176
+ # url=file_path,
177
+ # title=title,
178
+ # media_type='document',
179
+ # content=content,
180
+ # keywords=keywords,
181
+ # prompt='No prompt for text files',
182
+ # summary='No summary for text files',
183
+ # transcription_model='None',
184
+ # author=author,
185
+ # ingestion_date=datetime.now().strftime('%Y-%m-%d')
186
+ # )
187
+ #
188
+ # # Vectorize the newly added document
189
+ # rag_system.vectorize_document(doc_id, content)
190
+ #
191
+ # return f"Text file '{title}' by {author} ingested and vectorized successfully."
192
+ # except Exception as e:
193
+ # logging.error(f"Error ingesting text file: {str(e)}")
194
+ # return f"Error ingesting text file: {str(e)}"
195
+
196
+
197
+
198
+ # Setup logging
199
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
200
+ logger = logging.getLogger(__name__)
201
+
202
+ # Constants
203
+ EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
204
+ VECTOR_DIM = 384 # Dimension of the chosen embedding model
205
+
206
+
207
+ class RAGSystem:
208
+ def __init__(self, sqlite_path: str, pg_config: Dict[str, str], cache_size: int = 100):
209
+ self.sqlite_path = sqlite_path
210
+ self.pg_config = pg_config
211
+ self.model = SentenceTransformer(EMBEDDING_MODEL)
212
+ self.cache_size = cache_size
213
+
214
+ self._init_postgres()
215
+
216
+ def _init_postgres(self):
217
+ with psycopg2.connect(**self.pg_config) as conn:
218
+ with conn.cursor() as cur:
219
+ cur.execute("""
220
+ CREATE TABLE IF NOT EXISTS document_vectors (
221
+ id SERIAL PRIMARY KEY,
222
+ document_id INTEGER UNIQUE,
223
+ vector vector(384)
224
+ )
225
+ """)
226
+ conn.commit()
227
+
228
+ @lru_cache(maxsize=100)
229
+ def _get_embedding(self, text: str) -> np.ndarray:
230
+ return self.model.encode([text])[0]
231
+
232
+ def vectorize_document(self, doc_id: int, content: str):
233
+ chunks = create_chunks(content, chunk_size=1000, overlap=100)
234
+ for chunk in chunks:
235
+ vector = self._get_embedding(chunk['text'])
236
+
237
+ with psycopg2.connect(**self.pg_config) as conn:
238
+ with conn.cursor() as cur:
239
+ cur.execute("""
240
+ INSERT INTO document_vectors (document_id, chunk_index, vector, metadata)
241
+ VALUES (%s, %s, %s, %s)
242
+ ON CONFLICT (document_id, chunk_index) DO UPDATE SET vector = EXCLUDED.vector
243
+ """, (doc_id, chunk['index'], vector.tolist(), json.dumps(chunk)))
244
+ conn.commit()
245
+
246
+ def vectorize_all_documents(self):
247
+ with sqlite3.connect(self.sqlite_path) as sqlite_conn:
248
+ unprocessed_chunks = get_unvectorized_chunks(sqlite_conn, limit=1000)
249
+ for chunk in unprocessed_chunks:
250
+ self.vectorize_document(chunk['id'], chunk['text'])
251
+ mark_chunks_as_processed(sqlite_conn, [chunk['id'] for chunk in unprocessed_chunks])
252
+
253
+ def semantic_search(self, query: str, top_k: int = 5) -> List[Tuple[int, int, float]]:
254
+ query_vector = self._get_embedding(query)
255
+
256
+ with psycopg2.connect(**self.pg_config) as conn:
257
+ with conn.cursor() as cur:
258
+ cur.execute("""
259
+ SELECT document_id, chunk_index, 1 - (vector <-> %s) AS similarity
260
+ FROM document_vectors
261
+ ORDER BY vector <-> %s ASC
262
+ LIMIT %s
263
+ """, (query_vector.tolist(), query_vector.tolist(), top_k))
264
+ results = cur.fetchall()
265
+
266
+ return results
267
+
268
+ def get_document_content(self, doc_id: int) -> str:
269
+ with sqlite3.connect(self.sqlite_path) as conn:
270
+ cur = conn.cursor()
271
+ cur.execute("SELECT content FROM media WHERE id = ?", (doc_id,))
272
+ result = cur.fetchone()
273
+ return result[0] if result else ""
274
+
275
+ def bm25_search(self, query: str, top_k: int = 5) -> List[Tuple[int, float]]:
276
+ with sqlite3.connect(self.sqlite_path) as conn:
277
+ cur = conn.cursor()
278
+ cur.execute("SELECT id, content FROM media")
279
+ documents = cur.fetchall()
280
+
281
+ vectorizer = TfidfVectorizer(use_idf=True)
282
+ tfidf_matrix = vectorizer.fit_transform([doc[1] for doc in documents])
283
+
284
+ query_vector = vectorizer.transform([query])
285
+ doc_lengths = tfidf_matrix.sum(axis=1).A1
286
+ avg_doc_length = np.mean(doc_lengths)
287
+
288
+ k1, b = 1.5, 0.75
289
+ scores = []
290
+ for i, doc_vector in enumerate(tfidf_matrix):
291
+ score = np.sum(
292
+ ((k1 + 1) * query_vector.multiply(doc_vector)).A1 /
293
+ (k1 * (1 - b + b * doc_lengths[i] / avg_doc_length) + query_vector.multiply(doc_vector).A1)
294
+ )
295
+ scores.append((documents[i][0], score))
296
+
297
+ return sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
298
+
299
+ def combine_search_results(self, bm25_results: List[Tuple[int, float]], vector_results: List[Tuple[int, float]],
300
+ alpha: float = 0.5) -> List[Tuple[int, float]]:
301
+ combined_scores = {}
302
+ for idx, score in bm25_results + vector_results:
303
+ if idx in combined_scores:
304
+ combined_scores[idx] += score * (alpha if idx in dict(bm25_results) else (1 - alpha))
305
+ else:
306
+ combined_scores[idx] = score * (alpha if idx in dict(bm25_results) else (1 - alpha))
307
+ return sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
308
+
309
+ def expand_query(self, query: str) -> str:
310
+ model = T5ForConditionalGeneration.from_pretrained("t5-small")
311
+ tokenizer = T5Tokenizer.from_pretrained("t5-small")
312
+
313
+ input_text = f"expand query: {query}"
314
+ input_ids = tokenizer.encode(input_text, return_tensors="pt")
315
+
316
+ outputs = model.generate(input_ids, max_length=50, num_return_sequences=1)
317
+ expanded_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
318
+
319
+ return f"{query} {expanded_query}"
320
+
321
+ def cross_encoder_rerank(self, query: str, initial_results: List[Tuple[int, float]], top_k: int = 5) -> List[
322
+ Tuple[int, float]]:
323
+ from sentence_transformers import CrossEncoder
324
+ model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
325
+
326
+ candidate_docs = [self.get_document_content(doc_id) for doc_id, _ in initial_results[:top_k * 2]]
327
+ pairs = [[query, doc] for doc in candidate_docs]
328
+ scores = model.predict(pairs)
329
+
330
+ reranked = sorted(zip(initial_results[:top_k * 2], scores), key=lambda x: x[1], reverse=True)
331
+ return [(idx, score) for (idx, _), score in reranked[:top_k]]
332
+
333
+ def rag_query(self, query: str, search_type: str = 'combined', top_k: int = 5, use_hyde: bool = False,
334
+ rerank: bool = False, expand: bool = False) -> List[Dict[str, any]]:
335
+ try:
336
+ if expand:
337
+ query = self.expand_query(query)
338
+
339
+ if use_hyde:
340
+ # Implement HyDE if needed
341
+ pass
342
+ elif search_type == 'vector':
343
+ results = self.semantic_search(query, top_k)
344
+ elif search_type == 'bm25':
345
+ results = self.bm25_search(query, top_k)
346
+ elif search_type == 'combined':
347
+ bm25_results = self.bm25_search(query, top_k)
348
+ vector_results = self.semantic_search(query, top_k)
349
+ results = self.combine_search_results(bm25_results, vector_results)
350
+ else:
351
+ raise ValueError("Invalid search type. Choose 'vector', 'bm25', or 'combined'.")
352
+
353
+ if rerank:
354
+ results = self.cross_encoder_rerank(query, results, top_k)
355
+
356
+ enriched_results = []
357
+ for doc_id, score in results:
358
+ content = self.get_document_content(doc_id)
359
+ enriched_results.append({
360
+ "document_id": doc_id,
361
+ "score": score,
362
+ "content": content[:500] # Truncate content for brevity
363
+ })
364
+
365
+ return enriched_results
366
+ except Exception as e:
367
+ logger.error(f"An error occurred during RAG query: {str(e)}")
368
+ return []
369
+
370
+
371
+ # Example usage
372
+ if __name__ == "__main__":
373
+ sqlite_path = "path/to/your/sqlite/database.db"
374
+ pg_config = {
375
+ "dbname": "your_db_name",
376
+ "user": "your_username",
377
+ "password": "your_password",
378
+ "host": "localhost"
379
+ }
380
+
381
+ rag_system = RAGSystem(sqlite_path, pg_config)
382
+
383
+ # Vectorize all documents (run this once or periodically)
384
+ rag_system.vectorize_all_documents()
385
+
386
+ # Example query
387
+ query = "programming concepts for beginners"
388
+ results = rag_system.rag_query(query, search_type='combined', expand=True, rerank=True)
389
+
390
+ print(f"Search results for query: '{query}'\n")
391
+ for i, result in enumerate(results, 1):
392
+ print(f"Result {i}:")
393
+ print(f"Document ID: {result['document_id']}")
394
+ print(f"Score: {result['score']:.4f}")
395
+ print(f"Content snippet: {result['content']}")
396
+ print("---")
App_Function_Libraries/RAG/RAPTOR-Skeleton.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements
2
+ # scikit-learn umap-learn
3
+ from itertools import chain
4
+ from typing import List, Dict
5
+
6
+ from App_Function_Libraries.RAG.ChromaDB_Library import store_in_chroma, create_embedding, vector_search, chroma_client
7
+ from App_Function_Libraries.Chunk_Lib import improved_chunking_process, recursive_summarize_chunks
8
+ import logging
9
+ from sklearn.mixture import GaussianMixture
10
+ import umap
11
+ from nltk.corpus import wordnet
12
+
13
+
14
+ # Logging setup
15
+ logging.basicConfig(filename='raptor.log', level=logging.DEBUG)
16
+
17
+ # FIXME
18
+ MAX_LEVELS = 3
19
+
20
+
21
+ def log_and_summarize(text, prompt):
22
+ logging.debug(f"Summarizing text: {text[:100]} with prompt: {prompt}")
23
+ return dummy_summarize(text, prompt)
24
+
25
+ # 1. Data Preparation
26
+ def prepare_data(content: str, media_id: int, chunk_options: dict):
27
+ chunks = improved_chunking_process(content, chunk_options)
28
+ embeddings = [create_embedding(chunk['text']) for chunk in chunks]
29
+ return chunks, embeddings
30
+
31
+ # 2. Recursive Summarization
32
+ def recursive_summarization(chunks, summarize_func, custom_prompt):
33
+ summarized_chunks = recursive_summarize_chunks(
34
+ [chunk['text'] for chunk in chunks],
35
+ summarize_func=summarize_func,
36
+ custom_prompt=custom_prompt
37
+ )
38
+ return summarized_chunks
39
+
40
+ # Initial gen
41
+ # 3. Tree Organization
42
+ #def build_tree_structure(chunks, embeddings, collection_name, level=0):
43
+ # if len(chunks) <= 1:
44
+ # return chunks # Base case: if chunks are small enough, return as is
45
+
46
+ # Recursive case: cluster and summarize
47
+ # summarized_chunks = recursive_summarization(chunks, summarize_func=dummy_summarize, custom_prompt="Summarize:")
48
+ # new_chunks, new_embeddings = prepare_data(' '.join(summarized_chunks), media_id, chunk_options)
49
+
50
+ # Store in ChromaDB
51
+ # ids = [f"{media_id}_L{level}_chunk_{i}" for i in range(len(new_chunks))]
52
+ # store_in_chroma(collection_name, [chunk['text'] for chunk in new_chunks], new_embeddings, ids)
53
+
54
+ # Recursively build tree
55
+ # return build_tree_structure(new_chunks, new_embeddings, collection_name, level+1)
56
+
57
+ # Second iteration
58
+ def build_tree_structure(chunks, collection_name, level=0):
59
+ # Dynamic clustering
60
+ clustered_texts = dynamic_clustering([chunk['text'] for chunk in chunks])
61
+
62
+ # Summarize each cluster
63
+ summarized_clusters = {}
64
+ for cluster_id, cluster_texts in clustered_texts.items():
65
+ summary = dummy_summarize(' '.join(cluster_texts), custom_prompt="Summarize:")
66
+ summarized_clusters[cluster_id] = summary
67
+
68
+ # Store summaries at current level
69
+ ids = []
70
+ embeddings = []
71
+ summaries = []
72
+ for cluster_id, summary in summarized_clusters.items():
73
+ ids.append(f"{collection_name}_L{level}_C{cluster_id}")
74
+ embeddings.append(create_embedding(summary))
75
+ summaries.append(summary)
76
+
77
+ store_in_chroma(collection_name, summaries, embeddings, ids)
78
+
79
+ # Recursively build tree structure if necessary
80
+ if level < MAX_LEVELS:
81
+ for cluster_id, cluster_texts in clustered_texts.items():
82
+ build_tree_structure(cluster_texts, collection_name, level + 1)
83
+
84
+
85
+
86
+
87
+ # Dummy summarize function (replace with actual summarization)
88
+ def dummy_summarize(text, custom_prompt, temp=None, system_prompt=None):
89
+ return text # Replace this with actual call to summarization model (like GPT-3.5-turbo)
90
+
91
+ # 4. Retrieval
92
+ def raptor_retrieve(query, collection_name, level=0):
93
+ results = vector_search(collection_name, query, k=5)
94
+ return results
95
+
96
+ # Main function integrating RAPTOR
97
+ def raptor_pipeline(media_id, content, chunk_options):
98
+ collection_name = f"media_{media_id}_raptor"
99
+
100
+ # Step 1: Prepare Data
101
+ chunks, embeddings = prepare_data(content, media_id, chunk_options)
102
+
103
+ # Step 2: Build Tree
104
+ build_tree_structure(chunks, embeddings, collection_name)
105
+
106
+ # Step 3: Retrieve Information
107
+ query = "Your query here"
108
+ result = raptor_retrieve(query, collection_name)
109
+ print(result)
110
+
111
+ # Example usage
112
+ content = "Your long document content here"
113
+ chunk_options = {
114
+ 'method': 'sentences',
115
+ 'max_size': 300,
116
+ 'overlap': 50
117
+ }
118
+ media_id = 1
119
+ raptor_pipeline(media_id, content, chunk_options)
120
+
121
+
122
+ #
123
+ #
124
+ ###################################################################################################################
125
+ #
126
+ # Additions:
127
+
128
+
129
+ def dynamic_clustering(texts, n_components=2):
130
+ # Step 1: Convert text to embeddings
131
+ embeddings = [create_embedding(text) for text in texts]
132
+
133
+ # Step 2: Dimensionality reduction (UMAP)
134
+ reducer = umap.UMAP(n_components=n_components)
135
+ reduced_embeddings = reducer.fit_transform(embeddings)
136
+
137
+ # Step 3: Find optimal number of clusters using BIC
138
+ best_gmm = None
139
+ best_bic = float('inf')
140
+ n_clusters = range(2, 10)
141
+ for n in n_clusters:
142
+ gmm = GaussianMixture(n_components=n, covariance_type='full')
143
+ gmm.fit(reduced_embeddings)
144
+ bic = gmm.bic(reduced_embeddings)
145
+ if bic < best_bic:
146
+ best_bic = bic
147
+ best_gmm = gmm
148
+
149
+ # Step 4: Cluster the reduced embeddings
150
+ cluster_labels = best_gmm.predict(reduced_embeddings)
151
+ clustered_texts = {i: [] for i in range(best_gmm.n_components)}
152
+ for label, text in zip(cluster_labels, texts):
153
+ clustered_texts[label].append(text)
154
+
155
+ return clustered_texts
156
+
157
+
158
+ def tree_traversal_retrieve(query, collection_name, max_depth=3):
159
+ logging.info(f"Starting tree traversal for query: {query}")
160
+ results = []
161
+ current_level = 0
162
+ current_nodes = [collection_name + '_L0']
163
+
164
+ while current_level <= max_depth and current_nodes:
165
+ next_level_nodes = []
166
+ for node_id in current_nodes:
167
+ documents = vector_search(node_id, query, k=5)
168
+ results.extend(documents)
169
+ next_level_nodes.extend([doc['id'] for doc in documents]) # Assuming your doc structure includes an 'id' field
170
+ current_nodes = next_level_nodes
171
+ current_level += 1
172
+
173
+ logging.info(f"Tree traversal completed with {len(results)} results")
174
+ return results
175
+
176
+
177
+ def collapsed_tree_retrieve(query, collection_name):
178
+ all_layers = [f"{collection_name}_L{level}" for level in range(MAX_LEVELS)]
179
+ all_results = []
180
+
181
+ for layer in all_layers:
182
+ all_results.extend(vector_search(layer, query, k=5))
183
+
184
+ # Sort and rank results by relevance
185
+ sorted_results = sorted(all_results, key=lambda x: x['relevance'], reverse=True) # Assuming 'relevance' is a key
186
+ return sorted_results[:5] # Return top 5 results
187
+
188
+ # Test collaped tree retrieval
189
+ query = "Your broad query here"
190
+ results = collapsed_tree_retrieve(query, collection_name=f"media_{media_id}_raptor")
191
+ print(results)
192
+
193
+
194
+ # Parallel processing
195
+ # pip install joblib
196
+ from joblib import Parallel, delayed
197
+
198
+ def parallel_process_chunks(chunks):
199
+ return Parallel(n_jobs=-1)(delayed(create_embedding)(chunk['text']) for chunk in chunks)
200
+
201
+ def build_tree_structure(chunks, collection_name, level=0):
202
+ clustered_texts = dynamic_clustering([chunk['text'] for chunk in chunks])
203
+
204
+ summarized_clusters = {}
205
+ for cluster_id, cluster_texts in clustered_texts.items():
206
+ summary = dummy_summarize(' '.join(cluster_texts), custom_prompt="Summarize:")
207
+ summarized_clusters[cluster_id] = summary
208
+
209
+ # Parallel processing of embeddings
210
+ embeddings = parallel_process_chunks([{'text': summary} for summary in summarized_clusters.values()])
211
+
212
+ ids = [f"{collection_name}_L{level}_C{cluster_id}" for cluster_id in summarized_clusters.keys()]
213
+ store_in_chroma(collection_name, list(summarized_clusters.values()), embeddings, ids)
214
+
215
+ if len(summarized_clusters) > 1 and level < MAX_LEVELS:
216
+ build_tree_structure(summarized_clusters.values(), collection_name, level + 1)
217
+
218
+ # Asynchronous processing
219
+ import asyncio
220
+
221
+ async def async_create_embedding(text):
222
+ return create_embedding(text) # Assuming create_embedding is now async
223
+
224
+ async def build_tree_structure_async(chunks, collection_name, level=0):
225
+ clustered_texts = dynamic_clustering([chunk['text'] for chunk in chunks])
226
+
227
+ summarized_clusters = {}
228
+ for cluster_id, cluster_texts in clustered_texts.items():
229
+ summary = await async_create_embedding(' '.join(cluster_texts))
230
+ summarized_clusters[cluster_id] = summary
231
+
232
+ embeddings = await asyncio.gather(*[async_create_embedding(summary) for summary in summarized_clusters.values()])
233
+
234
+ ids = [f"{collection_name}_L{level}_C{cluster_id}" for cluster_id in summarized_clusters.keys()]
235
+ store_in_chroma(collection_name, list(summarized_clusters.values()), embeddings, ids)
236
+
237
+ if len(summarized_clusters) > 1 and level < MAX_LEVELS:
238
+ await build_tree_structure_async(summarized_clusters.values(), collection_name, level + 1)
239
+
240
+
241
+ # User feedback Loop
242
+ def get_user_feedback(results):
243
+ print("Please review the following results:")
244
+ for i, result in enumerate(results):
245
+ print(f"{i + 1}: {result['text'][:100]}...")
246
+
247
+ feedback = input("Enter the numbers of the results that were relevant (comma-separated): ")
248
+ relevant_indices = [int(i.strip()) - 1 for i in feedback.split(",")]
249
+ return relevant_indices
250
+
251
+
252
+ def raptor_pipeline_with_feedback(media_id, content, chunk_options):
253
+ # ... Existing pipeline steps ...
254
+
255
+ query = "Your query here"
256
+ initial_results = tree_traversal_retrieve(query, collection_name=f"media_{media_id}_raptor")
257
+ relevant_indices = get_user_feedback(initial_results)
258
+
259
+ if relevant_indices:
260
+ relevant_results = [initial_results[i] for i in relevant_indices]
261
+ refined_query = " ".join([res['text'] for res in relevant_results])
262
+ try:
263
+ final_results = tree_traversal_retrieve(refined_query, collection_name=f"media_{media_id}_raptor")
264
+ except Exception as e:
265
+ logging.error(f"Error during retrieval: {str(e)}")
266
+ raise
267
+ print("Refined Results:", final_results)
268
+ else:
269
+ print("No relevant results were found in the initial search.")
270
+
271
+
272
+ def identify_uncertain_results(results):
273
+ threshold = 0.5 # Define a confidence threshold
274
+ uncertain_results = [res for res in results if res['confidence'] < threshold]
275
+ return uncertain_results
276
+
277
+
278
+ def raptor_pipeline_with_active_learning(media_id, content, chunk_options):
279
+ # ... Existing pipeline steps ...
280
+
281
+ query = "Your query here"
282
+ initial_results = tree_traversal_retrieve(query, collection_name=f"media_{media_id}_raptor")
283
+ uncertain_results = identify_uncertain_results(initial_results)
284
+
285
+ if uncertain_results:
286
+ print("The following results are uncertain. Please provide feedback:")
287
+ feedback_indices = get_user_feedback(uncertain_results)
288
+ # Use feedback to adjust retrieval or refine the query
289
+ refined_query = " ".join([uncertain_results[i]['text'] for i in feedback_indices])
290
+ final_results = tree_traversal_retrieve(refined_query, collection_name=f"media_{media_id}_raptor")
291
+ print("Refined Results:", final_results)
292
+ else:
293
+ print("No uncertain results were found.")
294
+
295
+
296
+ # Query Expansion
297
+ def expand_query_with_synonyms(query):
298
+ words = query.split()
299
+ expanded_query = []
300
+ for word in words:
301
+ synonyms = wordnet.synsets(word)
302
+ lemmas = set(chain.from_iterable([syn.lemma_names() for syn in synonyms]))
303
+ expanded_query.append(" ".join(lemmas))
304
+ return " ".join(expanded_query)
305
+
306
+
307
+ def contextual_query_expansion(query, context):
308
+ # FIXME: Replace with actual contextual model
309
+ expanded_terms = some_contextual_model.get_expansions(query, context)
310
+ return query + " " + " ".join(expanded_terms)
311
+
312
+
313
+ def raptor_pipeline_with_query_expansion(media_id, content, chunk_options):
314
+ # ... Existing pipeline steps ...
315
+
316
+ query = "Your initial query"
317
+ expanded_query = expand_query_with_synonyms(query)
318
+ initial_results = tree_traversal_retrieve(expanded_query, collection_name=f"media_{media_id}_raptor")
319
+ # ... Continue with feedback loop ...
320
+
321
+
322
+ def generate_summary_with_citations(query: str, collection_name: str):
323
+ results = vector_search_with_citation(collection_name, query)
324
+ # FIXME
325
+ summary = summarize([res['text'] for res in results])
326
+ # Deduplicate sources
327
+ sources = list(set(res['source'] for res in results))
328
+ return f"{summary}\n\nCitations:\n" + "\n".join(sources)
329
+
330
+
331
+ def vector_search_with_citation(collection_name: str, query: str, k: int = 10) -> List[Dict[str, str]]:
332
+ query_embedding = create_embedding(query)
333
+ collection = chroma_client.get_collection(name=collection_name)
334
+ results = collection.query(
335
+ query_embeddings=[query_embedding],
336
+ n_results=k
337
+ )
338
+ return [{'text': doc, 'source': meta['source']} for doc, meta in zip(results['documents'], results['metadatas'])]
339
+
340
+
341
+ def generate_summary_with_footnotes(query: str, collection_name: str):
342
+ results = vector_search_with_citation(collection_name, query)
343
+ summary_parts = []
344
+ citations = []
345
+ for i, res in enumerate(results):
346
+ summary_parts.append(f"{res['text']} [{i + 1}]")
347
+ citations.append(f"[{i + 1}] {res['source']}")
348
+ return " ".join(summary_parts) + "\n\nFootnotes:\n" + "\n".join(citations)
349
+
350
+
351
+ def generate_summary_with_hyperlinks(query: str, collection_name: str):
352
+ results = vector_search_with_citation(collection_name, query)
353
+ summary_parts = []
354
+ for res in results:
355
+ summary_parts.append(f'<a href="{res["source"]}">{res["text"][:100]}...</a>')
356
+ return " ".join(summary_parts)
357
+
358
+
359
+ #
360
+ # End of Additions
361
+ ############################################3############################################3##############################
App_Function_Libraries/RAG/__init__.py ADDED
File without changes