|
from src.domain.block import Block |
|
from src.domain.doc import Doc |
|
from src.llm.llm_tools import summarize_paragraph_v2 |
|
import gradio as gr |
|
|
|
class Retriever: |
|
""" |
|
The Retriever class is responsible for processing and summarizing documents. |
|
It supports operations such as summarizing individual blocks of text, organizing |
|
text into a hierarchy, and conducting similarity searches within a collection of documents. |
|
|
|
Attributes: |
|
collection: A collection object where summaries and metadata are stored. |
|
llmagent: An instance of LlmAgent used for generating summaries. |
|
""" |
|
|
|
|
|
def __init__(self, doc: Doc = None, collection=None): |
|
""" |
|
Initializes the Retriever class with a document, a collection, and a language model agent. |
|
|
|
Args: |
|
doc: A document object containing text blocks to be processed. |
|
collection: A collection object to store summaries and metadata. |
|
llmagent: An instance of LlmAgent for generating summaries. |
|
""" |
|
|
|
if doc is not None: |
|
self.collection = collection |
|
blocks_good_format = doc.blocks_requirements |
|
gr.Info("Please wait while the database is being created") |
|
|
|
|
|
for block in blocks_good_format: |
|
print(f"block index : {block.index}") |
|
|
|
if len(block.content) > 4500: |
|
new_blocks = block.separate_1_block_in_n(max_size=4500) |
|
for new_block in new_blocks: |
|
summary = summarize_paragraph_v2(prompt=new_block.content, title_doc=doc.title, title_para=block.title) |
|
if "<summary>" in summary: |
|
summary = summary.split("<summary>")[1] |
|
|
|
self.collection.add( |
|
documents=[summary], |
|
ids=[new_block.index], |
|
metadatas=[new_block.to_dict()] |
|
) |
|
else: |
|
|
|
summary = summarize_paragraph_v2(prompt=block.content, title_doc=doc.title, title_para=block.title) |
|
if "<summary>" in summary: |
|
summary = summary.split("<summary>")[1] |
|
self.collection.add( |
|
documents=[summary], |
|
ids=[block.index], |
|
metadatas=[block.to_dict()] |
|
) |
|
|
|
|
|
self.summarize_by_hierarchy(blocks_good_format, doc.title) |
|
gr.Info(f"The collection {collection.name} has been added to the database") |
|
else: |
|
self.collection = collection |
|
|
|
|
|
|
|
|
|
def summarize_by_hierarchy(self, blocks, doc_title): |
|
""" |
|
Summarizes blocks based on their hierarchical levels. |
|
|
|
Args: |
|
blocks: A list of Block objects to be summarized. |
|
llmagent: An instance of LlmAgent used for generating summaries. |
|
doc_title: The title of the document being processed. |
|
""" |
|
hierarchy = self.create_hierarchy(blocks) |
|
deepest_blocks_indices = self.find_deepest_blocks(blocks) |
|
print("Hierarchy levels identified:", hierarchy.keys()) |
|
print("Deepest block indices:", [block.index for block in deepest_blocks_indices]) |
|
|
|
for level, level_blocks in hierarchy.items(): |
|
|
|
print(level) |
|
print(level_blocks) |
|
print(deepest_blocks_indices) |
|
print(len(level_blocks)) |
|
if len(level_blocks) > 1 and any(block.index in deepest_blocks_indices for block in level_blocks): |
|
level_content = " ".join(block.content for block in level_blocks) |
|
|
|
print(f"Summarizing level {level} with content from blocks: {[block.index for block in level_blocks]}") |
|
level_summary = summarize_paragraph_v2(prompt=level_content, title_doc=doc_title, title_para=f"Summary of section : {level}") |
|
|
|
level_summary_id = f"summary_{level}" |
|
|
|
|
|
first_block = level_blocks[0] |
|
combined_block = Block( |
|
doc=first_block.doc, |
|
title=first_block.title, |
|
content=" ".join(block.content for block in level_blocks), |
|
index=first_block.index, |
|
rank=first_block.rank, |
|
level=first_block.level, |
|
distance=first_block.distance |
|
) |
|
|
|
|
|
self.collection.add( |
|
documents=[level_summary], |
|
ids=[level_summary_id], |
|
metadatas=[combined_block.to_dict()] |
|
) |
|
|
|
|
|
print(f"Added summary for level {level} to the collection.") |
|
else: |
|
|
|
print(f"Skipping level {level} as it is deepest blocks.") |
|
|
|
|
|
def create_hierarchy(self, blocks): |
|
""" |
|
Creates a hierarchical structure of the blocks based on their indices. |
|
|
|
Args: |
|
blocks: A list of Block objects to be organized into a hierarchy. |
|
|
|
Returns: |
|
A dictionary representing the hierarchy of blocks. |
|
""" |
|
hierarchy = {} |
|
for block in blocks: |
|
levels = self.extract_levels(block.index) |
|
for level in levels: |
|
hierarchy.setdefault(level, []).append(block) |
|
return hierarchy |
|
|
|
|
|
def extract_levels(self, index): |
|
""" |
|
Extracts all hierarchical levels from a block index. |
|
|
|
Args: |
|
index: The index string of a block. |
|
|
|
Returns: |
|
A list of levels extracted from the index. |
|
""" |
|
|
|
parts = index.split('.') |
|
levels = ['.'.join(parts[:i]) for i in range(1, len(parts) + 1)] |
|
return levels |
|
|
|
|
|
def find_deepest_blocks(self, blocks): |
|
""" |
|
Identifies the deepest blocks in the hierarchy. |
|
|
|
Args: |
|
blocks: A list of Block objects. |
|
|
|
Returns: |
|
A set of indices representing the deepest blocks. |
|
""" |
|
deepest_blocks = set() |
|
block_indices = {block.index for block in blocks} |
|
for block in blocks: |
|
|
|
if not any(b_index != block.index and b_index.startswith(block.index + '.') for b_index in block_indices): |
|
deepest_blocks.add(block.index) |
|
return deepest_blocks |
|
|
|
|
|
|
|
def similarity_search(self, queries: str) -> {}: |
|
""" |
|
Performs a similarity search in the collection based on given queries. |
|
|
|
Args: |
|
queries: A string or list of strings representing the query or queries. |
|
|
|
Returns: |
|
A list of Block objects that are similar to the given queries. |
|
""" |
|
|
|
res = self.collection.query(query_texts=queries, n_results=5) |
|
block_dict_sources = res['metadatas'][0] |
|
distances = res['distances'][0] |
|
blocks = [] |
|
for bd, d in zip(block_dict_sources, distances): |
|
b = Block().from_dict(bd) |
|
b.distance = d |
|
blocks.append(b) |
|
|
|
return blocks |
|
|