Spaces:

anyuanay
/

ahu17_pub

Sleeping

App Files Files Community

anyuanay commited on Apr 3, 2024

Commit

f13eeb8

verified ·

1 Parent(s): d0f6105

upload 3 main files

Browse files

Files changed (3) hide show

app.py +108 -0
knowledge_triples_utils.py +1190 -0
requirements.txt +13 -0

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import pandas as pd
+import os, sys
+import ast
+import gradio as gr
+import google.generativeai as genai
+GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
+genai.configure(api_key=GOOGLE_API_KEY)
+gemini_pro = genai.GenerativeModel(model_name="models/gemini-pro")
+gemini_pro_vision = genai.GenerativeModel(model_name="models/gemini-pro-vision")
+import knowledge_triples_utils as kutils
+all_nodes_csv_path = "AHU_17_All_Nodes_embedding.csv"
+all_nodes_df = pd.read_csv(all_nodes_csv_path)
+all_nodes_df['node_embedding'] = all_nodes_df['node_embedding'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
+all_images_csv_path = "AHU_17_All_Images_embeddings_hf.csv"
+all_images_df = pd.read_csv(all_images_csv_path)
+all_images_df['desc_title_embedding'] = all_images_df['desc_title_embedding'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
+# answer query by gemini
+def answer_query(query):
+    # Matching user text query with "node_embedding" to find relevant chunks.
+    matching_results_text = kutils.get_similar_text_from_query(
+        query,
+        all_nodes_df,
+        column_name="node_embedding",
+        top_n=3,
+        print_citation=False,
+    )
+    # Matching user text query with "desc_title_embedding" to find relevant images.
+    matching_results_images = kutils.get_relevant_images_from_query(
+        query,
+        all_images_df,
+        column_name="desc_title_embedding",
+        top_n=3,
+    )
+    # combine all the selected relevant text chunks
+    context_text = []
+    for key, value in matching_results_text.items():
+        context_text.append(value["node_text"])
+    final_context_text = "\n".join(context_text)
+    # combine all the relevant images and their description generated by Gemini
+    context_images = []
+    for key, value in matching_results_images.items():
+        context_images.extend(
+            ["Image: ", value["image_object"], "Caption: ", value["image_description"]]
+        )
+    instructions = '''
+        You will answer the query based on the text context given in "text_context" and Image context given
+        in "image_context" along with its Caption:\n
+        Base your response on "text_context" and "image_context". Do not use any numbers or percentages that are
+        not present in the "image_context".
+        Context:
+        '''
+    final_prompt = [
+        "QUERY: " + query + " ANSWER: ",
+        instructions,
+        "text_context:",
+        "\n".join(context_text),
+        "image_context:",
+    ]
+    final_prompt.extend(context_images)
+    response = gemini_pro_vision.generate_content(
+            final_prompt,
+            stream=True,
+    )
+    response_list = []
+    for chunk in response:
+        response_list.append(chunk.text)
+    response = "".join(response_list)
+    return response, matching_results_images[0]["image_object"]
+demo = gr.Interface(
+    fn=answer_query,
+    inputs="textbox",
+    outputs=["textbox", "image"]
+)
+if __name__ == "__main__":
+    demo.launch()

knowledge_triples_utils.py ADDED Viewed

	@@ -0,0 +1,1190 @@

+import os, random, re
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import llama_index
+from llama_index import Document
+import google.generativeai as genai
+from llama_index.schema import MetadataMode, NodeRelationship
+from llama_index.text_splitter import TokenTextSplitter
+from llama_index import SimpleDirectoryReader
+from copy import deepcopy
+import time
+import fitz
+import errno
+import typing
+import requests
+import networkx as nx
+from base64 import b64encode
+from typing import Optional
+from typing import Tuple, List
+from typing import Dict, List, Union, Any, Iterable
+from IPython.display import Markdown, display
+import PIL
+from PIL import Image
+from tqdm import tqdm
+import json
+# llama_index Documents in info213_docs
+# fitz_docs which is opened by fitz.open(path_input)
+# both list of docs should have the same page numbers
+def classify_image(image_path:str, model:genai.GenerativeModel) -> str:
+    """
+    Given an image path, classify the image as floor plan, equipment, etc...
+    INPUT: image_path: the path to the image
+           model: LLM model
+    OUTPUT: the type of the image in a string
+    """
+    image_for_gemini = Image.open(image_path)
+    # Specify the image description prompt.
+    image_description_prompt = """
+    Analyze and classify the image into one of the following categories:
+    floor plan, flow chart, HAVC equipment, sign, and other. Ouput one and
+    only one category names.
+    """
+    model_input = [image_description_prompt, image_for_gemini]
+    response = model.generate_content(
+                model_input
+        )
+    return response.text
+# Combine node's keywords, triples, questions, and text from a row
+def combine_node_fields(row):
+    result = ""
+    result = result + "KEYWORDS: " + row['node_keywords'] + ";\n"
+    result = result + "TRIPLES: " + row['node_triples'] + ";\n"
+    result = result + "ANSWERABLE_QUESTIONS: " + row['node_answerable_questions'] + ";\n"
+    result = result + "TEXT: " + row['node_text'] +".\n"
+    return result
+def display_images(
+    images: Iterable[Union[str, PIL.Image.Image]], resize_ratio: float = 0.5
+) -> None:
+    """
+    Displays a series of images provided as paths or PIL Image objects.
+    Args:
+        images: An iterable of image paths or PIL Image objects.
+        resize_ratio: The factor by which to resize each image (default 0.5).
+    Returns:
+        None (displays images using IPython or Jupyter notebook).
+    """
+    # Convert paths to PIL images if necessary
+    pil_images = []
+    for image in images:
+        if isinstance(image, str):
+            pil_images.append(PIL.Image.open(image))
+        else:
+            pil_images.append(image)
+    # Resize and display each image
+    for img in pil_images:
+        original_width, original_height = img.size
+        new_width = int(original_width * resize_ratio)
+        new_height = int(original_height * resize_ratio)
+        resized_img = img.resize((new_width, new_height))
+        display(resized_img)
+        print("\n")
+def doc_images_description_dict(fdocs:fitz.Document, fpage: fitz.Document, lpage:
+                                llama_index.Document, image_save_dir:str,
+                               image_description_prompt:str, model:genai.GenerativeModel) -> List[dict]:
+    file_name = lpage.metadata['file_name']
+    page_label = lpage.metadata['page_label']
+    images = fpage.get_images()
+    dict_list = []
+    for image_no, image in enumerate(images):
+        image_dict = {}
+        xref = image[0]
+        pix = fitz.Pixmap(fitz_docs, xref)
+        # Create the image file name
+        image_name = f"{image_save_dir}/{file_name}_image_{page_label}_{image_no}_{xref}.jpeg"
+        # Save the image to the specified location
+        pix.save(image_name)
+        # Load the saved image as a Gemini Image Object
+        image_for_gemini = Image.open(io.BytesIO(pix.tobytes("jpeg")))
+        model_input = [image_description_prompt, image_for_gemini]
+        response = gemini_pro_model.generate_content(
+            model_input
+        )
+        image_dict['doc_id'] = lpage.doc_id
+        image_dict['image_id'] = image_no
+        image_dict['image_name'] = image_name
+        mdict = lpage.metadata
+        image_dict['page_label'] = mdict['page_label']
+        image_dict['file_name'] = mdict['file_name']
+        image_dict['file_path'] = mdict['file_path']
+        image_dict['file_type'] = mdict['file_type']
+        image_dict['course_material_type'] = mdict['course_material_type']
+        image_dict['course_material_week'] = mdict['course_material_week']
+        image_dict['description'] = response.text
+        dict_list.append(image_dict)
+    return dict_list
+def docs_to_df(docs:llama_index.schema.Document, gemini_pro:genai.GenerativeModel) -> pd.DataFrame:
+    """
+    extract titles for docs, embed the documents and titles, and convert it to dataframe
+    INPUT: docs: the documents extacted from a file
+           gemini_pro: genai gemini pro model
+    OUTPUT: docs_df: a dataframe containing the information of the docs extracted from the input file
+    """
+    docs_df = llamaindex_docs_df(docs)
+    tqdm.pandas(desc="Processing rows for extracting document titles...")
+    docs_df['doc_title'] = docs_df.progress_apply(lambda row: node_text_title(row['text'], gemini_pro), axis=1)
+    #tqdm.pandas(desc="Processing rows for summiarizing documents...")
+    #try:
+    #    docs_df['doc_summary'] = docs_df.progress_apply(lambda row: text_summary(row['text'], gemini_pro), axis=1)
+    #except:
+    #    docs_df['doc_summary'] = None
+    doc_summary_list = []
+    for _, row in tqdm(docs_df.iterrows(), total=len(docs_df)):
+        try:
+            doc_summary_list.append(text_summary(row['text'], gemini_pro))
+        except:
+            #print(row['page_label'], row['text'])
+            doc_summary_list.append(None)
+    docs_df['doc_summary'] = doc_summary_list
+    tqdm.pandas(desc="Processing rows for embedding documents and titles...")
+    docs_df['doc_embedding'] = docs_df.progress_apply(lambda row: text_retrieval_document_embedding(row['text'], row['doc_title']), axis=1)
+    return docs_df
+def extract_image_description_df(image_path:str, category:str, model:genai.GenerativeModel) -> pd.DataFrame:
+    """
+    Extract description of the given image in the given category
+    INPUT: image_path: the path to the image
+           category: a string containing the category of the image
+           model: a generative model
+    OUTPUT: a DataFrame containing the metadata of the extracted images
+    """
+    image_for_gemini = Image.open(image_path)
+    # Specify the image description prompt.
+    image_description_prompt = """Explain what is going on in the image.
+        If it's a table, extract all elements of the table.
+        If it's a graph, explain the findings in the graph.
+        Do not include any numbers that are not mentioned in the image:
+    """
+    if "floor plan" in category.lower():
+        image_description_prompt = '''
+            Please analyze the provided floor plan image and extract the following information
+            related to rooms, locations, connections, HVAC equipment, and sensors:
+            1. Room Labels/Names: Identify and list all room labels or names shown on the floor plan.
+            2. Room Connectivity: Indicate how different rooms are connected (doors, hallways, openings, etc.).
+            3. HVAC Equipment: Locate and list all HVAC equipment depicted on the floor plan (e.g., air handling units, ductwork, vents, thermostats, etc.).
+            4. Sensor Locations: Note the locations of any sensors or control devices related to the HVAC system (e.g., temperature sensors, occupancy sensors, etc.).
+            5. Zoning/Partitions: If the floor plan shows any zoning or partitions related to HVAC control, please describe them.
+            6. Special Areas: Highlight any areas that may have unique HVAC requirements (e.g., server rooms, laboratories, etc.).
+            Please provide the extracted information in a structured format, separating the different categories as needed. Let me know if you need any clarification or have additional requirements for the information to be extracted from the floor plan.
+        '''
+    elif "flow chart" in category.lower():
+        image_description_prompt = '''
+            Please analyze the provided HVAC flow chart image and extract the following information:
+            1. System Components: Identify and list all the major HVAC components shown in the flow chart (e.g., air handling units, chillers, boilers, pumps, cooling towers, etc.).
+            2. Component Connections: Describe how the different HVAC components are connected, indicating the direction of airflow, water flow, refrigerant flow, etc.
+            3. System Inputs/Outputs: Note any system inputs (e.g., outside air intake) or outputs (e.g., exhaust air) shown in the flow chart.
+            4. Control Points: Locate any control points, sensors, or valves that regulate the flow or operation of the system components.
+            5. Subsystems/Zones: If the flow chart illustrates subsystems or zones within the overall HVAC system, please describe them and their components.
+            6. Operational Modes: Identify any operational modes or sequences depicted in the flow chart (e.g., heating mode, cooling mode, economizer mode, etc.).
+            Please provide the extracted information in a clear and structured format, separating the different categories as needed. If any abbreviations or symbols are used in the flow chart, please include a legend or clarify their meanings. Let me know if you need any clarification or have additional requirements for the information to be extracted.
+        '''
+    elif "havc equipment" in category.lower():
+        image_description_prompt = '''
+            Please analyze the image I will provide, which contains HVAC (heating, ventilation, and
+            air conditioning) equipment. Describe the different components you can identify, such
+            as the type of equipment (furnace, air conditioner, ductwork, etc.), the apparent
+            condition of the equipment, and any other relevant details you can discern from the
+            image. Your analysis should help someone understand what is depicted in the HVAC system
+            shown in the picture.
+        '''
+    else:
+        image_description_prompt = '''Explain what is going on in the image.
+            If it's a table, extract all elements of the table.
+            If it's a graph, explain the findings in the graph.
+            Do not include any numbers that are not mentioned in the image:
+        '''
+    dict_list = []
+    path_last_sep_idx = image_path.rfind("/")
+    file_name = image_path[path_last_sep_idx+1:]
+    print("Processing the image: {}".format(file_name))
+    model_input = [image_description_prompt, image_for_gemini]
+    response = model.generate_content(
+            model_input
+    )
+    image_dict = {}
+    image_dict['image_path'] = image_path
+    image_dict['file_name'] = file_name
+    try:
+        image_dict['image_description'] = response.text
+    except Exception as e:
+        print("Some errors happend in the response from Gemini.")
+        image_dict['image_description'] = None
+    dict_list.append(image_dict)
+    return pd.DataFrame(dict_list)
+def get_cosine_score(
+    dataframe: pd.DataFrame, column_name: str, input_text_embd: np.ndarray
+) -> float:
+    """
+    Calculates the cosine similarity between the user query embedding and the
+    dataframe embedding for a specific column.
+    Args:
+        dataframe: The pandas DataFrame containing the data to compare against.
+        column_name: The name of the column containing the embeddings to compare with.
+        input_text_embd: The NumPy array representing the user query embedding.
+    Returns:
+        The cosine similarity score (rounded to two decimal places) between the user query embedding and the dataframe embedding.
+    """
+    text_cosine_score = round(np.dot(dataframe[column_name], input_text_embd), 2)
+    return text_cosine_score
+def get_cosine_score_lists(
+    dataframe: pd.DataFrame, column_name: str, query_embs: list
+) -> float:
+    """
+    Calculates the cosine similarity between the user query embedding and the dataframe embedding for a specific column. Both embeddings are in lists
+    Args:
+        dataframe: The pandas DataFrame containing the data to compare against.
+        column_name: The name of the column containing the embeddings to compare with.
+        input_text_embd: The query embeddings as a list of numbers
+    Returns:
+        The cosine similarity score (rounded to two decimal places) between the user query embedding and the dataframe embedding.
+    """
+    text_cosine_score = round(np.dot(np.array(dataframe[column_name]), np.array(query_embs)), 2)
+    return text_cosine_score
+def get_relevant_images_from_query(
+    query: str,
+    images_df: pd.DataFrame,
+    column_name: str = "",
+    top_n: int = 3,
+    embedding_size: int = 768,
+    print_citation: bool = True,
+) -> Dict[int, Dict[str, Any]]:
+    """
+    Finds the top N most similar images from a metadata DataFrame based on a text query.
+    Args:
+        query: The text query used for finding similar passages.
+        images_df: A Pandas DataFrame containing the image metadata to search.
+        column_name: The column name in the text_metadata_df containing the text embeddings or
+        text itself.
+        top_n: The number of most similar text passages to return.
+        embedding_size: The dimensionality of the text embeddings (only used if text embeddings
+        are stored in the column specified by `column_name`).
+        print_citation: Whether to immediately print formatted citations for the matched text
+        passages (True) or just return the dictionary (False).
+    Returns:
+        A dictionary containing information about the top N most similar images,
+        including cosine scores, image_path, file_name, and description text.
+    Raises:
+        KeyError: If the specified `column_name` is not present in the `text_metadata_df`.
+    """
+    if column_name not in images_df.columns:
+        raise KeyError(f"Column '{column_name}' not found in the 'images_df'")
+    query_embs = text_query_embedding(query)
+    # Calculate cosine similarity between query text and metadata text
+    cosine_scores = images_df.apply(
+        lambda row: get_cosine_score_lists(
+            row,
+            column_name,
+            query_embs,
+        ),
+        axis=1,
+    )
+    # Get top N cosine scores and their indices
+    top_n_indices = cosine_scores.nlargest(top_n).index.tolist()
+    top_n_scores = cosine_scores.nlargest(top_n).values.tolist()
+    # Create a dictionary to store matched images and their information
+    final_images = {}
+    for matched_no, index in enumerate(top_n_indices):
+        # Create a sub-dictionary for each matched image
+        final_images[matched_no] = {}
+        # Store image path
+        final_images[matched_no]["image_path"] = images_df.iloc[index][
+            "image_path"
+        ]
+        # Store cosine score
+        final_images[matched_no]["cosine_score"] = top_n_scores[matched_no]
+        # Store image file name
+        final_images[matched_no]["file_name"] = images_df.iloc[index]["file_name"]
+        # Store image description
+        final_images[matched_no]["image_description"] = images_df["image_description"][index]
+        # Store image object
+        final_images[matched_no]["image_object"] = Image.open(images_df.iloc[index]['image_path'])
+    # Optionally print citations immediately
+    if print_citation:
+        print_text_to_image_citation(final_images)
+    return final_images
+def get_similar_text_from_query(
+    query: str,
+    nodes_df: pd.DataFrame,
+    column_name: str = "",
+    top_n: int = 3,
+    embedding_size: int = 768,
+    print_citation: bool = True,
+) -> Dict[int, Dict[str, Any]]:
+    """
+    Finds the top N most similar text passages from a metadata DataFrame based on a text query.
+    Args:
+        query: The text query used for finding similar passages.
+        nodes_df: A Pandas DataFrame containing the text metadata to search.
+        column_name: The column name in the text_metadata_df containing the text embeddings or
+        text itself.
+        top_n: The number of most similar text passages to return.
+        embedding_size: The dimensionality of the text embeddings (only used if text embeddings
+        are stored in the column specified by `column_name`).
+        print_citation: Whether to immediately print formatted citations for the matched text
+        passages (True) or just return the dictionary (False).
+    Returns:
+        A dictionary containing information about the top N most similar text passages,
+        including cosine scores, page numbers, chunk numbers (optional), and chunk text or
+        page text (depending on `chunk_text`).
+    Raises:
+        KeyError: If the specified `column_name` is not present in the `text_metadata_df`.
+    """
+    if column_name not in nodes_df.columns:
+        raise KeyError(f"Column '{column_name}' not found in the 'nodes_df'")
+    query_embs = text_query_embedding(query)
+    # Calculate cosine similarity between query text and metadata text
+    cosine_scores = nodes_df.apply(
+        lambda row: get_cosine_score_lists(
+            row,
+            column_name,
+            query_embs,
+        ),
+        axis=1,
+    )
+    # Get top N cosine scores and their indices
+    top_n_indices = cosine_scores.nlargest(top_n).index.tolist()
+    top_n_scores = cosine_scores.nlargest(top_n).values.tolist()
+    # Create a dictionary to store matched text and their information
+    final_text = {}
+    for matched_textno, index in enumerate(top_n_indices):
+        # Create a sub-dictionary for each matched text
+        final_text[matched_textno] = {}
+        # Store page number
+        final_text[matched_textno]["page_num"] = nodes_df.iloc[index][
+            "page_label"
+        ]
+        # Store cosine score
+        final_text[matched_textno]["cosine_score"] = top_n_scores[matched_textno]
+        # Store node id
+        final_text[matched_textno]["node_id"] = nodes_df.iloc[index]["node_id"]
+        # Store node text
+        final_text[matched_textno]["node_text"] = nodes_df["node_text"][index]
+    # Optionally print citations immediately
+    if print_citation:
+        print_text_to_text_citation(final_text)
+    return final_text
+def llamaindex_doc_dict(doc: llama_index.schema.Document) -> dict:
+    """
+    convert a LlamaIndex Document object to a dictionary
+    """
+    doc_dict = {}
+    doc_dict['doc_id'] = doc.doc_id
+    mdict = doc.metadata
+    doc_dict['page_label'] = mdict['page_label']
+    doc_dict['file_name'] = mdict['file_name']
+    doc_dict['file_path'] = mdict['file_path']
+    doc_dict['file_type'] = mdict['file_type']
+    doc_dict['file_title'] = mdict['file_title']
+    doc_dict['file_date'] = mdict['file_date']
+    doc_dict['file_subtitle'] = mdict['file_subtitle']
+    doc_dict['table_of_content'] = mdict['table_of_content']
+    doc_dict['text'] = doc.text
+    return doc_dict
+def llamaindex_docs_df(docs: List[llama_index.schema.Document]) -> pd.DataFrame:
+    """
+    convert a list of LlamaIndex Document object to a Pandas DataFrame with columns
+    """
+    recs = []
+    for doc in docs:
+        recs.append(llamaindex_doc_dict(doc))
+    return pd.DataFrame(recs)
+def llamaindex_docs_from_path(path_input:str,
+    gemini_pro:genai.GenerativeModel) -> llama_index.schema.Document:
+    """
+    extract llama_index Document from the file given the path_input
+    INPUT: path_input: the path pointing to the file in the disk
+           gemini_pro: the gemini pro model for extracting course metadata
+    OUTPUT: docs: llama_index Document extracted from the file by the path_input
+    """
+    docs = SimpleDirectoryReader(input_files=[path_input]).load_data()
+    first2pages = docs[0].text + " " + docs[1].text
+    metadata_extraction_sys_content = '''
+        You are a helpful assistant focusing on extracting the metadata describing the input document.
+    '''
+    metadata_extraction_prompt = '''
+        {}\n
+        Please perform metadata extraction on the given text.
+        Focuse on the following metadata fields:
+        title: what the document is about;
+        date: when the document was created;
+        subtitle: what specific content the document is about;
+        table of content: section titles and their page numbers.
+        Output NA if there is no value for a metadata field.
+        Output the results in a dictionary.
+        TEXT: ```{}```
+    '''
+    msg = metadata_extraction_prompt.format(metadata_extraction_sys_content, first2pages)
+    response = gemini_pro.generate_content(
+        msg
+    )
+    response_string = response.text.strip('`')
+    extracted_meta_dict = {}
+    try:
+        extracted_meta_dict = json.loads(response_string)
+    except json.decoder.JSONDecodeError as e:
+        # Handling the JSON decoding error
+        extracted_meta_dict = {}
+    for doc in tqdm(docs, total=len(docs), desc="Adding metadata to docs..."):
+        if 'title' in extracted_meta_dict:
+            doc.metadata['file_title'] = extracted_meta_dict['title']
+        else:
+            doc.metadata['file_title'] = None
+        if 'date' in extracted_meta_dict:
+            doc.metadata['file_date'] = extracted_meta_dict['date']
+        else:
+            doc.metadata['file_date'] = None
+        if 'subtitle' in extracted_meta_dict:
+            doc.metadata['file_subtitle'] = extracted_meta_dict['subtitle']
+        else:
+            doc.metadata['file_subtitle'] = None
+        if 'table of content' in extracted_meta_dict:
+            doc.metadata['table_of_content'] = extracted_meta_dict['table of content']
+        else:
+            doc.metadata['table_of_content'] = None
+    return docs
+def llamaindex_node_dict(node: llama_index.schema.TextNode) -> dict:
+    """
+    convert a LlamaIndex TextNode object to a dictionary
+    INPUT: doc_id: the document from where the node extracted
+           node_order: an integer for the order of the node in the parent document
+           node: a TextNode extracted from the parent document
+    OUTPUT: dictionary for the node's information
+    """
+    node_dict = {}
+    node_dict['node_id'] = node.node_id
+    mdict = node.metadata
+    node_dict['page_label'] = mdict['page_label']
+    node_dict['file_name'] = mdict['file_name']
+    node_dict['file_path'] = mdict['file_path']
+    node_dict['file_type'] = mdict['file_type']
+    #node_dict['document_title'] = mdict['document_title']
+    #node_dict['questions_this_excerpt_can_answer'] = mdict['questions_this_excerpt_can_answer']
+    #node_dict['section_summary'] = mdict['section_summary']
+    node_dict['file_title'] = mdict['file_title']
+    node_dict['file_date'] = mdict['file_date']
+    node_dict['file_subtitle'] = mdict['file_subtitle']
+    node_dict['node_text'] = node.text
+    node_dict['start_char_idx'] = node.start_char_idx
+    node_dict['end_char_idx'] = node.end_char_idx
+    rdict = node.relationships
+    if NodeRelationship.SOURCE in rdict.keys():
+        node_dict['doc_id'] = rdict[NodeRelationship.SOURCE].node_id
+    else:
+        node_dict['doc_id'] = None
+    if NodeRelationship.PREVIOUS in rdict.keys():
+        node_dict['previous_node'] = rdict[NodeRelationship.PREVIOUS].node_id
+    else:
+        node_dict['previous_node'] = None
+    if NodeRelationship.NEXT in rdict.keys():
+        node_dict['next_node'] = rdict[NodeRelationship.NEXT].node_id
+    else:
+        node_dict['next_node'] = None
+    return node_dict
+def llamaindex_nodes_df(nodes: List[llama_index.schema.TextNode]) -> pd.DataFrame:
+    """
+    convert a list of LlamaIndex TextNode object to a Pandas DataFrame with columns
+    """
+    recs = []
+    for node in nodes:
+        recs.append(llamaindex_node_dict(node))
+    return pd.DataFrame(recs)
+def node_text_title(text:str, model:genai.GenerativeModel) -> str:
+    """
+    use gemini to generate a title for the input text
+    """
+    prompt = '''
+        Please summairze the given input text
+        enclosed within the three backticks. Generate a short
+        title for the text. Correct misspells and syntactic errors.
+        Output a short title string only.
+        TEXT: ```{}```
+        '''
+    msg = prompt.format(text)
+    response = model.generate_content(
+        msg
+    )
+    return response.text
+def pdf_extract_images(pdf_path:str, image_save_dir:str):
+    """
+    Given a PDF path, extract images from the PDf file and save in disk
+    INPUT: pdf_path: the path to the PDF file
+           image_save_dir: the directory for storing the extracted images
+    OUTPUT: None
+    """
+    fitz_docs = fitz.open(pdf_path)
+    path_last_sep_idx = pdf_path.rfind("/")
+    file_name = pdf_path[path_last_sep_idx+1:]
+    print("Processing the images from the pages of {}".format(file_name))
+    for idx, fpage in tqdm(enumerate(fitz_docs), total=len(fitz_docs)):
+        images = fpage.get_images()
+        page_label = idx + 1 # llamaindex document pages indexing start from 1
+        for image_no, image in enumerate(images):
+            xref = image[0]
+            pix = fitz.Pixmap(fitz_docs, xref)
+            # Create the image file name
+            image_name = f"{image_save_dir}/extracted_from_{file_name}_{page_label}_{image_no}_{xref}.jpeg"
+            # Save the image to the specified location
+            pix.save(image_name)
+def pdf_images_description_df(pdf_path:str, docs_df_path:str, image_save_dir:str) -> pd.DataFrame:
+    """
+    Given a PDF path and the path to the DataFrame containing the metadata of the pages extracted from the PDF file, extract the metadata of images from the PDf file as a DataFrame
+    INPUT: pdf_path: the path to the PDF file
+           docs_df_path: the path to the DataFrame containing page metadata extracted from the PDF file
+           image_save_dir: the directory for storing the extracted images
+    OUTPUT: a DataFrame containing the metadata of the extracted images
+    """
+    fitz_docs = fitz.open(pdf_path)
+    doc_df = pd.read_csv(docs_df_path)
+    # Specify the image description prompt.
+    image_description_prompt = """Explain what is going on in the image.
+        If it's a table, extract all elements of the table.
+        If it's a graph, explain the findings in the graph.
+        Do not include any numbers that are not mentioned in the image:
+    """
+    dict_list = []
+    path_last_sep_idx = pdf_path.rfind("/")
+    file_name = pdf_path[path_last_sep_idx+1:]
+    print("Processing the images from the pages of {}".format(file_name))
+    for idx, fpage in tqdm(enumerate(fitz_docs), total=len(fitz_docs)):
+        images = fpage.get_images()
+        page_label = idx + 1 # llamaindex document pages indexing start from 1
+        for image_no, image in enumerate(images):
+            image_dict = {}
+            xref = image[0]
+            pix = fitz.Pixmap(fitz_docs, xref)
+            # Create the image file name
+            image_name = f"{image_save_dir}/{file_name}_image_{page_label}_{image_no}_{xref}.jpeg"
+            # Save the image to the specified location
+            pix.save(image_name)
+            # Load the saved image as a Gemini Image Object
+            image_for_gemini = Image.open(io.BytesIO(pix.tobytes("jpeg")))
+            model_input = [image_description_prompt, image_for_gemini]
+            response = gemini_pro_vision.generate_content(
+                model_input
+            )
+            image_dict['image_id'] = image_no
+            image_dict['image_name'] = image_name
+            image_dict['page_label'] = page_label
+            try:
+                doc_page = doc_df[doc_df.page_label == page_label].iloc[0]
+                image_dict['doc_id'] = doc_page['doc_id']
+                image_dict['file_name'] = doc_page['file_name']
+                image_dict['file_path'] = doc_page['file_path']
+                image_dict['file_type'] = doc_page['file_type']
+                image_dict['course_material_type'] = doc_page['course_material_type']
+                image_dict['course_material_week'] = doc_page['course_material_week']
+            except Exception as e:
+                print("Some errors happened in the doc_page of the doc_df.")
+                image_dict['doc_id'] = None
+                image_dict['file_name'] = None
+                image_dict['file_path'] = None
+                image_dict['file_type'] = None
+                image_dict['course_material_type'] = None
+                image_dict['course_material_week'] = None
+            try:
+                image_dict['image_description'] = response.text
+            except Exception as e:
+                print("Some errors happend in the response from Gemini.")
+                image_dict['image_description'] = None
+            dict_list.append(image_dict)
+            time.sleep(2)
+    return pd.DataFrame(dict_list)
+# Add colors to the print
+class Color:
+    """
+    This class defines a set of color codes that can be used to print text in different colors.
+    This will be used later to print citations and results to make outputs more readable.
+    """
+    PURPLE: str = "\033[95m"
+    CYAN: str = "\033[96m"
+    DARKCYAN: str = "\033[36m"
+    BLUE: str = "\033[94m"
+    GREEN: str = "\033[92m"
+    YELLOW: str = "\033[93m"
+    RED: str = "\033[91m"
+    BOLD: str = "\033[1m"
+    UNDERLINE: str = "\033[4m"
+    END: str = "\033[0m"
+def print_text_to_image_citation(
+    final_images: Dict[int, Dict[str, Any]], print_top: bool = True
+) -> None:
+    """
+    Prints a formatted citation for each matched image in a dictionary.
+    Args:
+        final_images: A dictionary containing information about matched images,
+                    with keys as image number and values as dictionaries containing
+                    image path, page number, page text, cosine similarity score, and image description.
+        print_top: A boolean flag indicating whether to only print the first citation (True) or all citations (False).
+    Returns:
+        None (prints formatted citations to the console).
+    """
+    color = Color()
+    # Iterate through the matched image citations
+    for imageno, image_dict in final_images.items():
+        # Print the citation header
+        print(
+            color.RED + f"Citation {imageno + 1}:",
+            "Mached image path, page number and page text: \n" + color.END,
+        )
+        # Print the cosine similarity score
+        print(color.BLUE + f"score: " + color.END, image_dict["cosine_score"])
+        # Print the image path
+        print(color.BLUE + f"path: " + color.END, image_dict["image_path"])
+        # Print the file name
+        print(color.BLUE + f"file name: " + color.END, image_dict["file_name"])
+        # Print the image description
+        print(
+            color.BLUE + f"image description: " + color.END,
+            image_dict["image_description"],
+        )
+        # Display image
+        display_images([image_dict["image_object"]])
+        # Only print the first citation if print_top is True
+        if print_top and imageno == 0:
+            break
+def print_text_to_text_citation(
+    final_text: Dict[int, Dict[str, Any]],
+    print_top: bool = True,
+) -> None:
+    """
+    Prints a formatted citation for each matched text in a dictionary.
+    Args:
+        final_text: A dictionary containing information about matched text passages,
+                    with keys as text number and values as dictionaries containing
+                    page number, cosine similarity score, chunk number (optional),
+                    chunk text (optional), and page text (optional).
+        print_top: A boolean flag indicating whether to only print the first citation (True) or all citations (False).
+        chunk_text: A boolean flag indicating whether to print individual text chunks (True) or the entire page text (False).
+    Returns:
+        None (prints formatted citations to the console).
+    """
+    color = Color()
+    # Iterate through the matched text citations
+    for textno, text_dict in final_text.items():
+        # Print the citation header
+        print(color.RED + f"Citation {textno + 1}:", "Matched text:" + color.END)
+        # Print the cosine similarity score
+        print(color.BLUE + f"score: " + color.END, text_dict["cosine_score"])
+        # Print the page number
+        print(color.BLUE + f"page_number: " + color.END, text_dict["page_num"])
+        # Print chunk number and chunk text
+        print(color.BLUE + f"node_id: " + color.END, text_dict["node_id"])
+        print(color.BLUE + f"node_text: " + color.END, text_dict["node_text"])
+        print()
+        # Only print the first citation if print_top is True
+        if print_top and textno == 0:
+            break
+def sentence_df_triples_df(sentence_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Extract (subject, predicate, object) triples from the input sentence DataFrame
+    INPUT: sentence_df: a DataFrame ('sent_id', 'node_id', 'course_material_type',
+                                     'course_material_week', 'sent_text')
+    OUTPUT: triple_df: a DataFrame (triple_id, sent_id, course_material_type, course_material_week,
+                                    triples_to_process)
+    """
+    model = genai.GenerativeModel('gemini-pro')
+    count = 0
+    dict_list = []
+    for idx, row in tqdm(sentence_df.iterrows(), total=len(sentence_df)):
+        if count < len(sentence_df) + 1:
+            count += 1
+            dict_list.append(sentence_triple_dict_list(row, model))
+        else:
+            break
+    return pd.DataFrame(dict_list)
+def sentence_triple_dict_list(row: pd.Series, model) -> dict:
+    """
+    Extract (subject, predicate, object) triples from a row of a sentence dataframe
+    INPUT: row: a row with the following columns: ('sent_id', 'node_id', 'course_material_type',
+           'course_material_week', 'sent_text')
+           model: llm model
+    OUTPUT: a list of dictionaries each of which has the following keys: triple_id, sent_id,
+            course_material_type, course_material_week, triples_to_process
+    """
+    triple_extraction_prompt = '''
+            Please perform structured triple extraction on the given text enclosed within the
+            three backticks.
+            Convert the text into a set of (subject, predicate, object) triples.
+            Treat a math expression or a block of programming statements as a single concept.
+            Use the previous extraction text and results as context.
+            Correct misspells and syntactic errors.
+            Don't summarize. Don't rewrite the original text. Don't decode the original text.
+            Output the results as a set of ("subject":extracted subject, "predicate":extracted predicate,
+            "object":extracted object). Don't add extra explanation to the results.
+            TEXT: ```{}```
+            '''
+    asent = row['sent_text']
+    #print(asent)
+    msg = triple_extraction_prompt.format(asent)
+    response = model.generate_content(
+        msg
+    )
+    #print(response.text)
+    pattern = r'\{([^}]+)\}|\(([^)]+)\)'
+    #response_text =  response.text.encode("ascii", "ignore").decode(
+    #        "utf-8", "ignore"
+    #    )
+    response_text = response.text
+    matches = re.findall(pattern, response_text)
+    # Flatten the list of tuples and filter out empty matches
+    text_to_process = [ "{" + match[0].strip() + "}" if match[0]
+                       else "{" + match[1].strip() + "}" for match in matches if match[0] or match[1]]
+    #print(text_to_process)
+    tri_dict = {}
+    tri_dict['triple_id'] = row['sent_id'] + "_triples"
+    tri_dict['sent_id'] = row['sent_id']
+    tri_dict['course_material_type'] = row['course_material_type']
+    tri_dict['course_material_week'] = row['course_material_week']
+    tri_dict['triples_to_process'] = text_to_process
+    return tri_dict
+def split_nodes_sentences_df(nodes: List[llama_index.schema.TextNode]) -> pd.DataFrame:
+    """
+    split the text of each node into sentences by spacy
+    """
+    recs = []
+    nlp = spacy.load('en_core_web_sm')
+    for node in nodes:
+        dict_list = split_nodeText_sentences_dict_list(nlp, node)
+        recs.extend(dict_list)
+    return pd.DataFrame(recs)
+def split_nodeText_sentences_dict_list(nlp: Any, node: llama_index.schema.TextNode) -> list:
+    """
+    split the text of the given TextNode into sentences
+    INPUT:  nlp: the spacy model
+            node: a TextNode
+    OUTPUT: a list of dictionaries each of which contains the information for a sentence.
+    """
+    dict_list = []
+    node_text = node.text
+    text_doc = nlp(node_text)
+    text_sentences = list(text_doc.sents)
+    for idx, sent in enumerate(text_sentences):
+        order = idx + 1 # the order of the sentence in the node
+        sent_dict = {}
+        sent_dict['sent_id'] = node.node_id + "_sent" + str(order)
+        sent_dict['node_id'] = node.node_id
+        mdict = node.metadata
+        sent_dict['course_material_type'] = mdict['course_material_type']
+        sent_dict['course_material_week'] = mdict['course_material_week']
+        sent_dict['sent_text'] = sent
+        dict_list.append(sent_dict)
+    return dict_list
+def text_keyconcepts(text:str, model:genai.GenerativeModel) -> str:
+    """
+    use gemini to generate a set of key learning concepts from the input text
+    """
+    prompt = '''
+        You are an expert AI assistant trained on extracting key concepts from the text.
+        Please analyze the following material.
+        Extract the key concepts that can be used to find related materials.
+        Output the results as a list of key concepts only. Only keywords in the output list.
+        No definitions. Separate the keywords by comma.
+        TEXT: ```{}```
+        '''
+    msg = prompt.format(text)
+    response = model.generate_content(
+        msg
+    )
+    input_string = response.text
+    items_list = [item.strip('-').strip() for item in re.split(r'[\n,]', input_string) if item]
+    return items_list
+def text_query_embedding(query:str):
+    """
+    Use Gemini to Embed the given query by the type of retrieval_query
+    INPUT: query: str
+    OUTPUT: embedding as a list of numbers
+    """
+    embedding = genai.embed_content(model="models/embedding-001",
+                                content=query,
+                                task_type="retrieval_query")
+    return embedding['embedding']
+def text_questions_answered(text:str, model:genai.GenerativeModel) -> str:
+    """
+    use gemini to extract a set of questions that can be answered by the input text
+    """
+    prompt = '''
+        You are an expert AI assistant trained on creating a list of specific,
+        answerable questions that can be extracted from input text enclosed within the three backticks.
+        Identify the most pertinent questions that could be asked based on its content.
+        Compose these questions in a clear and concise manner, ensuring they directly
+        align with the information presented in the text. Output the results in JSON format.
+        TEXT: ```{}```
+        '''
+    msg = prompt.format(text)
+    response = model.generate_content(
+        msg
+    )
+    return response.text
+def text_retrieval_document_embedding(text:str, title:str):
+    """
+    Use Gemini to Embed the given text and title by the type of retrieval_document
+    INPUT: text: str
+           title: str
+    OUTPUT: embedding as a list of numbers
+    """
+    embedding = genai.embed_content(model="models/embedding-001",
+                                content=text,
+                                task_type="retrieval_document",
+                                title=title)
+    return embedding['embedding']
+def text_semantic_triples(text:str, model:genai.GenerativeModel) -> str:
+    """
+    use gemini to extract a set of semantic triples from the input text
+    """
+    prompt = '''
+        You are an expert AI assistant trained on extracting semantic triples from the given
+        text enclosed within the three backticks.
+        Genearate a set of (subject, predicate, object) triples for the identified relationships.
+        Correct misspells and syntactic errors.
+        Don't summarize. Don't rewrite the original text. Don't decode the original text.
+        Output the results as JSON format. Don't add extra explanation to the results.
+        TEXT: ```{}```
+        '''
+    msg = prompt.format(text)
+    response = model.generate_content(
+        msg
+    )
+    return response.text
+def text_summary(text:str, model:genai.GenerativeModel) -> str:
+    """
+    use gemini to generate a summary from the input text
+    """
+    prompt = '''
+        You are an expert AI summarization assistant and ready to condense any text into a
+        clear and concise overview. Please help me summairze the text within the backticks below.
+        Please extract the key topics and concepts. Plus, please ensure there are no typos or
+        grammatical errors in the summary. The summary will be used as surrounding context of additional
+        content to answer specific questions.
+        TEXT: ```{}```
+        '''
+    msg = prompt.format(text)
+    response = model.generate_content(
+        msg
+    )
+    return response.text

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+google-generativeai==0.3.1
+python-dotenv==1.0.0
+llama-index==0.9.25.post1
+PyMuPDF==1.23.8
+PyMuPDFb==1.23.7
+networkx==3.2.1
+ipykernel==6.27.1
+ipython==8.18.1
+ipywidgets==8.1.1
+Pillow==10.1.0
+tqdm==4.66.1
+seaborn==0.13.1