anyuanay commited on
Commit
f13eeb8
·
verified ·
1 Parent(s): d0f6105

upload 3 main files

Browse files
Files changed (3) hide show
  1. app.py +108 -0
  2. knowledge_triples_utils.py +1190 -0
  3. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ import os, sys
4
+
5
+ import ast
6
+
7
+ import gradio as gr
8
+
9
+ import google.generativeai as genai
10
+
11
+ GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
12
+
13
+ genai.configure(api_key=GOOGLE_API_KEY)
14
+
15
+ gemini_pro = genai.GenerativeModel(model_name="models/gemini-pro")
16
+ gemini_pro_vision = genai.GenerativeModel(model_name="models/gemini-pro-vision")
17
+
18
+
19
+ import knowledge_triples_utils as kutils
20
+
21
+
22
+ all_nodes_csv_path = "AHU_17_All_Nodes_embedding.csv"
23
+
24
+ all_nodes_df = pd.read_csv(all_nodes_csv_path)
25
+
26
+ all_nodes_df['node_embedding'] = all_nodes_df['node_embedding'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
27
+
28
+
29
+ all_images_csv_path = "AHU_17_All_Images_embeddings_hf.csv"
30
+
31
+ all_images_df = pd.read_csv(all_images_csv_path)
32
+
33
+ all_images_df['desc_title_embedding'] = all_images_df['desc_title_embedding'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
34
+
35
+
36
+ # answer query by gemini
37
+ def answer_query(query):
38
+
39
+ # Matching user text query with "node_embedding" to find relevant chunks.
40
+ matching_results_text = kutils.get_similar_text_from_query(
41
+ query,
42
+ all_nodes_df,
43
+ column_name="node_embedding",
44
+ top_n=3,
45
+ print_citation=False,
46
+ )
47
+
48
+ # Matching user text query with "desc_title_embedding" to find relevant images.
49
+ matching_results_images = kutils.get_relevant_images_from_query(
50
+ query,
51
+ all_images_df,
52
+ column_name="desc_title_embedding",
53
+ top_n=3,
54
+ )
55
+
56
+ # combine all the selected relevant text chunks
57
+ context_text = []
58
+ for key, value in matching_results_text.items():
59
+ context_text.append(value["node_text"])
60
+ final_context_text = "\n".join(context_text)
61
+
62
+ # combine all the relevant images and their description generated by Gemini
63
+ context_images = []
64
+ for key, value in matching_results_images.items():
65
+ context_images.extend(
66
+ ["Image: ", value["image_object"], "Caption: ", value["image_description"]]
67
+ )
68
+
69
+ instructions = '''
70
+ You will answer the query based on the text context given in "text_context" and Image context given
71
+ in "image_context" along with its Caption:\n
72
+ Base your response on "text_context" and "image_context". Do not use any numbers or percentages that are
73
+ not present in the "image_context".
74
+ Context:
75
+ '''
76
+
77
+ final_prompt = [
78
+ "QUERY: " + query + " ANSWER: ",
79
+ instructions,
80
+ "text_context:",
81
+ "\n".join(context_text),
82
+ "image_context:",
83
+ ]
84
+ final_prompt.extend(context_images)
85
+
86
+ response = gemini_pro_vision.generate_content(
87
+ final_prompt,
88
+ stream=True,
89
+ )
90
+
91
+ response_list = []
92
+ for chunk in response:
93
+ response_list.append(chunk.text)
94
+ response = "".join(response_list)
95
+
96
+
97
+ return response, matching_results_images[0]["image_object"]
98
+
99
+
100
+ demo = gr.Interface(
101
+ fn=answer_query,
102
+ inputs="textbox",
103
+ outputs=["textbox", "image"]
104
+ )
105
+
106
+ if __name__ == "__main__":
107
+ demo.launch()
108
+
knowledge_triples_utils.py ADDED
@@ -0,0 +1,1190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, random, re
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+
8
+ import llama_index
9
+ from llama_index import Document
10
+
11
+ import google.generativeai as genai
12
+
13
+ from llama_index.schema import MetadataMode, NodeRelationship
14
+ from llama_index.text_splitter import TokenTextSplitter
15
+ from llama_index import SimpleDirectoryReader
16
+
17
+ from copy import deepcopy
18
+
19
+ import time
20
+ import fitz
21
+ import errno
22
+ import typing
23
+ import requests
24
+
25
+ import networkx as nx
26
+ from base64 import b64encode
27
+
28
+ from typing import Optional
29
+ from typing import Tuple, List
30
+ from typing import Dict, List, Union, Any, Iterable
31
+
32
+ from IPython.display import Markdown, display
33
+
34
+ import PIL
35
+ from PIL import Image
36
+
37
+ from tqdm import tqdm
38
+
39
+ import json
40
+
41
+ # llama_index Documents in info213_docs
42
+ # fitz_docs which is opened by fitz.open(path_input)
43
+ # both list of docs should have the same page numbers
44
+
45
+
46
+ def classify_image(image_path:str, model:genai.GenerativeModel) -> str:
47
+ """
48
+ Given an image path, classify the image as floor plan, equipment, etc...
49
+ INPUT: image_path: the path to the image
50
+ model: LLM model
51
+ OUTPUT: the type of the image in a string
52
+ """
53
+
54
+ image_for_gemini = Image.open(image_path)
55
+
56
+
57
+ # Specify the image description prompt.
58
+ image_description_prompt = """
59
+ Analyze and classify the image into one of the following categories:
60
+ floor plan, flow chart, HAVC equipment, sign, and other. Ouput one and
61
+ only one category names.
62
+ """
63
+
64
+ model_input = [image_description_prompt, image_for_gemini]
65
+
66
+ response = model.generate_content(
67
+ model_input
68
+ )
69
+
70
+ return response.text
71
+
72
+ # Combine node's keywords, triples, questions, and text from a row
73
+ def combine_node_fields(row):
74
+ result = ""
75
+ result = result + "KEYWORDS: " + row['node_keywords'] + ";\n"
76
+
77
+ result = result + "TRIPLES: " + row['node_triples'] + ";\n"
78
+
79
+ result = result + "ANSWERABLE_QUESTIONS: " + row['node_answerable_questions'] + ";\n"
80
+
81
+ result = result + "TEXT: " + row['node_text'] +".\n"
82
+
83
+ return result
84
+
85
+ def display_images(
86
+ images: Iterable[Union[str, PIL.Image.Image]], resize_ratio: float = 0.5
87
+ ) -> None:
88
+ """
89
+ Displays a series of images provided as paths or PIL Image objects.
90
+
91
+ Args:
92
+ images: An iterable of image paths or PIL Image objects.
93
+ resize_ratio: The factor by which to resize each image (default 0.5).
94
+
95
+ Returns:
96
+ None (displays images using IPython or Jupyter notebook).
97
+ """
98
+
99
+ # Convert paths to PIL images if necessary
100
+ pil_images = []
101
+ for image in images:
102
+ if isinstance(image, str):
103
+ pil_images.append(PIL.Image.open(image))
104
+ else:
105
+ pil_images.append(image)
106
+
107
+ # Resize and display each image
108
+ for img in pil_images:
109
+ original_width, original_height = img.size
110
+ new_width = int(original_width * resize_ratio)
111
+ new_height = int(original_height * resize_ratio)
112
+ resized_img = img.resize((new_width, new_height))
113
+ display(resized_img)
114
+ print("\n")
115
+
116
+
117
+ def doc_images_description_dict(fdocs:fitz.Document, fpage: fitz.Document, lpage:
118
+ llama_index.Document, image_save_dir:str,
119
+ image_description_prompt:str, model:genai.GenerativeModel) -> List[dict]:
120
+
121
+ file_name = lpage.metadata['file_name']
122
+ page_label = lpage.metadata['page_label']
123
+
124
+ images = fpage.get_images()
125
+
126
+ dict_list = []
127
+
128
+ for image_no, image in enumerate(images):
129
+
130
+ image_dict = {}
131
+
132
+ xref = image[0]
133
+ pix = fitz.Pixmap(fitz_docs, xref)
134
+
135
+ # Create the image file name
136
+ image_name = f"{image_save_dir}/{file_name}_image_{page_label}_{image_no}_{xref}.jpeg"
137
+
138
+ # Save the image to the specified location
139
+ pix.save(image_name)
140
+
141
+ # Load the saved image as a Gemini Image Object
142
+ image_for_gemini = Image.open(io.BytesIO(pix.tobytes("jpeg")))
143
+
144
+ model_input = [image_description_prompt, image_for_gemini]
145
+
146
+ response = gemini_pro_model.generate_content(
147
+ model_input
148
+ )
149
+
150
+ image_dict['doc_id'] = lpage.doc_id
151
+
152
+ image_dict['image_id'] = image_no
153
+
154
+ image_dict['image_name'] = image_name
155
+
156
+ mdict = lpage.metadata
157
+
158
+ image_dict['page_label'] = mdict['page_label']
159
+ image_dict['file_name'] = mdict['file_name']
160
+ image_dict['file_path'] = mdict['file_path']
161
+ image_dict['file_type'] = mdict['file_type']
162
+
163
+ image_dict['course_material_type'] = mdict['course_material_type']
164
+ image_dict['course_material_week'] = mdict['course_material_week']
165
+
166
+ image_dict['description'] = response.text
167
+
168
+ dict_list.append(image_dict)
169
+
170
+ return dict_list
171
+
172
+
173
+ def docs_to_df(docs:llama_index.schema.Document, gemini_pro:genai.GenerativeModel) -> pd.DataFrame:
174
+ """
175
+ extract titles for docs, embed the documents and titles, and convert it to dataframe
176
+ INPUT: docs: the documents extacted from a file
177
+ gemini_pro: genai gemini pro model
178
+ OUTPUT: docs_df: a dataframe containing the information of the docs extracted from the input file
179
+ """
180
+
181
+ docs_df = llamaindex_docs_df(docs)
182
+
183
+ tqdm.pandas(desc="Processing rows for extracting document titles...")
184
+
185
+ docs_df['doc_title'] = docs_df.progress_apply(lambda row: node_text_title(row['text'], gemini_pro), axis=1)
186
+
187
+ #tqdm.pandas(desc="Processing rows for summiarizing documents...")
188
+
189
+ #try:
190
+ # docs_df['doc_summary'] = docs_df.progress_apply(lambda row: text_summary(row['text'], gemini_pro), axis=1)
191
+ #except:
192
+ # docs_df['doc_summary'] = None
193
+
194
+ doc_summary_list = []
195
+ for _, row in tqdm(docs_df.iterrows(), total=len(docs_df)):
196
+ try:
197
+ doc_summary_list.append(text_summary(row['text'], gemini_pro))
198
+ except:
199
+ #print(row['page_label'], row['text'])
200
+ doc_summary_list.append(None)
201
+
202
+ docs_df['doc_summary'] = doc_summary_list
203
+
204
+ tqdm.pandas(desc="Processing rows for embedding documents and titles...")
205
+
206
+ docs_df['doc_embedding'] = docs_df.progress_apply(lambda row: text_retrieval_document_embedding(row['text'], row['doc_title']), axis=1)
207
+
208
+ return docs_df
209
+
210
+
211
+ def extract_image_description_df(image_path:str, category:str, model:genai.GenerativeModel) -> pd.DataFrame:
212
+ """
213
+ Extract description of the given image in the given category
214
+ INPUT: image_path: the path to the image
215
+ category: a string containing the category of the image
216
+ model: a generative model
217
+ OUTPUT: a DataFrame containing the metadata of the extracted images
218
+ """
219
+
220
+ image_for_gemini = Image.open(image_path)
221
+
222
+
223
+ # Specify the image description prompt.
224
+ image_description_prompt = """Explain what is going on in the image.
225
+ If it's a table, extract all elements of the table.
226
+ If it's a graph, explain the findings in the graph.
227
+ Do not include any numbers that are not mentioned in the image:
228
+ """
229
+
230
+ if "floor plan" in category.lower():
231
+ image_description_prompt = '''
232
+ Please analyze the provided floor plan image and extract the following information
233
+ related to rooms, locations, connections, HVAC equipment, and sensors:
234
+ 1. Room Labels/Names: Identify and list all room labels or names shown on the floor plan.
235
+ 2. Room Connectivity: Indicate how different rooms are connected (doors, hallways, openings, etc.).
236
+ 3. HVAC Equipment: Locate and list all HVAC equipment depicted on the floor plan (e.g., air handling units, ductwork, vents, thermostats, etc.).
237
+ 4. Sensor Locations: Note the locations of any sensors or control devices related to the HVAC system (e.g., temperature sensors, occupancy sensors, etc.).
238
+ 5. Zoning/Partitions: If the floor plan shows any zoning or partitions related to HVAC control, please describe them.
239
+ 6. Special Areas: Highlight any areas that may have unique HVAC requirements (e.g., server rooms, laboratories, etc.).
240
+ Please provide the extracted information in a structured format, separating the different categories as needed. Let me know if you need any clarification or have additional requirements for the information to be extracted from the floor plan.
241
+ '''
242
+ elif "flow chart" in category.lower():
243
+ image_description_prompt = '''
244
+ Please analyze the provided HVAC flow chart image and extract the following information:
245
+
246
+ 1. System Components: Identify and list all the major HVAC components shown in the flow chart (e.g., air handling units, chillers, boilers, pumps, cooling towers, etc.).
247
+ 2. Component Connections: Describe how the different HVAC components are connected, indicating the direction of airflow, water flow, refrigerant flow, etc.
248
+ 3. System Inputs/Outputs: Note any system inputs (e.g., outside air intake) or outputs (e.g., exhaust air) shown in the flow chart.
249
+ 4. Control Points: Locate any control points, sensors, or valves that regulate the flow or operation of the system components.
250
+ 5. Subsystems/Zones: If the flow chart illustrates subsystems or zones within the overall HVAC system, please describe them and their components.
251
+ 6. Operational Modes: Identify any operational modes or sequences depicted in the flow chart (e.g., heating mode, cooling mode, economizer mode, etc.).
252
+
253
+ Please provide the extracted information in a clear and structured format, separating the different categories as needed. If any abbreviations or symbols are used in the flow chart, please include a legend or clarify their meanings. Let me know if you need any clarification or have additional requirements for the information to be extracted.
254
+ '''
255
+ elif "havc equipment" in category.lower():
256
+ image_description_prompt = '''
257
+ Please analyze the image I will provide, which contains HVAC (heating, ventilation, and
258
+ air conditioning) equipment. Describe the different components you can identify, such
259
+ as the type of equipment (furnace, air conditioner, ductwork, etc.), the apparent
260
+ condition of the equipment, and any other relevant details you can discern from the
261
+ image. Your analysis should help someone understand what is depicted in the HVAC system
262
+ shown in the picture.
263
+ '''
264
+ else:
265
+ image_description_prompt = '''Explain what is going on in the image.
266
+ If it's a table, extract all elements of the table.
267
+ If it's a graph, explain the findings in the graph.
268
+ Do not include any numbers that are not mentioned in the image:
269
+ '''
270
+
271
+ dict_list = []
272
+
273
+ path_last_sep_idx = image_path.rfind("/")
274
+ file_name = image_path[path_last_sep_idx+1:]
275
+ print("Processing the image: {}".format(file_name))
276
+
277
+ model_input = [image_description_prompt, image_for_gemini]
278
+
279
+ response = model.generate_content(
280
+ model_input
281
+ )
282
+
283
+ image_dict = {}
284
+
285
+ image_dict['image_path'] = image_path
286
+ image_dict['file_name'] = file_name
287
+
288
+ try:
289
+ image_dict['image_description'] = response.text
290
+ except Exception as e:
291
+ print("Some errors happend in the response from Gemini.")
292
+ image_dict['image_description'] = None
293
+
294
+ dict_list.append(image_dict)
295
+
296
+ return pd.DataFrame(dict_list)
297
+
298
+
299
+ def get_cosine_score(
300
+ dataframe: pd.DataFrame, column_name: str, input_text_embd: np.ndarray
301
+ ) -> float:
302
+ """
303
+ Calculates the cosine similarity between the user query embedding and the
304
+ dataframe embedding for a specific column.
305
+
306
+ Args:
307
+ dataframe: The pandas DataFrame containing the data to compare against.
308
+ column_name: The name of the column containing the embeddings to compare with.
309
+ input_text_embd: The NumPy array representing the user query embedding.
310
+
311
+ Returns:
312
+ The cosine similarity score (rounded to two decimal places) between the user query embedding and the dataframe embedding.
313
+ """
314
+
315
+ text_cosine_score = round(np.dot(dataframe[column_name], input_text_embd), 2)
316
+
317
+ return text_cosine_score
318
+
319
+ def get_cosine_score_lists(
320
+ dataframe: pd.DataFrame, column_name: str, query_embs: list
321
+ ) -> float:
322
+ """
323
+ Calculates the cosine similarity between the user query embedding and the dataframe embedding for a specific column. Both embeddings are in lists
324
+
325
+ Args:
326
+ dataframe: The pandas DataFrame containing the data to compare against.
327
+ column_name: The name of the column containing the embeddings to compare with.
328
+ input_text_embd: The query embeddings as a list of numbers
329
+
330
+ Returns:
331
+ The cosine similarity score (rounded to two decimal places) between the user query embedding and the dataframe embedding.
332
+ """
333
+
334
+ text_cosine_score = round(np.dot(np.array(dataframe[column_name]), np.array(query_embs)), 2)
335
+ return text_cosine_score
336
+
337
+
338
+ def get_relevant_images_from_query(
339
+ query: str,
340
+ images_df: pd.DataFrame,
341
+ column_name: str = "",
342
+ top_n: int = 3,
343
+ embedding_size: int = 768,
344
+ print_citation: bool = True,
345
+ ) -> Dict[int, Dict[str, Any]]:
346
+ """
347
+ Finds the top N most similar images from a metadata DataFrame based on a text query.
348
+
349
+ Args:
350
+ query: The text query used for finding similar passages.
351
+ images_df: A Pandas DataFrame containing the image metadata to search.
352
+ column_name: The column name in the text_metadata_df containing the text embeddings or
353
+ text itself.
354
+ top_n: The number of most similar text passages to return.
355
+ embedding_size: The dimensionality of the text embeddings (only used if text embeddings
356
+ are stored in the column specified by `column_name`).
357
+ print_citation: Whether to immediately print formatted citations for the matched text
358
+ passages (True) or just return the dictionary (False).
359
+
360
+ Returns:
361
+ A dictionary containing information about the top N most similar images,
362
+ including cosine scores, image_path, file_name, and description text.
363
+
364
+ Raises:
365
+ KeyError: If the specified `column_name` is not present in the `text_metadata_df`.
366
+ """
367
+
368
+ if column_name not in images_df.columns:
369
+ raise KeyError(f"Column '{column_name}' not found in the 'images_df'")
370
+
371
+ query_embs = text_query_embedding(query)
372
+
373
+ # Calculate cosine similarity between query text and metadata text
374
+ cosine_scores = images_df.apply(
375
+ lambda row: get_cosine_score_lists(
376
+ row,
377
+ column_name,
378
+ query_embs,
379
+ ),
380
+ axis=1,
381
+ )
382
+
383
+ # Get top N cosine scores and their indices
384
+ top_n_indices = cosine_scores.nlargest(top_n).index.tolist()
385
+ top_n_scores = cosine_scores.nlargest(top_n).values.tolist()
386
+
387
+ # Create a dictionary to store matched images and their information
388
+ final_images = {}
389
+
390
+ for matched_no, index in enumerate(top_n_indices):
391
+ # Create a sub-dictionary for each matched image
392
+ final_images[matched_no] = {}
393
+
394
+ # Store image path
395
+ final_images[matched_no]["image_path"] = images_df.iloc[index][
396
+ "image_path"
397
+ ]
398
+
399
+ # Store cosine score
400
+ final_images[matched_no]["cosine_score"] = top_n_scores[matched_no]
401
+
402
+
403
+ # Store image file name
404
+ final_images[matched_no]["file_name"] = images_df.iloc[index]["file_name"]
405
+
406
+ # Store image description
407
+ final_images[matched_no]["image_description"] = images_df["image_description"][index]
408
+
409
+ # Store image object
410
+ final_images[matched_no]["image_object"] = Image.open(images_df.iloc[index]['image_path'])
411
+
412
+ # Optionally print citations immediately
413
+ if print_citation:
414
+ print_text_to_image_citation(final_images)
415
+
416
+ return final_images
417
+
418
+
419
+ def get_similar_text_from_query(
420
+ query: str,
421
+ nodes_df: pd.DataFrame,
422
+ column_name: str = "",
423
+ top_n: int = 3,
424
+ embedding_size: int = 768,
425
+ print_citation: bool = True,
426
+ ) -> Dict[int, Dict[str, Any]]:
427
+ """
428
+ Finds the top N most similar text passages from a metadata DataFrame based on a text query.
429
+
430
+ Args:
431
+ query: The text query used for finding similar passages.
432
+ nodes_df: A Pandas DataFrame containing the text metadata to search.
433
+ column_name: The column name in the text_metadata_df containing the text embeddings or
434
+ text itself.
435
+ top_n: The number of most similar text passages to return.
436
+ embedding_size: The dimensionality of the text embeddings (only used if text embeddings
437
+ are stored in the column specified by `column_name`).
438
+ print_citation: Whether to immediately print formatted citations for the matched text
439
+ passages (True) or just return the dictionary (False).
440
+
441
+ Returns:
442
+ A dictionary containing information about the top N most similar text passages,
443
+ including cosine scores, page numbers, chunk numbers (optional), and chunk text or
444
+ page text (depending on `chunk_text`).
445
+
446
+ Raises:
447
+ KeyError: If the specified `column_name` is not present in the `text_metadata_df`.
448
+ """
449
+
450
+ if column_name not in nodes_df.columns:
451
+ raise KeyError(f"Column '{column_name}' not found in the 'nodes_df'")
452
+
453
+ query_embs = text_query_embedding(query)
454
+
455
+ # Calculate cosine similarity between query text and metadata text
456
+ cosine_scores = nodes_df.apply(
457
+ lambda row: get_cosine_score_lists(
458
+ row,
459
+ column_name,
460
+ query_embs,
461
+ ),
462
+ axis=1,
463
+ )
464
+
465
+ # Get top N cosine scores and their indices
466
+ top_n_indices = cosine_scores.nlargest(top_n).index.tolist()
467
+ top_n_scores = cosine_scores.nlargest(top_n).values.tolist()
468
+
469
+ # Create a dictionary to store matched text and their information
470
+ final_text = {}
471
+
472
+ for matched_textno, index in enumerate(top_n_indices):
473
+ # Create a sub-dictionary for each matched text
474
+ final_text[matched_textno] = {}
475
+
476
+ # Store page number
477
+ final_text[matched_textno]["page_num"] = nodes_df.iloc[index][
478
+ "page_label"
479
+ ]
480
+
481
+ # Store cosine score
482
+ final_text[matched_textno]["cosine_score"] = top_n_scores[matched_textno]
483
+
484
+
485
+ # Store node id
486
+ final_text[matched_textno]["node_id"] = nodes_df.iloc[index]["node_id"]
487
+
488
+ # Store node text
489
+ final_text[matched_textno]["node_text"] = nodes_df["node_text"][index]
490
+
491
+ # Optionally print citations immediately
492
+ if print_citation:
493
+ print_text_to_text_citation(final_text)
494
+
495
+ return final_text
496
+
497
+
498
+ def llamaindex_doc_dict(doc: llama_index.schema.Document) -> dict:
499
+ """
500
+ convert a LlamaIndex Document object to a dictionary
501
+ """
502
+
503
+ doc_dict = {}
504
+
505
+ doc_dict['doc_id'] = doc.doc_id
506
+
507
+ mdict = doc.metadata
508
+
509
+ doc_dict['page_label'] = mdict['page_label']
510
+ doc_dict['file_name'] = mdict['file_name']
511
+ doc_dict['file_path'] = mdict['file_path']
512
+ doc_dict['file_type'] = mdict['file_type']
513
+
514
+ doc_dict['file_title'] = mdict['file_title']
515
+ doc_dict['file_date'] = mdict['file_date']
516
+ doc_dict['file_subtitle'] = mdict['file_subtitle']
517
+ doc_dict['table_of_content'] = mdict['table_of_content']
518
+
519
+ doc_dict['text'] = doc.text
520
+
521
+ return doc_dict
522
+
523
+
524
+ def llamaindex_docs_df(docs: List[llama_index.schema.Document]) -> pd.DataFrame:
525
+ """
526
+ convert a list of LlamaIndex Document object to a Pandas DataFrame with columns
527
+ """
528
+
529
+ recs = []
530
+ for doc in docs:
531
+ recs.append(llamaindex_doc_dict(doc))
532
+
533
+ return pd.DataFrame(recs)
534
+
535
+
536
+ def llamaindex_docs_from_path(path_input:str,
537
+ gemini_pro:genai.GenerativeModel) -> llama_index.schema.Document:
538
+
539
+ """
540
+ extract llama_index Document from the file given the path_input
541
+ INPUT: path_input: the path pointing to the file in the disk
542
+ gemini_pro: the gemini pro model for extracting course metadata
543
+ OUTPUT: docs: llama_index Document extracted from the file by the path_input
544
+ """
545
+
546
+ docs = SimpleDirectoryReader(input_files=[path_input]).load_data()
547
+
548
+ first2pages = docs[0].text + " " + docs[1].text
549
+
550
+ metadata_extraction_sys_content = '''
551
+ You are a helpful assistant focusing on extracting the metadata describing the input document.
552
+ '''
553
+
554
+ metadata_extraction_prompt = '''
555
+ {}\n
556
+ Please perform metadata extraction on the given text.
557
+ Focuse on the following metadata fields:
558
+ title: what the document is about;
559
+ date: when the document was created;
560
+ subtitle: what specific content the document is about;
561
+ table of content: section titles and their page numbers.
562
+ Output NA if there is no value for a metadata field.
563
+ Output the results in a dictionary.
564
+ TEXT: ```{}```
565
+ '''
566
+
567
+ msg = metadata_extraction_prompt.format(metadata_extraction_sys_content, first2pages)
568
+
569
+ response = gemini_pro.generate_content(
570
+ msg
571
+ )
572
+
573
+ response_string = response.text.strip('`')
574
+
575
+ extracted_meta_dict = {}
576
+
577
+ try:
578
+ extracted_meta_dict = json.loads(response_string)
579
+ except json.decoder.JSONDecodeError as e:
580
+ # Handling the JSON decoding error
581
+ extracted_meta_dict = {}
582
+
583
+ for doc in tqdm(docs, total=len(docs), desc="Adding metadata to docs..."):
584
+ if 'title' in extracted_meta_dict:
585
+ doc.metadata['file_title'] = extracted_meta_dict['title']
586
+ else:
587
+ doc.metadata['file_title'] = None
588
+
589
+ if 'date' in extracted_meta_dict:
590
+ doc.metadata['file_date'] = extracted_meta_dict['date']
591
+ else:
592
+ doc.metadata['file_date'] = None
593
+
594
+ if 'subtitle' in extracted_meta_dict:
595
+ doc.metadata['file_subtitle'] = extracted_meta_dict['subtitle']
596
+ else:
597
+ doc.metadata['file_subtitle'] = None
598
+
599
+ if 'table of content' in extracted_meta_dict:
600
+ doc.metadata['table_of_content'] = extracted_meta_dict['table of content']
601
+ else:
602
+ doc.metadata['table_of_content'] = None
603
+
604
+ return docs
605
+
606
+ def llamaindex_node_dict(node: llama_index.schema.TextNode) -> dict:
607
+ """
608
+ convert a LlamaIndex TextNode object to a dictionary
609
+ INPUT: doc_id: the document from where the node extracted
610
+ node_order: an integer for the order of the node in the parent document
611
+ node: a TextNode extracted from the parent document
612
+ OUTPUT: dictionary for the node's information
613
+ """
614
+
615
+ node_dict = {}
616
+
617
+ node_dict['node_id'] = node.node_id
618
+
619
+ mdict = node.metadata
620
+
621
+ node_dict['page_label'] = mdict['page_label']
622
+ node_dict['file_name'] = mdict['file_name']
623
+ node_dict['file_path'] = mdict['file_path']
624
+ node_dict['file_type'] = mdict['file_type']
625
+ #node_dict['document_title'] = mdict['document_title']
626
+ #node_dict['questions_this_excerpt_can_answer'] = mdict['questions_this_excerpt_can_answer']
627
+ #node_dict['section_summary'] = mdict['section_summary']
628
+
629
+ node_dict['file_title'] = mdict['file_title']
630
+ node_dict['file_date'] = mdict['file_date']
631
+ node_dict['file_subtitle'] = mdict['file_subtitle']
632
+
633
+ node_dict['node_text'] = node.text
634
+
635
+ node_dict['start_char_idx'] = node.start_char_idx
636
+ node_dict['end_char_idx'] = node.end_char_idx
637
+
638
+ rdict = node.relationships
639
+
640
+ if NodeRelationship.SOURCE in rdict.keys():
641
+ node_dict['doc_id'] = rdict[NodeRelationship.SOURCE].node_id
642
+ else:
643
+ node_dict['doc_id'] = None
644
+
645
+ if NodeRelationship.PREVIOUS in rdict.keys():
646
+ node_dict['previous_node'] = rdict[NodeRelationship.PREVIOUS].node_id
647
+ else:
648
+ node_dict['previous_node'] = None
649
+
650
+ if NodeRelationship.NEXT in rdict.keys():
651
+ node_dict['next_node'] = rdict[NodeRelationship.NEXT].node_id
652
+ else:
653
+ node_dict['next_node'] = None
654
+
655
+
656
+ return node_dict
657
+
658
+
659
+ def llamaindex_nodes_df(nodes: List[llama_index.schema.TextNode]) -> pd.DataFrame:
660
+ """
661
+ convert a list of LlamaIndex TextNode object to a Pandas DataFrame with columns
662
+ """
663
+
664
+ recs = []
665
+ for node in nodes:
666
+ recs.append(llamaindex_node_dict(node))
667
+
668
+ return pd.DataFrame(recs)
669
+
670
+
671
+ def node_text_title(text:str, model:genai.GenerativeModel) -> str:
672
+ """
673
+ use gemini to generate a title for the input text
674
+ """
675
+
676
+ prompt = '''
677
+ Please summairze the given input text
678
+ enclosed within the three backticks. Generate a short
679
+ title for the text. Correct misspells and syntactic errors.
680
+ Output a short title string only.
681
+ TEXT: ```{}```
682
+ '''
683
+ msg = prompt.format(text)
684
+
685
+ response = model.generate_content(
686
+ msg
687
+ )
688
+
689
+ return response.text
690
+
691
+ def pdf_extract_images(pdf_path:str, image_save_dir:str):
692
+ """
693
+ Given a PDF path, extract images from the PDf file and save in disk
694
+ INPUT: pdf_path: the path to the PDF file
695
+ image_save_dir: the directory for storing the extracted images
696
+ OUTPUT: None
697
+ """
698
+
699
+ fitz_docs = fitz.open(pdf_path)
700
+
701
+ path_last_sep_idx = pdf_path.rfind("/")
702
+ file_name = pdf_path[path_last_sep_idx+1:]
703
+ print("Processing the images from the pages of {}".format(file_name))
704
+
705
+ for idx, fpage in tqdm(enumerate(fitz_docs), total=len(fitz_docs)):
706
+
707
+ images = fpage.get_images()
708
+
709
+ page_label = idx + 1 # llamaindex document pages indexing start from 1
710
+
711
+ for image_no, image in enumerate(images):
712
+
713
+ xref = image[0]
714
+ pix = fitz.Pixmap(fitz_docs, xref)
715
+
716
+ # Create the image file name
717
+ image_name = f"{image_save_dir}/extracted_from_{file_name}_{page_label}_{image_no}_{xref}.jpeg"
718
+
719
+ # Save the image to the specified location
720
+ pix.save(image_name)
721
+
722
+
723
+
724
+ def pdf_images_description_df(pdf_path:str, docs_df_path:str, image_save_dir:str) -> pd.DataFrame:
725
+ """
726
+ Given a PDF path and the path to the DataFrame containing the metadata of the pages extracted from the PDF file, extract the metadata of images from the PDf file as a DataFrame
727
+ INPUT: pdf_path: the path to the PDF file
728
+ docs_df_path: the path to the DataFrame containing page metadata extracted from the PDF file
729
+ image_save_dir: the directory for storing the extracted images
730
+ OUTPUT: a DataFrame containing the metadata of the extracted images
731
+ """
732
+
733
+ fitz_docs = fitz.open(pdf_path)
734
+
735
+ doc_df = pd.read_csv(docs_df_path)
736
+
737
+ # Specify the image description prompt.
738
+ image_description_prompt = """Explain what is going on in the image.
739
+ If it's a table, extract all elements of the table.
740
+ If it's a graph, explain the findings in the graph.
741
+ Do not include any numbers that are not mentioned in the image:
742
+ """
743
+
744
+ dict_list = []
745
+
746
+ path_last_sep_idx = pdf_path.rfind("/")
747
+ file_name = pdf_path[path_last_sep_idx+1:]
748
+ print("Processing the images from the pages of {}".format(file_name))
749
+
750
+ for idx, fpage in tqdm(enumerate(fitz_docs), total=len(fitz_docs)):
751
+
752
+ images = fpage.get_images()
753
+
754
+ page_label = idx + 1 # llamaindex document pages indexing start from 1
755
+
756
+ for image_no, image in enumerate(images):
757
+
758
+ image_dict = {}
759
+
760
+ xref = image[0]
761
+ pix = fitz.Pixmap(fitz_docs, xref)
762
+
763
+ # Create the image file name
764
+ image_name = f"{image_save_dir}/{file_name}_image_{page_label}_{image_no}_{xref}.jpeg"
765
+
766
+ # Save the image to the specified location
767
+ pix.save(image_name)
768
+
769
+ # Load the saved image as a Gemini Image Object
770
+ image_for_gemini = Image.open(io.BytesIO(pix.tobytes("jpeg")))
771
+
772
+ model_input = [image_description_prompt, image_for_gemini]
773
+
774
+ response = gemini_pro_vision.generate_content(
775
+ model_input
776
+ )
777
+
778
+ image_dict['image_id'] = image_no
779
+ image_dict['image_name'] = image_name
780
+ image_dict['page_label'] = page_label
781
+
782
+ try:
783
+ doc_page = doc_df[doc_df.page_label == page_label].iloc[0]
784
+
785
+ image_dict['doc_id'] = doc_page['doc_id']
786
+ image_dict['file_name'] = doc_page['file_name']
787
+ image_dict['file_path'] = doc_page['file_path']
788
+ image_dict['file_type'] = doc_page['file_type']
789
+ image_dict['course_material_type'] = doc_page['course_material_type']
790
+ image_dict['course_material_week'] = doc_page['course_material_week']
791
+
792
+ except Exception as e:
793
+ print("Some errors happened in the doc_page of the doc_df.")
794
+ image_dict['doc_id'] = None
795
+ image_dict['file_name'] = None
796
+ image_dict['file_path'] = None
797
+ image_dict['file_type'] = None
798
+ image_dict['course_material_type'] = None
799
+ image_dict['course_material_week'] = None
800
+
801
+ try:
802
+ image_dict['image_description'] = response.text
803
+ except Exception as e:
804
+ print("Some errors happend in the response from Gemini.")
805
+
806
+ image_dict['image_description'] = None
807
+
808
+ dict_list.append(image_dict)
809
+
810
+ time.sleep(2)
811
+
812
+ return pd.DataFrame(dict_list)
813
+
814
+
815
+ # Add colors to the print
816
+ class Color:
817
+ """
818
+ This class defines a set of color codes that can be used to print text in different colors.
819
+ This will be used later to print citations and results to make outputs more readable.
820
+ """
821
+
822
+ PURPLE: str = "\033[95m"
823
+ CYAN: str = "\033[96m"
824
+ DARKCYAN: str = "\033[36m"
825
+ BLUE: str = "\033[94m"
826
+ GREEN: str = "\033[92m"
827
+ YELLOW: str = "\033[93m"
828
+ RED: str = "\033[91m"
829
+ BOLD: str = "\033[1m"
830
+ UNDERLINE: str = "\033[4m"
831
+ END: str = "\033[0m"
832
+
833
+ def print_text_to_image_citation(
834
+ final_images: Dict[int, Dict[str, Any]], print_top: bool = True
835
+ ) -> None:
836
+ """
837
+ Prints a formatted citation for each matched image in a dictionary.
838
+
839
+ Args:
840
+ final_images: A dictionary containing information about matched images,
841
+ with keys as image number and values as dictionaries containing
842
+ image path, page number, page text, cosine similarity score, and image description.
843
+ print_top: A boolean flag indicating whether to only print the first citation (True) or all citations (False).
844
+
845
+ Returns:
846
+ None (prints formatted citations to the console).
847
+ """
848
+
849
+ color = Color()
850
+
851
+ # Iterate through the matched image citations
852
+ for imageno, image_dict in final_images.items():
853
+ # Print the citation header
854
+ print(
855
+ color.RED + f"Citation {imageno + 1}:",
856
+ "Mached image path, page number and page text: \n" + color.END,
857
+ )
858
+
859
+ # Print the cosine similarity score
860
+ print(color.BLUE + f"score: " + color.END, image_dict["cosine_score"])
861
+
862
+ # Print the image path
863
+ print(color.BLUE + f"path: " + color.END, image_dict["image_path"])
864
+
865
+ # Print the file name
866
+ print(color.BLUE + f"file name: " + color.END, image_dict["file_name"])
867
+
868
+ # Print the image description
869
+ print(
870
+ color.BLUE + f"image description: " + color.END,
871
+ image_dict["image_description"],
872
+ )
873
+
874
+ # Display image
875
+ display_images([image_dict["image_object"]])
876
+
877
+ # Only print the first citation if print_top is True
878
+ if print_top and imageno == 0:
879
+ break
880
+
881
+
882
+ def print_text_to_text_citation(
883
+ final_text: Dict[int, Dict[str, Any]],
884
+ print_top: bool = True,
885
+ ) -> None:
886
+ """
887
+ Prints a formatted citation for each matched text in a dictionary.
888
+
889
+ Args:
890
+ final_text: A dictionary containing information about matched text passages,
891
+ with keys as text number and values as dictionaries containing
892
+ page number, cosine similarity score, chunk number (optional),
893
+ chunk text (optional), and page text (optional).
894
+ print_top: A boolean flag indicating whether to only print the first citation (True) or all citations (False).
895
+ chunk_text: A boolean flag indicating whether to print individual text chunks (True) or the entire page text (False).
896
+
897
+ Returns:
898
+ None (prints formatted citations to the console).
899
+ """
900
+
901
+ color = Color()
902
+
903
+ # Iterate through the matched text citations
904
+ for textno, text_dict in final_text.items():
905
+ # Print the citation header
906
+ print(color.RED + f"Citation {textno + 1}:", "Matched text:" + color.END)
907
+
908
+ # Print the cosine similarity score
909
+ print(color.BLUE + f"score: " + color.END, text_dict["cosine_score"])
910
+
911
+ # Print the page number
912
+ print(color.BLUE + f"page_number: " + color.END, text_dict["page_num"])
913
+
914
+ # Print chunk number and chunk text
915
+ print(color.BLUE + f"node_id: " + color.END, text_dict["node_id"])
916
+ print(color.BLUE + f"node_text: " + color.END, text_dict["node_text"])
917
+ print()
918
+
919
+ # Only print the first citation if print_top is True
920
+ if print_top and textno == 0:
921
+ break
922
+
923
+
924
+ def sentence_df_triples_df(sentence_df: pd.DataFrame) -> pd.DataFrame:
925
+ """
926
+ Extract (subject, predicate, object) triples from the input sentence DataFrame
927
+ INPUT: sentence_df: a DataFrame ('sent_id', 'node_id', 'course_material_type',
928
+ 'course_material_week', 'sent_text')
929
+ OUTPUT: triple_df: a DataFrame (triple_id, sent_id, course_material_type, course_material_week,
930
+ triples_to_process)
931
+ """
932
+
933
+ model = genai.GenerativeModel('gemini-pro')
934
+
935
+ count = 0
936
+
937
+ dict_list = []
938
+
939
+ for idx, row in tqdm(sentence_df.iterrows(), total=len(sentence_df)):
940
+ if count < len(sentence_df) + 1:
941
+ count += 1
942
+ dict_list.append(sentence_triple_dict_list(row, model))
943
+ else:
944
+ break
945
+
946
+ return pd.DataFrame(dict_list)
947
+
948
+
949
+ def sentence_triple_dict_list(row: pd.Series, model) -> dict:
950
+ """
951
+ Extract (subject, predicate, object) triples from a row of a sentence dataframe
952
+ INPUT: row: a row with the following columns: ('sent_id', 'node_id', 'course_material_type',
953
+ 'course_material_week', 'sent_text')
954
+ model: llm model
955
+ OUTPUT: a list of dictionaries each of which has the following keys: triple_id, sent_id,
956
+ course_material_type, course_material_week, triples_to_process
957
+ """
958
+
959
+ triple_extraction_prompt = '''
960
+ Please perform structured triple extraction on the given text enclosed within the
961
+ three backticks.
962
+ Convert the text into a set of (subject, predicate, object) triples.
963
+ Treat a math expression or a block of programming statements as a single concept.
964
+ Use the previous extraction text and results as context.
965
+ Correct misspells and syntactic errors.
966
+ Don't summarize. Don't rewrite the original text. Don't decode the original text.
967
+ Output the results as a set of ("subject":extracted subject, "predicate":extracted predicate,
968
+ "object":extracted object). Don't add extra explanation to the results.
969
+ TEXT: ```{}```
970
+ '''
971
+
972
+ asent = row['sent_text']
973
+ #print(asent)
974
+
975
+ msg = triple_extraction_prompt.format(asent)
976
+
977
+
978
+ response = model.generate_content(
979
+ msg
980
+ )
981
+
982
+ #print(response.text)
983
+
984
+ pattern = r'\{([^}]+)\}|\(([^)]+)\)'
985
+
986
+ #response_text = response.text.encode("ascii", "ignore").decode(
987
+ # "utf-8", "ignore"
988
+ # )
989
+
990
+ response_text = response.text
991
+
992
+ matches = re.findall(pattern, response_text)
993
+
994
+ # Flatten the list of tuples and filter out empty matches
995
+ text_to_process = [ "{" + match[0].strip() + "}" if match[0]
996
+ else "{" + match[1].strip() + "}" for match in matches if match[0] or match[1]]
997
+
998
+ #print(text_to_process)
999
+
1000
+ tri_dict = {}
1001
+
1002
+ tri_dict['triple_id'] = row['sent_id'] + "_triples"
1003
+ tri_dict['sent_id'] = row['sent_id']
1004
+ tri_dict['course_material_type'] = row['course_material_type']
1005
+ tri_dict['course_material_week'] = row['course_material_week']
1006
+
1007
+ tri_dict['triples_to_process'] = text_to_process
1008
+
1009
+ return tri_dict
1010
+
1011
+
1012
+
1013
+ def split_nodes_sentences_df(nodes: List[llama_index.schema.TextNode]) -> pd.DataFrame:
1014
+ """
1015
+ split the text of each node into sentences by spacy
1016
+ """
1017
+
1018
+ recs = []
1019
+
1020
+ nlp = spacy.load('en_core_web_sm')
1021
+
1022
+ for node in nodes:
1023
+ dict_list = split_nodeText_sentences_dict_list(nlp, node)
1024
+
1025
+ recs.extend(dict_list)
1026
+
1027
+ return pd.DataFrame(recs)
1028
+
1029
+
1030
+ def split_nodeText_sentences_dict_list(nlp: Any, node: llama_index.schema.TextNode) -> list:
1031
+ """
1032
+ split the text of the given TextNode into sentences
1033
+ INPUT: nlp: the spacy model
1034
+ node: a TextNode
1035
+ OUTPUT: a list of dictionaries each of which contains the information for a sentence.
1036
+ """
1037
+
1038
+ dict_list = []
1039
+
1040
+ node_text = node.text
1041
+ text_doc = nlp(node_text)
1042
+ text_sentences = list(text_doc.sents)
1043
+
1044
+ for idx, sent in enumerate(text_sentences):
1045
+
1046
+ order = idx + 1 # the order of the sentence in the node
1047
+
1048
+ sent_dict = {}
1049
+ sent_dict['sent_id'] = node.node_id + "_sent" + str(order)
1050
+
1051
+ sent_dict['node_id'] = node.node_id
1052
+
1053
+ mdict = node.metadata
1054
+
1055
+ sent_dict['course_material_type'] = mdict['course_material_type']
1056
+ sent_dict['course_material_week'] = mdict['course_material_week']
1057
+
1058
+ sent_dict['sent_text'] = sent
1059
+
1060
+ dict_list.append(sent_dict)
1061
+
1062
+ return dict_list
1063
+
1064
+
1065
+ def text_keyconcepts(text:str, model:genai.GenerativeModel) -> str:
1066
+ """
1067
+ use gemini to generate a set of key learning concepts from the input text
1068
+ """
1069
+
1070
+ prompt = '''
1071
+ You are an expert AI assistant trained on extracting key concepts from the text.
1072
+ Please analyze the following material.
1073
+ Extract the key concepts that can be used to find related materials.
1074
+ Output the results as a list of key concepts only. Only keywords in the output list.
1075
+ No definitions. Separate the keywords by comma.
1076
+ TEXT: ```{}```
1077
+ '''
1078
+
1079
+ msg = prompt.format(text)
1080
+
1081
+ response = model.generate_content(
1082
+ msg
1083
+ )
1084
+
1085
+ input_string = response.text
1086
+
1087
+ items_list = [item.strip('-').strip() for item in re.split(r'[\n,]', input_string) if item]
1088
+
1089
+ return items_list
1090
+
1091
+ def text_query_embedding(query:str):
1092
+
1093
+ """
1094
+ Use Gemini to Embed the given query by the type of retrieval_query
1095
+ INPUT: query: str
1096
+ OUTPUT: embedding as a list of numbers
1097
+ """
1098
+ embedding = genai.embed_content(model="models/embedding-001",
1099
+ content=query,
1100
+ task_type="retrieval_query")
1101
+
1102
+ return embedding['embedding']
1103
+
1104
+
1105
+ def text_questions_answered(text:str, model:genai.GenerativeModel) -> str:
1106
+ """
1107
+ use gemini to extract a set of questions that can be answered by the input text
1108
+ """
1109
+
1110
+ prompt = '''
1111
+ You are an expert AI assistant trained on creating a list of specific,
1112
+ answerable questions that can be extracted from input text enclosed within the three backticks.
1113
+ Identify the most pertinent questions that could be asked based on its content.
1114
+ Compose these questions in a clear and concise manner, ensuring they directly
1115
+ align with the information presented in the text. Output the results in JSON format.
1116
+ TEXT: ```{}```
1117
+ '''
1118
+
1119
+ msg = prompt.format(text)
1120
+
1121
+ response = model.generate_content(
1122
+ msg
1123
+ )
1124
+
1125
+ return response.text
1126
+
1127
+
1128
+
1129
+ def text_retrieval_document_embedding(text:str, title:str):
1130
+
1131
+ """
1132
+ Use Gemini to Embed the given text and title by the type of retrieval_document
1133
+ INPUT: text: str
1134
+ title: str
1135
+ OUTPUT: embedding as a list of numbers
1136
+ """
1137
+ embedding = genai.embed_content(model="models/embedding-001",
1138
+ content=text,
1139
+ task_type="retrieval_document",
1140
+ title=title)
1141
+
1142
+ return embedding['embedding']
1143
+
1144
+
1145
+ def text_semantic_triples(text:str, model:genai.GenerativeModel) -> str:
1146
+ """
1147
+ use gemini to extract a set of semantic triples from the input text
1148
+ """
1149
+
1150
+ prompt = '''
1151
+ You are an expert AI assistant trained on extracting semantic triples from the given
1152
+ text enclosed within the three backticks.
1153
+ Genearate a set of (subject, predicate, object) triples for the identified relationships.
1154
+ Correct misspells and syntactic errors.
1155
+ Don't summarize. Don't rewrite the original text. Don't decode the original text.
1156
+ Output the results as JSON format. Don't add extra explanation to the results.
1157
+ TEXT: ```{}```
1158
+ '''
1159
+
1160
+ msg = prompt.format(text)
1161
+
1162
+ response = model.generate_content(
1163
+ msg
1164
+ )
1165
+
1166
+ return response.text
1167
+
1168
+
1169
+
1170
+ def text_summary(text:str, model:genai.GenerativeModel) -> str:
1171
+ """
1172
+ use gemini to generate a summary from the input text
1173
+ """
1174
+
1175
+ prompt = '''
1176
+ You are an expert AI summarization assistant and ready to condense any text into a
1177
+ clear and concise overview. Please help me summairze the text within the backticks below.
1178
+ Please extract the key topics and concepts. Plus, please ensure there are no typos or
1179
+ grammatical errors in the summary. The summary will be used as surrounding context of additional
1180
+ content to answer specific questions.
1181
+ TEXT: ```{}```
1182
+ '''
1183
+ msg = prompt.format(text)
1184
+
1185
+ response = model.generate_content(
1186
+ msg
1187
+ )
1188
+
1189
+ return response.text
1190
+
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ google-generativeai==0.3.1
2
+ python-dotenv==1.0.0
3
+ llama-index==0.9.25.post1
4
+ PyMuPDF==1.23.8
5
+ PyMuPDFb==1.23.7
6
+ networkx==3.2.1
7
+ ipykernel==6.27.1
8
+ ipython==8.18.1
9
+ ipywidgets==8.1.1
10
+ Pillow==10.1.0
11
+ tqdm==4.66.1
12
+ seaborn==0.13.1
13
+