Spaces:
Sleeping
Sleeping
add csv export
Browse files- app.py +153 -79
- config.py +2 -0
- rag/rag_pipeline.py +1 -0
- utils/helpers.py +15 -13
- utils/zotero_manager.py +3 -1
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import json
|
2 |
from typing import List, Tuple
|
3 |
import os
|
@@ -8,7 +10,12 @@ from dotenv import load_dotenv
|
|
8 |
from slugify import slugify
|
9 |
|
10 |
from rag.rag_pipeline import RAGPipeline
|
11 |
-
from utils.helpers import
|
|
|
|
|
|
|
|
|
|
|
12 |
from utils.prompts import (
|
13 |
highlight_prompt,
|
14 |
evidence_based_prompt,
|
@@ -19,6 +26,11 @@ import openai
|
|
19 |
from config import STUDY_FILES, OPENAI_API_KEY
|
20 |
from utils.zotero_manager import ZoteroManager
|
21 |
|
|
|
|
|
|
|
|
|
|
|
22 |
load_dotenv()
|
23 |
logging.basicConfig(level=logging.INFO)
|
24 |
|
@@ -30,7 +42,10 @@ add_study_files_to_chromadb("study_files.json", "study_files_collection")
|
|
30 |
# Cache for RAG pipelines
|
31 |
rag_cache = {}
|
32 |
|
33 |
-
|
|
|
|
|
|
|
34 |
if not zotero_library_id or not zotero_api_access_key:
|
35 |
return "Please enter your zotero library Id and API Access Key"
|
36 |
|
@@ -46,9 +61,13 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
|
|
46 |
)
|
47 |
|
48 |
zotero_collections = zotero_manager.get_collections()
|
49 |
-
zotero_collection_lists = zotero_manager.list_zotero_collections(
|
|
|
|
|
50 |
filtered_zotero_collection_lists = (
|
51 |
-
zotero_manager.filter_and_return_collections_with_items(
|
|
|
|
|
52 |
)
|
53 |
|
54 |
study_files_data = {} # Dictionary to collect items for ChromaDB
|
@@ -62,12 +81,16 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
|
|
62 |
zotero_manager.get_collection_zotero_items_by_key(collection_key)
|
63 |
)
|
64 |
#### Export zotero collection items to json ####
|
65 |
-
zotero_items_json = zotero_manager.zotero_items_to_json(
|
|
|
|
|
66 |
export_file = f"{slugify(collection_name)}_zotero_items.json"
|
67 |
zotero_manager.write_zotero_items_to_json_file(
|
68 |
zotero_items_json, f"data/{export_file}"
|
69 |
)
|
70 |
-
append_to_study_files(
|
|
|
|
|
71 |
|
72 |
# Collect for ChromaDB
|
73 |
study_files_data[collection_name] = f"data/{export_file}"
|
@@ -75,13 +98,13 @@ def process_zotero_library_items(zotero_library_id: str, zotero_api_access_key:
|
|
75 |
# Update in-memory STUDY_FILES for reference in current session
|
76 |
STUDY_FILES.update({collection_name: f"data/{export_file}"})
|
77 |
logging.info(f"STUDY_FILES: {STUDY_FILES}")
|
78 |
-
|
79 |
# After loop, add all collected data to ChromaDB
|
80 |
add_study_files_to_chromadb("study_files.json", "study_files_collection")
|
81 |
message = "Successfully processed items in your zotero library"
|
82 |
except Exception as e:
|
83 |
message = f"Error process your zotero library: {str(e)}"
|
84 |
-
|
85 |
return message
|
86 |
|
87 |
|
@@ -93,11 +116,11 @@ def get_rag_pipeline(study_name: str) -> RAGPipeline:
|
|
93 |
result = collection.get(ids=[study_name]) # Retrieve document by ID
|
94 |
|
95 |
# Check if the result contains the requested document
|
96 |
-
if not result or len(result[
|
97 |
raise ValueError(f"Invalid study name: {study_name}")
|
98 |
|
99 |
# Extract the file path from the document metadata
|
100 |
-
study_file = result[
|
101 |
if not study_file:
|
102 |
raise ValueError(f"File path not found for study name: {study_name}")
|
103 |
|
@@ -107,9 +130,7 @@ def get_rag_pipeline(study_name: str) -> RAGPipeline:
|
|
107 |
return rag_cache[study_name]
|
108 |
|
109 |
|
110 |
-
def chat_function(
|
111 |
-
message: str, study_name: str, prompt_type: str
|
112 |
-
) -> str:
|
113 |
"""Process a chat message and generate a response using the RAG pipeline."""
|
114 |
|
115 |
if not message.strip():
|
@@ -134,11 +155,11 @@ def get_study_info(study_name: str) -> str:
|
|
134 |
logging.info(f"Result: ======> {result}")
|
135 |
|
136 |
# Check if the document exists in the result
|
137 |
-
if not result or len(result[
|
138 |
raise ValueError(f"Invalid study name: {study_name}")
|
139 |
|
140 |
# Extract the file path from the document metadata
|
141 |
-
study_file = result[
|
142 |
logging.info(f"study_file: =======> {study_file}")
|
143 |
if not study_file:
|
144 |
raise ValueError(f"File path not found for study name: {study_name}")
|
@@ -148,6 +169,34 @@ def get_study_info(study_name: str) -> str:
|
|
148 |
return f"### Number of documents: {len(data)}"
|
149 |
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
def update_interface(study_name: str) -> Tuple[str, gr.update, gr.update, gr.update]:
|
152 |
"""Update the interface based on the selected study."""
|
153 |
|
@@ -163,13 +212,14 @@ def update_interface(study_name: str) -> Tuple[str, gr.update, gr.update, gr.upd
|
|
163 |
def set_question(question: str) -> str:
|
164 |
return question.lstrip("✨ ")
|
165 |
|
|
|
166 |
def process_multi_input(text, study_name, prompt_type):
|
167 |
# Split input based on commas and strip any extra spaces
|
168 |
-
variable_list = [word.strip().upper() for word in text.split(
|
169 |
-
user_message =f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
|
170 |
logging.info(f"User message: ==> {user_message}")
|
171 |
response = chat_function(user_message, study_name, prompt_type)
|
172 |
-
return response
|
173 |
|
174 |
|
175 |
def create_gr_interface() -> gr.Blocks:
|
@@ -189,32 +239,46 @@ def create_gr_interface() -> gr.Blocks:
|
|
189 |
|
190 |
with gr.Blocks() as demo:
|
191 |
gr.Markdown("# ACRES RAG Platform")
|
192 |
-
|
193 |
with gr.Row():
|
194 |
with gr.Column(scale=1):
|
195 |
gr.Markdown("### Zotero Credentials")
|
196 |
-
zotero_library_id = gr.Textbox(
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
process_zotero_btn = gr.Button("Process your Zotero Library")
|
199 |
zotero_output = gr.Markdown(label="Zotero")
|
200 |
|
201 |
gr.Markdown("### Study Information")
|
202 |
|
203 |
# Query ChromaDB for all document IDs in the "study_files_collection" collection
|
204 |
-
collection = chromadb_client.get_or_create_collection(
|
|
|
|
|
205 |
# Retrieve all documents by querying with an empty string and specifying a high n_results
|
206 |
all_documents = collection.query(query_texts=[""], n_results=1000)
|
207 |
logging.info(f"all_documents: =========> {all_documents}")
|
208 |
# Extract document IDs as study names
|
209 |
document_ids = all_documents.get("ids")
|
210 |
-
study_choices = [
|
|
|
|
|
211 |
logging.info(f"study_choices: ======> {study_choices}")
|
212 |
|
213 |
# Update the Dropdown with choices from ChromaDB
|
214 |
study_dropdown = gr.Dropdown(
|
215 |
choices=study_choices,
|
216 |
label="Select Study",
|
217 |
-
value=
|
|
|
|
|
218 |
)
|
219 |
|
220 |
study_info = gr.Markdown(label="Study Details")
|
@@ -226,7 +290,7 @@ def create_gr_interface() -> gr.Blocks:
|
|
226 |
value="Default",
|
227 |
)
|
228 |
# clear = gr.Button("Clear Chat")
|
229 |
-
|
230 |
with gr.Column(scale=3):
|
231 |
gr.Markdown("### Study Variables")
|
232 |
with gr.Row():
|
@@ -239,59 +303,52 @@ def create_gr_interface() -> gr.Blocks:
|
|
239 |
)
|
240 |
submit_btn = gr.Button("Submit", scale=1)
|
241 |
answer_output = gr.Markdown(label="Answer")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
-
def
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
)
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
""
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
# msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
282 |
-
# bot,
|
283 |
-
# [chatbot, study_dropdown, prompt_type],
|
284 |
-
# [chatbot, *follow_up_btns],
|
285 |
-
# )
|
286 |
-
# send_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
287 |
-
# bot,
|
288 |
-
# [chatbot, study_dropdown, prompt_type],
|
289 |
-
# [chatbot, *follow_up_btns],
|
290 |
-
# )
|
291 |
-
# for btn in follow_up_btns + sample_btns:
|
292 |
-
# btn.click(set_question, inputs=[btn], outputs=[msg])
|
293 |
-
|
294 |
-
# clear.click(lambda: None, None, chatbot, queue=False)
|
295 |
|
296 |
study_dropdown.change(
|
297 |
fn=get_study_info,
|
@@ -299,8 +356,25 @@ def create_gr_interface() -> gr.Blocks:
|
|
299 |
outputs=[study_info],
|
300 |
)
|
301 |
|
302 |
-
process_zotero_btn.click(
|
303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
|
305 |
return demo
|
306 |
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
import json
|
4 |
from typing import List, Tuple
|
5 |
import os
|
|
|
10 |
from slugify import slugify
|
11 |
|
12 |
from rag.rag_pipeline import RAGPipeline
|
13 |
+
from utils.helpers import (
|
14 |
+
generate_follow_up_questions,
|
15 |
+
append_to_study_files,
|
16 |
+
add_study_files_to_chromadb,
|
17 |
+
chromadb_client,
|
18 |
+
)
|
19 |
from utils.prompts import (
|
20 |
highlight_prompt,
|
21 |
evidence_based_prompt,
|
|
|
26 |
from config import STUDY_FILES, OPENAI_API_KEY
|
27 |
from utils.zotero_manager import ZoteroManager
|
28 |
|
29 |
+
import csv
|
30 |
+
import io
|
31 |
+
|
32 |
+
import datetime
|
33 |
+
|
34 |
load_dotenv()
|
35 |
logging.basicConfig(level=logging.INFO)
|
36 |
|
|
|
42 |
# Cache for RAG pipelines
|
43 |
rag_cache = {}
|
44 |
|
45 |
+
|
46 |
+
def process_zotero_library_items(
|
47 |
+
zotero_library_id: str, zotero_api_access_key: str
|
48 |
+
) -> str:
|
49 |
if not zotero_library_id or not zotero_api_access_key:
|
50 |
return "Please enter your zotero library Id and API Access Key"
|
51 |
|
|
|
61 |
)
|
62 |
|
63 |
zotero_collections = zotero_manager.get_collections()
|
64 |
+
zotero_collection_lists = zotero_manager.list_zotero_collections(
|
65 |
+
zotero_collections
|
66 |
+
)
|
67 |
filtered_zotero_collection_lists = (
|
68 |
+
zotero_manager.filter_and_return_collections_with_items(
|
69 |
+
zotero_collection_lists
|
70 |
+
)
|
71 |
)
|
72 |
|
73 |
study_files_data = {} # Dictionary to collect items for ChromaDB
|
|
|
81 |
zotero_manager.get_collection_zotero_items_by_key(collection_key)
|
82 |
)
|
83 |
#### Export zotero collection items to json ####
|
84 |
+
zotero_items_json = zotero_manager.zotero_items_to_json(
|
85 |
+
zotero_collection_items
|
86 |
+
)
|
87 |
export_file = f"{slugify(collection_name)}_zotero_items.json"
|
88 |
zotero_manager.write_zotero_items_to_json_file(
|
89 |
zotero_items_json, f"data/{export_file}"
|
90 |
)
|
91 |
+
append_to_study_files(
|
92 |
+
"study_files.json", collection_name, f"data/{export_file}"
|
93 |
+
)
|
94 |
|
95 |
# Collect for ChromaDB
|
96 |
study_files_data[collection_name] = f"data/{export_file}"
|
|
|
98 |
# Update in-memory STUDY_FILES for reference in current session
|
99 |
STUDY_FILES.update({collection_name: f"data/{export_file}"})
|
100 |
logging.info(f"STUDY_FILES: {STUDY_FILES}")
|
101 |
+
|
102 |
# After loop, add all collected data to ChromaDB
|
103 |
add_study_files_to_chromadb("study_files.json", "study_files_collection")
|
104 |
message = "Successfully processed items in your zotero library"
|
105 |
except Exception as e:
|
106 |
message = f"Error process your zotero library: {str(e)}"
|
107 |
+
|
108 |
return message
|
109 |
|
110 |
|
|
|
116 |
result = collection.get(ids=[study_name]) # Retrieve document by ID
|
117 |
|
118 |
# Check if the result contains the requested document
|
119 |
+
if not result or len(result["metadatas"]) == 0:
|
120 |
raise ValueError(f"Invalid study name: {study_name}")
|
121 |
|
122 |
# Extract the file path from the document metadata
|
123 |
+
study_file = result["metadatas"][0].get("file_path")
|
124 |
if not study_file:
|
125 |
raise ValueError(f"File path not found for study name: {study_name}")
|
126 |
|
|
|
130 |
return rag_cache[study_name]
|
131 |
|
132 |
|
133 |
+
def chat_function(message: str, study_name: str, prompt_type: str) -> str:
|
|
|
|
|
134 |
"""Process a chat message and generate a response using the RAG pipeline."""
|
135 |
|
136 |
if not message.strip():
|
|
|
155 |
logging.info(f"Result: ======> {result}")
|
156 |
|
157 |
# Check if the document exists in the result
|
158 |
+
if not result or len(result["metadatas"]) == 0:
|
159 |
raise ValueError(f"Invalid study name: {study_name}")
|
160 |
|
161 |
# Extract the file path from the document metadata
|
162 |
+
study_file = result["metadatas"][0].get("file_path")
|
163 |
logging.info(f"study_file: =======> {study_file}")
|
164 |
if not study_file:
|
165 |
raise ValueError(f"File path not found for study name: {study_name}")
|
|
|
169 |
return f"### Number of documents: {len(data)}"
|
170 |
|
171 |
|
172 |
+
def markdown_table_to_csv(markdown_text: str) -> str:
|
173 |
+
"""Convert a markdown table to CSV format."""
|
174 |
+
# Split the text into lines and remove empty lines
|
175 |
+
lines = [line.strip() for line in markdown_text.split("\n") if line.strip()]
|
176 |
+
|
177 |
+
# Find the table content (lines starting with |)
|
178 |
+
table_lines = [line for line in lines if line.startswith("|")]
|
179 |
+
|
180 |
+
if not table_lines:
|
181 |
+
return ""
|
182 |
+
|
183 |
+
# Process each line to extract cell values
|
184 |
+
csv_data = []
|
185 |
+
for line in table_lines:
|
186 |
+
# Skip separator lines (containing only dashes)
|
187 |
+
if "---" in line:
|
188 |
+
continue
|
189 |
+
# Split by |, remove empty strings, and strip whitespace
|
190 |
+
cells = [cell.strip() for cell in line.split("|") if cell.strip()]
|
191 |
+
csv_data.append(cells)
|
192 |
+
|
193 |
+
# Create CSV string
|
194 |
+
output = io.StringIO()
|
195 |
+
writer = csv.writer(output)
|
196 |
+
writer.writerows(csv_data)
|
197 |
+
return output.getvalue()
|
198 |
+
|
199 |
+
|
200 |
def update_interface(study_name: str) -> Tuple[str, gr.update, gr.update, gr.update]:
|
201 |
"""Update the interface based on the selected study."""
|
202 |
|
|
|
212 |
def set_question(question: str) -> str:
|
213 |
return question.lstrip("✨ ")
|
214 |
|
215 |
+
|
216 |
def process_multi_input(text, study_name, prompt_type):
|
217 |
# Split input based on commas and strip any extra spaces
|
218 |
+
variable_list = [word.strip().upper() for word in text.split(",")]
|
219 |
+
user_message = f"Extract and present in a tabular format the following variables for each {study_name} study: {', '.join(variable_list)}"
|
220 |
logging.info(f"User message: ==> {user_message}")
|
221 |
response = chat_function(user_message, study_name, prompt_type)
|
222 |
+
return [response, gr.update(visible=True)]
|
223 |
|
224 |
|
225 |
def create_gr_interface() -> gr.Blocks:
|
|
|
239 |
|
240 |
with gr.Blocks() as demo:
|
241 |
gr.Markdown("# ACRES RAG Platform")
|
242 |
+
|
243 |
with gr.Row():
|
244 |
with gr.Column(scale=1):
|
245 |
gr.Markdown("### Zotero Credentials")
|
246 |
+
zotero_library_id = gr.Textbox(
|
247 |
+
label="Zotero Library ID",
|
248 |
+
type="password",
|
249 |
+
placeholder="Enter Your Zotero Library ID here...",
|
250 |
+
)
|
251 |
+
zotero_api_access_key = gr.Textbox(
|
252 |
+
label="Zotero API Access Key",
|
253 |
+
type="password",
|
254 |
+
placeholder="Enter Your Zotero API Access Key...",
|
255 |
+
)
|
256 |
process_zotero_btn = gr.Button("Process your Zotero Library")
|
257 |
zotero_output = gr.Markdown(label="Zotero")
|
258 |
|
259 |
gr.Markdown("### Study Information")
|
260 |
|
261 |
# Query ChromaDB for all document IDs in the "study_files_collection" collection
|
262 |
+
collection = chromadb_client.get_or_create_collection(
|
263 |
+
"study_files_collection"
|
264 |
+
)
|
265 |
# Retrieve all documents by querying with an empty string and specifying a high n_results
|
266 |
all_documents = collection.query(query_texts=[""], n_results=1000)
|
267 |
logging.info(f"all_documents: =========> {all_documents}")
|
268 |
# Extract document IDs as study names
|
269 |
document_ids = all_documents.get("ids")
|
270 |
+
study_choices = [
|
271 |
+
doc_id for doc_id in document_ids[0] if document_ids
|
272 |
+
] # Get list of document IDs
|
273 |
logging.info(f"study_choices: ======> {study_choices}")
|
274 |
|
275 |
# Update the Dropdown with choices from ChromaDB
|
276 |
study_dropdown = gr.Dropdown(
|
277 |
choices=study_choices,
|
278 |
label="Select Study",
|
279 |
+
value=(
|
280 |
+
study_choices[0] if study_choices else None
|
281 |
+
), # Set first choice as default, if available
|
282 |
)
|
283 |
|
284 |
study_info = gr.Markdown(label="Study Details")
|
|
|
290 |
value="Default",
|
291 |
)
|
292 |
# clear = gr.Button("Clear Chat")
|
293 |
+
|
294 |
with gr.Column(scale=3):
|
295 |
gr.Markdown("### Study Variables")
|
296 |
with gr.Row():
|
|
|
303 |
)
|
304 |
submit_btn = gr.Button("Submit", scale=1)
|
305 |
answer_output = gr.Markdown(label="Answer")
|
306 |
+
# button to download_csv
|
307 |
+
download_btn = gr.DownloadButton(
|
308 |
+
"Download as CSV",
|
309 |
+
variant="primary",
|
310 |
+
size="sm",
|
311 |
+
scale=1,
|
312 |
+
visible=False,
|
313 |
+
)
|
314 |
|
315 |
+
def download_as_csv(markdown_content):
|
316 |
+
"""Convert markdown table to CSV and provide for download."""
|
317 |
+
if not markdown_content:
|
318 |
+
return None
|
319 |
+
|
320 |
+
csv_content = markdown_table_to_csv(markdown_content)
|
321 |
+
if not csv_content:
|
322 |
+
return None
|
323 |
+
|
324 |
+
# Create temporary file with actual content
|
325 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
326 |
+
temp_path = f"study_export_{timestamp}.csv"
|
327 |
+
|
328 |
+
with open(temp_path, "w", newline="", encoding="utf-8") as f:
|
329 |
+
f.write(csv_content)
|
330 |
+
|
331 |
+
return temp_path
|
332 |
+
|
333 |
+
def cleanup_temp_files():
|
334 |
+
"""Clean up old temporary files."""
|
335 |
+
try:
|
336 |
+
# Delete files older than 5 minutes
|
337 |
+
current_time = datetime.datetime.now()
|
338 |
+
for file in os.listdir():
|
339 |
+
if file.startswith("study_export_") and file.endswith(".csv"):
|
340 |
+
file_time = datetime.datetime.fromtimestamp(
|
341 |
+
os.path.getmtime(file)
|
342 |
+
)
|
343 |
+
if (current_time - file_time).seconds > 30: # 5 minutes
|
344 |
+
try:
|
345 |
+
os.remove(file)
|
346 |
+
except Exception as e:
|
347 |
+
logging.warning(
|
348 |
+
f"Failed to remove temp file {file}: {e}"
|
349 |
+
)
|
350 |
+
except Exception as e:
|
351 |
+
logging.warning(f"Error during cleanup: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
|
353 |
study_dropdown.change(
|
354 |
fn=get_study_info,
|
|
|
356 |
outputs=[study_info],
|
357 |
)
|
358 |
|
359 |
+
process_zotero_btn.click(
|
360 |
+
process_zotero_library_items,
|
361 |
+
inputs=[zotero_library_id, zotero_api_access_key],
|
362 |
+
outputs=[zotero_output],
|
363 |
+
queue=False,
|
364 |
+
)
|
365 |
+
submit_btn.click(
|
366 |
+
process_multi_input,
|
367 |
+
inputs=[study_variables, study_dropdown, prompt_type],
|
368 |
+
outputs=[answer_output, download_btn],
|
369 |
+
queue=False,
|
370 |
+
)
|
371 |
+
download_btn.click(
|
372 |
+
fn=download_as_csv,
|
373 |
+
inputs=[answer_output],
|
374 |
+
outputs=[download_btn],
|
375 |
+
).then(
|
376 |
+
fn=cleanup_temp_files, inputs=None, outputs=None # Clean up after download
|
377 |
+
)
|
378 |
|
379 |
return demo
|
380 |
|
config.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import os
|
2 |
|
3 |
from dotenv import load_dotenv
|
|
|
1 |
+
# config.py
|
2 |
+
|
3 |
import os
|
4 |
|
5 |
from dotenv import load_dotenv
|
rag/rag_pipeline.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import json
|
2 |
import logging
|
3 |
from typing import Dict, Any, List
|
|
|
1 |
+
# rag/rag_pipeline.py
|
2 |
import json
|
3 |
import logging
|
4 |
from typing import Dict, Any, List
|
utils/helpers.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
from typing import Dict, Any
|
2 |
from llama_index.core import Response
|
3 |
from typing import List
|
@@ -29,7 +31,7 @@ def read_study_files(file_path):
|
|
29 |
Raises:
|
30 |
FileNotFoundError: If the file is not found at the provided path.
|
31 |
json.JSONDecodeError: If the file contents are not valid JSON.
|
32 |
-
|
33 |
Example:
|
34 |
Given a JSON file 'study_files.json' with content like:
|
35 |
{
|
@@ -46,13 +48,15 @@ def read_study_files(file_path):
|
|
46 |
}
|
47 |
"""
|
48 |
try:
|
49 |
-
with open(file_path,
|
50 |
data = json.load(file)
|
51 |
return data
|
52 |
except FileNotFoundError as e:
|
53 |
raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
|
54 |
except json.JSONDecodeError as e:
|
55 |
-
raise ValueError(
|
|
|
|
|
56 |
|
57 |
|
58 |
def append_to_study_files(file_path, new_key, new_value):
|
@@ -86,20 +90,22 @@ def append_to_study_files(file_path, new_key, new_value):
|
|
86 |
"""
|
87 |
try:
|
88 |
# Read the existing data from the file
|
89 |
-
with open(file_path,
|
90 |
data = json.load(file)
|
91 |
-
|
92 |
# Append the new key-value pair to the dictionary
|
93 |
data[new_key] = new_value
|
94 |
|
95 |
# Write the updated data back to the file
|
96 |
-
with open(file_path,
|
97 |
json.dump(data, file, indent=4) # indent for pretty printing
|
98 |
|
99 |
except FileNotFoundError as e:
|
100 |
raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
|
101 |
except json.JSONDecodeError as e:
|
102 |
-
raise ValueError(
|
|
|
|
|
103 |
except IOError as e:
|
104 |
raise IOError(f"Failed to write to the file at {file_path}.") from e
|
105 |
|
@@ -204,12 +210,8 @@ def add_study_files_to_chromadb(file_path: str, collection_name: str):
|
|
204 |
metadatas.append({"file_path": file_path}) # Metadata with file path
|
205 |
|
206 |
# Add documents to the collection in batch
|
207 |
-
collection.add(
|
208 |
-
|
209 |
-
documents=documents,
|
210 |
-
metadatas=metadatas
|
211 |
-
)
|
212 |
-
|
213 |
print("All study files have been successfully added to ChromaDB.")
|
214 |
|
215 |
|
|
|
1 |
+
# utils/helpers.py
|
2 |
+
|
3 |
from typing import Dict, Any
|
4 |
from llama_index.core import Response
|
5 |
from typing import List
|
|
|
31 |
Raises:
|
32 |
FileNotFoundError: If the file is not found at the provided path.
|
33 |
json.JSONDecodeError: If the file contents are not valid JSON.
|
34 |
+
|
35 |
Example:
|
36 |
Given a JSON file 'study_files.json' with content like:
|
37 |
{
|
|
|
48 |
}
|
49 |
"""
|
50 |
try:
|
51 |
+
with open(file_path, "r") as file:
|
52 |
data = json.load(file)
|
53 |
return data
|
54 |
except FileNotFoundError as e:
|
55 |
raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
|
56 |
except json.JSONDecodeError as e:
|
57 |
+
raise ValueError(
|
58 |
+
f"The file at path {file_path} does not contain valid JSON."
|
59 |
+
) from e
|
60 |
|
61 |
|
62 |
def append_to_study_files(file_path, new_key, new_value):
|
|
|
90 |
"""
|
91 |
try:
|
92 |
# Read the existing data from the file
|
93 |
+
with open(file_path, "r") as file:
|
94 |
data = json.load(file)
|
95 |
+
|
96 |
# Append the new key-value pair to the dictionary
|
97 |
data[new_key] = new_value
|
98 |
|
99 |
# Write the updated data back to the file
|
100 |
+
with open(file_path, "w") as file:
|
101 |
json.dump(data, file, indent=4) # indent for pretty printing
|
102 |
|
103 |
except FileNotFoundError as e:
|
104 |
raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
|
105 |
except json.JSONDecodeError as e:
|
106 |
+
raise ValueError(
|
107 |
+
f"The file at path {file_path} does not contain valid JSON."
|
108 |
+
) from e
|
109 |
except IOError as e:
|
110 |
raise IOError(f"Failed to write to the file at {file_path}.") from e
|
111 |
|
|
|
210 |
metadatas.append({"file_path": file_path}) # Metadata with file path
|
211 |
|
212 |
# Add documents to the collection in batch
|
213 |
+
collection.add(ids=ids, documents=documents, metadatas=metadatas)
|
214 |
+
|
|
|
|
|
|
|
|
|
215 |
print("All study files have been successfully added to ChromaDB.")
|
216 |
|
217 |
|
utils/zotero_manager.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
from typing import Any, Dict, List, Optional
|
@@ -641,4 +643,4 @@ if __name__ == "__main__":
|
|
641 |
## Save to disc
|
642 |
zotero_manager.write_zotero_items_to_json_file(
|
643 |
ebora_virus_zotero_items_json, "zotero_data/ebora_virus_zotero_items.json"
|
644 |
-
)
|
|
|
1 |
+
# utils/zotero_manager.py
|
2 |
+
|
3 |
import json
|
4 |
import os
|
5 |
from typing import Any, Dict, List, Optional
|
|
|
643 |
## Save to disc
|
644 |
zotero_manager.write_zotero_items_to_json_file(
|
645 |
ebora_virus_zotero_items_json, "zotero_data/ebora_virus_zotero_items.json"
|
646 |
+
)
|