import streamlit as st from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.llms import HuggingFaceEndpoint from langchain.schema import Document import json from typing import Iterable import os from datetime import datetime import zipfile import tempfile def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None: with open(file_path, 'w') as jsonl_file: for doc in array: jsonl_file.write(doc.json() + '\n') def load_docs_from_jsonl(file)->Iterable[Document]: array = [] for line in file: data = json.loads(line.decode('utf-8')) obj = Document(**data) array.append(obj) return array st.title('Encoding and Storage') # Create output directory start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") OUTPUT_DIR = "./out" # Check if the directory exists, and if not, create it if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) st.write(f"Directory '{OUTPUT_DIR}' was created.") else: st.write(f"Directory '{OUTPUT_DIR}' already exists.") # Allow the user to upload the JSON file if missing # Allow the user to upload the JSONL file if missing if 'docs' not in st.session_state: st.write("Document collection not found in session state.") uploaded_file = st.file_uploader("Upload JSONL file", type=["jsonl"]) if uploaded_file is not None: try: docs = load_docs_from_jsonl(uploaded_file) st.session_state['docs'] = docs st.write(f"Loaded {len(docs)} documents from the uploaded file.") except Exception as e: st.error(f"Error loading JSONL file: {str(e)}") else: docs = st.session_state['docs'] st.write(f"Loaded {len(docs)} documents from the session state.") # Show the embedding model EMBEDDING_MODEL_NAME = st.session_state.get('selected_embedding_model', "thenlper/gte-small") st.write(f"Selected Embedding Model: {EMBEDDING_MODEL_NAME}") # Allow the user to select the device (GPU or CPU) device_form = st.form(key='device_form') device = device_form.radio("Select Device", ("CUDA", "CPU")) submit_device = device_form.form_submit_button(label='Submit Device') if submit_device: # Set up the embedding model embedding_model = HuggingFaceEmbeddings( model_name=EMBEDDING_MODEL_NAME, multi_process=True, model_kwargs={"device": device.lower()}, encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity ) # Show the configuration st.write("Embedding Model Configuration:") st.write(embedding_model) # Start the encoding if 'docs' in st.session_state: progress_bar = st.progress(0) total_docs = len(docs) collection_vectorstore = FAISS.from_documents(docs, embedding=embedding_model) st.session_state['collection_vectorstore'] = collection_vectorstore for i in range(total_docs): progress_bar.progress((i + 1) / total_docs) st.write("Encoding completed.") else: st.write("No documents found in the session state.") # Allow saving and downloading the configuration if st.button("Save and Download Configuration"): if 'collection_vectorstore' in st.session_state: collection_vectorstore = st.session_state['collection_vectorstore'] timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") zip_filename = f"docs_vectors_{timestamp}.zip" with tempfile.TemporaryDirectory() as temp_dir: collection_vectorstore.save_local(f"{temp_dir}/docs_vectors") with zipfile.ZipFile(zip_filename, "w") as zip_file: for root, _, files in os.walk(temp_dir): for file in files: file_path = os.path.join(root, file) zip_file.write(file_path, os.path.relpath(file_path, temp_dir)) with open(zip_filename, "rb") as zip_file: zip_bytes = zip_file.read() st.download_button( label="Download Configuration", data=zip_bytes, file_name=zip_filename, mime="application/zip", ) st.success("Configuration saved and downloaded.") else: st.warning("No vector store found. Please make sure the encoding is completed.") if st.button('Proceed to Q&A Testing'): st.switch_page('pages/05_testing_qa.py')