File size: 4,503 Bytes
9831243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import streamlit as st
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain.schema import Document
import json
from typing import Iterable
import os
from datetime import datetime
import zipfile
import tempfile

def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')

def load_docs_from_jsonl(file)->Iterable[Document]:
    array = []
    for line in file:
        data = json.loads(line.decode('utf-8'))
        obj = Document(**data)
        array.append(obj)
    return array

st.title('Encoding and Storage')

# Create output directory
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
OUTPUT_DIR = "./out"

# Check if the directory exists, and if not, create it
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    st.write(f"Directory '{OUTPUT_DIR}' was created.")
else:
    st.write(f"Directory '{OUTPUT_DIR}' already exists.")

# Allow the user to upload the JSON file if missing
# Allow the user to upload the JSONL file if missing
if 'docs' not in st.session_state:
    st.write("Document collection not found in session state.")
    uploaded_file = st.file_uploader("Upload JSONL file", type=["jsonl"])
    if uploaded_file is not None:
        try:
            docs = load_docs_from_jsonl(uploaded_file)
            st.session_state['docs'] = docs
            st.write(f"Loaded {len(docs)} documents from the uploaded file.")
        except Exception as e:
            st.error(f"Error loading JSONL file: {str(e)}")
else:
    docs = st.session_state['docs']
    st.write(f"Loaded {len(docs)} documents from the session state.")
# Show the embedding model
EMBEDDING_MODEL_NAME = st.session_state.get('selected_embedding_model', "thenlper/gte-small")
st.write(f"Selected Embedding Model: {EMBEDDING_MODEL_NAME}")

# Allow the user to select the device (GPU or CPU)
device_form = st.form(key='device_form')
device = device_form.radio("Select Device", ("CUDA", "CPU"))
submit_device = device_form.form_submit_button(label='Submit Device')

if submit_device:
    # Set up the embedding model
    embedding_model = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        multi_process=True,
        model_kwargs={"device": device.lower()},
        encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
    )

    # Show the configuration
    st.write("Embedding Model Configuration:")
    st.write(embedding_model)

    # Start the encoding
    if 'docs' in st.session_state:
        progress_bar = st.progress(0)
        total_docs = len(docs)

        collection_vectorstore = FAISS.from_documents(docs, embedding=embedding_model)
        st.session_state['collection_vectorstore'] = collection_vectorstore

        for i in range(total_docs):
            progress_bar.progress((i + 1) / total_docs)

        st.write("Encoding completed.")
    else:
        st.write("No documents found in the session state.")
        
 # Allow saving and downloading the configuration
if st.button("Save and Download Configuration"):
    if 'collection_vectorstore' in st.session_state:
        collection_vectorstore = st.session_state['collection_vectorstore']
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        zip_filename = f"docs_vectors_{timestamp}.zip"

        with tempfile.TemporaryDirectory() as temp_dir:
            collection_vectorstore.save_local(f"{temp_dir}/docs_vectors")

            with zipfile.ZipFile(zip_filename, "w") as zip_file:
                for root, _, files in os.walk(temp_dir):
                    for file in files:
                        file_path = os.path.join(root, file)
                        zip_file.write(file_path, os.path.relpath(file_path, temp_dir))

            with open(zip_filename, "rb") as zip_file:
                zip_bytes = zip_file.read()

            st.download_button(
                label="Download Configuration",
                data=zip_bytes,
                file_name=zip_filename,
                mime="application/zip",
            )

        st.success("Configuration saved and downloaded.")
    else:
        st.warning("No vector store found. Please make sure the encoding is completed.")

if st.button('Proceed to Q&A Testing'):
    st.switch_page('pages/05_testing_qa.py')