|
import streamlit as st |
|
import os |
|
import subprocess |
|
import sys |
|
import shutil |
|
from Crypto.PublicKey import RSA |
|
from datasets import load_dataset |
|
import pandas as pd |
|
import oci |
|
from oci import object_storage |
|
from oci.object_storage.models import CreateBucketDetails |
|
from oci.object_storage.models import CreatePreauthenticatedRequestDetails |
|
import pickle |
|
|
|
st.set_page_config(page_title="Oracle") |
|
st.title("Oracle") |
|
st.caption("Upload HF Dataset to OCI Object Storage!") |
|
|
|
|
|
config_location = ".oci/config" |
|
user_ocid = "" |
|
tenancy_ocid = "" |
|
region = "" |
|
gen_api_key = "n" |
|
private_key_location = ".oci/private_key.pem" |
|
hf_dataset = "" |
|
|
|
|
|
|
|
|
|
|
|
oracle_form = st.form("configuration") |
|
oracle_form.write("OCI Settings") |
|
user_ocid = oracle_form.text_input("Enter the User OCID", "ocid1.user.oc1..aaaaaaaakhekqfxefo2a3sveid67qqlfgtrmpk5cym5oqkcgtgkhbi3elova") |
|
tenancy_ocid = oracle_form.text_input("Enter the tenancy ocid", "ocid1.tenancy.oc1..aaaaaaaahzy3x4boh7ipxyft2rowu2xeglvanlfewudbnueugsieyuojkldq") |
|
region = oracle_form.text_input("Enter the region", "us-ashburn-1") |
|
existing_checkbox = oracle_form.checkbox("Check this if you want to put the dataset into an existing bucket") |
|
|
|
|
|
|
|
oracle_submitted = oracle_form.form_submit_button("Generate API Key") |
|
|
|
|
|
|
|
dataset_form = st.form("dataset") |
|
dataset_form.write("Dataset Settings") |
|
dataset_name = dataset_form.text_area("Enter the name of the huggingface Dataset:", value = "biosses") |
|
dataset_name_2 = dataset_form.text_area("Enter the name of the config for the dataset if it has one", value = " ") |
|
split_name = dataset_form.text_area("Enter the name of the split of the dataset that you want to use", value = "train") |
|
pd_checkbox = dataset_form.checkbox("Check this if you want this to be a pandas dataframe instead of a HF Dataset Object") |
|
dataset_submitted = dataset_form.form_submit_button("Pull Dataset") |
|
|
|
|
|
def load_and_process_data(path, name, streaming, split_name): |
|
dataset = load_dataset(path = path, name = name, streaming=streaming, keep_in_memory = True) |
|
|
|
dataset_head = dataset[split_name] |
|
return dataset_head |
|
|
|
|
|
|
|
if oracle_submitted: |
|
input_str = config_location + "\n" + "Y" + "\n" + "USER" + "\n" + user_ocid + "\n" + tenancy_ocid + "\n" + region + "\n" + gen_api_key + "\n" + private_key_location |
|
|
|
key_input_str = " \n" + " \n" |
|
|
|
try: |
|
shutil.rmtree(".oci") |
|
except Exception: |
|
pass |
|
try: |
|
os.mkdir(".oci") |
|
except FileExistsError: |
|
pass |
|
|
|
open(".oci/config", "a").close() |
|
|
|
|
|
key = RSA.generate(2048) |
|
private_key = key.export_key() |
|
file_out = open(".oci/private_key.pem", "wb") |
|
file_out.write(private_key) |
|
file_out.close() |
|
|
|
public_key = key.publickey().export_key() |
|
file_out = open(".oci/public_key.pem", "wb") |
|
file_out.write(public_key) |
|
file_out.close() |
|
|
|
p = subprocess.run(["oci", "setup", "config"], text = True, input = input_str) |
|
|
|
cat_public = subprocess.run(["cat", ".oci/public_key.pem"], text = True, capture_output=True) |
|
cat_config = subprocess.run(["cat", ".oci/config"], text = True, capture_output=True) |
|
oracle_form.text(cat_public.stdout) |
|
with oracle_form.expander("Open to see the generated OCI config file"): |
|
oracle_form.text(cat_config.stdout) |
|
|
|
|
|
|
|
if dataset_submitted: |
|
hf_dataset = load_and_process_data(dataset_name, dataset_name_2, False, split_name) |
|
if pd_checkbox: |
|
hf_dataset = pd.DataFrame.from_dict(hf_dataset) |
|
st.write(hf_dataset) |
|
st.write("Dataset Pulled Succesfully!") |
|
oci_config = oci.config.from_file(".oci/config", profile_name = "USER") |
|
object_storage = object_storage.ObjectStorageClient(oci_config) |
|
st.write("Object Storage Connected Succesfully") |
|
namespace = object_storage.get_namespace().data |
|
compartment_id = oci_config["tenancy"] |
|
st.write(namespace) |
|
bucket_name = dataset_name.replace("/", "-") |
|
try: |
|
bucket = object_storage.create_bucket( |
|
namespace, |
|
oci.object_storage.models.CreateBucketDetails( |
|
name=bucket_name, |
|
compartment_id=compartment_id, |
|
storage_tier='Archive', |
|
public_access_type='ObjectRead' |
|
) |
|
) |
|
st.write("Bucket Written:") |
|
st.write(bucket) |
|
except Exception: |
|
st.write("Bucket Exists, Writing Dataset to Bucket") |
|
|
|
st.write("Uploading new object if it doesn't already exist {!r}".format(hf_dataset)) |
|
hf_bytes = pickle.dumps(hf_dataset) |
|
obj = object_storage.put_object( |
|
namespace, |
|
bucket_name, |
|
bucket_name, |
|
hf_bytes) |
|
st.write("Object Pushed Succesfully!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|