safety-copilot

Running

App Files Files Community

codelion commited on Feb 20, 2024

Commit

c23d6d1

verified ·

1 Parent(s): 6eaf7a2

Delete loaders

Browse files

Files changed (20) hide show

loaders/__init__.py +0 -0
loaders/__pycache__/__init__.cpython-310.pyc +0 -0
loaders/__pycache__/audio.cpython-310.pyc +0 -0
loaders/__pycache__/common.cpython-310.pyc +0 -0
loaders/__pycache__/csv.cpython-310.pyc +0 -0
loaders/__pycache__/docx.cpython-310.pyc +0 -0
loaders/__pycache__/html.cpython-310.pyc +0 -0
loaders/__pycache__/markdown.cpython-310.pyc +0 -0
loaders/__pycache__/pdf.cpython-310.pyc +0 -0
loaders/__pycache__/powerpoint.cpython-310.pyc +0 -0
loaders/__pycache__/txt.cpython-310.pyc +0 -0
loaders/audio.py +0 -65
loaders/common.py +0 -46
loaders/csv.py +0 -5
loaders/docx.py +0 -5
loaders/html.py +0 -47
loaders/markdown.py +0 -5
loaders/pdf.py +0 -6
loaders/powerpoint.py +0 -5
loaders/txt.py +0 -5

loaders/__init__.py DELETED Viewed

File without changes

loaders/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (144 Bytes)

loaders/__pycache__/audio.cpython-310.pyc DELETED Viewed

Binary file (2.39 kB)

loaders/__pycache__/common.cpython-310.pyc DELETED Viewed

Binary file (1.69 kB)

loaders/__pycache__/csv.cpython-310.pyc DELETED Viewed

Binary file (425 Bytes)

loaders/__pycache__/docx.cpython-310.pyc DELETED Viewed

Binary file (422 Bytes)

loaders/__pycache__/html.cpython-310.pyc DELETED Viewed

Binary file (1.97 kB)

loaders/__pycache__/markdown.cpython-310.pyc DELETED Viewed

Binary file (440 Bytes)

loaders/__pycache__/pdf.cpython-310.pyc DELETED Viewed

Binary file (416 Bytes)

loaders/__pycache__/powerpoint.cpython-310.pyc DELETED Viewed

Binary file (448 Bytes)

loaders/__pycache__/txt.cpython-310.pyc DELETED Viewed

Binary file (415 Bytes)

loaders/audio.py DELETED Viewed

@@ -1,65 +0,0 @@
-import os
-import tempfile
-from io import BytesIO
-import time
-import openai
-import streamlit as st
-from langchain.document_loaders import TextLoader
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from utils import compute_sha1_from_content
-from langchain.schema import Document
-from stats import add_usage
-# Create a function to transcribe audio using Whisper
-def _transcribe_audio(api_key, audio_file, stats_db):
-    openai.api_key = api_key
-    transcript = ""
-    with BytesIO(audio_file.read()) as audio_bytes:
-        # Get the extension of the uploaded file
-        file_extension = os.path.splitext(audio_file.name)[-1]
-        # Create a temporary file with the uploaded audio data and the correct extension
-        with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
-            temp_audio_file.write(audio_bytes.read())
-            temp_audio_file.seek(0)  # Move the file pointer to the beginning of the file
-            # Transcribe the temporary audio file
-            if st.secrets.self_hosted == "false":
-                    add_usage(stats_db, "embedding", "audio", metadata={"file_name": audio_file.name,"file_type": file_extension})
-            transcript = openai.Audio.translate("whisper-1", temp_audio_file)
-    return transcript
-def process_audio(vector_store, file_name, stats_db):
-    if st.secrets.self_hosted == "false":
-        if file_name.size > 10000000:
-            st.error("File size is too large. Please upload a file smaller than 1MB.")
-            return
-    file_sha = ""
-    dateshort = time.strftime("%Y%m%d-%H%M%S")
-    file_meta_name = f"audiotranscript_{dateshort}.txt"
-    openai_api_key = st.secrets["openai_api_key"]
-    transcript = _transcribe_audio(openai_api_key, file_name, stats_db)
-    file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
-    ## file size computed from transcript
-    file_size = len(transcript.text.encode("utf-8"))
-    ## Load chunk size and overlap from sidebar
-    chunk_size = st.session_state['chunk_size']
-    chunk_overlap = st.session_state['chunk_overlap']
-    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    texts = text_splitter.split_text(transcript.text)
-    docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha,"file_size": file_size, "file_name": file_meta_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts]
-    if st.secrets.self_hosted == "false":
-        add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
-    vector_store.add_documents(docs_with_metadata)
-    return vector_store

loaders/common.py DELETED Viewed

@@ -1,46 +0,0 @@
-import tempfile
-import time
-import os
-from utils import compute_sha1_from_file
-from langchain.schema import Document
-import streamlit as st
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from stats import add_usage
-def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
-    documents = []
-    file_name = file.name
-    file_size = file.size
-    if st.secrets.self_hosted == "false":
-        if file_size > 1000000:
-            st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
-            return
-    dateshort = time.strftime("%Y%m%d")
-    with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
-        tmp_file.write(file.getvalue())
-        tmp_file.flush()
-        loader = loader_class(tmp_file.name)
-        documents = loader.load()
-        file_sha1 = compute_sha1_from_file(tmp_file.name)
-    os.remove(tmp_file.name)
-    chunk_size = st.session_state['chunk_size']
-    chunk_overlap = st.session_state['chunk_overlap']
-    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    documents = text_splitter.split_documents(documents)
-    # Add the document sha1 as metadata to each document
-    docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name,
-                                                                            "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort,
-                                                                            "user" : st.session_state["username"]})
-                          for doc in documents]
-    vector_store.add_documents(docs_with_metadata)
-    if stats_db:
-        add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix,
-                                                           "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})

loaders/csv.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .common import process_file
-from langchain.document_loaders.csv_loader import CSVLoader
-def process_csv(vector_store, file,stats_db):
-    return process_file(vector_store, file, CSVLoader, ".csv",stats_db=stats_db)

loaders/docx.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .common import process_file
-from langchain.document_loaders import Docx2txtLoader
-def process_docx(vector_store, file, stats_db):
-    return process_file(vector_store, file, Docx2txtLoader, ".docx", stats_db=stats_db)

loaders/html.py DELETED Viewed

@@ -1,47 +0,0 @@
-from .common import process_file
-from langchain.document_loaders import UnstructuredHTMLLoader
-import requests
-import re
-import unicodedata
-import tempfile
-import os
-import streamlit as st
-from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile
-def process_html(vector_store, file, stats_db):
-    return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)
-def get_html(url):
-    response = requests.get(url)
-    if response.status_code == 200:
-        return response.text
-    else:
-        return None
-def create_html_file(url, content):
-    file_name = slugify(url) + ".html"
-    temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
-    with open(temp_file_path, 'w') as temp_file:
-        temp_file.write(content)
-    record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
-    uploaded_file = UploadedFile(record)
-    return uploaded_file, temp_file_path
-def delete_tempfile(temp_file_path, url, ret):
-    try:
-        os.remove(temp_file_path)
-        if ret:
-            st.write(f"✅ Content saved... {url}  ")
-    except OSError as e:
-        print(f"Error while deleting the temporary file: {str(e)}")
-        if ret:
-            st.write(f"❌ Error while saving content... {url}  ")
-def slugify(text):
-    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
-    text = re.sub(r'[^\w\s-]', '', text).strip().lower()
-    text = re.sub(r'[-\s]+', '-', text)
-    return text

loaders/markdown.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .common import process_file
-from langchain.document_loaders import UnstructuredMarkdownLoader
-def process_markdown(vector_store, file, stats_db):
-    return process_file(vector_store, file, UnstructuredMarkdownLoader, ".md", stats_db=stats_db)

loaders/pdf.py DELETED Viewed

@@ -1,6 +0,0 @@
-from .common import process_file
-from langchain.document_loaders import PyPDFLoader
-def process_pdf(vector_store, file, stats_db):
-    return process_file(vector_store, file, PyPDFLoader, ".pdf", stats_db=stats_db)

loaders/powerpoint.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .common import process_file
-from langchain.document_loaders import UnstructuredPowerPointLoader
-def process_powerpoint(vector_store, file, stats_db):
-    return process_file(vector_store, file, UnstructuredPowerPointLoader, ".pptx", stats_db=stats_db)

loaders/txt.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .common import process_file
-from langchain.document_loaders import TextLoader
-def process_txt(vector_store, file,stats_db):
-    return process_file(vector_store, file, TextLoader, ".txt", stats_db=stats_db)