codelion commited on
Commit
c23d6d1
1 Parent(s): 6eaf7a2

Delete loaders

Browse files
loaders/__init__.py DELETED
File without changes
loaders/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (144 Bytes)
 
loaders/__pycache__/audio.cpython-310.pyc DELETED
Binary file (2.39 kB)
 
loaders/__pycache__/common.cpython-310.pyc DELETED
Binary file (1.69 kB)
 
loaders/__pycache__/csv.cpython-310.pyc DELETED
Binary file (425 Bytes)
 
loaders/__pycache__/docx.cpython-310.pyc DELETED
Binary file (422 Bytes)
 
loaders/__pycache__/html.cpython-310.pyc DELETED
Binary file (1.97 kB)
 
loaders/__pycache__/markdown.cpython-310.pyc DELETED
Binary file (440 Bytes)
 
loaders/__pycache__/pdf.cpython-310.pyc DELETED
Binary file (416 Bytes)
 
loaders/__pycache__/powerpoint.cpython-310.pyc DELETED
Binary file (448 Bytes)
 
loaders/__pycache__/txt.cpython-310.pyc DELETED
Binary file (415 Bytes)
 
loaders/audio.py DELETED
@@ -1,65 +0,0 @@
1
- import os
2
- import tempfile
3
- from io import BytesIO
4
- import time
5
- import openai
6
- import streamlit as st
7
- from langchain.document_loaders import TextLoader
8
- from langchain.embeddings.openai import OpenAIEmbeddings
9
- from langchain.text_splitter import RecursiveCharacterTextSplitter
10
- from utils import compute_sha1_from_content
11
- from langchain.schema import Document
12
- from stats import add_usage
13
-
14
-
15
-
16
- # Create a function to transcribe audio using Whisper
17
- def _transcribe_audio(api_key, audio_file, stats_db):
18
- openai.api_key = api_key
19
- transcript = ""
20
-
21
- with BytesIO(audio_file.read()) as audio_bytes:
22
- # Get the extension of the uploaded file
23
- file_extension = os.path.splitext(audio_file.name)[-1]
24
-
25
- # Create a temporary file with the uploaded audio data and the correct extension
26
- with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
27
- temp_audio_file.write(audio_bytes.read())
28
- temp_audio_file.seek(0) # Move the file pointer to the beginning of the file
29
-
30
- # Transcribe the temporary audio file
31
- if st.secrets.self_hosted == "false":
32
- add_usage(stats_db, "embedding", "audio", metadata={"file_name": audio_file.name,"file_type": file_extension})
33
-
34
- transcript = openai.Audio.translate("whisper-1", temp_audio_file)
35
-
36
- return transcript
37
-
38
- def process_audio(vector_store, file_name, stats_db):
39
- if st.secrets.self_hosted == "false":
40
- if file_name.size > 10000000:
41
- st.error("File size is too large. Please upload a file smaller than 1MB.")
42
- return
43
- file_sha = ""
44
- dateshort = time.strftime("%Y%m%d-%H%M%S")
45
- file_meta_name = f"audiotranscript_{dateshort}.txt"
46
- openai_api_key = st.secrets["openai_api_key"]
47
- transcript = _transcribe_audio(openai_api_key, file_name, stats_db)
48
- file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
49
- ## file size computed from transcript
50
- file_size = len(transcript.text.encode("utf-8"))
51
-
52
-
53
- ## Load chunk size and overlap from sidebar
54
- chunk_size = st.session_state['chunk_size']
55
- chunk_overlap = st.session_state['chunk_overlap']
56
-
57
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
58
- texts = text_splitter.split_text(transcript.text)
59
-
60
- docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha,"file_size": file_size, "file_name": file_meta_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts]
61
-
62
- if st.secrets.self_hosted == "false":
63
- add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
64
- vector_store.add_documents(docs_with_metadata)
65
- return vector_store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
loaders/common.py DELETED
@@ -1,46 +0,0 @@
1
- import tempfile
2
- import time
3
- import os
4
- from utils import compute_sha1_from_file
5
- from langchain.schema import Document
6
- import streamlit as st
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from stats import add_usage
9
-
10
- def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
11
- documents = []
12
- file_name = file.name
13
- file_size = file.size
14
- if st.secrets.self_hosted == "false":
15
- if file_size > 1000000:
16
- st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
17
- return
18
-
19
- dateshort = time.strftime("%Y%m%d")
20
- with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
21
- tmp_file.write(file.getvalue())
22
- tmp_file.flush()
23
-
24
- loader = loader_class(tmp_file.name)
25
- documents = loader.load()
26
- file_sha1 = compute_sha1_from_file(tmp_file.name)
27
-
28
- os.remove(tmp_file.name)
29
-
30
- chunk_size = st.session_state['chunk_size']
31
- chunk_overlap = st.session_state['chunk_overlap']
32
-
33
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
34
-
35
- documents = text_splitter.split_documents(documents)
36
-
37
- # Add the document sha1 as metadata to each document
38
- docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name,
39
- "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort,
40
- "user" : st.session_state["username"]})
41
- for doc in documents]
42
-
43
- vector_store.add_documents(docs_with_metadata)
44
- if stats_db:
45
- add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix,
46
- "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
loaders/csv.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders.csv_loader import CSVLoader
3
-
4
- def process_csv(vector_store, file,stats_db):
5
- return process_file(vector_store, file, CSVLoader, ".csv",stats_db=stats_db)
 
 
 
 
 
 
loaders/docx.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import Docx2txtLoader
3
-
4
- def process_docx(vector_store, file, stats_db):
5
- return process_file(vector_store, file, Docx2txtLoader, ".docx", stats_db=stats_db)
 
 
 
 
 
 
loaders/html.py DELETED
@@ -1,47 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import UnstructuredHTMLLoader
3
- import requests
4
- import re
5
- import unicodedata
6
- import tempfile
7
- import os
8
- import streamlit as st
9
- from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile
10
-
11
- def process_html(vector_store, file, stats_db):
12
- return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)
13
-
14
-
15
- def get_html(url):
16
- response = requests.get(url)
17
- if response.status_code == 200:
18
- return response.text
19
- else:
20
- return None
21
-
22
- def create_html_file(url, content):
23
- file_name = slugify(url) + ".html"
24
- temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
25
- with open(temp_file_path, 'w') as temp_file:
26
- temp_file.write(content)
27
-
28
- record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
29
- uploaded_file = UploadedFile(record)
30
-
31
- return uploaded_file, temp_file_path
32
-
33
- def delete_tempfile(temp_file_path, url, ret):
34
- try:
35
- os.remove(temp_file_path)
36
- if ret:
37
- st.write(f"✅ Content saved... {url} ")
38
- except OSError as e:
39
- print(f"Error while deleting the temporary file: {str(e)}")
40
- if ret:
41
- st.write(f"❌ Error while saving content... {url} ")
42
-
43
- def slugify(text):
44
- text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
45
- text = re.sub(r'[^\w\s-]', '', text).strip().lower()
46
- text = re.sub(r'[-\s]+', '-', text)
47
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
loaders/markdown.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import UnstructuredMarkdownLoader
3
-
4
- def process_markdown(vector_store, file, stats_db):
5
- return process_file(vector_store, file, UnstructuredMarkdownLoader, ".md", stats_db=stats_db)
 
 
 
 
 
 
loaders/pdf.py DELETED
@@ -1,6 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import PyPDFLoader
3
-
4
-
5
- def process_pdf(vector_store, file, stats_db):
6
- return process_file(vector_store, file, PyPDFLoader, ".pdf", stats_db=stats_db)
 
 
 
 
 
 
 
loaders/powerpoint.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import UnstructuredPowerPointLoader
3
-
4
- def process_powerpoint(vector_store, file, stats_db):
5
- return process_file(vector_store, file, UnstructuredPowerPointLoader, ".pptx", stats_db=stats_db)
 
 
 
 
 
 
loaders/txt.py DELETED
@@ -1,5 +0,0 @@
1
- from .common import process_file
2
- from langchain.document_loaders import TextLoader
3
-
4
- def process_txt(vector_store, file,stats_db):
5
- return process_file(vector_store, file, TextLoader, ".txt", stats_db=stats_db)