|
from .common import process_file |
|
from langchain.document_loaders import UnstructuredHTMLLoader |
|
import requests |
|
import re |
|
import unicodedata |
|
import tempfile |
|
import os |
|
import streamlit as st |
|
from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile |
|
|
|
def process_html(vector_store, file, stats_db): |
|
return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db) |
|
|
|
|
|
def get_html(url): |
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
return response.text |
|
else: |
|
return None |
|
|
|
def create_html_file(url, content): |
|
file_name = slugify(url) + ".html" |
|
temp_file_path = os.path.join(tempfile.gettempdir(), file_name) |
|
with open(temp_file_path, 'w') as temp_file: |
|
temp_file.write(content) |
|
|
|
record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read()) |
|
uploaded_file = UploadedFile(record) |
|
|
|
return uploaded_file, temp_file_path |
|
|
|
def delete_tempfile(temp_file_path, url, ret): |
|
try: |
|
os.remove(temp_file_path) |
|
if ret: |
|
st.write(f"β
Content saved... {url} ") |
|
except OSError as e: |
|
print(f"Error while deleting the temporary file: {str(e)}") |
|
if ret: |
|
st.write(f"β Error while saving content... {url} ") |
|
|
|
def slugify(text): |
|
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') |
|
text = re.sub(r'[^\w\s-]', '', text).strip().lower() |
|
text = re.sub(r'[-\s]+', '-', text) |
|
return text |