meraKB / loaders /html.py
Asankhaya Sharma
initial commit
4e00df7
raw
history blame
1.55 kB
from .common import process_file
from langchain.document_loaders import UnstructuredHTMLLoader
import requests
import re
import unicodedata
import tempfile
import os
import streamlit as st
from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile
def process_html(vector_store, file, stats_db):
return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)
def get_html(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def create_html_file(url, content):
file_name = slugify(url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
with open(temp_file_path, 'w') as temp_file:
temp_file.write(content)
record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
uploaded_file = UploadedFile(record)
return uploaded_file, temp_file_path
def delete_tempfile(temp_file_path, url, ret):
try:
os.remove(temp_file_path)
if ret:
st.write(f"βœ… Content saved... {url} ")
except OSError as e:
print(f"Error while deleting the temporary file: {str(e)}")
if ret:
st.write(f"❌ Error while saving content... {url} ")
def slugify(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
text = re.sub(r'[^\w\s-]', '', text).strip().lower()
text = re.sub(r'[-\s]+', '-', text)
return text