### Notebook for cleaning, creating and updating the web documents

First import the documents from the uncleaned URLs and store the text into separate files. Then manually clean files and update it. After that, get all already cleaned documents and store both, only the new documents and all documents (old cleaned + new cleaned).

In [None]:
from ipynb.fs.defs.preprocess_data import get_web_documents
from ipynb.fs.defs.preprocess_data import store_documents
from langchain.docstore.document import Document
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from typing import List

def get_web_documents_for_cleaning() -> List[Document]:
 """
 Method for returning documents based on the URLs. Looks at the .txt file with all uncleaned urls and uses the AsyncHTMLoader and HTML2TextTransformer to get the texts.
 """
 directory_path_web = "./../input_data/Web/URLs/uncleaned_urls.txt"

 imported_urls = []
 with open(directory_path_web, "r") as file:
 for line in file:
 imported_urls.append(line.strip())

 loader_web = AsyncHtmlLoader(imported_urls)
 documents_web = loader_web.load()

 html2text = Html2TextTransformer()
 documents_web_transformed = html2text.transform_documents(documents_web)
 print("Number of documents: " + str(len(documents_web_transformed)) + "\n")

 return documents_web_transformed

documents = get_web_documents_for_cleaning()
already_cleaned_documents = get_web_documents(True)

In [None]:
# Loop over the array and store each string in a separate txt file
counter = 1
for doc in documents:
 # Specify the file name for each string (e.g., file0.txt, file1.txt, ...)
 file_name = f"file_{counter}.txt"
 counter += 1
 
 # Open the file in write mode
 with open(file_name, 'w', encoding='utf-8') as file:
 # Write the string to the file
 file.write(doc.page_content)

 print(f'The string has been successfully stored in {file_name}.')

In [None]:
# NOW MANUALLY CLEAN

In [None]:
cleaned_texts = []

counter = 1
for doc in documents:
 # Specify the file name for each string (e.g., file0.txt, file1.txt, ...)
 file_name = f"file_{counter}.txt"
 counter += 1
 
 # Open the file in write mode
 with open(file_name, 'r', encoding='utf-8') as file:
 # Write the string to the file
 text = file.read()
 cleaned_texts.append(text)

In [None]:
# Set the new cleaned texts
if len(documents) == len(cleaned_texts):
 for i in range(len(documents)):
 documents[i].page_content = cleaned_texts[i]
else:
 raise Exception("Error.")

In [None]:
# Store only the new documents and all documents
store_documents(documents, "./../input_data/Web/documents/new_documents")

already_cleaned_documents.extend(documents)
store_documents(already_cleaned_documents, "./../input_data/Web/documents/all_documents")

In [None]:
# Update the URLs list for cleaned and uncleaned
uncleaned_url_file_path = "./../input_data/Web/URLs/uncleaned_urls.txt"
cleaned_url_file_path = "./../input_data/Web/URLs/cleaned_urls.txt"

# Read URLs from the source file and store them in a list
with open(uncleaned_url_file_path, "r") as source_file:
 urls = source_file.readlines()

# Open the destination file in append mode and write the URLs to it
with open(cleaned_url_file_path, "a") as destination_file:
 destination_file.writelines(urls)

# Remove the URLs from the source file
with open(uncleaned_url_file_path, "w") as source_file:
 source_file.write("")

# Print moved urls
for url in urls:
 print("Moved URL:", url.strip())