{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Notebook for cleaning, creating and updating the web documents\n", "\n", "First import the documents from the uncleaned URLs and store the text into separate files. Then manually clean files and update it. After that, get all already cleaned documents and store both, only the new documents and all documents (old cleaned + new cleaned)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from ipynb.fs.defs.preprocess_data import get_web_documents\n", "from ipynb.fs.defs.preprocess_data import store_documents\n", "from langchain.docstore.document import Document\n", "from langchain.document_loaders import AsyncHtmlLoader\n", "from langchain.document_transformers import Html2TextTransformer\n", "from typing import List\n", "\n", "def get_web_documents_for_cleaning() -> List[Document]:\n", " \"\"\"\n", " Method for returning documents based on the URLs. Looks at the .txt file with all uncleaned urls and uses the AsyncHTMLoader and HTML2TextTransformer to get the texts.\n", " \"\"\"\n", " directory_path_web = \"./../input_data/Web/URLs/uncleaned_urls.txt\"\n", "\n", " imported_urls = []\n", " with open(directory_path_web, \"r\") as file:\n", " for line in file:\n", " imported_urls.append(line.strip())\n", "\n", " loader_web = AsyncHtmlLoader(imported_urls)\n", " documents_web = loader_web.load()\n", "\n", " html2text = Html2TextTransformer()\n", " documents_web_transformed = html2text.transform_documents(documents_web)\n", " print(\"Number of documents: \" + str(len(documents_web_transformed)) + \"\\n\")\n", "\n", " return documents_web_transformed\n", "\n", "documents = get_web_documents_for_cleaning()\n", "already_cleaned_documents = get_web_documents(True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Loop over the array and store each string in a separate txt file\n", "counter = 1\n", "for doc in documents:\n", " # Specify the file name for each string (e.g., file0.txt, file1.txt, ...)\n", " file_name = f\"file_{counter}.txt\"\n", " counter += 1\n", " \n", " # Open the file in write mode\n", " with open(file_name, 'w', encoding='utf-8') as file:\n", " # Write the string to the file\n", " file.write(doc.page_content)\n", "\n", " print(f'The string has been successfully stored in {file_name}.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# NOW MANUALLY CLEAN" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cleaned_texts = []\n", "\n", "counter = 1\n", "for doc in documents:\n", " # Specify the file name for each string (e.g., file0.txt, file1.txt, ...)\n", " file_name = f\"file_{counter}.txt\"\n", " counter += 1\n", " \n", " # Open the file in write mode\n", " with open(file_name, 'r', encoding='utf-8') as file:\n", " # Write the string to the file\n", " text = file.read()\n", " cleaned_texts.append(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set the new cleaned texts\n", "if len(documents) == len(cleaned_texts):\n", " for i in range(len(documents)):\n", " documents[i].page_content = cleaned_texts[i]\n", "else:\n", " raise Exception(\"Error.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Store only the new documents and all documents\n", "store_documents(documents, \"./../input_data/Web/documents/new_documents\")\n", "\n", "already_cleaned_documents.extend(documents)\n", "store_documents(already_cleaned_documents, \"./../input_data/Web/documents/all_documents\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Update the URLs list for cleaned and uncleaned\n", "uncleaned_url_file_path = \"./../input_data/Web/URLs/uncleaned_urls.txt\"\n", "cleaned_url_file_path = \"./../input_data/Web/URLs/cleaned_urls.txt\"\n", "\n", "# Read URLs from the source file and store them in a list\n", "with open(uncleaned_url_file_path, \"r\") as source_file:\n", " urls = source_file.readlines()\n", "\n", "# Open the destination file in append mode and write the URLs to it\n", "with open(cleaned_url_file_path, \"a\") as destination_file:\n", " destination_file.writelines(urls)\n", "\n", "# Remove the URLs from the source file\n", "with open(uncleaned_url_file_path, \"w\") as source_file:\n", " source_file.write(\"\")\n", "\n", "# Print moved urls\n", "for url in urls:\n", " print(\"Moved URL:\", url.strip())" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }