Spaces:

myn0908
/

Own-Knowledge-GPT

Build error

File size: 2,249 Bytes

d97a6fa

import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fpdf import FPDF


def content_crawler(url, file_format='txt', output_file='privacy_policy'):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.find_all(['h2', 'p', 'i', 'ul'])

        # Create output folder if it doesn't exist
        if not os.path.exists('../learning_documents'):
            os.makedirs('../learning_documents')

        # Save content based on the specified file format
        output_path = os.path.join('../learning_documents', output_file)

        if file_format == 'txt':
            with open(f"{output_path}.txt", "w", encoding="utf-8") as file:
                for t in text:
                    file.write(f'{t.text}\n')
            print(f"Content saved to {output_path}.txt")
        elif file_format == 'pdf':
            pdf = FPDF()
            pdf.set_auto_page_break(auto=True, margin=15)
            pdf.add_page()
            pdf.set_font("Arial", "B", 8)
            for t in text:
                pdf.cell(0, 10, t.text, ln=True)
            pdf.output(f"{output_path}.pdf")
            print(f"Content saved to {output_path}.pdf")
        elif file_format == 'csv':
            df = pd.DataFrame({'Content': [t.text for t in text]})
            df.to_csv(f"{output_path}.csv", index=False)
            print(f"Content saved to {output_path}.csv")
        elif file_format == 'xml':
            xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
            with open(f"{output_path}.xml", "w", encoding="utf-8") as file:
                file.write(f'<root>{xml_content}</root>')
            print(f"Content saved to {output_path}.xml")
        else:
            print("Invalid file format. Supported formats: txt, pdf, csv, xml")
    else:
        print("Failed to retrieve content from the URL.")


if __name__ == '__main__':
    pass
    # Example usage:
    # content_crawler("https://www.presight.io/privacy-policy.html", file_format='pdf', output_file='privacy_policy')