File size: 1,612 Bytes
5273d83
 
 
 
 
 
 
 
 
f0af1c3
5273d83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import re
import pandas as pd
from urllib.parse import urlparse

import logging
logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)


class UTILS:
    def __init__(self):
        pass


    def split_text(
        self,
        text
    ):
        text = text.split(',')
        text = [t.strip() for t in text]
        return text


    def replace_newlines_and_spaces(
        self,
        text
    ):
        # Replace all newline characters with spaces
        text = text.replace("\n", " ")
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)
        return text


    def clean_df(
        self,
        df,
        dropna=True,
        fillna=False
    ):
        if fillna:
            df.fillna('', inplace=True)
        if dropna:
            df.dropna(inplace=True)
            # df = df[~df.isna()]
        df = df.drop_duplicates().reset_index(drop=True)
        return df


    def validate_url_format(
        self,
        urls,
        url_type='urls'
    ):
        valid_urls = []
        for url in urls:
            result = urlparse(url)
            # Check if the url is valid
            if all([result.scheme, result.netloc]):
                # Online PDF urls should end with .pdf extension
                if url_type == 'online_pdf' and not url.endswith('.pdf'):
                    continue
                valid_urls.append(url)
        logging.info(f'Valid URLs are: {valid_urls}')
        return valid_urls