File size: 5,415 Bytes
2756ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

import requests
import insert_data
from bs4 import BeautifulSoup
#from dotenv import load_dotenv
import os
import streamlit as st

#load_dotenv()
api_key = st.secrets["IK_API_KEY"]
headers = {
    'authorization': f"Token {api_key}"
}


def get_text_for_new_docs(list_of_documents_not_present, searchusr, lst):
    lst_new_data = {}
    for id in list_of_documents_not_present:
        try:
            lst_new_data[id] = {'id': int(id), 'title': '', 'cleantext': '', 'blocktext': '', 'size': ''}
            lst_new_data[id]['title'] = lst[id]['title']
            lst_new_data[id]['size'] = lst[id]['size']

        except:
            print("Error in get_text_for_new_docs")

        try:
            cleantext, blocktext_lst = get_text(id, searchusr)
            blocktext = str(blocktext_lst)
        except:
            cleantext = ''
            blocktext = ''

        lst_new_data[id]['cleantext'] = cleantext
        lst_new_data[id]['blocktext'] = blocktext

    return lst_new_data

def get_text(id, searchusr):

    idd = str(id)

    st = ''
    global headers
    url = f'https://api.indiankanoon.org/doc/{idd}/'
    res = requests.post(url, headers=headers).json()

    print("Request for doc with id", idd, "sent")
    try:
        st = res['doc']
        html_string = st
        escaped_string = bytes(html_string, 'utf-8').decode('unicode-escape')
        soup = BeautifulSoup(escaped_string, "html.parser")

        st = soup.get_text()
    except:
        st = ''

    try:
        def get_blockquotes():
            search_strings = ["clause", "agreement", " which reads as", " mutually agreed", " states the following"]
            search_strings.append(str(searchusr))
            soup2 = BeautifulSoup(html_string, 'html.parser')

            filtered_paragraphs = []

            # Find all elements and process them
            elements = soup2.find_all()
            for i, element in enumerate(elements):
                # Check if the element is a paragraph containing any of the search strings
                if element.name == 'p' and any(
                        search_string in element.get_text() for search_string in search_strings):
                    # Check the next three elements for <blockquote> elements
                    j = i + 1
                    while j < len(elements) and j <= i + 3:
                        next_element = elements[j]
                        if next_element.name == 'blockquote':
                            filtered_paragraphs.append(next_element.get_text())
                        j += 1
            return filtered_paragraphs

        filtered_paragraphs_lst = get_blockquotes()
        # Combine the values from matching_indents list with newlines between them
        # filtered_paragraphs = '\n'.join(filtered_paragraphs_lst)
        filtered_paragraphs = filtered_paragraphs_lst
    except:
        filtered_paragraphs = ''

    return st, filtered_paragraphs

def get_docs(search):
    global headers
    S = requests.Session()
    S.headers = headers
    #lst = ["clause which reads as"]
    lst = ["clause which reads as", " mutually agreed", "clause states the following"]
    # lst += ["clause", "agreement"]
    lst_data = {}
    for qry in lst:
        search = '"' + search + '"' + qry
        search = search.replace(' ', '+')  # queries the search text
        for page_num in range(0, 1):
            url = f"https://api.indiankanoon.org/search/?formInput={search}&pagenum={page_num}"
            res = S.post(url).json()

            # if not res['docs']:
            #    pass
            # return []
            print("Res printed is", res)
            for doc in res.get('docs', []):  # safe access to 'docs'
                doc_id = int(doc.get('tid', ''))
                if doc_id:
                    # Initialize a sub-dictionary if not already present
                    if doc_id not in lst_data:
                        lst_data[doc_id] = {'id': int(doc_id), 'title': '', 'size': ''}
                    # Safely assign title and size with default values
                    lst_data[doc_id]['title'] = doc.get('title', '')
                    lst_data[doc_id]['size'] = doc.get('docsize', '')

    return lst_data




def main(shortcode):

    if not shortcode:
        return "Error: No shortcode provided", 400

    # Simulate retrieving documents based on the shortcode
    lst = get_docs(shortcode)

    # Check for documents that are already present
    list_of_docs_not_present = insert_data.check_for_already_present(lst)

    # Identify documents that are already present
    list_of_docs_already_present = [docid for docid in lst.keys() if docid not in list_of_docs_not_present]

    # Get text for new documents that are not already present
    lst_new_data = get_text_for_new_docs(list_of_docs_not_present, shortcode, lst)

    # Writing new data to a file
    # with open("new_data_output.txt", "w") as file:
    #    file.write(json.dumps(lst_new_data))

    results = insert_data.main(list_of_docs_already_present, lst_new_data,
                               shortcode)  # lst and shortcode to be passed
    '''
    with open ("Results3.txt", "w") as file:
        file.write(str(results))
    '''
    if results is not None:
        ln_lst1 = len(results)
    else:
        ln_lst1 = 0
    noresults = ''
    if (ln_lst1 != 0):
        return results
    else:
        return noresults