Spaces:

schematise
/

ICAT-v1-Query

Sleeping

File size: 5,415 Bytes

2756ab2


import requests
import insert_data
from bs4 import BeautifulSoup
#from dotenv import load_dotenv
import os
import streamlit as st

#load_dotenv()
api_key = st.secrets["IK_API_KEY"]
headers = {
    'authorization': f"Token {api_key}"
}


def get_text_for_new_docs(list_of_documents_not_present, searchusr, lst):
    lst_new_data = {}
    for id in list_of_documents_not_present:
        try:
            lst_new_data[id] = {'id': int(id), 'title': '', 'cleantext': '', 'blocktext': '', 'size': ''}
            lst_new_data[id]['title'] = lst[id]['title']
            lst_new_data[id]['size'] = lst[id]['size']

        except:
            print("Error in get_text_for_new_docs")

        try:
            cleantext, blocktext_lst = get_text(id, searchusr)
            blocktext = str(blocktext_lst)
        except:
            cleantext = ''
            blocktext = ''

        lst_new_data[id]['cleantext'] = cleantext
        lst_new_data[id]['blocktext'] = blocktext

    return lst_new_data

def get_text(id, searchusr):

    idd = str(id)

    st = ''
    global headers
    url = f'https://api.indiankanoon.org/doc/{idd}/'
    res = requests.post(url, headers=headers).json()

    print("Request for doc with id", idd, "sent")
    try:
        st = res['doc']
        html_string = st
        escaped_string = bytes(html_string, 'utf-8').decode('unicode-escape')
        soup = BeautifulSoup(escaped_string, "html.parser")

        st = soup.get_text()
    except:
        st = ''

    try:
        def get_blockquotes():
            search_strings = ["clause", "agreement", " which reads as", " mutually agreed", " states the following"]
            search_strings.append(str(searchusr))
            soup2 = BeautifulSoup(html_string, 'html.parser')

            filtered_paragraphs = []

            # Find all elements and process them
            elements = soup2.find_all()
            for i, element in enumerate(elements):
                # Check if the element is a paragraph containing any of the search strings
                if element.name == 'p' and any(
                        search_string in element.get_text() for search_string in search_strings):
                    # Check the next three elements for <blockquote> elements
                    j = i + 1
                    while j < len(elements) and j <= i + 3:
                        next_element = elements[j]
                        if next_element.name == 'blockquote':
                            filtered_paragraphs.append(next_element.get_text())
                        j += 1
            return filtered_paragraphs

        filtered_paragraphs_lst = get_blockquotes()
        # Combine the values from matching_indents list with newlines between them
        # filtered_paragraphs = '\n'.join(filtered_paragraphs_lst)
        filtered_paragraphs = filtered_paragraphs_lst
    except:
        filtered_paragraphs = ''

    return st, filtered_paragraphs

def get_docs(search):
    global headers
    S = requests.Session()
    S.headers = headers
    #lst = ["clause which reads as"]
    lst = ["clause which reads as", " mutually agreed", "clause states the following"]
    # lst += ["clause", "agreement"]
    lst_data = {}
    for qry in lst:
        search = '"' + search + '"' + qry
        search = search.replace(' ', '+')  # queries the search text
        for page_num in range(0, 1):
            url = f"https://api.indiankanoon.org/search/?formInput={search}&pagenum={page_num}"
            res = S.post(url).json()

            # if not res['docs']:
            #    pass
            # return []
            print("Res printed is", res)
            for doc in res.get('docs', []):  # safe access to 'docs'
                doc_id = int(doc.get('tid', ''))
                if doc_id:
                    # Initialize a sub-dictionary if not already present
                    if doc_id not in lst_data:
                        lst_data[doc_id] = {'id': int(doc_id), 'title': '', 'size': ''}
                    # Safely assign title and size with default values
                    lst_data[doc_id]['title'] = doc.get('title', '')
                    lst_data[doc_id]['size'] = doc.get('docsize', '')

    return lst_data




def main(shortcode):

    if not shortcode:
        return "Error: No shortcode provided", 400

    # Simulate retrieving documents based on the shortcode
    lst = get_docs(shortcode)

    # Check for documents that are already present
    list_of_docs_not_present = insert_data.check_for_already_present(lst)

    # Identify documents that are already present
    list_of_docs_already_present = [docid for docid in lst.keys() if docid not in list_of_docs_not_present]

    # Get text for new documents that are not already present
    lst_new_data = get_text_for_new_docs(list_of_docs_not_present, shortcode, lst)

    # Writing new data to a file
    # with open("new_data_output.txt", "w") as file:
    #    file.write(json.dumps(lst_new_data))

    results = insert_data.main(list_of_docs_already_present, lst_new_data,
                               shortcode)  # lst and shortcode to be passed
    '''
    with open ("Results3.txt", "w") as file:
        file.write(str(results))
    '''
    if results is not None:
        ln_lst1 = len(results)
    else:
        ln_lst1 = 0
    noresults = ''
    if (ln_lst1 != 0):
        return results
    else:
        return noresults