Spaces:
Sleeping
Sleeping
File size: 5,415 Bytes
2756ab2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import requests
import insert_data
from bs4 import BeautifulSoup
#from dotenv import load_dotenv
import os
import streamlit as st
#load_dotenv()
api_key = st.secrets["IK_API_KEY"]
headers = {
'authorization': f"Token {api_key}"
}
def get_text_for_new_docs(list_of_documents_not_present, searchusr, lst):
lst_new_data = {}
for id in list_of_documents_not_present:
try:
lst_new_data[id] = {'id': int(id), 'title': '', 'cleantext': '', 'blocktext': '', 'size': ''}
lst_new_data[id]['title'] = lst[id]['title']
lst_new_data[id]['size'] = lst[id]['size']
except:
print("Error in get_text_for_new_docs")
try:
cleantext, blocktext_lst = get_text(id, searchusr)
blocktext = str(blocktext_lst)
except:
cleantext = ''
blocktext = ''
lst_new_data[id]['cleantext'] = cleantext
lst_new_data[id]['blocktext'] = blocktext
return lst_new_data
def get_text(id, searchusr):
idd = str(id)
st = ''
global headers
url = f'https://api.indiankanoon.org/doc/{idd}/'
res = requests.post(url, headers=headers).json()
print("Request for doc with id", idd, "sent")
try:
st = res['doc']
html_string = st
escaped_string = bytes(html_string, 'utf-8').decode('unicode-escape')
soup = BeautifulSoup(escaped_string, "html.parser")
st = soup.get_text()
except:
st = ''
try:
def get_blockquotes():
search_strings = ["clause", "agreement", " which reads as", " mutually agreed", " states the following"]
search_strings.append(str(searchusr))
soup2 = BeautifulSoup(html_string, 'html.parser')
filtered_paragraphs = []
# Find all elements and process them
elements = soup2.find_all()
for i, element in enumerate(elements):
# Check if the element is a paragraph containing any of the search strings
if element.name == 'p' and any(
search_string in element.get_text() for search_string in search_strings):
# Check the next three elements for <blockquote> elements
j = i + 1
while j < len(elements) and j <= i + 3:
next_element = elements[j]
if next_element.name == 'blockquote':
filtered_paragraphs.append(next_element.get_text())
j += 1
return filtered_paragraphs
filtered_paragraphs_lst = get_blockquotes()
# Combine the values from matching_indents list with newlines between them
# filtered_paragraphs = '\n'.join(filtered_paragraphs_lst)
filtered_paragraphs = filtered_paragraphs_lst
except:
filtered_paragraphs = ''
return st, filtered_paragraphs
def get_docs(search):
global headers
S = requests.Session()
S.headers = headers
#lst = ["clause which reads as"]
lst = ["clause which reads as", " mutually agreed", "clause states the following"]
# lst += ["clause", "agreement"]
lst_data = {}
for qry in lst:
search = '"' + search + '"' + qry
search = search.replace(' ', '+') # queries the search text
for page_num in range(0, 1):
url = f"https://api.indiankanoon.org/search/?formInput={search}&pagenum={page_num}"
res = S.post(url).json()
# if not res['docs']:
# pass
# return []
print("Res printed is", res)
for doc in res.get('docs', []): # safe access to 'docs'
doc_id = int(doc.get('tid', ''))
if doc_id:
# Initialize a sub-dictionary if not already present
if doc_id not in lst_data:
lst_data[doc_id] = {'id': int(doc_id), 'title': '', 'size': ''}
# Safely assign title and size with default values
lst_data[doc_id]['title'] = doc.get('title', '')
lst_data[doc_id]['size'] = doc.get('docsize', '')
return lst_data
def main(shortcode):
if not shortcode:
return "Error: No shortcode provided", 400
# Simulate retrieving documents based on the shortcode
lst = get_docs(shortcode)
# Check for documents that are already present
list_of_docs_not_present = insert_data.check_for_already_present(lst)
# Identify documents that are already present
list_of_docs_already_present = [docid for docid in lst.keys() if docid not in list_of_docs_not_present]
# Get text for new documents that are not already present
lst_new_data = get_text_for_new_docs(list_of_docs_not_present, shortcode, lst)
# Writing new data to a file
# with open("new_data_output.txt", "w") as file:
# file.write(json.dumps(lst_new_data))
results = insert_data.main(list_of_docs_already_present, lst_new_data,
shortcode) # lst and shortcode to be passed
'''
with open ("Results3.txt", "w") as file:
file.write(str(results))
'''
if results is not None:
ln_lst1 = len(results)
else:
ln_lst1 = 0
noresults = ''
if (ln_lst1 != 0):
return results
else:
return noresults
|