Spaces:

schematise
/

ICAT-v1-Query

Sleeping

App Files Files Community

ICAT-v1-Query / get_results.py

sankalps

made new text for streamlit spaces

2756ab2 12 months ago

raw

history blame

5.42 kB


	import requests
	import insert_data
	from bs4 import BeautifulSoup
	#from dotenv import load_dotenv
	import os
	import streamlit as st

	#load_dotenv()
	api_key = st.secrets["IK_API_KEY"]
	headers = {
	'authorization': f"Token {api_key}"
	}


	def get_text_for_new_docs(list_of_documents_not_present, searchusr, lst):
	lst_new_data = {}
	for id in list_of_documents_not_present:
	try:
	lst_new_data[id] = {'id': int(id), 'title': '', 'cleantext': '', 'blocktext': '', 'size': ''}
	lst_new_data[id]['title'] = lst[id]['title']
	lst_new_data[id]['size'] = lst[id]['size']

	except:
	print("Error in get_text_for_new_docs")

	try:
	cleantext, blocktext_lst = get_text(id, searchusr)
	blocktext = str(blocktext_lst)
	except:
	cleantext = ''
	blocktext = ''

	lst_new_data[id]['cleantext'] = cleantext
	lst_new_data[id]['blocktext'] = blocktext

	return lst_new_data

	def get_text(id, searchusr):

	idd = str(id)

	st = ''
	global headers
	url = f'https://api.indiankanoon.org/doc/{idd}/'
	res = requests.post(url, headers=headers).json()

	print("Request for doc with id", idd, "sent")
	try:
	st = res['doc']
	html_string = st
	escaped_string = bytes(html_string, 'utf-8').decode('unicode-escape')
	soup = BeautifulSoup(escaped_string, "html.parser")

	st = soup.get_text()
	except:
	st = ''

	try:
	def get_blockquotes():
	search_strings = ["clause", "agreement", " which reads as", " mutually agreed", " states the following"]
	search_strings.append(str(searchusr))
	soup2 = BeautifulSoup(html_string, 'html.parser')

	filtered_paragraphs = []

	# Find all elements and process them
	elements = soup2.find_all()
	for i, element in enumerate(elements):
	# Check if the element is a paragraph containing any of the search strings
	if element.name == 'p' and any(
	search_string in element.get_text() for search_string in search_strings):
	# Check the next three elements for <blockquote> elements
	j = i + 1
	while j < len(elements) and j <= i + 3:
	next_element = elements[j]
	if next_element.name == 'blockquote':
	filtered_paragraphs.append(next_element.get_text())
	j += 1
	return filtered_paragraphs

	filtered_paragraphs_lst = get_blockquotes()
	# Combine the values from matching_indents list with newlines between them
	# filtered_paragraphs = '\n'.join(filtered_paragraphs_lst)
	filtered_paragraphs = filtered_paragraphs_lst
	except:
	filtered_paragraphs = ''

	return st, filtered_paragraphs

	def get_docs(search):
	global headers
	S = requests.Session()
	S.headers = headers
	#lst = ["clause which reads as"]
	lst = ["clause which reads as", " mutually agreed", "clause states the following"]
	# lst += ["clause", "agreement"]
	lst_data = {}
	for qry in lst:
	search = '"' + search + '"' + qry
	search = search.replace(' ', '+') # queries the search text
	for page_num in range(0, 1):
	url = f"https://api.indiankanoon.org/search/?formInput={search}&pagenum={page_num}"
	res = S.post(url).json()

	# if not res['docs']:
	# pass
	# return []
	print("Res printed is", res)
	for doc in res.get('docs', []): # safe access to 'docs'
	doc_id = int(doc.get('tid', ''))
	if doc_id:
	# Initialize a sub-dictionary if not already present
	if doc_id not in lst_data:
	lst_data[doc_id] = {'id': int(doc_id), 'title': '', 'size': ''}
	# Safely assign title and size with default values
	lst_data[doc_id]['title'] = doc.get('title', '')
	lst_data[doc_id]['size'] = doc.get('docsize', '')

	return lst_data




	def main(shortcode):

	if not shortcode:
	return "Error: No shortcode provided", 400

	# Simulate retrieving documents based on the shortcode
	lst = get_docs(shortcode)

	# Check for documents that are already present
	list_of_docs_not_present = insert_data.check_for_already_present(lst)

	# Identify documents that are already present
	list_of_docs_already_present = [docid for docid in lst.keys() if docid not in list_of_docs_not_present]

	# Get text for new documents that are not already present
	lst_new_data = get_text_for_new_docs(list_of_docs_not_present, shortcode, lst)

	# Writing new data to a file
	# with open("new_data_output.txt", "w") as file:
	# file.write(json.dumps(lst_new_data))

	results = insert_data.main(list_of_docs_already_present, lst_new_data,
	shortcode) # lst and shortcode to be passed
	'''
	with open ("Results3.txt", "w") as file:
	file.write(str(results))
	'''
	if results is not None:
	ln_lst1 = len(results)
	else:
	ln_lst1 = 0
	noresults = ''
	if (ln_lst1 != 0):
	return results
	else:
	return noresults