#import the necessary packages import streamlit as st from streamlit import session_state from langchain.document_loaders import WebBaseLoader, PyPDFLoader, TextLoader from langchain.indexes import VectorstoreIndexCreator from langchain.embeddings import HuggingFaceEmbeddings from langchain.docstore.document import Document import os import bs4 from bs4 import BeautifulSoup from langchain.chat_models import ChatOpenAI import openai import json #need to set openai key or set it as a environment variable openai.api_key = os.getenv("OPENAI_API_KEY") model = ChatOpenAI(model = 'gpt-4', max_tokens = 100,temperature=0) st.set_page_config(page_title="jury_records", page_icon="📈") # using this function to extract the content from the url. here we are using langchain webbaseloader to extract the content. We can use any web scrapping function also. def extract(link): res = [] loader = WebBaseLoader(link) pages = loader.load() for i in pages: res.append(i.page_content.replace('\n','')) a = " ".join(res) print(len(a)) if len(a)>0: return a else: return 'error' #Summarize the content with use of gpt4 with prompting. def summarize(link): context = extract(link) if context!='error': #print(context) response = openai.ChatCompletion.create( model="gpt-4", messages=[ { "role": "system", "content": f"Following context is given.{context}" }, { "role": "user", "content": '''Summarize the content in detail. Follow these instructions while summarizing.\n Include case no.\n Include all Plaintiff. \n Include the court name. \n Alias name should be included.\n Include case no. \n Include all defendants.\n If place is mentioned then include it, otherwise don't include it. \n Date format should be dd/mm/yyyy.\n If case is settled for an amount then try to include the amount. If amount is not mentioned don't mentioned anything about the same. only include this line if case is setteled otherwise include the status of case.\n\n<>\n\n Please try to include all the details. Don't leave out any information.''' } ], temperature=0, max_tokens=1000, top_p=1, frequency_penalty=0, presence_penalty=0 ) return response.choices[0].message.content.strip() else: return 'error' # Passing these questions dictinary for qna. there are lot of iterations has been done and this is final questions dictionary that we have come up with. you can change this dictionary based on input parameters those needs to be extracted from url. info_detail = {'case_type':'provide case type or court system like "Criminal", "Family Law", "labour law"', 'name_of_court': 'provide name of court or jail or court record.', 'case_number': 'provide case number or country case number or bankrupty case number', 'date_filed': 'what is the date when the case was filed or the date when case first formally/officially submitted?', 'plaintiff': 'Names of the Petitioner or plaintiff or applicant? ', 'defendants': "Names of all defendants, respondent and alias. Name entity under 'Defendants'", 'nature_of_action': 'Summarize the reason behind the case within 20 words in detail', 'status': 'what is the status of case?'} #langchain function for qna over the summary extracted from gpt4. vector database concept has been adopted. def lang(context): answer_dict={} docs = Document(page_content=context) index2 = VectorstoreIndexCreator().from_documents([docs]) for key in info_detail: ques = info_detail[key] answer_dict[key] = index2.query(llm = model, question = ques) index2.vectorstore.delete() return answer_dict def process(url): summary = summarize(url) if summary == 'error': return {"details":"","status":False} else: answer_dict = lang(summary) return answer_dict if 'jury_records_dict' not in session_state: session_state['jury_records_dict']= "" def Jury(jury_url): session_state['jury_records_dict']= process(jury_url) st.title("Jury Records") jury_url= st.text_area(label= "Please enter the jury records link", placeholder="Jury records Link") st.text_area("result", value=session_state['jury_records_dict']) st.button("Get answer dictionary", on_click=Jury, args=[jury_url])