harshil1973 commited on
Commit
33ba695
·
1 Parent(s): 5b34e25
Files changed (3) hide show
  1. app.py +69 -27
  2. docstore.json +0 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -11,13 +11,18 @@ from llama_index.core import VectorStoreIndex
11
  from llama_index.vector_stores.qdrant import QdrantVectorStore
12
  from llama_index.core import Settings
13
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 
14
  nest_asyncio.apply()
15
  os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
 
 
 
 
16
 
17
  # default llamaindex llm and embendding model selection
18
  @st.cache_resource(show_spinner=False)
19
  def llamaindex_default():
20
- Settings.llm = Groq(model="llama3-8b-8192", api_key="")
21
  Settings.embed_model = HuggingFaceEmbedding(
22
  model_name="law-ai/InLegalBERT", trust_remote_code=True
23
  )
@@ -34,8 +39,66 @@ def load_index():
34
  )
35
  return VectorStoreIndex.from_vector_store(vector_store=vector_store)
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  index = load_index()
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # reranker selection in the sidebar
40
  with st.sidebar:
41
  selected_reranker = st.selectbox(
@@ -60,31 +123,6 @@ with st.sidebar:
60
  value=10
61
  )
62
 
63
- @st.cache_resource(show_spinner=False)
64
- def load_retriver():
65
- dense_retriever = VectorIndexRetriever(
66
- index=index,
67
- similarity_top_k=num_k
68
- )
69
- sparse_retriever = BM25Retriever.from_persist_dir("./sparse_retriever")
70
- sparse_retriever.similarity_top_k = num_k
71
-
72
- retriever = QueryFusionRetriever(
73
- [
74
- dense_retriever,
75
- sparse_retriever,
76
- ],
77
- num_queries=1,
78
- use_async=False,
79
- retriever_weights=[dense_weightage, sparse_weightage],
80
- similarity_top_k=num_k,
81
- mode="relative_score",
82
- verbose=True,
83
- )
84
- return retriever
85
-
86
- retriever = load_retriver()
87
-
88
  st.title("Legal Documents Hybrid+Reranker Search")
89
 
90
  query = st.text_input("Search through documents by keyword", value="")
@@ -108,5 +146,9 @@ if search_btn and query:
108
  st.write("File Name: ", node["meta"].get("file_name"))
109
  st.write("reranking score: ", node["score"])
110
  st.write("node id", node["id"])
111
- st.write(node["text"])
 
 
 
 
112
  st.write("---")
 
11
  from llama_index.vector_stores.qdrant import QdrantVectorStore
12
  from llama_index.core import Settings
13
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
14
+ import PyPDF2
15
  nest_asyncio.apply()
16
  os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
17
+ groq_token = st.secrets["groq_token"]
18
+ st.set_page_config(
19
+ layout="wide"
20
+ )
21
 
22
  # default llamaindex llm and embendding model selection
23
  @st.cache_resource(show_spinner=False)
24
  def llamaindex_default():
25
+ Settings.llm = Groq(model="llama-3.1-8b-instant", api_key=groq_token)
26
  Settings.embed_model = HuggingFaceEmbedding(
27
  model_name="law-ai/InLegalBERT", trust_remote_code=True
28
  )
 
39
  )
40
  return VectorStoreIndex.from_vector_store(vector_store=vector_store)
41
 
42
+ @st.cache_resource(show_spinner=False)
43
+ def load_retriver():
44
+ dense_retriever = VectorIndexRetriever(
45
+ index=index,
46
+ similarity_top_k=num_k
47
+ )
48
+ sparse_retriever = BM25Retriever.from_persist_dir("./sparse_retriever")
49
+ sparse_retriever.similarity_top_k = num_k
50
+
51
+ retriever = QueryFusionRetriever(
52
+ [
53
+ dense_retriever,
54
+ sparse_retriever,
55
+ ],
56
+ num_queries=1,
57
+ use_async=False,
58
+ retriever_weights=[dense_weightage, sparse_weightage],
59
+ similarity_top_k=num_k,
60
+ mode="relative_score",
61
+ verbose=True,
62
+ )
63
+ return retriever
64
+
65
+ retriever = load_retriver()
66
  index = load_index()
67
 
68
+ def extract_pdf_content(pdf_file_path):
69
+ with open(pdf_file_path, 'rb') as pdf_file:
70
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
71
+ text = ""
72
+ for page_num in range(len(pdf_reader.pages)):
73
+ page = pdf_reader.pages[page_num]  
74
+
75
+ text += page.extract_text()
76
+ return text
77
+
78
+ #prompt template for summarization
79
+ template = """
80
+ Please summarize the following legal document and provide the summary in the specified format. The output should directly follow the format without any introductory text.
81
+ **Document:**
82
+ {document_content}
83
+
84
+ **Format:**
85
+
86
+ **Case:** [Case Number]
87
+
88
+ **Petitioner:** [Petitioner's Name]
89
+
90
+ **Respondent:** [Respondent's Name]
91
+
92
+ **Judge:** [Judge's Name]
93
+
94
+ **Order Date:** [Order Date]
95
+
96
+ **Summary:**
97
+ - **Background:** [Brief description of the case background]
98
+ - **Allegations:** [Summary of the allegations made in the case]
99
+ - **Investigation:** [Key findings from the investigation]
100
+ - **Court's Decision:** [Summary of the court's decision and any conditions imposed]
101
+ """
102
  # reranker selection in the sidebar
103
  with st.sidebar:
104
  selected_reranker = st.selectbox(
 
123
  value=10
124
  )
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  st.title("Legal Documents Hybrid+Reranker Search")
127
 
128
  query = st.text_input("Search through documents by keyword", value="")
 
146
  st.write("File Name: ", node["meta"].get("file_name"))
147
  st.write("reranking score: ", node["score"])
148
  st.write("node id", node["id"])
149
+ with st.expander("See Summary"):
150
+ text = extract_pdf_content(node["meta"].get("file_name"))
151
+ formatted_template = template.format(document_content=text)
152
+ summary = Settings.llm.complete(formatted_template)
153
+ st.write(summary)
154
  st.write("---")
docstore.json DELETED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -7,4 +7,5 @@ xformers
7
  ipython
8
  matplotlib
9
  fastembed
10
- flashrank[listwise]
 
 
7
  ipython
8
  matplotlib
9
  fastembed
10
+ flashrank[listwise]
11
+ PyPDF2