Spaces:
Sleeping
Sleeping
Commit
·
33ba695
1
Parent(s):
5b34e25
let's see
Browse files- app.py +69 -27
- docstore.json +0 -0
- requirements.txt +2 -1
app.py
CHANGED
@@ -11,13 +11,18 @@ from llama_index.core import VectorStoreIndex
|
|
11 |
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
12 |
from llama_index.core import Settings
|
13 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
|
14 |
nest_asyncio.apply()
|
15 |
os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# default llamaindex llm and embendding model selection
|
18 |
@st.cache_resource(show_spinner=False)
|
19 |
def llamaindex_default():
|
20 |
-
Settings.llm = Groq(model="
|
21 |
Settings.embed_model = HuggingFaceEmbedding(
|
22 |
model_name="law-ai/InLegalBERT", trust_remote_code=True
|
23 |
)
|
@@ -34,8 +39,66 @@ def load_index():
|
|
34 |
)
|
35 |
return VectorStoreIndex.from_vector_store(vector_store=vector_store)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
index = load_index()
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
# reranker selection in the sidebar
|
40 |
with st.sidebar:
|
41 |
selected_reranker = st.selectbox(
|
@@ -60,31 +123,6 @@ with st.sidebar:
|
|
60 |
value=10
|
61 |
)
|
62 |
|
63 |
-
@st.cache_resource(show_spinner=False)
|
64 |
-
def load_retriver():
|
65 |
-
dense_retriever = VectorIndexRetriever(
|
66 |
-
index=index,
|
67 |
-
similarity_top_k=num_k
|
68 |
-
)
|
69 |
-
sparse_retriever = BM25Retriever.from_persist_dir("./sparse_retriever")
|
70 |
-
sparse_retriever.similarity_top_k = num_k
|
71 |
-
|
72 |
-
retriever = QueryFusionRetriever(
|
73 |
-
[
|
74 |
-
dense_retriever,
|
75 |
-
sparse_retriever,
|
76 |
-
],
|
77 |
-
num_queries=1,
|
78 |
-
use_async=False,
|
79 |
-
retriever_weights=[dense_weightage, sparse_weightage],
|
80 |
-
similarity_top_k=num_k,
|
81 |
-
mode="relative_score",
|
82 |
-
verbose=True,
|
83 |
-
)
|
84 |
-
return retriever
|
85 |
-
|
86 |
-
retriever = load_retriver()
|
87 |
-
|
88 |
st.title("Legal Documents Hybrid+Reranker Search")
|
89 |
|
90 |
query = st.text_input("Search through documents by keyword", value="")
|
@@ -108,5 +146,9 @@ if search_btn and query:
|
|
108 |
st.write("File Name: ", node["meta"].get("file_name"))
|
109 |
st.write("reranking score: ", node["score"])
|
110 |
st.write("node id", node["id"])
|
111 |
-
st.
|
|
|
|
|
|
|
|
|
112 |
st.write("---")
|
|
|
11 |
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
12 |
from llama_index.core import Settings
|
13 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
14 |
+
import PyPDF2
|
15 |
nest_asyncio.apply()
|
16 |
os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"]
|
17 |
+
groq_token = st.secrets["groq_token"]
|
18 |
+
st.set_page_config(
|
19 |
+
layout="wide"
|
20 |
+
)
|
21 |
|
22 |
# default llamaindex llm and embendding model selection
|
23 |
@st.cache_resource(show_spinner=False)
|
24 |
def llamaindex_default():
|
25 |
+
Settings.llm = Groq(model="llama-3.1-8b-instant", api_key=groq_token)
|
26 |
Settings.embed_model = HuggingFaceEmbedding(
|
27 |
model_name="law-ai/InLegalBERT", trust_remote_code=True
|
28 |
)
|
|
|
39 |
)
|
40 |
return VectorStoreIndex.from_vector_store(vector_store=vector_store)
|
41 |
|
42 |
+
@st.cache_resource(show_spinner=False)
|
43 |
+
def load_retriver():
|
44 |
+
dense_retriever = VectorIndexRetriever(
|
45 |
+
index=index,
|
46 |
+
similarity_top_k=num_k
|
47 |
+
)
|
48 |
+
sparse_retriever = BM25Retriever.from_persist_dir("./sparse_retriever")
|
49 |
+
sparse_retriever.similarity_top_k = num_k
|
50 |
+
|
51 |
+
retriever = QueryFusionRetriever(
|
52 |
+
[
|
53 |
+
dense_retriever,
|
54 |
+
sparse_retriever,
|
55 |
+
],
|
56 |
+
num_queries=1,
|
57 |
+
use_async=False,
|
58 |
+
retriever_weights=[dense_weightage, sparse_weightage],
|
59 |
+
similarity_top_k=num_k,
|
60 |
+
mode="relative_score",
|
61 |
+
verbose=True,
|
62 |
+
)
|
63 |
+
return retriever
|
64 |
+
|
65 |
+
retriever = load_retriver()
|
66 |
index = load_index()
|
67 |
|
68 |
+
def extract_pdf_content(pdf_file_path):
|
69 |
+
with open(pdf_file_path, 'rb') as pdf_file:
|
70 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
71 |
+
text = ""
|
72 |
+
for page_num in range(len(pdf_reader.pages)):
|
73 |
+
page = pdf_reader.pages[page_num]
|
74 |
+
|
75 |
+
text += page.extract_text()
|
76 |
+
return text
|
77 |
+
|
78 |
+
#prompt template for summarization
|
79 |
+
template = """
|
80 |
+
Please summarize the following legal document and provide the summary in the specified format. The output should directly follow the format without any introductory text.
|
81 |
+
**Document:**
|
82 |
+
{document_content}
|
83 |
+
|
84 |
+
**Format:**
|
85 |
+
|
86 |
+
**Case:** [Case Number]
|
87 |
+
|
88 |
+
**Petitioner:** [Petitioner's Name]
|
89 |
+
|
90 |
+
**Respondent:** [Respondent's Name]
|
91 |
+
|
92 |
+
**Judge:** [Judge's Name]
|
93 |
+
|
94 |
+
**Order Date:** [Order Date]
|
95 |
+
|
96 |
+
**Summary:**
|
97 |
+
- **Background:** [Brief description of the case background]
|
98 |
+
- **Allegations:** [Summary of the allegations made in the case]
|
99 |
+
- **Investigation:** [Key findings from the investigation]
|
100 |
+
- **Court's Decision:** [Summary of the court's decision and any conditions imposed]
|
101 |
+
"""
|
102 |
# reranker selection in the sidebar
|
103 |
with st.sidebar:
|
104 |
selected_reranker = st.selectbox(
|
|
|
123 |
value=10
|
124 |
)
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
st.title("Legal Documents Hybrid+Reranker Search")
|
127 |
|
128 |
query = st.text_input("Search through documents by keyword", value="")
|
|
|
146 |
st.write("File Name: ", node["meta"].get("file_name"))
|
147 |
st.write("reranking score: ", node["score"])
|
148 |
st.write("node id", node["id"])
|
149 |
+
with st.expander("See Summary"):
|
150 |
+
text = extract_pdf_content(node["meta"].get("file_name"))
|
151 |
+
formatted_template = template.format(document_content=text)
|
152 |
+
summary = Settings.llm.complete(formatted_template)
|
153 |
+
st.write(summary)
|
154 |
st.write("---")
|
docstore.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -7,4 +7,5 @@ xformers
|
|
7 |
ipython
|
8 |
matplotlib
|
9 |
fastembed
|
10 |
-
flashrank[listwise]
|
|
|
|
7 |
ipython
|
8 |
matplotlib
|
9 |
fastembed
|
10 |
+
flashrank[listwise]
|
11 |
+
PyPDF2
|