amalsp commited on
Commit
4241674
Β·
verified Β·
1 Parent(s): 5bf4b6c

Add semantic search with sentence-transformers and improved UI

Browse files
Files changed (1) hide show
  1. app.py +145 -62
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
2
  from datasets import load_dataset
3
  import pandas as pd
4
  import random
 
 
5
 
6
  # Load the Indian Law dataset
7
  print("Loading Indian Law Dataset...")
@@ -9,49 +11,60 @@ ds = load_dataset("viber1/indian-law-dataset")
9
 
10
  # Convert to pandas for easier manipulation
11
  df = pd.DataFrame(ds['train'])
12
-
13
  print(f"Dataset loaded successfully with {len(df)} entries")
14
  print(f"Dataset columns: {df.columns.tolist()}")
15
 
16
- # Preview first few entries
17
- print("\nFirst 3 entries:")
18
- for i in range(min(3, len(df))):
19
- print(f"\nEntry {i+1}:")
20
- for col in df.columns:
21
- print(f" {col}: {df.iloc[i][col][:100] if isinstance(df.iloc[i][col], str) else df.iloc[i][col]}...")
 
 
 
22
 
23
  def search_legal_info(question):
24
- """Search the dataset for relevant legal information based on user question"""
25
  if not question or len(question.strip()) == 0:
26
- return "Please enter a legal question."
27
-
28
- question_lower = question.lower()
29
- results = []
30
-
31
- # Search through the dataset
32
- for idx, row in df.iterrows():
33
- # Check all text columns for matches
34
- for col in df.columns:
35
- if isinstance(row[col], str) and any(word in row[col].lower() for word in question_lower.split()):
36
- results.append(row.to_dict())
37
- break
38
-
39
- if len(results) >= 5: # Limit to top 5 results
40
- break
41
 
42
- if not results:
43
- return "No relevant information found in the dataset. Try rephrasing your question or use different keywords."
44
 
45
- # Format the response
46
- response = "πŸ“‹ **Legal Information Found:**\n\n"
47
- for i, result in enumerate(results, 1):
48
- response += f"**Result {i}:**\n"
 
 
 
 
 
 
 
 
 
 
 
 
49
  for key, value in result.items():
50
- if value and isinstance(value, str):
51
- # Truncate long text
52
- display_value = value[:500] + "..." if len(value) > 500 else value
53
- response += f"- **{key}**: {display_value}\n"
54
- response += "\n---\n\n"
 
 
 
 
 
 
 
 
55
 
56
  return response
57
 
@@ -60,62 +73,132 @@ def get_random_sample():
60
  random_idx = random.randint(0, len(df) - 1)
61
  sample = df.iloc[random_idx]
62
 
63
- response = "πŸ“ **Random Dataset Entry:**\n\n"
 
64
  for key, value in sample.items():
65
- if value and isinstance(value, str):
66
- display_value = value[:500] + "..." if len(value) > 500 else value
67
- response += f"**{key}**: {display_value}\n\n"
 
 
 
 
68
 
69
  return response
70
 
71
- # Create Gradio interface
72
- with gr.Blocks(title="Indian Law Q&A Assistant") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  gr.Markdown("""
74
  # πŸ›οΈ Indian Law Q&A Assistant
75
 
76
- Welcome to the Indian Law Question-Answer Assistant powered by the `viber1/indian-law-dataset`.
77
-
78
- ### How to use:
79
- 1. Enter your legal question in the text box below
80
- 2. Click "Search" to find relevant information from the dataset
81
- 3. Or click "Random Sample" to explore a random entry from the dataset
82
 
83
  ---
84
 
85
- ⚠️ **DISCLAIMER**: This application is for **informational purposes only**. The information provided
86
- is based on a dataset and should NOT be considered as legal advice. Always consult with a qualified
87
- legal professional for specific legal matters and guidance.
88
-
89
- ---
90
  """)
91
 
92
  with gr.Row():
93
- with gr.Column():
94
  question_input = gr.Textbox(
95
- label="Your Legal Question",
96
- placeholder="E.g., What are the provisions related to property rights?",
97
  lines=3
98
  )
99
 
100
  with gr.Row():
101
- search_btn = gr.Button("πŸ” Search", variant="primary")
102
- random_btn = gr.Button("🎲 Random Sample")
103
 
104
- output_box = gr.Markdown(label="Response")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Button actions
107
  search_btn.click(fn=search_legal_info, inputs=question_input, outputs=output_box)
108
  random_btn.click(fn=get_random_sample, inputs=None, outputs=output_box)
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  gr.Markdown("""
111
  ---
112
 
113
- ### Dataset Information:
114
- - **Dataset**: viber1/indian-law-dataset
115
  - **Total Entries**: """ + str(len(df)) + """
116
- - **Columns**: """ + ", ".join(df.columns.tolist()) + """
117
-
118
- *Built with πŸ’™ using Gradio and Hugging Face Datasets*
 
 
 
 
 
 
 
 
 
119
  """)
120
 
121
  if __name__ == "__main__":
 
2
  from datasets import load_dataset
3
  import pandas as pd
4
  import random
5
+ from sentence_transformers import SentenceTransformer, util
6
+ import torch
7
 
8
  # Load the Indian Law dataset
9
  print("Loading Indian Law Dataset...")
 
11
 
12
  # Convert to pandas for easier manipulation
13
  df = pd.DataFrame(ds['train'])
 
14
  print(f"Dataset loaded successfully with {len(df)} entries")
15
  print(f"Dataset columns: {df.columns.tolist()}")
16
 
17
+ # Load semantic search model
18
+ print("Loading sentence-transformers model for semantic search...")
19
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
20
+
21
+ # Precompute embeddings for the dataset
22
+ print("Computing embeddings for dataset...")
23
+ df['combined_text'] = df.apply(lambda row: ' '.join([str(val) for val in row.values if pd.notna(val) and isinstance(val, str)]), axis=1)
24
+ corpus_embeddings = model.encode(df['combined_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)
25
+ print("Embeddings computed successfully!")
26
 
27
  def search_legal_info(question):
28
+ """Search the dataset for relevant legal information using semantic search"""
29
  if not question or len(question.strip()) == 0:
30
+ return "⚠️ Please enter a legal question to search."
31
+
32
+ # Encode the query
33
+ query_embedding = model.encode(question, convert_to_tensor=True)
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Compute cosine similarity scores
36
+ cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
37
 
38
+ # Get top 5 results
39
+ top_results = torch.topk(cos_scores, k=min(5, len(df)))
40
+
41
+ # Format the response with best match first
42
+ response = "# πŸ” Search Results\n\n"
43
+
44
+ for i, (score, idx) in enumerate(zip(top_results.values, top_results.indices)):
45
+ result = df.iloc[idx.item()].to_dict()
46
+ similarity = score.item()
47
+
48
+ # Skip low relevance results
49
+ if similarity < 0.2:
50
+ continue
51
+
52
+ response += f"## πŸ“Œ Result {i+1} (Relevance: {similarity*100:.1f}%)\n\n"
53
+
54
  for key, value in result.items():
55
+ if key == 'combined_text': # Skip internal field
56
+ continue
57
+ if value and isinstance(value, str) and len(value.strip()) > 0:
58
+ # Clean and format the text
59
+ display_value = value.strip()
60
+ if len(display_value) > 800:
61
+ display_value = display_value[:800] + "..."
62
+ response += f"**{key.replace('_', ' ').title()}:**\n\n{display_value}\n\n"
63
+
64
+ response += "---\n\n"
65
+
66
+ if "Result 1" not in response:
67
+ return "❌ No relevant information found in the dataset. Please try rephrasing your question or use different keywords."
68
 
69
  return response
70
 
 
73
  random_idx = random.randint(0, len(df) - 1)
74
  sample = df.iloc[random_idx]
75
 
76
+ response = "# πŸ“ Random Legal Information\n\n"
77
+
78
  for key, value in sample.items():
79
+ if key == 'combined_text': # Skip internal field
80
+ continue
81
+ if value and isinstance(value, str) and len(value.strip()) > 0:
82
+ display_value = value.strip()
83
+ if len(display_value) > 800:
84
+ display_value = display_value[:800] + "..."
85
+ response += f"**{key.replace('_', ' ').title()}:**\n\n{display_value}\n\n"
86
 
87
  return response
88
 
89
+ def handle_feedback(question, feedback_type):
90
+ """Handle user feedback"""
91
+ return f"βœ… Thank you for your {feedback_type}! Your input helps us improve the system."
92
+
93
+ # Example questions
94
+ EXAMPLE_QUESTIONS = [
95
+ "Can a plaint be amended after it has been filed in a civil case in India?",
96
+ "What are the provisions for bail under Indian law?",
97
+ "What are the rights of an accused person in India?",
98
+ "How can property rights be transferred in India?",
99
+ "What is the procedure for filing a divorce petition?",
100
+ "What are the provisions related to consumer protection?",
101
+ "What are the penalties for copyright infringement in India?",
102
+ ]
103
+
104
+ # Create Gradio interface with improved UI
105
+ with gr.Blocks(title="Indian Law Q&A Assistant", theme=gr.themes.Soft()) as demo:
106
  gr.Markdown("""
107
  # πŸ›οΈ Indian Law Q&A Assistant
108
 
109
+ ### ⚠️ IMPORTANT DISCLAIMER
110
+ **This application is for informational purposes only and does NOT constitute legal advice.**
111
+ The information provided is based on a dataset and should not be relied upon for legal decisions.
112
+ Always consult with a qualified legal professional for specific legal matters.
 
 
113
 
114
  ---
115
 
116
+ Welcome to the Indian Law Question-Answer Assistant powered by semantic search technology
117
+ and the `viber1/indian-law-dataset`. Ask questions and get relevant legal information instantly!
 
 
 
118
  """)
119
 
120
  with gr.Row():
121
+ with gr.Column(scale=2):
122
  question_input = gr.Textbox(
123
+ label="πŸ’¬ Your Legal Question",
124
+ placeholder="Type your legal question here...",
125
  lines=3
126
  )
127
 
128
  with gr.Row():
129
+ search_btn = gr.Button("πŸ” Search", variant="primary", size="lg")
130
+ random_btn = gr.Button("🎲 Random Sample", size="lg")
131
 
132
+ gr.Markdown("### πŸ“‹ Example Questions (Click to use):")
133
+
134
+ with gr.Row():
135
+ example_btns = []
136
+ for example in EXAMPLE_QUESTIONS[:4]:
137
+ btn = gr.Button(example, size="sm")
138
+ example_btns.append(btn)
139
+
140
+ with gr.Row():
141
+ for example in EXAMPLE_QUESTIONS[4:]:
142
+ btn = gr.Button(example, size="sm")
143
+ example_btns.append(btn)
144
+
145
+ output_box = gr.Markdown(label="πŸ“„ Response", value="Enter a question above and click Search to begin.")
146
+
147
+ with gr.Row():
148
+ gr.Markdown("""
149
+ ### πŸ“’ Feedback
150
+ Found this helpful? Have suggestions? Click below:
151
+ """)
152
+
153
+ with gr.Row():
154
+ helpful_btn = gr.Button("πŸ‘ Helpful", size="sm")
155
+ report_btn = gr.Button("πŸ“ Report Issue", size="sm")
156
+
157
+ feedback_output = gr.Markdown(visible=False)
158
 
159
  # Button actions
160
  search_btn.click(fn=search_legal_info, inputs=question_input, outputs=output_box)
161
  random_btn.click(fn=get_random_sample, inputs=None, outputs=output_box)
162
 
163
+ # Example button actions
164
+ for i, btn in enumerate(example_btns):
165
+ btn.click(
166
+ fn=lambda ex=EXAMPLE_QUESTIONS[i]: ex,
167
+ inputs=None,
168
+ outputs=question_input
169
+ )
170
+
171
+ # Feedback actions
172
+ helpful_btn.click(
173
+ fn=lambda q: handle_feedback(q, "positive feedback"),
174
+ inputs=question_input,
175
+ outputs=feedback_output
176
+ ).then(lambda: gr.update(visible=True), outputs=feedback_output)
177
+
178
+ report_btn.click(
179
+ fn=lambda q: handle_feedback(q, "report"),
180
+ inputs=question_input,
181
+ outputs=feedback_output
182
+ ).then(lambda: gr.update(visible=True), outputs=feedback_output)
183
+
184
  gr.Markdown("""
185
  ---
186
 
187
+ ### πŸ“Š Dataset Information
188
+ - **Source**: viber1/indian-law-dataset on Hugging Face
189
  - **Total Entries**: """ + str(len(df)) + """
190
+ - **Search Method**: Semantic search using sentence-transformers
191
+ - **Model**: sentence-transformers/all-MiniLM-L6-v2
192
+
193
+ ### πŸ”§ Features
194
+ - βœ… Semantic search for better relevance
195
+ - βœ… Results ranked by similarity score
196
+ - βœ… Clean, readable Markdown formatting
197
+ - βœ… Example questions for quick start
198
+ - βœ… Random exploration of dataset
199
+ - βœ… User feedback mechanism
200
+
201
+ *Built with ❀️ using Gradio, Hugging Face Datasets, and Sentence Transformers*
202
  """)
203
 
204
  if __name__ == "__main__":