mohitmayank commited on
Commit
5469918
1 Parent(s): 80eb70b

initial version

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
IPC_Semantic_Search.ipynb ADDED
The diff for this file is too large to render. See raw diff
README.md CHANGED
@@ -1,13 +1,16 @@
1
- ---
2
- title: Law Finder Ipc
3
- emoji: 📚
4
- colorFrom: pink
5
- colorTo: green
6
- sdk: streamlit
7
- sdk_version: 1.9.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
 
 
1
+ # IPC Semantic Search
2
+
3
+ **What:**
4
+ - The Indian Penal Code (IPC) is the official criminal code of India. [Wikipedia](https://en.wikipedia.org/wiki/Indian_Penal_Code)
5
+ - The intention behind this project is to provide an interfact for a common layman to search IPC sections.
6
+
7
+ **Steps:**
8
+ - Scrap the `devgan.in` website to get the different sections in IPC
9
+ - Use `LegalBERT` to get the embeddings of the descriptions of the IPC sections
10
+ - Perform Cosine Similarity to get the nearest matching section wrt query.
11
+
12
+ **Code:**
13
+ - `devganscrap`: Scrapy code to crawl and extract IPC sections and their description from the `devgan.in`
14
+
15
+ **Data:**
16
+ - `devagnscrp/sections_desc.csv`: IPC sections and descriptions *(Data Credits: http://devgan.in)*
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import
2
+ import pandas as pd
3
+ import streamlit as st
4
+ from txtai.embeddings import Embeddings
5
+
6
+ # set config
7
+ st.set_page_config(layout="wide", page_title="⚖️ Law Finder - IPC")
8
+
9
+ # load the summarization model (cache for faster loading)
10
+ @st.cache(allow_output_mutation=True)
11
+ def load_model_embeddings_data():
12
+ embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})
13
+ # embeddings.load("embedding")
14
+ df = pd.read_csv("devganscrap/sections_desc.csv")
15
+ embeddings.index([(uid, str(text), None) for uid, text in enumerate(df['description'].tolist())])
16
+ return embeddings, df
17
+
18
+ # loading the model
19
+ embeddings, df = load_model_embeddings_data()
20
+
21
+ # APP
22
+ # set title and subtitle
23
+ st.title("⚖️ Law Finder - IPC")
24
+ st.markdown("Search the [Indian Penal Code](https://en.wikipedia.org/wiki/Indian_Penal_Code) Sections with simple english.")
25
+ st.markdown("The data scraping procedure is explained in detail on [my website](http://mohitmayank.com/a_lazy_data_science_guide/python/scraping_websites/)")
26
+ st.markdown("The complete code is on [Github](https://github.com/imohitmayank/ipc_semantic_search)")
27
+
28
+ # create the input text box
29
+ query = st.text_area("Input your search phrase here!", "animal cruelty")
30
+ button = st.button("Find sections...")
31
+
32
+ # if button is clicked
33
+ with st.spinner("Finding the most similar sections...."):
34
+ if button:
35
+ # find and display the sections
36
+ st.markdown("**Sections:**")
37
+ results = []
38
+ for id, score in embeddings.search(query, limit=5):
39
+ st.write({
40
+ 'section': df.loc[id, 'section'],
41
+ 'description': df.loc[id, 'description']
42
+ })
devganscrap/.DS_Store ADDED
Binary file (6.15 kB). View file
devganscrap/devganscrap/__init__.py ADDED
File without changes
devganscrap/devganscrap/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (174 Bytes). View file
devganscrap/devganscrap/__pycache__/settings.cpython-38.pyc ADDED
Binary file (290 Bytes). View file
devganscrap/devganscrap/items.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define here the models for your scraped items
2
+ #
3
+ # See documentation in:
4
+ # https://docs.scrapy.org/en/latest/topics/items.html
5
+
6
+ import scrapy
7
+
8
+
9
+ class DevganscrapItem(scrapy.Item):
10
+ # define the fields for your item here like:
11
+ # name = scrapy.Field()
12
+ pass
devganscrap/devganscrap/middlewares.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define here the models for your spider middleware
2
+ #
3
+ # See documentation in:
4
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5
+
6
+ from scrapy import signals
7
+
8
+ # useful for handling different item types with a single interface
9
+ from itemadapter import is_item, ItemAdapter
10
+
11
+
12
+ class DevganscrapSpiderMiddleware:
13
+ # Not all methods need to be defined. If a method is not defined,
14
+ # scrapy acts as if the spider middleware does not modify the
15
+ # passed objects.
16
+
17
+ @classmethod
18
+ def from_crawler(cls, crawler):
19
+ # This method is used by Scrapy to create your spiders.
20
+ s = cls()
21
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22
+ return s
23
+
24
+ def process_spider_input(self, response, spider):
25
+ # Called for each response that goes through the spider
26
+ # middleware and into the spider.
27
+
28
+ # Should return None or raise an exception.
29
+ return None
30
+
31
+ def process_spider_output(self, response, result, spider):
32
+ # Called with the results returned from the Spider, after
33
+ # it has processed the response.
34
+
35
+ # Must return an iterable of Request, or item objects.
36
+ for i in result:
37
+ yield i
38
+
39
+ def process_spider_exception(self, response, exception, spider):
40
+ # Called when a spider or process_spider_input() method
41
+ # (from other spider middleware) raises an exception.
42
+
43
+ # Should return either None or an iterable of Request or item objects.
44
+ pass
45
+
46
+ def process_start_requests(self, start_requests, spider):
47
+ # Called with the start requests of the spider, and works
48
+ # similarly to the process_spider_output() method, except
49
+ # that it doesn’t have a response associated.
50
+
51
+ # Must return only requests (not items).
52
+ for r in start_requests:
53
+ yield r
54
+
55
+ def spider_opened(self, spider):
56
+ spider.logger.info('Spider opened: %s' % spider.name)
57
+
58
+
59
+ class DevganscrapDownloaderMiddleware:
60
+ # Not all methods need to be defined. If a method is not defined,
61
+ # scrapy acts as if the downloader middleware does not modify the
62
+ # passed objects.
63
+
64
+ @classmethod
65
+ def from_crawler(cls, crawler):
66
+ # This method is used by Scrapy to create your spiders.
67
+ s = cls()
68
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69
+ return s
70
+
71
+ def process_request(self, request, spider):
72
+ # Called for each request that goes through the downloader
73
+ # middleware.
74
+
75
+ # Must either:
76
+ # - return None: continue processing this request
77
+ # - or return a Response object
78
+ # - or return a Request object
79
+ # - or raise IgnoreRequest: process_exception() methods of
80
+ # installed downloader middleware will be called
81
+ return None
82
+
83
+ def process_response(self, request, response, spider):
84
+ # Called with the response returned from the downloader.
85
+
86
+ # Must either;
87
+ # - return a Response object
88
+ # - return a Request object
89
+ # - or raise IgnoreRequest
90
+ return response
91
+
92
+ def process_exception(self, request, exception, spider):
93
+ # Called when a download handler or a process_request()
94
+ # (from other downloader middleware) raises an exception.
95
+
96
+ # Must either:
97
+ # - return None: continue processing this exception
98
+ # - return a Response object: stops process_exception() chain
99
+ # - return a Request object: stops process_exception() chain
100
+ pass
101
+
102
+ def spider_opened(self, spider):
103
+ spider.logger.info('Spider opened: %s' % spider.name)
devganscrap/devganscrap/pipelines.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define your item pipelines here
2
+ #
3
+ # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4
+ # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5
+
6
+
7
+ # useful for handling different item types with a single interface
8
+ from itemadapter import ItemAdapter
9
+
10
+
11
+ class DevganscrapPipeline:
12
+ def process_item(self, item, spider):
13
+ return item
devganscrap/devganscrap/settings.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scrapy settings for devganscrap project
2
+ #
3
+ # For simplicity, this file contains only settings considered important or
4
+ # commonly used. You can find more settings consulting the documentation:
5
+ #
6
+ # https://docs.scrapy.org/en/latest/topics/settings.html
7
+ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
+
10
+ BOT_NAME = 'devganscrap'
11
+
12
+ SPIDER_MODULES = ['devganscrap.spiders']
13
+ NEWSPIDER_MODULE = 'devganscrap.spiders'
14
+
15
+
16
+ # Crawl responsibly by identifying yourself (and your website) on the user-agent
17
+ #USER_AGENT = 'devganscrap (+http://www.yourdomain.com)'
18
+
19
+ # Obey robots.txt rules
20
+ ROBOTSTXT_OBEY = True
21
+
22
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
23
+ #CONCURRENT_REQUESTS = 32
24
+
25
+ # Configure a delay for requests for the same website (default: 0)
26
+ # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27
+ # See also autothrottle settings and docs
28
+ #DOWNLOAD_DELAY = 3
29
+ # The download delay setting will honor only one of:
30
+ #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31
+ #CONCURRENT_REQUESTS_PER_IP = 16
32
+
33
+ # Disable cookies (enabled by default)
34
+ #COOKIES_ENABLED = False
35
+
36
+ # Disable Telnet Console (enabled by default)
37
+ #TELNETCONSOLE_ENABLED = False
38
+
39
+ # Override the default request headers:
40
+ #DEFAULT_REQUEST_HEADERS = {
41
+ # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42
+ # 'Accept-Language': 'en',
43
+ #}
44
+
45
+ # Enable or disable spider middlewares
46
+ # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47
+ #SPIDER_MIDDLEWARES = {
48
+ # 'devganscrap.middlewares.DevganscrapSpiderMiddleware': 543,
49
+ #}
50
+
51
+ # Enable or disable downloader middlewares
52
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53
+ #DOWNLOADER_MIDDLEWARES = {
54
+ # 'devganscrap.middlewares.DevganscrapDownloaderMiddleware': 543,
55
+ #}
56
+
57
+ # Enable or disable extensions
58
+ # See https://docs.scrapy.org/en/latest/topics/extensions.html
59
+ #EXTENSIONS = {
60
+ # 'scrapy.extensions.telnet.TelnetConsole': None,
61
+ #}
62
+
63
+ # Configure item pipelines
64
+ # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65
+ #ITEM_PIPELINES = {
66
+ # 'devganscrap.pipelines.DevganscrapPipeline': 300,
67
+ #}
68
+
69
+ # Enable and configure the AutoThrottle extension (disabled by default)
70
+ # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71
+ #AUTOTHROTTLE_ENABLED = True
72
+ # The initial download delay
73
+ #AUTOTHROTTLE_START_DELAY = 5
74
+ # The maximum download delay to be set in case of high latencies
75
+ #AUTOTHROTTLE_MAX_DELAY = 60
76
+ # The average number of requests Scrapy should be sending in parallel to
77
+ # each remote server
78
+ #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79
+ # Enable showing throttling stats for every response received:
80
+ #AUTOTHROTTLE_DEBUG = False
81
+
82
+ # Enable and configure HTTP caching (disabled by default)
83
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84
+ #HTTPCACHE_ENABLED = True
85
+ #HTTPCACHE_EXPIRATION_SECS = 0
86
+ #HTTPCACHE_DIR = 'httpcache'
87
+ #HTTPCACHE_IGNORE_HTTP_CODES = []
88
+ #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
devganscrap/devganscrap/spiders/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ # This package will contain the spiders of your Scrapy project
2
+ #
3
+ # Please refer to the documentation for information on how to create and manage
4
+ # your spiders.
devganscrap/devganscrap/spiders/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (182 Bytes). View file
devganscrap/devganscrap/spiders/__pycache__/devgan.cpython-38.pyc ADDED
Binary file (1.37 kB). View file
devganscrap/devganscrap/spiders/devgan.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import scrapy
2
+
3
+ class QuotesSpider(scrapy.Spider):
4
+ name = "devgan"
5
+ allowed_domains = ["devgan.in"]
6
+
7
+ def start_requests(self):
8
+ urls = [
9
+ 'http://devgan.in/all_sections_ipc.php',
10
+ ]
11
+ for url in urls:
12
+ yield scrapy.Request(url=url, callback=self.parse_mainpage)
13
+
14
+ def parse_mainpage(self, response):
15
+ # identify the links to the individual section pages
16
+ sections = response.css('div#content').css('a')#.getall()
17
+ # for each section
18
+ for section in sections:
19
+ # loc var
20
+ loc = {
21
+ 'title' : section.xpath('@title').extract(),
22
+ 'link' : 'http://devgan.in' + section.xpath('@href').extract()[0],
23
+ 'section': section.css('span.sectionlink::text').extract(),
24
+ }
25
+ # traverse again and extract the description
26
+ yield scrapy.Request(loc['link'], callback=self.parse_section,
27
+ cb_kwargs=dict(meta=loc))
28
+
29
+ def parse_section(self, response, meta):
30
+ # extract the description
31
+ meta['description'] = " ".join(response.css('tr.mys-desc').css('::text').extract())
32
+ # return
33
+ return meta
34
+
35
+
devganscrap/scrapy.cfg ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Automatically created by: scrapy startproject
2
+ #
3
+ # For more information about the [deploy] section see:
4
+ # https://scrapyd.readthedocs.io/en/latest/deploy.html
5
+
6
+ [settings]
7
+ default = devganscrap.settings
8
+
9
+ [deploy]
10
+ #url = http://localhost:6800/
11
+ project = devganscrap
devganscrap/sections_desc.csv ADDED
The diff for this file is too large to render. See raw diff
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ scrapy==2.6.1
2
+ streamlit
3
+ txtai
4
+ pandas