Chintan Donda commited on
Commit
5273d83
1 Parent(s): 22a1464

KKMS Kisan Smart Search Demo App and their scripts

Browse files
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: KKMS Smart Search Demo
3
- emoji: 🐠
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 3.27.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: KKMS KSSW
3
+ emoji: 🔥
4
+ colorFrom: blue
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.24.1
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,537 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import datetime
4
+
5
+ import src.constants as constants_utils
6
+ import src.kkms_kssw as kkms_kssw
7
+ import src.weather as weather_utils
8
+
9
+ os.environ["CURL_CA_BUNDLE"] = ""
10
+
11
+ import warnings
12
+ warnings.filterwarnings('ignore')
13
+
14
+
15
+ class DomState:
16
+ def __init__(
17
+ self,
18
+ index_type,
19
+ load_from_existing_index_file
20
+ ):
21
+ self.index_type = index_type
22
+ self.load_from_existing_index_file = load_from_existing_index_file
23
+
24
+ self.relevant_paragraphs = ''
25
+ self.sources_relevant_paragraphs = ''
26
+ self.answer = ''
27
+ self.summary = ''
28
+ self.mandi_price = ''
29
+ self.mandi_from_date = (datetime.datetime.now() - datetime.timedelta(days=5)).strftime('%Y-%m-%d')
30
+ self.mandi_to_date = datetime.datetime.now().strftime('%Y-%m-%d')
31
+ self.weather_info = ''
32
+ self.weather_forecast = ''
33
+ self.weather_forecast_summary = ''
34
+ self.indic_translation = ''
35
+
36
+ # Initialize index (vector store) - This will create a new index from scratch if load_from_existing_index_file == False
37
+ self.kkms_kssw_obj = kkms_kssw.KKMS_KSSW()
38
+ self.kkms_kssw_obj.load_create_index()
39
+
40
+
41
+ def click_handler_for_get_relevant_paragraphs(
42
+ self,
43
+ question_category,
44
+ question
45
+ ):
46
+ self.relevant_paragraphs = self.kkms_kssw_obj.query(
47
+ question=question,
48
+ question_category=question_category
49
+ )
50
+ if self.index_type in ['FAISS', 'Chroma']:
51
+ self.sources_relevant_paragraphs = [doc.metadata for doc in self.relevant_paragraphs]
52
+ self.relevant_paragraphs = [doc.page_content.replace('\n', '').replace('\t', ' ') for doc in self.relevant_paragraphs]
53
+ return self.relevant_paragraphs
54
+
55
+
56
+ def click_handler_for_relevant_paragraphs_source(
57
+ self,
58
+ relevant_paragraphs
59
+ ):
60
+ return self.sources_relevant_paragraphs
61
+
62
+
63
+ def click_handler_for_summary(
64
+ self,
65
+ answer
66
+ ):
67
+ self.sumamry = self.kkms_kssw_obj.langchain_utils_obj.get_textual_summary(answer)
68
+ return self.sumamry
69
+
70
+
71
+ def click_handler_for_get_answer(
72
+ self,
73
+ relevant_paragraphs,
74
+ question
75
+ ):
76
+ self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(
77
+ relevant_paragraphs,
78
+ question
79
+ )
80
+ return self.answer
81
+
82
+
83
+ def click_handler_for_mandi_price(
84
+ self,
85
+ state_name,
86
+ apmc_name,
87
+ commodity_name,
88
+ from_date,
89
+ to_date
90
+ ):
91
+ if state_name and apmc_name and commodity_name and from_date and to_date:
92
+ self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
93
+ return self.mandi_price
94
+
95
+
96
+ def click_handler_for_get_weather(
97
+ self,
98
+ city
99
+ ):
100
+ time, info, temperature = self.kkms_kssw_obj.weather_utils_obj.get_weather(city)
101
+ self.weather_info = f'Weather in {city.capitalize()} on {time} is {temperature} with {info}.'
102
+ return self.weather_info
103
+
104
+
105
+ def click_handler_for_get_weather_forecast(
106
+ self,
107
+ state,
108
+ district
109
+ ):
110
+ self.weather_forecast = self.kkms_kssw_obj.weather_utils_obj.get_weather_forecast(state, district)
111
+ return self.weather_forecast
112
+
113
+
114
+ def click_handler_for_weather_forecast_summary(
115
+ self,
116
+ weather_forecast
117
+ ):
118
+ self.weather_forecast_summary = self.kkms_kssw_obj.langchain_utils_obj.get_weather_forecast_summary(weather_forecast)
119
+ return self.weather_forecast_summary
120
+
121
+
122
+ def click_handler_for_load_files_urls(
123
+ self,
124
+ doc_type,
125
+ files_or_urls,
126
+ question_category
127
+ ):
128
+ self.kkms_kssw_obj.upload_data(
129
+ doc_type=constants_utils.DATA_SOURCES[doc_type],
130
+ files_or_urls=files_or_urls,
131
+ index_category=question_category
132
+ )
133
+
134
+
135
+ def click_handler_for_get_indic_translation(
136
+ self,
137
+ eng_ans,
138
+ language='Hindi'
139
+ ):
140
+ self.indic_translation = self.kkms_kssw_obj.translator_utils_obj.get_indic_google_translate(eng_ans, language)
141
+ return self.indic_translation
142
+
143
+
144
+ def click_handler_for_weather_forecast_districts_dropdown_list_update(
145
+ self,
146
+ state,
147
+ district
148
+ ):
149
+ return gr.update(
150
+ choices=self.kkms_kssw_obj.weather_utils_obj.get_district_names(state)
151
+ )
152
+
153
+
154
+ def click_handler_for_weather_forecast_district(
155
+ self,
156
+ state,
157
+ district,
158
+ weather
159
+ ):
160
+ return self.kkms_kssw_obj.weather_utils_obj.get_weather_forecast(state, district)
161
+
162
+
163
+ def _upload_file(self, files):
164
+ file_paths = [file.name for file in files]
165
+ return file_paths
166
+
167
+
168
+ def select_widget(
169
+ self,
170
+ choice
171
+ ):
172
+ if choice == "Custom Query":
173
+ return [
174
+ gr.update(visible=True),
175
+ gr.update(visible=False),
176
+ gr.update(visible=False),
177
+ gr.update(visible=False),
178
+ gr.update(visible=False),
179
+ ]
180
+
181
+ elif choice == "General (AgGPT)":
182
+ return [
183
+ gr.update(visible=False),
184
+ gr.update(visible=True),
185
+ gr.update(visible=False),
186
+ gr.update(visible=False),
187
+ gr.update(visible=False),
188
+ ]
189
+
190
+ elif choice == "Mandi Price":
191
+ return [
192
+ gr.update(visible=False),
193
+ gr.update(visible=False),
194
+ gr.update(visible=True),
195
+ gr.update(visible=False),
196
+ gr.update(visible=False),
197
+ ]
198
+
199
+ elif choice == "Weather":
200
+ return [
201
+ gr.update(visible=False),
202
+ gr.update(visible=False),
203
+ gr.update(visible=False),
204
+ gr.update(visible=True),
205
+ gr.update(visible=False),
206
+ ]
207
+
208
+ elif choice == "Load Custom Data":
209
+ return [
210
+ gr.update(visible=False),
211
+ gr.update(visible=False),
212
+ gr.update(visible=False),
213
+ gr.update(visible=False),
214
+ gr.update(visible=True)
215
+ ]
216
+
217
+ else:
218
+ return gr.update(visible=False)
219
+
220
+
221
+ def select_files_urls(
222
+ self,
223
+ choice
224
+ ):
225
+ if choice == "PDF":
226
+ return [
227
+ gr.update(visible=True),
228
+ gr.update(visible=False),
229
+ gr.update(visible=False),
230
+ gr.update(visible=False),
231
+ ]
232
+
233
+ elif choice == "Online PDF":
234
+ return [
235
+ gr.update(visible=False),
236
+ gr.update(visible=True),
237
+ gr.update(visible=False),
238
+ gr.update(visible=False),
239
+ ]
240
+
241
+ elif choice == "Text File":
242
+ return [
243
+ gr.update(visible=False),
244
+ gr.update(visible=False),
245
+ gr.update(visible=True),
246
+ gr.update(visible=False),
247
+ ]
248
+
249
+ elif choice == "URLs":
250
+ return [
251
+ gr.update(visible=False),
252
+ gr.update(visible=False),
253
+ gr.update(visible=False),
254
+ gr.update(visible=True),
255
+ ]
256
+
257
+ else:
258
+ return [
259
+ gr.update(visible=True),
260
+ gr.update(visible=False),
261
+ gr.update(visible=False),
262
+ gr.update(visible=False),
263
+ ]
264
+
265
+
266
+
267
+ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
268
+ dom = DomState(
269
+ index_type=constants_utils.INDEX_TYPE,
270
+ load_from_existing_index_file=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
271
+ )
272
+
273
+ widgets = gr.Radio(
274
+ [
275
+ "Custom Query",
276
+ "General (AgGPT)",
277
+ "Mandi Price",
278
+ "Weather",
279
+ "Load Custom Data"
280
+ ],
281
+ label="Query related to",
282
+ value="Custom Query"
283
+ )
284
+
285
+ #############################################################################
286
+ # Widget for Custom Queries
287
+ with gr.Row(visible=True) as rowCustomQuery:
288
+ with gr.Column(scale=1, min_width=600):
289
+ with gr.Tab(label='Relevant paragraphs'):
290
+ question_category = gr.Dropdown(
291
+ constants_utils.INDEX_CATEGORY,
292
+ label="Select Question Category")
293
+ question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
294
+ # Get the Relevant paragraphs for the question asked
295
+ relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
296
+ b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
297
+ b_relevant_paragraphs.click(
298
+ fn=dom.click_handler_for_get_relevant_paragraphs,
299
+ inputs=[question_category, question],
300
+ outputs=[relevant_paragraphs]
301
+ )
302
+
303
+ with gr.Column(scale=1):
304
+ with gr.Tab(label='Sources of relevant paragraphs'):
305
+ # Get the Sources of relevant paragraphs
306
+ sources_relevant_paragraphs = gr.Textbox(label="Sources of relevant paragraphs are:", interactive=False)
307
+ relevant_paragraphs.change(
308
+ dom.click_handler_for_relevant_paragraphs_source,
309
+ relevant_paragraphs,
310
+ sources_relevant_paragraphs
311
+ )
312
+
313
+ # Get the exact answer for the question asked from the retrieved Relevant paragraphs
314
+ with gr.Column(scale=1, min_width=600):
315
+ with gr.Tab(label='Answer'):
316
+ answer = gr.Textbox(label="Answer is:", value=dom.answer, interactive=False)
317
+ relevant_paragraphs.change(
318
+ dom.click_handler_for_get_answer,
319
+ [relevant_paragraphs, question],
320
+ answer
321
+ )
322
+
323
+ # Covert the answer to Indian language
324
+ with gr.Column(scale=1, min_width=600):
325
+ with gr.Tab(label='Answer in selected language'):
326
+ # Select the language
327
+ language = gr.Dropdown(
328
+ list(constants_utils.INDIC_LANGUAGE.keys()),
329
+ label="Select language")
330
+ indic_lang_answer = gr.Textbox(label="Answer in the selected language is:", interactive=False)
331
+ answer.change(
332
+ dom.click_handler_for_get_indic_translation,
333
+ answer,
334
+ indic_lang_answer
335
+ )
336
+ b_indic_lang_answer = gr.Button("Get answer in selected language").style(size='sm')
337
+ b_indic_lang_answer.click(fn=dom.click_handler_for_get_indic_translation, inputs=[answer, language], outputs=[indic_lang_answer])
338
+
339
+
340
+ #############################################################################
341
+ # Widget for General Query using AgGPT
342
+ with gr.Row(visible=False) as rowGeneral:
343
+ with gr.Column(scale=1, min_width=600):
344
+ chatbot = gr.Chatbot()
345
+ msg = gr.Textbox()
346
+ submit = gr.Button("Submit")
347
+ clear = gr.Button("Clear")
348
+ submit.click(
349
+ dom.kkms_kssw_obj.langchain_utils_obj.user, [msg, chatbot], [msg, chatbot]
350
+ ).then(dom.kkms_kssw_obj.langchain_utils_obj.bot, chatbot, chatbot)
351
+ clear.click(
352
+ dom.kkms_kssw_obj.langchain_utils_obj.clear_history, None, chatbot, queue=False)
353
+
354
+
355
+ #############################################################################
356
+ # Widget for Mandi Price
357
+ with gr.Row(visible=False) as rowMandiPrice:
358
+ with gr.Column(scale=1, min_width=600):
359
+ # Select State
360
+ state_name = gr.Dropdown(constants_utils.MANDI_PRICE_STATES, label="Select state")
361
+ # APMC name
362
+ apmc_name = gr.Textbox(label="Enter APMC name", placeholder='Type the APMC name here')
363
+ # APMC name
364
+ commodity_name = gr.Textbox(label="Enter Commodity name", placeholder='Type the Commodity name here')
365
+
366
+ # From/To date in yyyy-mm-dd format
367
+ from_date = gr.Textbox(label="From date?", value=dom.mandi_from_date, placeholder='Please enter the From date here in yyyy-mm-dd format')
368
+ to_date = gr.Textbox(label="To date?", value=dom.mandi_to_date, placeholder='Please enter the To date here in yyyy-mm-dd format')
369
+
370
+ with gr.Column(scale=1, min_width=600):
371
+ mandi_price = gr.Textbox(label=f"Mandi Price is:", value=dom.mandi_price, interactive=False)
372
+ b_summary = gr.Button("Get Mandi Price").style(size='sm')
373
+ b_summary.click(fn=dom.click_handler_for_mandi_price, inputs=[state_name, apmc_name, commodity_name, from_date, to_date], outputs=[mandi_price])
374
+
375
+
376
+ #############################################################################
377
+ # Widget for Weather Info
378
+ with gr.Row(visible=False) as rowWeather:
379
+ ########### Weather Forecast ###########
380
+ with gr.Column(scale=1, min_width=600):
381
+ with gr.Tab(label='Weather Forecast for next 5 days'):
382
+ # Select the State
383
+ state = gr.Dropdown(
384
+ list(constants_utils.WEATHER_FORECAST_STATE_CODES.keys()),
385
+ label="Select state"
386
+ )
387
+
388
+ # Select District
389
+ district = gr.Dropdown(
390
+ choices=[],
391
+ label="Select District"
392
+ )
393
+
394
+ # Get districts of the selected state
395
+ state.change(
396
+ dom.click_handler_for_weather_forecast_districts_dropdown_list_update,
397
+ state,
398
+ district
399
+ )
400
+
401
+ # Get weather forecast on district selection event
402
+ district_weather = gr.Textbox(label=f"Weather forecast is:", interactive=False)
403
+ district.change(
404
+ dom.click_handler_for_weather_forecast_district,
405
+ [state, district],
406
+ district_weather
407
+ )
408
+
409
+ with gr.Column(scale=1, min_width=600):
410
+ with gr.Tab(label='Weather Forecast Summary'):
411
+ # Get the summary of the weather forecast
412
+ weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", interactive=False)
413
+ district.change(
414
+ dom.click_handler_for_weather_forecast_summary,
415
+ district_weather,
416
+ weather_forecast_summary
417
+ )
418
+
419
+ # Covert the weather forcast summary in Indian language
420
+ with gr.Column(scale=1, min_width=600):
421
+ with gr.Tab(label='Weather Forecast Summary in selected language'):
422
+ # Select the language
423
+ language = gr.Dropdown(
424
+ list(constants_utils.INDIC_LANGUAGE.keys()),
425
+ label="Select language")
426
+ indic_weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary in the selected language is:", interactive=False)
427
+
428
+ # By default display weather forecast summary in Hindi. User can change it later on.
429
+ weather_forecast_summary.change(
430
+ dom.click_handler_for_get_indic_translation,
431
+ weather_forecast_summary,
432
+ indic_weather_forecast_summary
433
+ )
434
+
435
+ # User can get the weather forecast summary in their preferred language as well
436
+ b_indic_weather_forecast_summary = gr.Button("Get answer in selected language").style(size='sm')
437
+ b_indic_weather_forecast_summary.click(fn=dom.click_handler_for_get_indic_translation, inputs=[weather_forecast_summary, language], outputs=[indic_weather_forecast_summary])
438
+
439
+ with gr.Column(scale=1, min_width=600):
440
+ with gr.Tab(label='Weather Info'):
441
+ weather = gr.Textbox(label=f"Current weather is:", interactive=False)
442
+ district.change(
443
+ dom.click_handler_for_get_weather,
444
+ district,
445
+ weather
446
+ )
447
+
448
+
449
+ #############################################################################
450
+ # Widget to load and process from the custom data source
451
+ with gr.Row(visible=False) as rowLoadCustomData:
452
+ with gr.Column(scale=1, min_width=600):
453
+ with gr.Tab(label='Load Custom Data (Do not upload data from the same file/url again. Once it is uploaded, it gets stored forever.)'):
454
+ question_category = gr.Dropdown(
455
+ constants_utils.INDEX_CATEGORY,
456
+ label="Select Query Type")
457
+
458
+ doc_type = gr.Radio(
459
+ list(constants_utils.DATA_SOURCES.keys()),
460
+ label="Select data source (Supports uploading multiple Files/URLs)",
461
+ value="PDF"
462
+ )
463
+
464
+ with gr.Row(visible=True) as rowUploadPdf:
465
+ with gr.Column(scale=1, min_width=600):
466
+ file_output = gr.File()
467
+ upload_button = gr.UploadButton(
468
+ "Click to Upload PDF Files",
469
+ file_types=['.pdf'],
470
+ file_count="multiple"
471
+ )
472
+ upload_button.upload(dom._upload_file, upload_button, file_output)
473
+ b_files = gr.Button("Load PDF Files").style(size='sm')
474
+ b_files.click(
475
+ fn=dom.click_handler_for_load_files_urls,
476
+ inputs=[doc_type, upload_button, question_category]
477
+ )
478
+
479
+ with gr.Row(visible=False) as rowUploadOnlinePdf:
480
+ with gr.Column(scale=1, min_width=600):
481
+ urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format.)", placeholder='Type the URLs here')
482
+ b_urls = gr.Button("Load Online PDFs").style(size='sm')
483
+ b_urls.click(
484
+ fn=dom.click_handler_for_load_files_urls,
485
+ inputs=[doc_type, urls, question_category]
486
+ )
487
+
488
+ with gr.Row(visible=False) as rowUploadTextFile:
489
+ with gr.Column(scale=1, min_width=600):
490
+ file_output = gr.File()
491
+ upload_button = gr.UploadButton(
492
+ "Click to Upload Text Files",
493
+ file_types=['.txt'],
494
+ file_count="multiple"
495
+ )
496
+ upload_button.upload(dom._upload_file, upload_button, file_output)
497
+ b_files = gr.Button("Load Text Files").style(size='sm')
498
+ b_files.click(
499
+ fn=dom.click_handler_for_load_files_urls,
500
+ inputs=[doc_type, file_output, question_category]
501
+ )
502
+
503
+ with gr.Row(visible=False) as rowUploadUrls:
504
+ with gr.Column(scale=1, min_width=600):
505
+ urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format.)", placeholder='Type the URLs here')
506
+ b_urls = gr.Button("Load URLs").style(size='sm')
507
+ b_urls.click(
508
+ fn=dom.click_handler_for_load_files_urls,
509
+ inputs=[doc_type, urls, question_category]
510
+ )
511
+
512
+ doc_type.change(
513
+ fn=dom.select_files_urls,
514
+ inputs=doc_type,
515
+ outputs=[
516
+ rowUploadPdf,
517
+ rowUploadOnlinePdf,
518
+ rowUploadTextFile,
519
+ rowUploadUrls,
520
+ ],
521
+ )
522
+
523
+
524
+ widgets.change(
525
+ fn=dom.select_widget,
526
+ inputs=widgets,
527
+ outputs=[
528
+ rowCustomQuery,
529
+ rowGeneral,
530
+ rowMandiPrice,
531
+ rowWeather,
532
+ rowLoadCustomData,
533
+ ],
534
+ )
535
+
536
+
537
+ demo.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ llama-index
3
+ langchain
4
+ chromadb
5
+ torch
6
+ transformers
7
+ gradio
8
+ scikit-learn
9
+ scipy
10
+ matplotlib
11
+ openpyxl
12
+ mosestokenizer
13
+ indic-nlp-library
14
+ sentence_transformers
15
+ playwright~=1.30
16
+ faiss-cpu
17
+ tiktoken
18
+ googletrans==3.1.0a0
19
+ BeautifulSoup4
20
+ pypdf
21
+ PyPDF2
22
+ html2text
src/constants.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import src.web_crawler as web_crawler_utils
3
+ import src.weather as weather_utils
4
+
5
+ LOAD_FROM_EXISTING_INDEX_STORE = False
6
+ INDEX_TYPE = 'FAISS'
7
+
8
+ # Path from where to load the data (from the local directory)
9
+ DATA_PATH = './data/'
10
+
11
+ # Path to store the index/vector db
12
+ OUTPUT_PATH = os.path.join('./output/', INDEX_TYPE)
13
+ # Create OUTPUT_PATH directory if not present
14
+ if not os.path.exists(OUTPUT_PATH):
15
+ os.makedirs(OUTPUT_PATH)
16
+
17
+ # Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
18
+ INDEX_CATEGORY = [
19
+ 'crops',
20
+ 'fruits',
21
+ 'pest_management',
22
+ 'govt_policy',
23
+ 'insurance',
24
+ 'soil',
25
+ 'general',
26
+ 'vegetables'
27
+ ]
28
+
29
+ # Doctype of the master index of each index category. Master index for each index category would be stored under this key.
30
+ INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE = 'master'
31
+
32
+ # List of data sources/types & from where to load the data and create the index/vector store
33
+ # 2nd item is the type of source from where the data would be loaded. Currently it could come from either a file or URL.
34
+ DATA_SOURCES = {
35
+ 'PDF': 'pdf',
36
+ 'Text File': 'textfile',
37
+ 'Online PDF': 'online_pdf', # web_crawler_utils.get_ipm_packages_pdfs_urls()[:1]
38
+ 'URLs': 'urls',
39
+ }
40
+
41
+ # LangChain related constants
42
+ SIMILARITY_TOP_K = 1
43
+ MODE = 'embedding'
44
+ RESPONSE_MODE = 'default'
45
+ TEXT_SPLITTER_CHUNK_SIZE = 1000
46
+ TEXT_SPLITTER_CHUNK_OVERLAP = 0
47
+ TEXT_SPLITTER_SEPARATOR = '\n\n'
48
+
49
+
50
+ URLS = [
51
+ # Govt. Schemes
52
+ 'https://agricoop.nic.in/en/Major#gsc.tab=0'
53
+ 'https://agricoop.nic.in/#gsc.tab=0',
54
+
55
+ 'https://dmi.gov.in/Documents/GrantCAGrapes.pdf',
56
+ 'https://dmi.gov.in/Documents/organicfaq.pdf',
57
+ 'https://dmi.gov.in/Documents/CAGMOrganic-III.pdf',
58
+ 'https://dmi.gov.in/GradesStandard.aspx',
59
+ 'https://www.india.gov.in/topics/agriculture',
60
+ 'https://www.india.gov.in/farmers-portal',
61
+
62
+ # Pest Management related
63
+ 'https://niphm.gov.in/IPMPackages/Maize.pdf',
64
+
65
+ # Banned Pesticides
66
+ 'https://ppqs.gov.in/divisions/cib-rc/registered-products', # Online PDF links on the page
67
+
68
+ # Mandi Price related
69
+ 'https://agmarknet.gov.in/',
70
+
71
+ # General information related: Information of interests are present on the 2nd level url
72
+ 'https://www.manage.gov.in/nf/nf.asp',
73
+
74
+ # Weather forecast related
75
+ 'https://nwp.imd.gov.in/blf/blf_temp/', # need to select state -> district (on the new page) -> displays detailed table -> can get info at the block level as well from the same page on selection
76
+ 'https://nwp.imd.gov.in/blf/blf_temp/dis.php?value=12gujarat', # to get weather forecast for the given state
77
+ 'https://nwp.imd.gov.in/blf/blf_temp/block.php?dis=12BHAVNAGAR', # to get the weather forecast for the given district
78
+ ]
79
+
80
+
81
+ # Supported Indian laguages for translating the English text to Indian language
82
+ INDIC_LANGUAGE = {
83
+ 'Hindi': 'hi',
84
+ 'Gujarati': 'gu',
85
+ 'Kannada': 'kn',
86
+ 'Marathi': 'mr',
87
+ 'Panjabi': 'pa',
88
+ 'Bengali': "bn",
89
+ 'Telugu': 'te',
90
+ 'Tamil': 'ta',
91
+ 'Malayalam': 'ml',
92
+ }
93
+
94
+ # State list used in the Mandi Price widget dropdown list
95
+ MANDI_PRICE_STATES = [
96
+ 'ANDAMAN AND NICOBAR ISLANDS',
97
+ 'ANDHRA PRADESH',
98
+ 'ASSAM',
99
+ 'BIHAR',
100
+ 'CHANDIGARH',
101
+ 'CHHATTISGARH',
102
+ 'GOA',
103
+ 'GUJARAT',
104
+ 'HARYANA',
105
+ 'HIMACHAL PRADESH',
106
+ 'JAMMU AND KASHMIR',
107
+ 'JHARKHAND',
108
+ 'KARNATAKA',
109
+ 'KERALA',
110
+ 'MADHYA PRADESH',
111
+ 'MAHARASHTRA',
112
+ 'NAGALAND',
113
+ 'ODISHA',
114
+ 'PUDUCHERRY',
115
+ 'PUNJAB',
116
+ 'RAJASTHAN',
117
+ 'TAMIL NADU',
118
+ 'TELANGANA',
119
+ 'TRIPURA',
120
+ 'UTTAR PRADESH',
121
+ 'UTTARAKHAND',
122
+ 'WEST BENGAL'
123
+ ]
124
+
125
+ # State list used in the Weather forecast widget dropdown list
126
+ weather_utils_obj = weather_utils.WEATHER()
127
+ WEATHER_FORECAST_STATE_CODES = weather_utils_obj.get_state_names_codes()
128
+
129
+ # LIST OF PESTICIDES WHICH ARE BANNED AND RESTRICTED USE (List created from: https://pib.gov.in/PressReleaseIframePage.aspx?PRID=1896140)
130
+ BANNED_PESTICIDES_FORMULATIONS = [
131
+ 'Alachlor',
132
+ 'Aldicarb',
133
+ 'Aldrin',
134
+ 'Benzene Hexachloride',
135
+ 'Benomyl',
136
+ 'Calcium Cyanide',
137
+ 'Carbaryl',
138
+ 'Chlorbenzilate',
139
+ 'Chlordane',
140
+ 'Chlorofenvinphos',
141
+ 'Copper Acetoarsenite',
142
+ ]
143
+
src/data_loader.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+ from pathlib import Path
5
+ import glob
6
+
7
+ from llama_index import GPTSimpleVectorIndex, download_loader, SimpleDirectoryReader, SimpleWebPageReader
8
+ from langchain.document_loaders import PyPDFLoader, TextLoader
9
+ from langchain.agents import initialize_agent, Tool
10
+ from langchain.llms import OpenAI
11
+ from langchain.chains.conversation.memory import ConversationBufferMemory
12
+ from langchain.docstore.document import Document
13
+
14
+ import src.utils as utils
15
+
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ logging.basicConfig(
19
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
20
+ )
21
+
22
+ import warnings
23
+ warnings.filterwarnings('ignore')
24
+
25
+
26
+
27
+ class DATA_LOADER:
28
+ def __init__(self):
29
+ # Instantiate UTILS class object
30
+ self.utils_obj = utils.UTILS()
31
+
32
+
33
+ def load_documents_from_urls(self, urls=[], doc_type='urls'):
34
+ url_documents = self.load_document(doc_type=doc_type, urls=urls)
35
+ return url_documents
36
+
37
+
38
+ def load_documents_from_pdf(self, doc_filepath='', urls=[], doc_type='pdf'):
39
+ if doc_type == 'pdf':
40
+ pdf_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
41
+ elif doc_type == 'online_pdf':
42
+ pdf_documents = self.load_document(doc_type=doc_type, urls=urls)
43
+ return pdf_documents
44
+
45
+
46
+ def load_documents_from_directory(self, doc_filepath='', doc_type='directory'):
47
+ doc_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
48
+ return doc_documents
49
+
50
+
51
+ def load_documents_from_text(self, doc_filepath='', doc_type='textfile'):
52
+ text_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
53
+ return text_documents
54
+
55
+
56
+ def pdf_loader(self, filepath):
57
+ loader = PyPDFLoader(filepath)
58
+ return loader.load_and_split()
59
+
60
+
61
+ def text_loader(self, filepath):
62
+ loader = TextLoader(filepath)
63
+ return loader.load()
64
+
65
+
66
+ def load_document(self,
67
+ doc_type='pdf',
68
+ doc_filepath='',
69
+ urls=[]
70
+ ):
71
+ logger.info(f'Loading {doc_type} in raw format from: {doc_filepath}')
72
+
73
+ documents = []
74
+
75
+ # Validation checks
76
+ if doc_type in ['directory', 'pdf', 'textfile']:
77
+ if not os.path.exists(doc_filepath):
78
+ logger.warning(f"{doc_filepath} does not exist, nothing can be loaded!")
79
+ return documents
80
+
81
+ elif doc_type in ['online_pdf', 'urls']:
82
+ if len(urls) == 0:
83
+ logger.warning(f"URLs list empty, nothing can be loaded!")
84
+ return documents
85
+
86
+
87
+ ######### Load documents #########
88
+ # Load PDF
89
+ if doc_type == 'pdf':
90
+ # Load multiple PDFs from directory
91
+ if os.path.isdir(doc_filepath):
92
+ pdfs = glob.glob(f"{doc_filepath}/*.pdf")
93
+ logger.info(f'Total PDF files to load: {len(pdfs)}')
94
+ for pdf in pdfs:
95
+ documents.extend(self.pdf_loader(pdf))
96
+
97
+ # Loading from a single PDF file
98
+ elif os.path.isfile(doc_filepath) and doc_filepath.endswith('.pdf'):
99
+ documents.extend(self.pdf_loader(doc_filepath))
100
+
101
+ # Load PDFs from online (urls). Can read multiple PDFs from multiple URLs in one-shot
102
+ elif doc_type == 'online_pdf':
103
+ logger.info(f'URLs to load Online PDFs are from: {urls}')
104
+ valid_urls = self.utils_obj.validate_url_format(
105
+ urls=urls,
106
+ url_type=doc_type
107
+ )
108
+ for url in valid_urls:
109
+ # Load and split PDF pages per document
110
+ documents.extend(self.pdf_loader(url))
111
+
112
+ # Load data from URLs (can load data from multiple URLs)
113
+ elif doc_type == 'urls':
114
+ logger.info(f'URLs to load data from are: {urls}')
115
+ valid_urls = self.utils_obj.validate_url_format(
116
+ urls=urls,
117
+ url_type=doc_type
118
+ )
119
+ # Load data from URLs
120
+ docs = SimpleWebPageReader(html_to_text=True).load_data(valid_urls)
121
+ docs = [Document(page_content=doc.text) for doc in docs]
122
+ documents.extend(docs)
123
+
124
+ # Load data from text file(s)
125
+ elif doc_type == 'textfile':
126
+ # Load multiple text files from directory
127
+ if os.path.isdir(doc_filepath):
128
+ text_files = glob.glob(f"{doc_filepath}/*.txt")
129
+ logger.info(f'Total text files to load: {len(text_files)}')
130
+ for tf in text_files:
131
+ documents.extend(self.text_loader(tf))
132
+
133
+ # Loading from a single text file
134
+ elif os.path.isfile(doc_filepath) and doc_filepath.endswith('.txt'):
135
+ documents.extend(self.text_loader(doc_filepath))
136
+
137
+ # Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
138
+ elif doc_type == 'directory':
139
+ # Load multiple PDFs from directory
140
+ if os.path.isdir(doc_filepath):
141
+ documents = SimpleDirectoryReader(
142
+ input_dir=doc_filepath
143
+ ).load_data()
144
+
145
+ # Loading from a file
146
+ elif os.path.isfile(doc_filepath):
147
+ documents.extend(SimpleDirectoryReader(
148
+ input_files=[doc_filepath]
149
+ ).load_data())
150
+
151
+ # Load data from URLs in Knowledge Base format
152
+ elif doc_type == 'url-kb':
153
+ KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader")
154
+ loader = KnowledgeBaseWebReader()
155
+ for url in urls:
156
+ doc = loader.load_data(
157
+ root_url=url,
158
+ link_selectors=['.article-list a', '.article-list a'],
159
+ article_path='/articles',
160
+ body_selector='.article-body',
161
+ title_selector='.article-title',
162
+ subtitle_selector='.article-subtitle',
163
+ )
164
+ documents.extend(doc)
165
+
166
+ # Load data from URLs and create an agent chain using ChatGPT
167
+ elif doc_type == 'url-chatgpt':
168
+ BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
169
+ loader = BeautifulSoupWebReader()
170
+ # Load data from URLs
171
+ documents = loader.load_data(urls=urls)
172
+ # Build the Vector database
173
+ index = GPTSimpleVectorIndex(documents)
174
+ tools = [
175
+ Tool(
176
+ name="Website Index",
177
+ func=lambda q: index.query(q),
178
+ description=f"Useful when you want answer questions about the text retrieved from websites.",
179
+ ),
180
+ ]
181
+
182
+ # Call ChatGPT API
183
+ llm = OpenAI(temperature=0) # Keep temperature=0 to search from the given urls only
184
+ memory = ConversationBufferMemory(memory_key="chat_history")
185
+ agent_chain = initialize_agent(
186
+ tools, llm, agent="zero-shot-react-description", memory=memory
187
+ )
188
+
189
+ output = agent_chain.run(input="What language is on this website?")
190
+
191
+
192
+ # Clean documents
193
+ documents = self.clean_documents(documents)
194
+ logger.info(f'{doc_type} in raw format from: {doc_filepath} loaded successfully!')
195
+ return documents
196
+
197
+
198
+ def clean_documents(
199
+ self,
200
+ documents
201
+ ):
202
+ cleaned_documents = []
203
+ for document in documents:
204
+ if hasattr(document, 'page_content'):
205
+ document.page_content = self.utils_obj.replace_newlines_and_spaces(document.page_content)
206
+ elif hasattr(document, 'text'):
207
+ document.text = self.utils_obj.replace_newlines_and_spaces(document.text)
208
+ else:
209
+ document = self.utils_obj.replace_newlines_and_spaces(document)
210
+ cleaned_documents.append(document)
211
+ return cleaned_documents
212
+
213
+
214
+ def load_external_links_used_by_FTAs(self,
215
+ sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx'
216
+ ):
217
+ xls = pd.ExcelFile(sheet_filepath)
218
+ df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link'])
219
+ for sheet_name in xls.sheet_names:
220
+ sheet = pd.read_excel(xls, sheet_name)
221
+ if sheet.shape[0] > 0:
222
+ df = pd.concat([df, sheet])
223
+ else:
224
+ logger.info(f'{sheet_name} has no content.')
225
+
226
+ df = df[['Link used for', 'Link type', 'Link']]
227
+ # Clean df
228
+ df = self.utils_obj.clean_df(df)
229
+ logger.info(f'Total links available across all cities: {df.shape[0]}')
230
+ return df
src/kkms_kssw.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import src.constants as constants_utils
4
+ import src.langchain_utils as langchain_utils
5
+ import src.weather as weather_utils
6
+ import src.mandi_price as mandi_utils
7
+ import src.translator as translator_utils
8
+ import src.web_crawler as web_crawler_utils
9
+
10
+ import logging
11
+ logger = logging.getLogger(__name__)
12
+ logging.basicConfig(
13
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
14
+ )
15
+
16
+ import warnings
17
+ warnings.filterwarnings('ignore')
18
+
19
+
20
+
21
+ class KKMS_KSSW:
22
+ def __init__(self):
23
+ self.index_type = constants_utils.INDEX_TYPE
24
+ self.load_from_existing_index_store = constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
25
+
26
+ # Instantiate langchain_utils class object
27
+ self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS(
28
+ index_type=self.index_type,
29
+ load_from_existing_index_store=self.load_from_existing_index_store
30
+ )
31
+ # Instantiate Mandi Price utils class object
32
+ self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
33
+ # Instantiate Weather class object
34
+ self.weather_utils_obj = weather_utils.WEATHER()
35
+ # Instantiate translator_utils class object
36
+ self.translator_utils_obj = translator_utils.TRANSLATOR()
37
+
38
+
39
+
40
+ # Initialize index (vector store)
41
+ def load_create_index(self):
42
+ logger.info(f"Load/Create index")
43
+ self.langchain_utils_obj.load_create_index()
44
+
45
+
46
+ # Upload data and update the index
47
+ def upload_data(
48
+ self,
49
+ doc_type,
50
+ files_or_urls,
51
+ index_category
52
+ ):
53
+ logger.info(f"Uploading data")
54
+ self.langchain_utils_obj.upload_data(
55
+ doc_type=doc_type,
56
+ files_or_urls=files_or_urls,
57
+ index_category=index_category
58
+ )
59
+
60
+
61
+ # Define query on index to retrieve the most relevant top K documents from the vector store
62
+ def query(
63
+ self,
64
+ question,
65
+ question_category
66
+ ):
67
+ '''
68
+ Args:
69
+ mode: can be any of [default, embedding]
70
+ response_mode: can be any of [default, compact, tree_summarize]
71
+ '''
72
+ logger.info(f"Querying from index/vector store")
73
+
74
+ return self.langchain_utils_obj.query(
75
+ question=question,
76
+ question_category=question_category
77
+ )
src/langchain_utils.py ADDED
@@ -0,0 +1,891 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import src.constants as constants_utils
2
+ import src.data_loader as data_loader_utils
3
+ import src.utils as utils
4
+
5
+ from langchain.llms import OpenAI
6
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
7
+ from langchain.chains.summarize import load_summarize_chain
8
+ from langchain.docstore.document import Document
9
+ from langchain.embeddings.openai import OpenAIEmbeddings
10
+ import openai
11
+ from langchain.vectorstores import Chroma
12
+ import chromadb
13
+ from langchain.chains.question_answering import load_qa_chain
14
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
15
+ from langchain.prompts import PromptTemplate
16
+ from llama_index import GPTSimpleVectorIndex, GPTListIndex
17
+ from langchain.vectorstores import FAISS
18
+
19
+ import pickle
20
+ import shutil
21
+ from typing import Dict, List, Optional
22
+
23
+ import os
24
+ os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
25
+
26
+ import logging
27
+ logger = logging.getLogger(__name__)
28
+ logging.basicConfig(
29
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
30
+ )
31
+
32
+ import warnings
33
+ warnings.filterwarnings('ignore')
34
+
35
+
36
+
37
+ class LANGCHAIN_UTILS:
38
+ def __init__(self,
39
+ index_type=constants_utils.INDEX_TYPE,
40
+ load_from_existing_index_store=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
41
+ ):
42
+ self.index_type = index_type
43
+ self.load_from_existing_index_store = load_from_existing_index_store
44
+
45
+ # Temporary index in the current context for the doc_type in consideration
46
+ self.index = None
47
+ # Master index which contains data from multiple sources (PDF, Online PDF, Text files, URLs, etc. It gets updated on Uploading the data from new files/urls without downtime of the application on-demand.)
48
+ self.master_index = None
49
+
50
+ # Data source wise index
51
+ self.index_category_doc_type_wise_index = dict(
52
+ (ic, dict(
53
+ (ds, None) for ds in list(constants_utils.DATA_SOURCES.values()))
54
+ ) for ic in constants_utils.INDEX_CATEGORY)
55
+ # Initialize master index for each INDEX_CATEGORY
56
+ for ic in constants_utils.INDEX_CATEGORY:
57
+ self.index_category_doc_type_wise_index[ic][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
58
+
59
+ # Data loaded as a Document format in the current context for the doc_type in consideration
60
+ self.documents = []
61
+
62
+ # Instantiate data_loader_utils class object
63
+ self.data_loader_utils_obj = data_loader_utils.DATA_LOADER()
64
+ # Instantiate UTILS class object
65
+ self.utils_obj = utils.UTILS()
66
+
67
+ # Initialize embeddings (we can also use other embeddings)
68
+ self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
69
+
70
+ # Global history for AgGPT widget
71
+ self.global_history = [
72
+ {
73
+ "role": "assistant",
74
+ "content": "Hi, I am a chatbot. I can converse in English. I can answer your questions about farming in India. Ask me anything!"
75
+ }
76
+ ]
77
+
78
+
79
+ def generate_prompt_template(
80
+ self,
81
+ prompt_type='general'
82
+ ):
83
+ prompt_template = ''
84
+
85
+ if prompt_type == 'general':
86
+ prompt_template = """Write a concise summary of the following:
87
+
88
+ {text}
89
+
90
+ SUMMARIZE IN ENGLISH:"""
91
+
92
+ elif prompt_type == 'weather':
93
+ prompt_template = """
94
+ What would be the weather based on the below data:
95
+ {text}
96
+ """
97
+
98
+ return prompt_template
99
+
100
+
101
+ def user(
102
+ self,
103
+ user_message,
104
+ history
105
+ ):
106
+ history = history + [[user_message, None]]
107
+ self.global_history = self.global_history + [{"role": "user", "content": user_message}]
108
+ return "", history
109
+
110
+
111
+ def get_chatgpt_response(
112
+ self,
113
+ history
114
+ ):
115
+ output = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=history)
116
+ history.append({"role": "assistant", "content": output.choices[0].message.content})
117
+ return output.choices[0].message.content, history
118
+
119
+
120
+ def bot(
121
+ self,
122
+ history
123
+ ):
124
+ response, self.global_history = self.get_chatgpt_response(self.global_history)
125
+ history[-1][1] = response
126
+ return history
127
+
128
+
129
+ def clear_history(
130
+ self,
131
+ lang="English"
132
+ ):
133
+ self.global_history = [{"role": "assistant", "content": "Hi, I am a chatbot. I can converse in {}. I can answer your questions about farming in India. Ask me anything!".format(lang)}]
134
+ return None
135
+
136
+
137
+ def get_textual_summary(
138
+ self,
139
+ text,
140
+ chain_type="stuff",
141
+ custom_prompt=True,
142
+ prompt_type='general'
143
+ ):
144
+ texts = [text]
145
+ docs = [Document(page_content=t) for t in texts[:3]]
146
+
147
+ llm = OpenAI(temperature=0)
148
+ if custom_prompt:
149
+ prompt_template = self.generate_prompt_template(prompt_type)
150
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
151
+ chain = load_summarize_chain(llm, chain_type=chain_type, prompt=PROMPT)
152
+ else:
153
+ chain = load_summarize_chain(llm, chain_type=chain_type)
154
+
155
+ text_summary = chain.run(docs)
156
+ return text_summary
157
+
158
+
159
+ def get_weather_forecast_summary(
160
+ self,
161
+ text,
162
+ chain_type="stuff"
163
+ ):
164
+ text = f"""
165
+ What would be the weather based on the below data:
166
+ {text}
167
+
168
+ Give simple response without technical numbers which can be explained to human.
169
+ """
170
+ texts = [text]
171
+ docs = [Document(page_content=t) for t in texts[:3]]
172
+
173
+ llm = OpenAI(temperature=0)
174
+ chain = load_summarize_chain(llm, chain_type=chain_type)
175
+ text_summary = chain.run(docs)
176
+
177
+ return text_summary
178
+
179
+
180
+ def get_answer_from_para(
181
+ self,
182
+ para,
183
+ question,
184
+ chain_type="stuff",
185
+ custom_prompt=True
186
+ ):
187
+ # Prepare data (Split paragraph into chunks of small documents)
188
+ text_splitter = CharacterTextSplitter(
189
+ chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
190
+ chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
191
+ separator=constants_utils.TEXT_SPLITTER_SEPARATOR
192
+ )
193
+ texts = text_splitter.split_text(para)
194
+
195
+ if self.index_type == 'FAISS':
196
+ # Find similar docs that are relevant to the question
197
+ docsearch = FAISS.from_texts(
198
+ texts, self.embeddings,
199
+ metadatas=[{"source": str(i)} for i in range(len(texts))]
200
+ )
201
+
202
+ elif self.index_type == 'Chroma':
203
+ # Find similar docs that are relevant to the question
204
+ docsearch = Chroma.from_texts(
205
+ texts, self.embeddings,
206
+ metadatas=[{"source": str(i)} for i in range(len(texts))]
207
+ )
208
+
209
+ # Search for the similar docs
210
+ docs = docsearch.similarity_search(question, k=1)
211
+
212
+ llm = OpenAI(temperature=0)
213
+ # Create a Chain for question answering
214
+ if custom_prompt:
215
+ prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
216
+
217
+ {context}
218
+
219
+ Question: {question}
220
+ Answer in English:"""
221
+
222
+ PROMPT = PromptTemplate(
223
+ template=prompt_template, input_variables=["context", "question"]
224
+ )
225
+ chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
226
+ else:
227
+ # chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
228
+ chain = load_qa_chain(llm, chain_type=chain_type)
229
+ # chain.run(input_documents=docs, question=question)
230
+
231
+ out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
232
+ return out_dict['output_text']
233
+
234
+
235
+ def load_documents(
236
+ self,
237
+ doc_type,
238
+ doc_filepath='',
239
+ urls=[]
240
+ ):
241
+ """
242
+ Load data in Document format of the given doc_type from either doc_filepath or list of urls.
243
+ It can load multiple files/urls in one shot.
244
+
245
+ Args:
246
+ doc_type: can be any of [pdf, online_pdf, urls, textfile]
247
+ doc_filepath: can be a directory or a filepath
248
+ urls: list of urls
249
+ """
250
+
251
+ logger.info(f'Loading {doc_type} data into Documents format')
252
+
253
+ if doc_type == 'pdf':
254
+ # Load data from PDFs stored in local directory
255
+ self.documents.extend(
256
+ self.data_loader_utils_obj.load_documents_from_pdf(
257
+ doc_filepath=doc_filepath,
258
+ doc_type=doc_type
259
+ ))
260
+
261
+ elif doc_type == 'online_pdf':
262
+ # Load data from PDFs stored in local directory
263
+ self.documents.extend(
264
+ self.data_loader_utils_obj.load_documents_from_pdf(
265
+ urls=urls,
266
+ doc_type=doc_type
267
+ ))
268
+
269
+ elif doc_type == 'urls':
270
+ # Load data from URLs
271
+ self.documents.extend(
272
+ self.data_loader_utils_obj.load_documents_from_urls(
273
+ urls=urls,
274
+ doc_type=doc_type
275
+ ))
276
+
277
+ elif doc_type == 'textfile':
278
+ # Load data from text files & Convert texts into Document format
279
+ self.documents.extend(
280
+ self.data_loader_utils_obj.load_documents_from_text(
281
+ doc_filepath=doc_filepath,
282
+ doc_type=doc_type
283
+ ))
284
+
285
+ elif doc_type == 'directory':
286
+ # Load data from local directory
287
+ self.documents.extend(
288
+ self.data_loader_utils_obj.load_documents_from_directory(
289
+ doc_filepath=doc_filepath,
290
+ doc_type=doc_type
291
+ ))
292
+
293
+ logger.info(f'{doc_type} data into Documents format loaded successfully!')
294
+
295
+
296
+ def create_index(
297
+ self
298
+ ):
299
+ if not self.documents:
300
+ logger.warning(f'Empty documents. Index cannot be created!')
301
+ return None
302
+
303
+ logger.info(f'Creating index')
304
+
305
+ text_splitter = CharacterTextSplitter(
306
+ chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
307
+ chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
308
+ separator=constants_utils.TEXT_SPLITTER_SEPARATOR
309
+ )
310
+ self.documents = text_splitter.split_documents(self.documents)
311
+
312
+ ############## Build the Vector store for docs ##############
313
+ # Vector store using Facebook AI Similarity Search
314
+ if self.index_type == 'FAISS':
315
+ self.index = FAISS.from_documents(
316
+ self.documents,
317
+ self.embeddings
318
+ )
319
+
320
+ # Vector store using Chroma DB
321
+ elif self.index_type == 'Chroma':
322
+ if not os.path.exists(self.index_filepath):
323
+ os.makedirs(self.index_filepath)
324
+
325
+ self.index = Chroma.from_documents(
326
+ self.documents,
327
+ self.embeddings,
328
+ persist_directory=self.index_filepath
329
+ )
330
+
331
+ # Vector store using GPT vector index
332
+ elif self.index_type == 'GPTSimpleVectorIndex':
333
+ self.index = GPTSimpleVectorIndex.from_documents(self.documents)
334
+
335
+ logger.info(f'Index created successfully!')
336
+ return self.index
337
+
338
+
339
+ def get_index_filepath(
340
+ self,
341
+ index_category,
342
+ doc_type
343
+ ):
344
+ if doc_type == 'master':
345
+ self.index_filepath = os.path.join(
346
+ constants_utils.OUTPUT_PATH, f'index_{index_category}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}.json')
347
+ else:
348
+ self.index_filepath = os.path.join(
349
+ constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}.json')
350
+
351
+ return self.index_filepath
352
+
353
+
354
+ def load_master_doctype_indices_for_index_category(
355
+ self,
356
+ index_category
357
+ ):
358
+ logger.info(f'Loading master and doc_type indices for: {index_category}')
359
+
360
+ # Set master index of index_category = None
361
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
362
+
363
+ for doc_type in self.index_category_doc_type_wise_index[index_category].keys():
364
+ self.index = None
365
+ self.index_filepath = self.get_index_filepath(
366
+ index_category=index_category,
367
+ doc_type=doc_type
368
+ )
369
+ self.load_index()
370
+ # Set master/doc_type index
371
+ self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
372
+
373
+ logger.info(f'Master and doc_type indices for: {index_category} loaded successfully!')
374
+
375
+
376
+ def load_create_index(
377
+ self
378
+ ):
379
+ logger.info(f'Loading/Creating index for each index_category')
380
+
381
+ for index_category in constants_utils.INDEX_CATEGORY:
382
+ # Load master index_category index if self.load_from_existing_index_store == True
383
+ if self.load_from_existing_index_store:
384
+ self.load_master_doctype_indices_for_index_category(index_category)
385
+
386
+ # For any reason, if master index is not loaded then create the new index/vector store
387
+ if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
388
+ logger.info(f'Creating a new Vector/Index store for: {index_category}')
389
+
390
+ doc_filepath = os.path.join(constants_utils.DATA_PATH, index_category)
391
+ urls = []
392
+
393
+ # Build the Vector/Index store
394
+ for doc_type in list(constants_utils.DATA_SOURCES.values()):
395
+ logger.info(f'Creating a new Vector/Index store for: {index_category} from data source: {doc_type}')
396
+
397
+ index = None
398
+ if doc_type in ['pdf', 'textfile']:
399
+ index = self.create_store_index(
400
+ doc_type=doc_type,
401
+ doc_filepath=doc_filepath,
402
+ index_category=index_category
403
+ )
404
+ else:
405
+ # Build the Vector/Index store from web urls
406
+ index = self.create_store_index(
407
+ doc_type=doc_type,
408
+ urls=urls,
409
+ index_category=index_category
410
+ )
411
+
412
+ if index:
413
+ self.index_category_doc_type_wise_index[index_category][doc_type] = index
414
+
415
+ logger.info(f'New Vector/Index store for: {index_category} from data source: {doc_type} created successfully!')
416
+
417
+ logger.info(f'New Vector/Index store for: {index_category} created successfully!')
418
+
419
+ # Merge index of each doc_type into a single index_category
420
+ self.merge_store_master_index(
421
+ index_category=index_category
422
+ )
423
+
424
+ logger.info(f'Index for each index_category loaded successfully!')
425
+
426
+
427
+ def create_store_index(
428
+ self,
429
+ doc_type='pdf',
430
+ doc_filepath=constants_utils.DATA_PATH,
431
+ urls=[],
432
+ index_category=constants_utils.INDEX_CATEGORY[0]
433
+ ):
434
+ logger.info(f'Creating and storing {doc_type} index')
435
+
436
+ self.documents = []
437
+ self.index = None
438
+
439
+ self.index_filepath = self.get_index_filepath(
440
+ index_category=index_category,
441
+ doc_type=doc_type
442
+ )
443
+
444
+ # Delete the old index file
445
+ shutil.rmtree(self.index_filepath, ignore_errors=True)
446
+ logger.info(f'{self.index_filepath} deleted.')
447
+
448
+ # Load data in Documents format that can be consumed for index creation
449
+ self.load_documents(
450
+ doc_type,
451
+ doc_filepath,
452
+ urls
453
+ )
454
+
455
+ # Create the index from documents for search/retrieval
456
+ self.index = self.create_index()
457
+
458
+ # Store index
459
+ self.store_index(
460
+ index=self.index,
461
+ index_filepath=self.index_filepath
462
+ )
463
+
464
+ logger.info(f'{doc_type} index created and stored successfully!')
465
+ # Return the index of the given doc_type (this is an index for a single doc_type). Indices from multiple doc_types should be merged later on in the master index so that query could be made from a single index.
466
+ return self.index
467
+
468
+
469
+ def store_index(
470
+ self,
471
+ index,
472
+ index_filepath
473
+ ):
474
+ if not index:
475
+ logger.warning(f'Cannot write an empty index to: {index_filepath}!')
476
+ return
477
+
478
+ logger.info(f'Saving index to: {index_filepath}')
479
+
480
+ if not os.path.exists(index_filepath) and os.path.isdir(index_filepath):
481
+ os.makedirs(index_filepath)
482
+
483
+ if self.index_type == 'FAISS':
484
+ index.save_local(index_filepath)
485
+
486
+ elif self.index_type == 'Chroma':
487
+ index.persist()
488
+
489
+ elif self.index_type == 'GPTSimpleVectorIndex':
490
+ index.save_to_disk(index_filepath)
491
+
492
+ elif self.index_type == 'pickle':
493
+ with open(index_filepath, "wb") as f:
494
+ pickle.dump(index, f)
495
+
496
+ logger.info(f'Index saved to: {index_filepath} successfully!')
497
+
498
+
499
+ def load_index(
500
+ self
501
+ ):
502
+ logger.info(f'Loading index from: {self.index_filepath}')
503
+
504
+ if not os.path.exists(self.index_filepath):
505
+ logger.warning(f"Cannot load index from {self.index_filepath} as the path doest not exist!")
506
+ return
507
+
508
+ if self.index_type == 'FAISS':
509
+ self.index = FAISS.load_local(self.index_filepath, self.embeddings)
510
+
511
+ elif self.index_type == 'Chroma':
512
+ self.index = Chroma(
513
+ persist_directory=self.index_filepath,
514
+ embedding_function=self.embeddings
515
+ )
516
+
517
+ elif self.index_type == 'GPTSimpleVectorIndex':
518
+ self.index = GPTSimpleVectorIndex.load_from_disk(self.index_filepath)
519
+
520
+ elif self.index_type == 'pickle':
521
+ with open(self.index_filepath, "rb") as f:
522
+ self.index = pickle.load(f)
523
+
524
+ logger.info(f'Index loaded from: {self.index_filepath} successfully!')
525
+
526
+
527
+ def convert_text_to_documents(
528
+ self,
529
+ text_list=[]
530
+ ):
531
+ """
532
+ Converts the list of text data to Documents format that can be feed to GPT API to build the Vector store
533
+ """
534
+
535
+ from llama_index import Document
536
+ documents = [Document(t) for t in text_list]
537
+ return documents
538
+
539
+
540
+ def merge_documents_from_different_sources(
541
+ self,
542
+ doc_documents,
543
+ url_documents
544
+ ):
545
+ # Build the Vector store for docs
546
+ doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
547
+ # Build the Vector store for URLs
548
+ url_index = GPTSimpleVectorIndex.from_documents(url_documents)
549
+
550
+ # Set summary of each index
551
+ doc_index.set_text("index_from_docs")
552
+ url_index.set_text("index_from_urls")
553
+
554
+ # Merge index of different data sources
555
+ index = GPTListIndex([doc_index, url_index])
556
+
557
+ return index
558
+
559
+
560
+ def merge_store_master_index(
561
+ self,
562
+ index_category
563
+ ):
564
+ """
565
+ Merge multiple doc_type indices into a single master index. Query/search would be performed on this merged index.
566
+
567
+ Args:
568
+ index_category: index_category (can be any of: [crops, fruits, pest_management, govt_policy, soil, etc.])
569
+ """
570
+ logger.info('Merging doc_type indices of different index categories into a master index')
571
+
572
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
573
+ doc_type_indices = self.index_category_doc_type_wise_index[index_category]
574
+
575
+ if self.index_type == 'FAISS':
576
+ for doc_type, index in doc_type_indices.items():
577
+ if doc_type == constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE:
578
+ # Only merge the non-master doc_type_indices
579
+ continue
580
+ if not index or not isinstance(index, FAISS):
581
+ logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.faiss.FAISS')
582
+ continue
583
+ if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
584
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = index
585
+ else:
586
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE].merge_from(index)
587
+
588
+ elif self.index_type == 'Chroma':
589
+ for doc_type, index in doc_type_indices.items():
590
+ if not index or not isinstance(index, Chroma):
591
+ logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.Chroma')
592
+ continue
593
+ raise NotImplementedError
594
+
595
+ elif self.index_type == 'GPTSimpleVectorIndex':
596
+ for doc_type, index in doc_type_indices.items():
597
+ if not index or not isinstance(index, GPTSimpleVectorIndex):
598
+ logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
599
+ continue
600
+ import pdb; pdb.set_trace()
601
+ raise NotImplementedError
602
+
603
+ # Store index_category master index
604
+ self.store_index(
605
+ index=self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE],
606
+ index_filepath=self.get_index_filepath(
607
+ index_category=index_category,
608
+ doc_type=constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE
609
+ )
610
+ )
611
+
612
+ logger.info('doc_type indices of different index categories into a master index merged successfully!')
613
+
614
+
615
+ def init_chromadb(self):
616
+ logger.info('Initializing Chroma DB')
617
+
618
+ if not os.path.exists(self.index_filepath):
619
+ os.makedirs(self.index_filepath)
620
+
621
+ client_settings = chromadb.config.Settings(
622
+ chroma_db_impl="duckdb+parquet",
623
+ persist_directory=self.index_filepath,
624
+ anonymized_telemetry=False
625
+ )
626
+
627
+ self.index = Chroma(
628
+ collection_name="langchain_store",
629
+ embedding_function=self.embeddings,
630
+ client_settings=client_settings,
631
+ persist_directory=self.index_filepath,
632
+ )
633
+
634
+ logger.info('Chroma DB initialized successfully!')
635
+
636
+
637
+ def query_chromadb(
638
+ self,
639
+ question,
640
+ k=1
641
+ ):
642
+ return self.index.similarity_search(query=question, k=k)
643
+
644
+
645
+ def query(self,
646
+ question,
647
+ question_category,
648
+ mode=constants_utils.MODE,
649
+ response_mode=constants_utils.RESPONSE_MODE,
650
+ similarity_top_k=constants_utils.SIMILARITY_TOP_K,
651
+ required_keywords=[],
652
+ exclude_keywords=[],
653
+ verbose=False
654
+ ):
655
+ '''
656
+ Args:
657
+ mode: can be any of [default, embedding]
658
+ response_mode: can be any of [default, compact, tree_summarize]
659
+ '''
660
+ logger.info(f'question category: {question_category}; question: {question}')
661
+
662
+ response = None
663
+
664
+ # Get the index of the given question_category
665
+ index = self.index_category_doc_type_wise_index[question_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]
666
+
667
+ if self.index_type == 'FAISS':
668
+ response = index.similarity_search(
669
+ question,
670
+ k=similarity_top_k
671
+ )
672
+
673
+ elif self.index_type == 'Chroma':
674
+ response = index.similarity_search(
675
+ question,
676
+ k=similarity_top_k
677
+ )
678
+
679
+ elif self.index_type == 'GPTSimpleVectorIndex':
680
+ # Querying the index
681
+ response = index.query(
682
+ question,
683
+ mode=mode,
684
+ response_mode=response_mode,
685
+ similarity_top_k=similarity_top_k,
686
+ required_keywords=required_keywords,
687
+ exclude_keywords=exclude_keywords,
688
+ verbose=verbose
689
+ )
690
+
691
+ return response
692
+
693
+
694
+ def load_uploaded_documents(
695
+ self,
696
+ doc_type,
697
+ files_or_urls
698
+ ):
699
+ logger.info(f'Loading uploaded documents from: {doc_type}')
700
+
701
+ if doc_type == 'pdf':
702
+ if not isinstance(files_or_urls, list):
703
+ files_or_urls = [files_or_urls]
704
+ for pdf in files_or_urls:
705
+ if not pdf.name.endswith('.pdf'):
706
+ logger.warning(f'Found a file other than .pdf format. Cannot load {pdf.name} file!')
707
+ continue
708
+ logger.info(f'Loading PDF from: {pdf.name}')
709
+ # Load PDF as documents
710
+ self.documents.extend(
711
+ self.data_loader_utils_obj.load_documents_from_pdf(
712
+ doc_filepath=pdf.name,
713
+ doc_type=doc_type
714
+ )
715
+ )
716
+
717
+ elif doc_type == 'textfile':
718
+ if not isinstance(files_or_urls, list):
719
+ files_or_urls = [files_or_urls]
720
+ for text_file in files_or_urls:
721
+ if not text_file.name.endswith('.txt'):
722
+ logger.warning(f'Found a file other than .txt format. Cannot load {text_file.name} file!')
723
+ continue
724
+ logger.info(f'Loading textfile from: {text_file.name}')
725
+ # Load textfile as documents
726
+ self.documents.extend(
727
+ self.data_loader_utils_obj.load_documents_from_text(
728
+ doc_filepath=text_file.name,
729
+ doc_type=doc_type
730
+ )
731
+ )
732
+
733
+ elif doc_type == 'online_pdf':
734
+ files_or_urls = self.utils_obj.split_text(files_or_urls)
735
+ # Load online_pdfs as documents
736
+ self.documents.extend(
737
+ self.data_loader_utils_obj.load_documents_from_pdf(
738
+ doc_type=doc_type,
739
+ urls=files_or_urls
740
+ )
741
+ )
742
+
743
+ elif doc_type == 'urls':
744
+ files_or_urls = self.utils_obj.split_text(files_or_urls)
745
+ # Load URLs as documents
746
+ self.documents.extend(
747
+ self.data_loader_utils_obj.load_documents_from_urls(
748
+ doc_type=doc_type,
749
+ urls=files_or_urls
750
+ )
751
+ )
752
+
753
+ logger.info(f'Uploaded documents from: {doc_type} loaded successfully!')
754
+
755
+
756
+ def upload_data(
757
+ self,
758
+ doc_type,
759
+ files_or_urls,
760
+ index_category
761
+ ):
762
+ logger.info(f'Uploading data for: {index_category}; from: {doc_type}')
763
+
764
+ self.documents = []
765
+ self.index = None
766
+
767
+ # Create documents of the uploaded files
768
+ self.load_uploaded_documents(
769
+ doc_type,
770
+ files_or_urls
771
+ )
772
+
773
+ # Create the index from documents for search/retrieval
774
+ self.index = self.create_index()
775
+
776
+ # Update the existing index with the newly data
777
+ self.upsert_index(
778
+ doc_type=doc_type,
779
+ index_category=index_category
780
+ )
781
+
782
+ logger.info(f'{index_category}-{doc_type} data uploaded successfully!')
783
+
784
+
785
+ def upsert_index(
786
+ self,
787
+ doc_type,
788
+ index_category
789
+ ):
790
+ """
791
+ Updates the index of the given index_category-doc_type, if present.
792
+ Creates a new index if index_category-doc_type index is not present.
793
+ Also updates the master index for the given index_category.
794
+ """
795
+ if not self.index:
796
+ return
797
+
798
+ logger.info(f'Upserting index for: {index_category}-{doc_type}')
799
+
800
+ if not self.index_category_doc_type_wise_index.get(index_category, None):
801
+ """
802
+ If index_category index does not exists
803
+ Steps:
804
+ - set index_category index
805
+ - set doc_type index
806
+ - Store new index_category index as master
807
+ - Store new doc_type index
808
+ """
809
+ logger.info(f'Master index does not exist for: {index_category}. A new {index_category} master index & {doc_type} index would be created.')
810
+ self.index_category_doc_type_wise_index.setdefault(index_category, {})
811
+ # Set a master index only if it doesn't exist. Else keep it's value as-it-is.
812
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
813
+ # Set an index for the given doc_type only if it doesn't exist. Else keep it's value as-it-is.
814
+ self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
815
+
816
+ elif not self.index_category_doc_type_wise_index[index_category].get(doc_type, None):
817
+ """
818
+ If doc_type index does not exists
819
+ Steps:
820
+ - set doc_type index
821
+ - if master index does not exist for the index_category - set a master index
822
+ - if master index exists - update the master index to merge it with doc_type index
823
+ - Store new/updated index_category index as master
824
+ - Store new doc_type index
825
+ """
826
+ logger.info(f'{doc_type} index does not exist for: {index_category}-{doc_type}. A new {doc_type} index would be created.')
827
+ # create doc_type index
828
+ self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
829
+ # if master index does not exist for the index_category - create a master index
830
+ if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
831
+ logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
832
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
833
+
834
+ else:
835
+ """
836
+ If the new document is of the existing index_category & doc_type
837
+ Steps:
838
+ - if master index does not exist for the index_category - set a master index
839
+ - if master index exists - update the master index to merge it with doc_type index
840
+ - update the doc_type index
841
+ - Store updated index_category index as master
842
+ - Store updated doc_type index
843
+ """
844
+ # if master index does not exist for the index_category - create a master index
845
+ if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
846
+ logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
847
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
848
+ # Merge new self.index with existing doc_type index
849
+ self.index_category_doc_type_wise_index[index_category][doc_type].merge_from(self.index)
850
+ # Update self.index to store/overwrite the existing index with the updated index
851
+ self.index = self.index_category_doc_type_wise_index[index_category][doc_type]
852
+
853
+
854
+ # Store newly created/merged index
855
+ self.store_index(
856
+ index=self.index,
857
+ index_filepath=self.get_index_filepath(
858
+ index_category=index_category,
859
+ doc_type=doc_type
860
+ )
861
+ )
862
+
863
+ # Merge and store master index for index_category
864
+ self.merge_store_master_index(
865
+ index_category=index_category
866
+ )
867
+
868
+ logger.info(f'Index for: {index_category}-{doc_type} upserted successful!')
869
+
870
+
871
+ def delete_index(
872
+ self,
873
+ ids: Optional[List[str]] = None,
874
+ # filter: Optional[DocumentMetadataFilter] = None,
875
+ delete_all: Optional[bool] = None,
876
+ ):
877
+ """
878
+ Removes vectors by ids, filter, or everything in the datastore.
879
+ Multiple parameters can be used at once.
880
+ Returns whether the operation was successful.
881
+ """
882
+ logger.info(f'Deleting index')
883
+
884
+ raise NotImplementedError
885
+
886
+ # NOTE: we can delete a specific collection
887
+ self.index.delete_collection()
888
+ self.index.persist()
889
+
890
+ # Or just nuke the persist directory
891
+ # !rm -rf self.index_filepath
src/mandi_price.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+
4
+ class MANDI_PRICE:
5
+ def __init__(self):
6
+ self.base_url = "https://enam.gov.in/web/Ajax_ctrl/trade_data_list"
7
+ # "https://enam.gov.in/web/dashboard/trade-data",
8
+ # "https://enam.gov.in/web/dashboard/trade_data_list",
9
+
10
+
11
+ def get_mandi_price(self,
12
+ state_name,
13
+ apmc_name,
14
+ commodity_name,
15
+ from_date,
16
+ to_date
17
+ ):
18
+ # Prepare the payload for POST request
19
+ payload = f"language=en&stateName={state_name}&apmcName={apmc_name}&commodityName={commodity_name}&fromDate={from_date}&toDate={to_date}"
20
+
21
+ headers = {
22
+ "Content-type": "application/x-www-form-urlencoded; charset=UTF-8",
23
+ "Referer": "https://enam.gov.in/web/dashboard/trade-data",
24
+ "Accept": "application/json, text/javascript, */*; q=0.01",
25
+ }
26
+
27
+ response = requests.post(
28
+ self.base_url,
29
+ json=payload,
30
+ headers=headers,
31
+ )
32
+
33
+ return response.json()
src/ner_detection.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import openai
3
+ import os
4
+ import re
5
+ import ast
6
+
7
+ openai.api_key = "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"
8
+ SYSTEM_PROMPT = "You are a smart and intelligent Named Entity Recognition (NER) system. I will provide you the definition of the entities you need to extract, the sentence from where your extract the entities and the output format with examples."
9
+ USER_PROMPT_1 = "Are you clear about your role?"
10
+ ASSISTANT_PROMPT_1 = "Sure, I'm ready to help you with your NER task. Please provide me with the necessary information to get started."
11
+ GUIDELINES_PROMPT = (
12
+ """Entity Definition:\n"
13
+ "1. PEST NAME: Name of the pest which has attacked a particular crop which may lead to crop damage.\n"
14
+ "2. CROP DISEASE: Any kind of crop disease which occurs in agriculture land in india and nearby resgions.\n"
15
+ "3. WEATHER CONDITION: Severe climate conditions like heavy rainfall, hailstorm which has destroyed crops.\n"
16
+ "\n"
17
+ "Output Format:\n"
18
+ "{{'PEST NAME': [list of entities present], 'CROP DISEASE': [list of entities present], 'WEATHER CONDITION': [list of entities present]}}\n"
19
+ "If no entities are presented in any categories keep it None\n"
20
+ "\n"
21
+ "Examples:\n"
22
+ "\n"
23
+ "1. Sentence: Pest attack on maize crop in lower Kangra : The Tribune India. Farmers in lower Kangra are a harried lot as the fall armyworm pest has attacked their maize crop. 'Kolshi' continues to affect Vidarbha's Orange crop cultivation (Citrus Black Fly) | Krishak Jagat. A total of 1,50,000 hectares of land in the Vidarbha region is planted with oranges, and of them, 25% are seriously damaged by Kolshi, a citrus black fly disease. India's June tea output drops 17% as floods hit plucking | Mint. India's June tea production fell 17.4% from a year earlier to 141.31 million kilograms, the state-run Tea Board said, as floods and pest attack dented output in the main producing region\n"
24
+ "Output: {{'PEST NAME': ['fall armyworm'], 'CROP DISEASE': ['citrus black fly disease'], 'WEATHER CONDITION': ['floods']}}\n"
25
+ "\n"
26
+ "2. Sentence: ICAR issues pest alert in Leparada, W/Siang | The Arunachal Times. 70 percent prevalence of fall army worm in maize fields in Pagi, Gori and Bam villages in Leparada district and Darka, Kombo and Jirdin villages in West Siang district was observed. After maize, Kangra vegetable crops under white fly attack : The Tribune India. Vegetable crops are under attack by white fly in the lower hills of Kangra district. The pest attack comes after the recent damage caused by fall armyworm to the maize crop in the area. Pest attacks on paddy crop worry farmers in the integrated Karimnagar district | Hindudayashankar. Crops withering due to stem borer, leaf folder and rice blast; farmers have to incur huge expenditures to control menace. Cyclone Amphan damages crop, vegetable prices shoot up | Cities News,The Indian Express. Cyclone Amphan has damaged vegetables across South Bengal. Farmers lost 80 to 90 per cent of crop as fields were flooded.\n"
27
+ "Output: {{'PEST NAME': ['fall army worm', 'white fly attack', 'stem borer', 'leaf folder'], 'CROP DISEASE': ['rice blast'], 'WEATHER CONDITION': ['Cyclone Amphan']}}\n"
28
+ "\n"
29
+ "3. Sentence: {}\n"
30
+ "Output: """
31
+ )
32
+
33
+ def openai_chat_completion_response(news_article_text):
34
+ final_prompt = GUIDELINES_PROMPT.format(news_article_text)
35
+ response = openai.ChatCompletion.create(
36
+ model="gpt-3.5-turbo",
37
+ messages=[
38
+ {"role": "system", "content": SYSTEM_PROMPT},
39
+ {"role": "user", "content": USER_PROMPT_1},
40
+ {"role": "assistant", "content": ASSISTANT_PROMPT_1},
41
+ {"role": "user", "content": final_prompt}
42
+ ]
43
+ )
44
+ return response['choices'][0]['message']['content'].strip(" \n")
45
+
46
+ # def preprocess(prompt):
47
+ # return GUIDELINES_PROMPT.format(prompt)
48
+ # def main():
49
+ # my_sentence = "Hundreds of hectares of land under the cotton crop, once referred to as white gold, has come under attack of a wide range of insects like whitefly, pink bollworm and mealybug. This is likely to hit the cotton production this year."
50
+ # GUIDELINES_PROMPT = GUIDELINES_PROMPT.format(my_sentence)
51
+ # # print(GUIDELINES_PROMPT)
52
+ # ners = openai_chat_completion_response(GUIDELINES_PROMPT)
53
+ # print(ners)
54
+
55
+ import gradio as gra
56
+ #define gradio interface and other parameters
57
+ app = gra.Interface(fn = openai_chat_completion_response, inputs="text", outputs="text")
58
+ app.launch(share=True)
src/translator.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import src.constants as constants_utils
2
+ import requests
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+ from mosestokenizer import *
5
+ from indicnlp.tokenize import sentence_tokenize
6
+ from googletrans import Translator, constants
7
+
8
+
9
+ class TRANSLATOR:
10
+ def __init__(self):
11
+ print()
12
+
13
+
14
+ def split_sentences(self, paragraph, language):
15
+ if language == "en":
16
+ with MosesSentenceSplitter(language) as splitter:
17
+ return splitter([paragraph])
18
+ elif language in constants_utils.INDIC_LANGUAGE:
19
+ return sentence_tokenize.sentence_split(paragraph, lang=language)
20
+
21
+
22
+ def get_in_hindi(self, payload):
23
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
24
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
25
+ article = self.split_sentences(payload['inputs'], 'en')
26
+ # inputs = tokenizer(payload['input'], return_tensors="pt")
27
+ out_text = ""
28
+ for a in article:
29
+ inputs = tokenizer(a, return_tensors="pt")
30
+ translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hin_Deva"], max_length=100)
31
+ translated_sent = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
32
+ out_text = out_text.join(translated_sent)
33
+ return out_text
34
+
35
+
36
+ def get_in_indic(self, text, language='Hindi'):
37
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
38
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
39
+ inputs = tokenizer(text, return_tensors="pt")
40
+
41
+ code = "eng_Latn"
42
+ if language == 'Hindi':
43
+ code= "hin_Deva"
44
+ elif language == 'Marathi':
45
+ code = "mar_Deva"
46
+
47
+ translated_tokens = model.generate(
48
+ **inputs,
49
+ forced_bos_token_id=tokenizer.lang_code_to_id[code],
50
+ max_length=1000
51
+ )
52
+
53
+ out_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
54
+ return out_text
55
+
56
+
57
+ def get_indic_google_translate(self, text, language='Hindi'):
58
+ # Init the Google API translator
59
+ translator = Translator()
60
+ translations = translator.translate(text, dest=constants_utils.INDIC_LANGUAGE.get(language, 'en'))
61
+ return str(translations.text)
src/utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+ from urllib.parse import urlparse
5
+
6
+ import logging
7
+ logger = logging.getLogger(__name__)
8
+ logging.basicConfig(
9
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
10
+ )
11
+
12
+
13
+ class UTILS:
14
+ def __init__(self):
15
+ pass
16
+
17
+
18
+ def split_text(
19
+ self,
20
+ text
21
+ ):
22
+ text = text.split(',')
23
+ text = [t.strip() for t in text]
24
+ return text
25
+
26
+
27
+ def replace_newlines_and_spaces(
28
+ self,
29
+ text
30
+ ):
31
+ # Replace all newline characters with spaces
32
+ text = text.replace("\n", " ")
33
+ # Replace multiple spaces with a single space
34
+ text = re.sub(r'\s+', ' ', text)
35
+ return text
36
+
37
+
38
+ def clean_df(
39
+ self,
40
+ df,
41
+ dropna=True,
42
+ fillna=False
43
+ ):
44
+ if fillna:
45
+ df.fillna('', inplace=True)
46
+ if dropna:
47
+ df.dropna(inplace=True)
48
+ # df = df[~df.isna()]
49
+ df = df.drop_duplicates().reset_index(drop=True)
50
+ return df
51
+
52
+
53
+ def validate_url_format(
54
+ self,
55
+ urls,
56
+ url_type='urls'
57
+ ):
58
+ valid_urls = []
59
+ for url in urls:
60
+ result = urlparse(url)
61
+ # Check if the url is valid
62
+ if all([result.scheme, result.netloc]):
63
+ # Online PDF urls should end with .pdf extension
64
+ if url_type == 'online_pdf' and not url.endswith('.pdf'):
65
+ continue
66
+ valid_urls.append(url)
67
+ logging.info(f'Valid URLs are: {valid_urls}')
68
+ return valid_urls
src/weather.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup as bs
3
+ import src.constants as constants_utils
4
+
5
+
6
+ class WEATHER:
7
+ def __init__(self):
8
+ self.base_url = 'https://nwp.imd.gov.in/blf/blf_temp'
9
+ self.headers = {
10
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
11
+ }
12
+
13
+ self.state_names_codes = {}
14
+ self.districts = []
15
+
16
+
17
+ def get_state_names_codes(
18
+ self
19
+ ):
20
+ response = requests.get(
21
+ self.base_url,
22
+ headers=self.headers,
23
+ )
24
+
25
+ soup = bs(response.text, 'html.parser')
26
+ for option in soup.find_all('option'):
27
+ if option.text.strip() == 'Select':
28
+ continue
29
+ self.state_names_codes[option.text.strip()] = str(option['value'].split('=')[-1][:2])
30
+
31
+ return self.state_names_codes
32
+
33
+
34
+ def get_district_names(
35
+ self,
36
+ state_name
37
+ ):
38
+ url = f"{self.base_url}/dis.php?value={constants_utils.WEATHER_FORECAST_STATE_CODES.get(state_name, '') + state_name}"
39
+ response = requests.get(
40
+ url,
41
+ headers=self.headers,
42
+ )
43
+
44
+ soup = bs(response.text, 'html.parser')
45
+ self.districts = soup.findAll('select', {'name': 'dis'}, limit=None)
46
+ self.districts = [district.strip() for district in self.districts[0].text.split('\n') if district and district != 'Select']
47
+ return self.districts
48
+
49
+
50
+ # Weather forecast from Govt. website
51
+ def get_weather_forecast(
52
+ self,
53
+ state,
54
+ district,
55
+ is_block_level=False
56
+ ):
57
+ self.district_url = f"{self.base_url}/block.php?dis={constants_utils.WEATHER_FORECAST_STATE_CODES.get(state, '') + district}"
58
+ self.block_url = f'{self.base_url}/table2.php'
59
+
60
+ response = requests.get(self.district_url if not is_block_level else self.block_url)
61
+ soup = bs(response.text, 'html.parser')
62
+ scripts = soup.findAll('font')[0]
63
+ return scripts.text
64
+
65
+
66
+ # Weather using Google weather API
67
+ def get_weather(
68
+ self,
69
+ city
70
+ ):
71
+ city = city + " weather"
72
+ city = city.replace(" ", "+")
73
+
74
+ headers = {
75
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
76
+ }
77
+ response = requests.get(
78
+ f'https://www.google.com/search?q={city}&oq={city}&aqs=chrome.0.35i39l2j0l4j46j69i60.6128j1j7&sourceid=chrome&ie=UTF-8', headers=headers)
79
+
80
+ soup = bs(response.text, 'html.parser')
81
+ location = soup.select('#wob_loc')[0].getText().strip()
82
+ time = soup.select('#wob_dts')[0].getText().strip()
83
+ info = soup.select('#wob_dc')[0].getText().strip()
84
+ temperature = soup.select('#wob_tm')[0].getText().strip()
85
+ temperature = temperature + "°C"
86
+
87
+ return time, info, temperature
src/web_crawler.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup as bs
3
+
4
+
5
+ class LOAD_ONLINE_PDF_IPM_PACKAGES:
6
+ def __init__(self):
7
+ self.base_url = 'https://ppqs.gov.in/ipm-packages'
8
+
9
+ self.ipm_packages = []
10
+ self.pdfs_urls = []
11
+
12
+ self.headers = {
13
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
14
+ }
15
+
16
+
17
+ def _get_ipm_packages_name_list(self):
18
+ """
19
+ Parse HTML page to get the names of each IPM Package
20
+ """
21
+
22
+ response = requests.get(
23
+ self.base_url,
24
+ headers=self.headers,
25
+ )
26
+
27
+ soup = bs(response.text, 'html.parser')
28
+ packages = soup.findAll('span', {'class': 'field-content region-name'}, limit=None)
29
+ for package in packages:
30
+ self.ipm_packages.append(package.a['href'].split('/')[-1])
31
+
32
+
33
+ def get_ipm_packages_pdfs_list(self):
34
+ """
35
+ Parse HTML page to get the PDF URLs of each IPM Package
36
+ """
37
+ self._get_ipm_packages_name_list()
38
+
39
+ for ip in self.ipm_packages:
40
+ source_url = f'{self.base_url}/{ip}'
41
+ print(f'Loading PDFs from: {source_url}')
42
+
43
+ response = requests.get(
44
+ source_url,
45
+ headers=self.headers,
46
+ )
47
+
48
+ soup = bs(response.text, 'html.parser')
49
+ urls = soup.findAll('td', {'class': 'views-field views-field-php'}, limit=None)
50
+ for url in urls:
51
+ self.pdfs_urls.append(url.a['href'])
52
+
53
+
54
+ def get_ipm_packages_pdfs_urls():
55
+ pdf = LOAD_ONLINE_PDF_IPM_PACKAGES()
56
+ pdf.get_ipm_packages_pdfs_list()
57
+ print('Total pdfs:', len(pdf.pdfs_urls))
58
+ return pdf.pdfs_urls