Spaces:

Chintan-Donda
/

KKMS-KSSW-HF

Runtime error

App Files Files Community

Chintan Donda commited on Apr 21, 2023

Commit

b16454e

•

1 Parent(s): 3cd2948

Changes:

Browse files

- Adding new widgets for Uploading Custom data
- Updating web crawler, LangChain utils, and other scripts

Files changed (12) hide show

app.py +427 -211
kkms_kssw.py +62 -108
requirements.txt +2 -1
utils/constants.py +165 -33
utils/data_loader.py +204 -94
utils/langchain_utils.py +821 -151
utils/mandi_price.py +24 -24
utils/ner_detection.py +58 -0
utils/translator.py +52 -52
utils/utils.py +68 -0
utils/weather.py +188 -188
utils/web_crawler.py +58 -0

app.py CHANGED Viewed

@@ -1,224 +1,440 @@
 import gradio as gr
 import os
 import datetime
-import kkms_kssw as kkms_kssw
 import utils.constants as constants_utils
-# Create and launch Gradio Web UI
 class DomState:
-	def __init__(self):
-		self.relevant_paragraphs = ''
-		self.answer = ''
-		self.summary = ''
-		self.mandi_price = ''
-		self.mandi_from_date = (datetime.datetime.now() - datetime.timedelta(days=2)).strftime('%Y-%m-%d')
-		self.mandi_to_date = datetime.datetime.now().strftime('%Y-%m-%d')
-		self.weather_info = ''
-		self.weather_forecast = ''
-		self.weather_forecast_summary = ''
-		self.indic_lang_answer = ''
-		# Initialize index (vector store)
-		self.kkms_kssw_obj = kkms_kssw.KKMS_KSSW()
-		self.kkms_kssw_obj.initialize_index(constants_utils.INDEX_FILENAME, index_type='GPTSimpleVectorIndex')
-	def click_handler_for_get_relevant_paragraphs(self,
-									 question,
-									 mode='default',
-									 response_mode='default',
-									 similarity_top_k=2,
-		):
-		self.relevant_paragraphs = self.kkms_kssw_obj.query(question,
-							mode=mode,
-							response_mode=response_mode,
-							similarity_top_k=similarity_top_k,
-							# required_keywords=required_keywords_list,
-							# exclude_keywords=exclude_keywords_list,
-		)
-		return self.relevant_paragraphs
-	def click_handler_for_summary(self, answer):
-		self.sumamry = self.kkms_kssw_obj.langchain_utils_obj.get_textual_summary(answer)
-		return self.sumamry
-	def click_handler_for_get_answer(self,
-									 relevant_paragraphs,
-									 question
-		):
-		self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(relevant_paragraphs, question)
-		return self.answer
-	def click_handler_for_mandi_price(self,
-									  state_name,
-									  apmc_name,
-									  commodity_name,
-									  from_date,
-									  to_date
-									 ):
-		if state_name and apmc_name and commodity_name and from_date and to_date:
-			self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
-		return self.mandi_price
-	def click_handler_for_get_weather(self, city):
-		time, info, temperature = self.kkms_kssw_obj.weather_utils_obj.get_weather(city)
-		self.weather_info = f'Weather in {city.capitalize()} on {time} is {temperature} with {info}.'
-		return self.weather_info
-	def click_handler_for_get_weather_forecast(self, state, district):
-		self.weather_forecast = self.kkms_kssw_obj.weather_utils_obj.get_weather_forecast(state, district)
-		return self.weather_forecast
-	def click_handler_for_weather_forecast_summary(self, weather_forecast):
-		self.weather_forecast_summary = self.kkms_kssw_obj.langchain_utils_obj.get_weather_forecast_summary(weather_forecast)
-		return self.weather_forecast_summary
-	def click_handler_for_get_indic_answer(self, eng_ans, language='Hindi'):
-		self.indic_lang_answer = self.kkms_kssw_obj.translator_utils_obj.get_indic_google_translate(eng_ans, language)
-		return self.indic_lang_answer
-	def select_widget(self, choice):
-		if choice == "General":
-			return [
-				gr.update(visible=True),
-				gr.update(visible=False),
-				gr.update(visible=False)
-			]
-		elif choice == "Mandi Price":
-			return [
-				gr.update(visible=False),
-				gr.update(visible=True),
-				gr.update(visible=False)
-			]
-		elif choice == "Weather":
-			return [
-				gr.update(visible=False),
-				gr.update(visible=False),
-				gr.update(visible=True)
-			]
-		else:
-			return gr.update(visible=False)
 with gr.Blocks(title='KKMS-KSSW Demo') as demo:
-	dom = DomState()
-	radio = gr.Radio(
-		["General", "Mandi Price", "Weather"], label="Query related to"
-	)
-	########################### Widget for Govt. Policy #################################################
-	with gr.Row(visible=True) as rowGeneral:
-		with gr.Column(scale=1, min_width=600):
-			with gr.Tab(label='Relevant paragraphs'):
-				question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
-				# Get the Relevant paragraphs for the question asked
-				relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
-				b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
-				b_relevant_paragraphs.click(fn=dom.click_handler_for_get_relevant_paragraphs, inputs=question, outputs=[relevant_paragraphs])
-		with gr.Column(scale=1, min_width=600):
-			with gr.Tab(label='Extractive Summary'):
-				# Get the extractive text summary from the retrieved Relevant paragraphs
-				summary = gr.Textbox(label="Extractive Summary is:", value=dom.summary, interactive=False)
-				b_summary = gr.Button("Extract Summary").style(size='sm')
-				b_summary.click(fn=dom.click_handler_for_summary, inputs=relevant_paragraphs, outputs=[summary])
-		# Get the exact answer for the question asked from the retrieved Relevant paragraphs
-		with gr.Row():
-			with gr.Column(scale=1, min_width=600):
-				with gr.Tab(label='Answer'):
-					answer = gr.Textbox(label="Answer is:", value=dom.answer, interactive=False)
-					b_answer = gr.Button("Get Answer").style(size='sm')
-					b_answer.click(fn=dom.click_handler_for_get_answer, inputs=[relevant_paragraphs, question], outputs=[answer])
-		# Covert the answer to Indian language
-		with gr.Row():
-			with gr.Column(scale=1, min_width=600):
-				with gr.Tab(label='Answer in selected language'):
-					# Select the language
-					language = gr.Dropdown(
-						['English', 'Hindi', 'Gujarati', 'Marathi', 'Kannada', 'Bengali', 'Panjabi', 'Telugu', 'Tamil', 'Malayalam'],
-						label="Select language")
-					indic_lang_answer = gr.Textbox(label="Answer in the selected language is:", value=dom.indic_lang_answer, interactive=False)
-					b_indic_lang_answer = gr.Button("Get answer in selected language").style(size='sm')
-					b_indic_lang_answer.click(fn=dom.click_handler_for_get_indic_answer, inputs=[answer, language], outputs=[indic_lang_answer])
-	########################## Widget for Mandi Price ###################################################
-	with gr.Row(visible=False) as rowMandiPrice:
-		with gr.Column(scale=1, min_width=600):
-			# Select State
-			state_name = gr.Dropdown(['ANDAMAN AND NICOBAR ISLANDS', 'ANDHRA PRADESH', 'ASSAM', 'BIHAR', 'CHANDIGARH', 'CHHATTISGARH', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH', 'JAMMU AND KASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'MADHYA PRADESH', 'MAHARASHTRA', 'NAGALAND', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'TAMIL NADU', 'TELANGANA', 'TRIPURA', 'UTTAR PRADESH', 'UTTARAKHAND', 'WEST BENGAL'], label="Select state")
-			# APMC name
-			apmc_name = gr.Textbox(label="Enter APMC name", placeholder='Type the APMC name here')
-			# APMC name
-			commodity_name = gr.Textbox(label="Enter Commodity name", placeholder='Type the Commodity name here')
-			# From/To date in yyyy-mm-dd format
-			from_date = gr.Textbox(label="From date?", value=dom.mandi_from_date, placeholder='Please enter the From date here in yyyy-mm-dd format')
-			to_date = gr.Textbox(label="To date?", value=dom.mandi_to_date, placeholder='Please enter the To date here in yyyy-mm-dd format')
-		with gr.Column(scale=1, min_width=600):
-			mandi_price = gr.Textbox(label=f"Mandi Price is:", value=dom.mandi_price, interactive=False)
-			b_summary = gr.Button("Get Mandi Price").style(size='sm')
-			b_summary.click(fn=dom.click_handler_for_mandi_price, inputs=[state_name, apmc_name, commodity_name, from_date, to_date], outputs=[mandi_price])
-	########################## Widget for Weather Info ###################################################
-	with gr.Row(visible=False) as rowWeather:
-		with gr.Column(scale=1, min_width=600):
-			with gr.Tab(label='Weather Info'):
-				city = gr.Textbox(label="Enter city name", placeholder='Type the city name here')
-				weather = gr.Textbox(label=f"Current weather is:", value=dom.weather_info, interactive=False)
-				b_weather = gr.Button("Get weather info").style(size='sm')
-				b_weather.click(fn=dom.click_handler_for_get_weather, inputs=city, outputs=[weather])
-		########### Weather Forecast ###########
-		with gr.Column(scale=1, min_width=600):
-			with gr.Tab(label='Weather Forecast for next 5 days'):
-				# Select the State
-				state = gr.Dropdown(
-					['Andaman-Nicobar', 'Andhra-Pradesh', 'Arunachal-Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh', 'Dadra-and-Nagar-Haveli', 'Daman-and-Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal-Pradesh', 'Jammu-Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Lakshadweep', 'Madhya-Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Pondicherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamilnadu', 'Telangana', 'Tripura', 'Uttar-Pradesh', 'Uttarakhand', 'West-Bengal'],
-					label="Select state"
-				)
-				# # Select district
-				# district = gr.Dropdown(
-				#     weather_utils.STATES.get(state, {}),
-				#     label="Select district"
-				# )
-				district = gr.Textbox(label="Enter district name", placeholder='Type the district name here')
-				district_weather = gr.Textbox(label=f"Weather forecast is:", value=dom.weather_forecast, interactive=False)
-				bd_weather = gr.Button("Get weather forecast").style(size='sm')
-				bd_weather.click(fn=dom.click_handler_for_get_weather_forecast, inputs=[state, district], outputs=[district_weather])
-		with gr.Column(scale=1, min_width=600):
-			with gr.Tab(label='Weather Forecast Summary'):
-				# Get the summary of the weather forecast
-				weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", value=dom.weather_forecast_summary, interactive=False)
-				b_weather_forecast_summary = gr.Button("Get Weather Forecast Summary").style(size='sm')
-				b_weather_forecast_summary.click(fn=dom.click_handler_for_weather_forecast_summary, inputs=district_weather, outputs=[weather_forecast_summary])
-	radio.change(fn=dom.select_widget, inputs=radio, outputs=[rowGeneral, rowMandiPrice, rowWeather])
 demo.launch(share=False)

 import gradio as gr
 import os
 import datetime
 import utils.constants as constants_utils
+import kkms_kssw as kkms_kssw
+import warnings
+warnings.filterwarnings('ignore')
 class DomState:
+    def __init__(
+        self,
+        index_type,
+        load_from_existing_index_file
+    ):
+        self.index_type = index_type
+        self.load_from_existing_index_file = load_from_existing_index_file
+        self.relevant_paragraphs = ''
+        self.sources_relevant_paragraphs = ''
+        self.answer = ''
+        self.summary = ''
+        self.mandi_price = ''
+        self.mandi_from_date = (datetime.datetime.now() - datetime.timedelta(days=5)).strftime('%Y-%m-%d')
+        self.mandi_to_date = datetime.datetime.now().strftime('%Y-%m-%d')
+        self.weather_info = ''
+        self.weather_forecast = ''
+        self.weather_forecast_summary = ''
+        self.indic_translation = ''
+        # Initialize index (vector store) - This will create a new index from scratch if load_from_existing_index_file == False
+        self.kkms_kssw_obj = kkms_kssw.KKMS_KSSW()
+        self.kkms_kssw_obj.load_create_index()
+    def click_handler_for_get_relevant_paragraphs(
+        self,
+        question,
+        question_category='general'
+    ):
+        self.relevant_paragraphs = self.kkms_kssw_obj.query(
+            question=question,
+            question_category=question_category
+        )
+        if self.index_type in ['FAISS', 'Chroma']:
+            self.sources_relevant_paragraphs = [doc.metadata for doc in self.relevant_paragraphs]
+            self.relevant_paragraphs = [doc.page_content.replace('\n', '').replace('\t', ' ') for doc in self.relevant_paragraphs]
+        return self.relevant_paragraphs
+    def click_handler_for_relevant_paragraphs_source(
+        self,
+        relevant_paragraphs
+    ):
+        return self.sources_relevant_paragraphs
+    def click_handler_for_summary(
+        self,
+        answer
+    ):
+        self.sumamry = self.kkms_kssw_obj.langchain_utils_obj.get_textual_summary(answer)
+        return self.sumamry
+    def click_handler_for_get_answer(
+        self,
+        relevant_paragraphs, question
+    ):
+        self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(relevant_paragraphs, question)
+        return self.answer
+    def click_handler_for_mandi_price(self,
+                                      state_name,
+                                      apmc_name,
+                                      commodity_name,
+                                      from_date,
+                                      to_date
+                                     ):
+        if state_name and apmc_name and commodity_name and from_date and to_date:
+            self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
+        return self.mandi_price
+    def click_handler_for_get_weather(
+        self,
+        city
+    ):
+        time, info, temperature = self.kkms_kssw_obj.weather_utils_obj.get_weather(city)
+        self.weather_info = f'Weather in {city.capitalize()} on {time} is {temperature} with {info}.'
+        return self.weather_info
+    def click_handler_for_get_weather_forecast(
+        self,
+        state,
+        district
+    ):
+        self.weather_forecast = self.kkms_kssw_obj.weather_utils_obj.get_weather_forecast(state, district)
+        return self.weather_forecast
+    def click_handler_for_weather_forecast_summary(
+        self,
+        weather_forecast
+    ):
+        self.weather_forecast_summary = self.kkms_kssw_obj.langchain_utils_obj.get_weather_forecast_summary(weather_forecast)
+        return self.weather_forecast_summary
+    def click_handler_for_load_files_urls(
+        self,
+        doc_type,
+        files_or_urls,
+        index_category='general'
+    ):
+        self.kkms_kssw_obj.upload_data(
+            doc_type=constants_utils.DATA_SOURCES[doc_type],
+            files_or_urls=files_or_urls,
+            index_category=index_category
+        )
+    def click_handler_for_get_indic_translation(
+        self,
+        eng_ans,
+        language='Hindi'
+    ):
+        self.indic_translation = self.kkms_kssw_obj.translator_utils_obj.get_indic_google_translate(eng_ans, language)
+        return self.indic_translation
+    def _upload_file(self, files):
+        file_paths = [file.name for file in files]
+        return file_paths
+    def select_widget(
+        self,
+        choice
+    ):
+        if choice == "General":
+            return [
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+            ]
+        elif choice == "Mandi Price":
+            return [
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+            ]
+        elif choice == "Weather":
+            return [
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+            ]
+        elif choice == "Load Custom Data":
+            return [
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=True)
+            ]
+        else:
+            return gr.update(visible=False)
+    def select_files_urls(
+        self,
+        choice
+    ):
+        if choice == "PDF":
+            return [
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+            ]
+        elif choice == "Online PDF":
+            return [
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+            ]
+        elif choice == "Text File":
+            return [
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+            ]
+        elif choice == "URLs":
+            return [
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=True),
+            ]
+        else:
+            return [
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+            ]
 with gr.Blocks(title='KKMS-KSSW Demo') as demo:
+    dom = DomState(
+        index_type=constants_utils.INDEX_TYPE,
+        load_from_existing_index_file=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
+    )
+    widgets = gr.Radio(
+        [
+            "General",
+            "Mandi Price",
+            "Weather",
+            "Load Custom Data"
+        ],
+        label="Query related to",
+        value="General"
+    )
+    #############################################################################
+    # Widget for Govt. Policy
+    with gr.Row(visible=True) as rowGeneral:
+        with gr.Column(scale=1, min_width=600):
+            with gr.Tab(label='Relevant paragraphs'):
+                question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
+                # Get the Relevant paragraphs for the question asked
+                relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
+                b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
+                b_relevant_paragraphs.click(
+                    fn=dom.click_handler_for_get_relevant_paragraphs,
+                    inputs=question,
+                    outputs=[relevant_paragraphs]
+                )
+        with gr.Column(scale=1):
+            with gr.Tab(label='Sources of relevant paragraphs'):
+                # Get the Sources of relevant paragraphs
+                sources_relevant_paragraphs = gr.Textbox(label="Sources of relevant paragraphs are:", interactive=False)
+                b_sources_relevant_paragraphs = gr.Button("Get Sources of relevant paragraphs").style(size='sm')
+                b_sources_relevant_paragraphs.click(fn=dom.click_handler_for_relevant_paragraphs_source, inputs=relevant_paragraphs, outputs=[sources_relevant_paragraphs])
+        # NOTE: Don't show extractive summary unless requested by FTA.
+        # with gr.Column(scale=1, min_width=600):
+        #     with gr.Tab(label='Extractive Summary'):
+        #         # Get the extractive text summary from the retrieved Relevant paragraphs
+        #         summary = gr.Textbox(label="Extractive Summary is:", value=dom.summary, interactive=False)
+        #         b_summary = gr.Button("Extract Summary").style(size='sm')
+        #         b_summary.click(fn=dom.click_handler_for_summary, inputs=relevant_paragraphs, outputs=[summary])
+        # Get the exact answer for the question asked from the retrieved Relevant paragraphs
+        with gr.Column(scale=1, min_width=600):
+            with gr.Tab(label='Answer'):
+                answer = gr.Textbox(label="Answer is:", value=dom.answer, interactive=False)
+                b_answer = gr.Button("Get Answer").style(size='sm')
+                b_answer.click(fn=dom.click_handler_for_get_answer, inputs=[relevant_paragraphs, question], outputs=[answer])
+        # Covert the answer to Indian language
+        with gr.Column(scale=1, min_width=600):
+            with gr.Tab(label='Answer in selected language'):
+                # Select the language
+                language = gr.Dropdown(
+                    ['English', 'Hindi', 'Gujarati', 'Marathi', 'Kannada', 'Bengali', 'Panjabi', 'Telugu', 'Tamil', 'Malayalam'],
+                    label="Select language")
+                indic_lang_answer = gr.Textbox(label="Answer in the selected language is:", value=dom.indic_translation, interactive=False)
+                b_indic_lang_answer = gr.Button("Get answer in selected language").style(size='sm')
+                b_indic_lang_answer.click(fn=dom.click_handler_for_get_indic_translation, inputs=[answer, language], outputs=[indic_lang_answer])
+    #############################################################################
+    # Widget for Mandi Price
+    with gr.Row(visible=False) as rowMandiPrice:
+        with gr.Column(scale=1, min_width=600):
+            # Select State
+            state_name = gr.Dropdown(constants_utils.MANDI_PRICE_STATES, label="Select state")
+            # APMC name
+            apmc_name = gr.Textbox(label="Enter APMC name", placeholder='Type the APMC name here')
+            # APMC name
+            commodity_name = gr.Textbox(label="Enter Commodity name", placeholder='Type the Commodity name here')
+            # From/To date in yyyy-mm-dd format
+            from_date = gr.Textbox(label="From date?", value=dom.mandi_from_date, placeholder='Please enter the From date here in yyyy-mm-dd format')
+            to_date = gr.Textbox(label="To date?", value=dom.mandi_to_date, placeholder='Please enter the To date here in yyyy-mm-dd format')
+        with gr.Column(scale=1, min_width=600):
+            mandi_price = gr.Textbox(label=f"Mandi Price is:", value=dom.mandi_price, interactive=False)
+            b_summary = gr.Button("Get Mandi Price").style(size='sm')
+            b_summary.click(fn=dom.click_handler_for_mandi_price, inputs=[state_name, apmc_name, commodity_name, from_date, to_date], outputs=[mandi_price])
+    #############################################################################
+    # Widget for Weather Info
+    with gr.Row(visible=False) as rowWeather:
+        ########### Weather Forecast ###########
+        with gr.Column(scale=1, min_width=600):
+            with gr.Tab(label='Weather Forecast for next 5 days'):
+                # Select the State
+                state = gr.Dropdown(
+                    constants_utils.WEATHER_FORECAST_STATES,
+                    label="Select state"
+                )
+                # # Select district
+                # district = gr.Dropdown(
+                #     weather_utils.STATES.get(state, {}),
+                #     label="Select district"
+                # )
+                district = gr.Textbox(label="Enter district name", placeholder='Type the district name here')
+                district_weather = gr.Textbox(label=f"Weather forecast is:", value=dom.weather_forecast, interactive=False)
+                bd_weather = gr.Button("Get weather forecast").style(size='sm')
+                bd_weather.click(fn=dom.click_handler_for_get_weather_forecast, inputs=[state, district], outputs=[district_weather])
+        with gr.Column(scale=1, min_width=600):
+            with gr.Tab(label='Weather Forecast Summary'):
+                # Get the summary of the weather forecast
+                weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", value=dom.weather_forecast_summary, interactive=False)
+                b_weather_forecast_summary = gr.Button("Get Weather Forecast Summary").style(size='sm')
+                b_weather_forecast_summary.click(fn=dom.click_handler_for_weather_forecast_summary, inputs=district_weather, outputs=[weather_forecast_summary])
+            # Covert the weather forcast summary in Indian language
+            with gr.Column(scale=1, min_width=600):
+                with gr.Tab(label='Weather Forecast Summary in selected language'):
+                    # Select the language
+                    language = gr.Dropdown(
+                        ['English', 'Hindi', 'Gujarati', 'Marathi', 'Kannada', 'Bengali', 'Panjabi', 'Telugu', 'Tamil', 'Malayalam'],
+                        label="Select language")
+                    indic_weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary in the selected language is:", value=dom.indic_translation, interactive=False)
+                    b_indic_weather_forecast_summary = gr.Button("Get answer in selected language").style(size='sm')
+                    b_indic_weather_forecast_summary.click(fn=dom.click_handler_for_get_indic_translation, inputs=[weather_forecast_summary, language], outputs=[indic_weather_forecast_summary])
+            with gr.Column(scale=1, min_width=600):
+                # with gr.Tab(label='Weather Info'):
+                city = gr.Textbox(label="Enter city name", placeholder='Type the city name here')
+                weather = gr.Textbox(label=f"Current weather is:", value=dom.weather_info, interactive=False)
+                b_weather = gr.Button("Get weather info").style(size='sm')
+                b_weather.click(fn=dom.click_handler_for_get_weather, inputs=city, outputs=[weather])
+    #############################################################################
+    # Widget to load and process from the custom data source
+    with gr.Row(visible=False) as rowLoadCustomData:
+        with gr.Column(scale=1, min_width=600):
+            with gr.Tab(label='Load Custom Data'):
+                doc_type = gr.Radio(
+                    list(constants_utils.DATA_SOURCES.keys()),
+                    label="Select data source (Supports uploading multiple Files/URLs)",
+                    value="PDF"
+                )
+                with gr.Row(visible=True) as rowUploadPdf:
+                    with gr.Column(scale=1, min_width=600):
+                        file_output = gr.File()
+                        upload_button = gr.UploadButton(
+                            "Click to Upload PDF Files",
+                            file_types=['.pdf'],
+                            file_count="multiple"
+                        )
+                        upload_button.upload(dom._upload_file, upload_button, file_output)
+                        b_files = gr.Button("Load PDF Files").style(size='sm')
+                        b_files.click(
+                            fn=dom.click_handler_for_load_files_urls,
+                            inputs=[doc_type, upload_button]
+                        )
+                with gr.Row(visible=False) as rowUploadOnlinePdf:
+                    with gr.Column(scale=1, min_width=600):
+                        urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
+                        b_urls = gr.Button("Load Online PDFs").style(size='sm')
+                        b_urls.click(fn=dom.click_handler_for_load_files_urls, inputs=[doc_type, urls])
+                with gr.Row(visible=False) as rowUploadTextFile:
+                    with gr.Column(scale=1, min_width=600):
+                        file_output = gr.File()
+                        upload_button = gr.UploadButton(
+                            "Click to Upload Text Files",
+                            file_types=['.txt'],
+                            file_count="multiple"
+                        )
+                        upload_button.upload(dom._upload_file, upload_button, file_output)
+                        b_files = gr.Button("Load Text Files").style(size='sm')
+                        b_files.click(
+                            fn=dom.click_handler_for_load_files_urls,
+                            inputs=[doc_type, file_output]
+                        )
+                with gr.Row(visible=False) as rowUploadUrls:
+                    with gr.Column(scale=1, min_width=600):
+                        urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
+                        b_urls = gr.Button("Load URLs").style(size='sm')
+                        b_urls.click(fn=dom.click_handler_for_load_files_urls, inputs=[doc_type, urls])
+                doc_type.change(
+                    fn=dom.select_files_urls,
+                    inputs=doc_type,
+                    outputs=[
+                        rowUploadPdf,
+                        rowUploadOnlinePdf,
+                        rowUploadTextFile,
+                        rowUploadUrls,
+                    ],
+                )
+    widgets.change(
+        fn=dom.select_widget,
+        inputs=widgets,
+        outputs=[
+            rowGeneral,
+            rowMandiPrice,
+            rowWeather,
+            rowLoadCustomData,
+        ],
+    )
 demo.launch(share=False)

kkms_kssw.py CHANGED Viewed

@@ -1,16 +1,17 @@
 import os
 import utils.constants as constants_utils
-import utils.data_loader as data_loader_utils
 import utils.langchain_utils as langchain_utils
 import utils.weather as weather_utils
 import utils.mandi_price as mandi_utils
 import utils.translator as translator_utils
-from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, GPTListIndex
-from langchain.indexes import VectorstoreIndexCreator
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.vectorstores import FAISS
 import warnings
 warnings.filterwarnings('ignore')
@@ -18,106 +19,59 @@ warnings.filterwarnings('ignore')
 class KKMS_KSSW:
-	def __init__(self):
-		self.index = None
-		self.documents = []
-		self.response = None
-		# Instantiate langchain_utils class object
-		self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS()
-		# Instantiate Mandi Price utils class object
-		self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
-		# Instantiate Weather class object
-		self.weather_utils_obj = weather_utils.WEATHER()
-		# Instantiate translator_utils class object
-		self.translator_utils_obj = translator_utils.TRANSLATOR()
-		if not os.path.exists(constants_utils.DATA_PATH):
-			os.makedirs(constants_utils.DATA_PATH)
-		if not os.path.exists(constants_utils.OUTPUT_PATH):
-			os.makedirs(constants_utils.OUTPUT_PATH)
-	# Initialize index (vector store)
-	def initialize_index(self, save_index_to_disk=True, index_type='GPTSimpleVectorIndex'):
-		# Load the index from the saved index.json file
-		if os.path.exists(constants_utils.INDEX_FILENAME):
-			print(f'Loading pre-generated index from: {constants_utils.INDEX_FILENAME}')
-			self.index = self.langchain_utils_obj.load_index(index_type='GPTSimpleVectorIndex', filepath=constants_utils.INDEX_FILENAME)
-		else:
-			# Load data from Docs
-			if os.path.exists(constants_utils.DATA_PATH):
-				doc_documents = SimpleDirectoryReader(constants_utils.DATA_PATH).load_data()
-				self.documents = doc_documents[:]
-				# Load data from PDFs only
-				# pdf_documents = data_loader_utils.load_document(doc_type='pdf', doc_filepath=doc_filepath)
-			# Load data from URLs & append it to the documents that we read from PDFs
-			# url_documents = data_loader_utils.load_document(doc_type='url', urls=urls)
-			# self.documents.extend(url_documents)
-			# Build the Vector store for docs
-			if index_type == 'GPTSimpleVectorIndex':
-				self.index = GPTSimpleVectorIndex.from_documents(self.documents)
-			elif index_type == 'FAISS':
-				self.index = FAISS.from_documents(
-					self.documents,
-					OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
-				)
-			def merge_documents_from_different_sources(doc_documents, url_documents):
-				# Build the Vector store for docs
-				doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
-				# Build the Vector store for URLs
-				url_index = GPTSimpleVectorIndex.from_documents(url_documents)
-				# Set summary of each index
-				doc_index.set_text("index_from_docs")
-				url_index.set_text("index_from_urls")
-				# Merge index of different data sources
-				self.index = GPTListIndex([doc_index])
-				self.index.insert(url_index)   # can also be passed directly as GPTListIndex([doc_index, url_index])
-				return self.index
-			if save_index_to_disk:
-				# Save index to a index.json file
-				print(f'Saving newly generated index: {constants_utils.INDEX_FILENAME}')
-				if index_type == 'GPTSimpleVectorIndex':
-					self.index.save_to_disk(constants_utils.INDEX_FILENAME)
-				elif index_type == 'FAISS':
-					self.index.save_local(constants_utils.INDEX_FILENAME)
-	# Define query on index to retrieve the most relevant top K documents from the vector store
-	def query(self,
-		question,
-		mode='default',
-		response_mode="default",
-		similarity_top_k=1,
-		required_keywords=[],
-		exclude_keywords=[],
-		verbose=False
-	):
-		'''
-			Args:
-				mode: can be any of [default, embedding]
-				response_mode: can be any of [default, compact, tree_summarize]
-		'''
-		# Querying the index
-		self.response = self.index.query(question,
-							   mode=mode,
-							   response_mode=response_mode,
-							   similarity_top_k=similarity_top_k,
-							   required_keywords=required_keywords,
-							   exclude_keywords=exclude_keywords,
-							   verbose=verbose)
-		return self.response

 import os
 import utils.constants as constants_utils
 import utils.langchain_utils as langchain_utils
 import utils.weather as weather_utils
 import utils.mandi_price as mandi_utils
 import utils.translator as translator_utils
+import utils.web_crawler as web_crawler_utils
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+)
 import warnings
 warnings.filterwarnings('ignore')
 class KKMS_KSSW:
+    def __init__(self):
+        self.index_type = constants_utils.INDEX_TYPE
+        self.load_from_existing_index_store = constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
+        # Instantiate langchain_utils class object
+        self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS(
+            index_type=self.index_type,
+            load_from_existing_index_store=self.load_from_existing_index_store
+        )
+        # Instantiate Mandi Price utils class object
+        self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
+        # Instantiate Weather class object
+        self.weather_utils_obj = weather_utils.WEATHER()
+        # Instantiate translator_utils class object
+        self.translator_utils_obj = translator_utils.TRANSLATOR()
+    # Initialize index (vector store)
+    def load_create_index(self):
+        logger.info(f"Load/Create index")
+        self.langchain_utils_obj.load_create_index()
+    # Upload data and update the index
+    def upload_data(
+        self,
+        doc_type,
+        files_or_urls,
+        index_category
+    ):
+        logger.info(f"Uploading data")
+        self.langchain_utils_obj.upload_data(
+            doc_type=doc_type,
+            files_or_urls=files_or_urls,
+            index_category=index_category
+        )
+    # Define query on index to retrieve the most relevant top K documents from the vector store
+    def query(
+        self,
+        question,
+        question_category
+    ):
+        '''
+            Args:
+                mode: can be any of [default, embedding]
+                response_mode: can be any of [default, compact, tree_summarize]
+        '''
+        logger.info(f"Querying from index/vector store")
+        return self.langchain_utils_obj.query(
+            question=question,
+            question_category=question_category
+        )

requirements.txt CHANGED Viewed

@@ -17,4 +17,5 @@ faiss-cpu
 tiktoken
 googletrans==3.1.0a0
 BeautifulSoup4
-PyPDF2

 tiktoken
 googletrans==3.1.0a0
 BeautifulSoup4
+pymupdf
+PyPDF2

utils/constants.py CHANGED Viewed

@@ -1,42 +1,174 @@
 DATA_PATH = './data/'
-OUTPUT_PATH = './output'
-INDEX_FILENAME = f'{OUTPUT_PATH}/index.json'
 URLS = [
-	'https://dmi.gov.in/Documents/GrantCAGrapes.pdf',
-	'https://dmi.gov.in/Documents/organicfaq.pdf',
-	'https://dmi.gov.in/Documents/CAGMOrganic-III.pdf',
-	'https://dmi.gov.in/GradesStandard.aspx',
-	'https://www.india.gov.in/topics/agriculture',
-	'https://www.india.gov.in/farmers-portal',
-	# Pest Management related
-	'https://niphm.gov.in/IPMPackages/Maize.pdf',
-	# Mandi Price related
-	'https://agmarknet.gov.in/',
-	'https://enam.gov.in/web/dashboard/trade-data',
-	# General information related: Information of interests are present on the 2nd level url
-	'https://agricoop.nic.in/#gsc.tab=0',
-	'https://www.manage.gov.in/nf/nf.asp',
-	# Weather forecast related
-	'https://nwp.imd.gov.in/blf/blf_temp/',   # need to select state -> district (on the new page) -> displays detailed table -> can get info at the block level as well from the same page on selection
-	'https://nwp.imd.gov.in/blf/blf_temp/dis.php?value=12gujarat',   # to get weather forecast for the given state
-	'https://nwp.imd.gov.in/blf/blf_temp/block.php?dis=12BHAVNAGAR',   # to get the weather forecast for the given district
 ]
 # Supported Indian laguages for translating the English text to Indian language
 INDIC_LANGUAGE = {
-	'Hindi': 'hi',
-	'Gujarati': 'gu',
-	'Kannada': 'kn',
-	'Marathi': 'mr',
-	'Panjabi': 'pa',
-	'Bengali': "bn",
-	'Telugu': 'te',
-	'Tamil': 'ta',
-	'Malayalam': 'ml',
 }

+import os
+import utils.web_crawler as web_crawler_utils
+LOAD_FROM_EXISTING_INDEX_STORE = True
+INDEX_TYPE = 'FAISS'
+# Path from where to load the data (from the local directory)
 DATA_PATH = './data/'
+# Path to store the index/vector db
+OUTPUT_PATH = os.path.join('./output/', INDEX_TYPE)
+# Create OUTPUT_PATH directory if not present
+if not os.path.exists(OUTPUT_PATH):
+    os.makedirs(OUTPUT_PATH)
+# Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
+INDEX_CATEGORY = [
+    # 'crops',
+    # 'fruits',
+    # 'pest_management',
+    # 'govt_policy',
+    # 'insurance',
+    # 'soil',
+    'general',
+]
+# Doctype of the master index of each index category. Master index for each index category would be stored under this key.
+INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE = 'master'
+# Output index name if creating the index/vector store using GPTSimpleVectorIndex
+INDEX_FILENAME = os.path.join(OUTPUT_PATH, 'index.json')
+# List of data sources/types & from where to load the data and create the index/vector store
+# 2nd item is the type of source from where the data would be loaded. Currently it could come from either a file or URL.
+DATA_SOURCES = {
+    'PDF': 'pdf',
+    'Text File': 'textfile',
+    'Online PDF': 'online_pdf', # web_crawler_utils.get_ipm_packages_pdfs_urls()[:1]
+    'URLs': 'urls',
+}
+# LangChain related constants
+TEXT_SPLITTER_CHUNK_SIZE = 1000
+TEXT_SPLITTER_CHUNK_OVERLAP = 0
 URLS = [
+    'https://agricoop.nic.in/#gsc.tab=0',
+    'https://dmi.gov.in/Documents/GrantCAGrapes.pdf',
+    'https://dmi.gov.in/Documents/organicfaq.pdf',
+    'https://dmi.gov.in/Documents/CAGMOrganic-III.pdf',
+    'https://dmi.gov.in/GradesStandard.aspx',
+    'https://www.india.gov.in/topics/agriculture',
+    'https://www.india.gov.in/farmers-portal',
+    # Pest Management related
+    'https://niphm.gov.in/IPMPackages/Maize.pdf',
+    # Banned Pesticides
+    'https://ppqs.gov.in/divisions/cib-rc/registered-products',     # Online PDF links on the page
+    # Mandi Price related
+    'https://agmarknet.gov.in/',
+    # General information related: Information of interests are present on the 2nd level url
+    'https://www.manage.gov.in/nf/nf.asp',
+    # Weather forecast related
+    'https://nwp.imd.gov.in/blf/blf_temp/',   # need to select state -> district (on the new page) -> displays detailed table -> can get info at the block level as well from the same page on selection
+    'https://nwp.imd.gov.in/blf/blf_temp/dis.php?value=12gujarat',   # to get weather forecast for the given state
+    'https://nwp.imd.gov.in/blf/blf_temp/block.php?dis=12BHAVNAGAR',   # to get the weather forecast for the given district
 ]
 # Supported Indian laguages for translating the English text to Indian language
 INDIC_LANGUAGE = {
+    'Hindi': 'hi',
+    'Gujarati': 'gu',
+    'Kannada': 'kn',
+    'Marathi': 'mr',
+    'Panjabi': 'pa',
+    'Bengali': "bn",
+    'Telugu': 'te',
+    'Tamil': 'ta',
+    'Malayalam': 'ml',
 }
+# State list used in the Mandi Price widget dropdown list
+MANDI_PRICE_STATES = [
+    'ANDAMAN AND NICOBAR ISLANDS',
+    'ANDHRA PRADESH',
+    'ASSAM',
+    'BIHAR',
+    'CHANDIGARH',
+    'CHHATTISGARH',
+    'GOA',
+    'GUJARAT',
+    'HARYANA',
+    'HIMACHAL PRADESH',
+    'JAMMU AND KASHMIR',
+    'JHARKHAND',
+    'KARNATAKA',
+    'KERALA',
+    'MADHYA PRADESH',
+    'MAHARASHTRA',
+    'NAGALAND',
+    'ODISHA',
+    'PUDUCHERRY',
+    'PUNJAB',
+    'RAJASTHAN',
+    'TAMIL NADU',
+    'TELANGANA',
+    'TRIPURA',
+    'UTTAR PRADESH',
+    'UTTARAKHAND',
+    'WEST BENGAL'
+]
+# State list used in the Weather forecast widget dropdown list
+WEATHER_FORECAST_STATES = [
+    'Andaman-Nicobar',
+    'Andhra-Pradesh',
+    'Arunachal-Pradesh',
+    'Assam',
+    'Bihar',
+    'Chandigarh',
+    'Chhattisgarh',
+    'Dadra-and-Nagar-Haveli',
+    'Daman-and-Diu',
+    'Delhi',
+    'Goa',
+    'Gujarat',
+    'Haryana',
+    'Himachal-Pradesh',
+    'Jammu-Kashmir',
+    'Jharkhand',
+    'Karnataka',
+    'Kerala',
+    'Lakshadweep',
+    'Madhya-Pradesh',
+    'Maharashtra',
+    'Manipur',
+    'Meghalaya',
+    'Mizoram',
+    'Nagaland',
+    'Odisha',
+    'Pondicherry',
+    'Punjab',
+    'Rajasthan',
+    'Sikkim',
+    'Tamilnadu',
+    'Telangana',
+    'Tripura',
+    'Uttar-Pradesh',
+    'Uttarakhand',
+    'West-Bengal'
+]
+# LIST OF  PESTICIDES WHICH ARE BANNED AND RESTRICTED USE (List created from: https://pib.gov.in/PressReleaseIframePage.aspx?PRID=1896140)
+BANNED_PESTICIDES_FORMULATIONS = [
+    'Alachlor',
+    'Aldicarb',
+    'Aldrin',
+    'Benzene Hexachloride',
+    'Benomyl',
+    'Calcium Cyanide',
+    'Carbaryl',
+    'Chlorbenzilate',
+    'Chlordane',
+    'Chlorofenvinphos',
+    'Copper Acetoarsenite',
+]

utils/data_loader.py CHANGED Viewed

@@ -1,104 +1,214 @@
 import os
 import pandas as pd
 from pathlib import Path
-from llama_index import GPTSimpleVectorIndex, download_loader
 from langchain.agents import initialize_agent, Tool
 from langchain.llms import OpenAI
 from langchain.chains.conversation.memory import ConversationBufferMemory
 class DATA_LOADER:
-	def __init__(self):
-		print()
-	def clean_df(self, df, dropna=True, fillna=False):
-		if fillna:
-			df.fillna('', inplace=True)
-		if dropna:
-			df.dropna(inplace=True)
-			# df = df[~df.isna()]
-		df = df.drop_duplicates().reset_index(drop=True)
-		return df
-	def load_external_links_used_by_FTAs(self,
-		sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx'
-	):
-		xls = pd.ExcelFile(sheet_filepath)
-		df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link'])
-		for sheet_name in xls.sheet_names:
-			sheet = pd.read_excel(xls, sheet_name)
-			if sheet.shape[0] > 0:
-				df = pd.concat([df, sheet])
-			else:
-				print(f'{sheet_name} has no content.')
-		df = df[['Link used for', 'Link type', 'Link']]
-		# Clean df
-		df = clean_df(df)
-		print(f'Total links available across all cities: {df.shape[0]}')
-		return df
-	def load_document(self,
-		doc_type='pdf',
-		doc_filepath='',
-		urls=[]
-	):
-		documents = []
-		if doc_type == 'pdf':
-			PDFReader = download_loader("PDFReader")
-			loader = PDFReader()
-			if os.path.exists(doc_filepath):
-				documents = loader.load_data(file=Path(doc_filepath))
-		elif doc_type == 'url':
-			BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
-			loader = BeautifulSoupWebReader()
-			if len(urls) > 0:
-				# Load data from URLs
-				documents = loader.load_data(urls=urls)
-		elif doc_type == 'url-kb':
-			KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader")
-			loader = KnowledgeBaseWebReader()
-			for url in urls:
-				doc = loader.load_data(
-					root_url=url,
-					link_selectors=['.article-list a', '.article-list a'],
-					article_path='/articles',
-					body_selector='.article-body',
-					title_selector='.article-title',
-					subtitle_selector='.article-subtitle',
-				)
-				documents.extend(doc)
-		elif doc_type == 'url-chatgpt':
-			BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
-			loader = BeautifulSoupWebReader()
-			if len(urls) > 0:
-				# Load data from URLs
-				documents = loader.load_data(urls=urls)
-				# Build the Vector database
-				index = GPTSimpleVectorIndex(documents)
-				tools = [
-					Tool(
-						name="Website Index",
-						func=lambda q: index.query(q),
-						description=f"Useful when you want answer questions about the text retrieved from websites.",
-					),
-				]
-				# Call ChatGPT API
-				llm = OpenAI(temperature=0)    # Keep temperature=0 to search from the given urls only
-				memory = ConversationBufferMemory(memory_key="chat_history")
-				agent_chain = initialize_agent(
-					tools, llm, agent="zero-shot-react-description", memory=memory
-				)
-				output = agent_chain.run(input="What language is on this website?")
-		return documents

 import os
+import re
 import pandas as pd
 from pathlib import Path
+import glob
+from llama_index import GPTSimpleVectorIndex, download_loader, SimpleDirectoryReader
+from langchain.document_loaders import PyPDFLoader, TextLoader
 from langchain.agents import initialize_agent, Tool
 from langchain.llms import OpenAI
 from langchain.chains.conversation.memory import ConversationBufferMemory
+import utils.utils as utils
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+)
+import warnings
+warnings.filterwarnings('ignore')
 class DATA_LOADER:
+    def __init__(self):
+        # Instantiate UTILS class object
+        self.utils_obj = utils.UTILS()
+    def load_documents_from_urls(self, urls=[], doc_type='urls'):
+        url_documents = self.load_document(doc_type=doc_type, urls=urls)
+        return url_documents
+    def load_documents_from_pdf(self, doc_filepath='', urls=[], doc_type='pdf'):
+        if doc_type == 'pdf':
+            pdf_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
+        elif doc_type == 'online_pdf':
+            pdf_documents = self.load_document(doc_type=doc_type, urls=urls)
+        return pdf_documents
+    def load_documents_from_directory(self, doc_filepath='', doc_type='directory'):
+        doc_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
+        return doc_documents
+    def load_documents_from_text(self, doc_filepath='', doc_type='textfile'):
+        text_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
+        return text_documents
+    def pdf_loader(self, filepath):
+        loader = PyPDFLoader(filepath)
+        return loader.load_and_split()
+    def text_loader(self, filepath):
+        loader = TextLoader(filepath)
+        return loader.load()
+    def load_document(self,
+        doc_type='pdf',
+        doc_filepath='',
+        urls=[]
+    ):
+        logger.info(f'Loading {doc_type} in raw format from: {doc_filepath}')
+        documents = []
+        # Validation checks
+        if doc_type in ['directory', 'pdf', 'textfile']:
+            if not os.path.exists(doc_filepath):
+                logger.warning(f"{doc_filepath} does not exist, nothing can be loaded!")
+                return documents
+        elif doc_type in ['online_pdf', 'urls']:
+            if len(urls) == 0:
+                logger.warning(f"URLs list empty, nothing can be loaded!")
+                return documents
+        ######### Load documents #########
+        # Load PDF
+        if doc_type == 'pdf':
+            # Load multiple PDFs from directory
+            if os.path.isdir(doc_filepath):
+                pdfs = glob.glob(f"{doc_filepath}/*.pdf")
+                logger.info(f'Total PDF files to load: {len(pdfs)}')
+                for pdf in pdfs:
+                    documents.extend(self.pdf_loader(pdf))
+            # Loading from a single PDF file
+            elif os.path.isfile(doc_filepath) and doc_filepath.endswith('.pdf'):
+                documents.extend(self.pdf_loader(doc_filepath))
+        # Load PDFs from online (urls). Can read multiple PDFs from multiple URLs in one-shot
+        elif doc_type == 'online_pdf':
+            logger.info(f'URLs to load Online PDFs are from: {urls}')
+            valid_urls = self.utils_obj.validate_url_format(
+                urls=urls,
+                url_type=doc_type
+            )
+            for url in valid_urls:
+                # Load and split PDF pages per document
+                documents.extend(self.pdf_loader(url))
+        # Load data from URLs (can load data from multiple URLs)
+        elif doc_type == 'urls':
+            logger.info(f'URLs to load data from are: {urls}')
+            valid_urls = self.utils_obj.validate_url_format(
+                urls=urls,
+                url_type=doc_type
+            )
+            BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
+            loader = BeautifulSoupWebReader()
+            # Load data from URLs
+            documents = loader.load_data(urls=valid_urls)
+        # Load data from text file(s)
+        elif doc_type == 'textfile':
+            # Load multiple text files from directory
+            if os.path.isdir(doc_filepath):
+                text_files = glob.glob(f"{doc_filepath}/*.txt")
+                logger.info(f'Total text files to load: {len(text_files)}')
+                for tf in text_files:
+                    documents.extend(self.text_loader(tf))
+            # Loading from a single text file
+            elif os.path.isfile(doc_filepath) and doc_filepath.endswith('.txt'):
+                documents.extend(self.text_loader(doc_filepath))
+        # Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
+        elif doc_type == 'directory':
+            documents = SimpleDirectoryReader(doc_filepath).load_data()
+        # Load data from URLs in Knowledge Base format
+        elif doc_type == 'url-kb':
+            KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader")
+            loader = KnowledgeBaseWebReader()
+            for url in urls:
+                doc = loader.load_data(
+                    root_url=url,
+                    link_selectors=['.article-list a', '.article-list a'],
+                    article_path='/articles',
+                    body_selector='.article-body',
+                    title_selector='.article-title',
+                    subtitle_selector='.article-subtitle',
+                )
+                documents.extend(doc)
+        # Load data from URLs and create an agent chain using ChatGPT
+        elif doc_type == 'url-chatgpt':
+            BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
+            loader = BeautifulSoupWebReader()
+            # Load data from URLs
+            documents = loader.load_data(urls=urls)
+            # Build the Vector database
+            index = GPTSimpleVectorIndex(documents)
+            tools = [
+                Tool(
+                    name="Website Index",
+                    func=lambda q: index.query(q),
+                    description=f"Useful when you want answer questions about the text retrieved from websites.",
+                ),
+            ]
+            # Call ChatGPT API
+            llm = OpenAI(temperature=0)    # Keep temperature=0 to search from the given urls only
+            memory = ConversationBufferMemory(memory_key="chat_history")
+            agent_chain = initialize_agent(
+                tools, llm, agent="zero-shot-react-description", memory=memory
+            )
+            output = agent_chain.run(input="What language is on this website?")
+        # Clean documents
+        documents = self.clean_documents(documents)
+        logger.info(f'{doc_type} in raw format from: {doc_filepath} loaded successfully!')
+        return documents
+    def clean_documents(
+        self,
+        documents
+    ):
+        cleaned_documents = []
+        for document in documents:
+            document.page_content = self.utils_obj.replace_newlines_and_spaces(document.page_content)
+            cleaned_documents.append(document)
+        return cleaned_documents
+    def load_external_links_used_by_FTAs(self,
+        sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx'
+    ):
+        xls = pd.ExcelFile(sheet_filepath)
+        df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link'])
+        for sheet_name in xls.sheet_names:
+            sheet = pd.read_excel(xls, sheet_name)
+            if sheet.shape[0] > 0:
+                df = pd.concat([df, sheet])
+            else:
+                logger.info(f'{sheet_name} has no content.')
+        df = df[['Link used for', 'Link type', 'Link']]
+        # Clean df
+        df = self.utils_obj.clean_df(df)
+        logger.info(f'Total links available across all cities: {df.shape[0]}')
+        return df

utils/langchain_utils.py CHANGED Viewed

@@ -1,169 +1,839 @@
 from langchain.llms import OpenAI
-from langchain.text_splitter import CharacterTextSplitter
 from langchain.chains.summarize import load_summarize_chain
 from langchain.docstore.document import Document
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.chains.question_answering import load_qa_chain
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 from langchain.prompts import PromptTemplate
-from llama_index import GPTSimpleVectorIndex
 from langchain.vectorstores import FAISS
 import pickle
 import os
 os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
 os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 class LANGCHAIN_UTILS:
-	def __init__(self):
-		print()
-	def generate_prompt_template(self, prompt_type='general'):
-		prompt_template = ''
-		if prompt_type == 'general':
-			prompt_template = """Write a concise summary of the following:
-			{text}
-			CONCISE SUMMARY IN ENGLISH:"""
-		elif prompt_type == 'weather':
-			prompt_template = """
-				What would be the weather based on the below data:
-				{text}
-			"""
-		return prompt_template
-	def get_textual_summary(self,
-		text,
-		chain_type="stuff",
-		custom_prompt=True,
-		prompt_type='general'
-	):
-		texts = [text]
-		docs = [Document(page_content=t) for t in texts[:3]]
-		llm = OpenAI(temperature=0)
-		if custom_prompt:
-			prompt_template = self.generate_prompt_template(prompt_type)
-			PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
-			chain = load_summarize_chain(llm, chain_type=chain_type, prompt=PROMPT)
-		else:
-			chain = load_summarize_chain(llm, chain_type=chain_type)
-		text_summary = chain.run(docs)
-		return text_summary
-	def get_weather_forecast_summary(self,
-		text,
-		chain_type="stuff"
-	):
-		text = f"""
-			What would be the weather based on the below data:
-			{text}
-			Give simple response without technical numbers which can be explained to human.
-		"""
-		texts = [text]
-		docs = [Document(page_content=t) for t in texts[:3]]
-		llm = OpenAI(temperature=0)
-		chain = load_summarize_chain(llm, chain_type=chain_type)
-		text_summary = chain.run(docs)
-		return text_summary
-	def get_answer_from_para(self,
-		para,
-		question,
-		chain_type="stuff",
-		custom_prompt=True
-	):
-		# Prepare data (Split paragraph into chunks of small documents)
-		text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-		texts = text_splitter.split_text(para)
-		# Find similar docs that are relevant to the question
-		embeddings = OpenAIEmbeddings()
-		docsearch = Chroma.from_texts(
-			texts, embeddings,
-			metadatas=[{"source": str(i)} for i in range(len(texts))]
-		)
-		# Search for the similar docs
-		docs = docsearch.similarity_search(question, k=1)
-		llm = OpenAI(temperature=0)
-		# Create a Chain for question answering
-		if custom_prompt:
-			prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
-			{context}
-			Question: {question}
-			Answer in English:"""
-			PROMPT = PromptTemplate(
-				template=prompt_template, input_variables=["context", "question"]
-			)
-			chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
-		else:
-			# chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
-			chain = load_qa_chain(llm, chain_type=chain_type)
-			# chain.run(input_documents=docs, question=question)
-		out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
-		return out_dict['output_text']
-	def store_index(self,
-		index,
-		index_type='GPTSimpleVectorIndex',
-		filepath='./output/index.json'
-	):
-		if index_type == 'GPTSimpleVectorIndex':
-			index.save_to_disk(filepath)
-		elif index_type == 'pickle':
-			with open(filepath, "wb") as f:
-				pickle.dump(index, f)
-		elif index_type == 'FAISS':
-			index.save_local(filepath)
-	def load_index(self,
-		index_type='GPTSimpleVectorIndex',
-		filepath='./output/index.json'
-	):
-		if index_type == 'GPTSimpleVectorIndex':
-			index = GPTSimpleVectorIndex.load_from_disk(filepath)
-		elif index_type == 'pickle':
-			with open(filepath, "rb") as f:
-				index = pickle.load(f)
-		elif index_type == 'FAISS':
-			index = FAISS.load_local(filepath, OpenAIEmbeddings())   # can we use open-source embeddings?
-		return index
-	def convert_text_to_documents(self, text_list=[]):
-		"""
-			Converts the list of text data to Documents format that can be feed to GPT API to build the Vector store
-		"""
-		from llama_index import Document
-		documents = [Document(t) for t in text_list]
-		return documents

+import utils.constants as constants_utils
+import utils.data_loader as data_loader_utils
+import utils.utils as utils
 from langchain.llms import OpenAI
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
 from langchain.chains.summarize import load_summarize_chain
 from langchain.docstore.document import Document
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
+import chromadb
 from langchain.chains.question_answering import load_qa_chain
 from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 from langchain.prompts import PromptTemplate
+from llama_index import GPTSimpleVectorIndex, GPTListIndex
 from langchain.vectorstores import FAISS
 import pickle
+import shutil
+from typing import Dict, List, Optional
 import os
 os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
 os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+)
+import warnings
+warnings.filterwarnings('ignore')
 class LANGCHAIN_UTILS:
+    def __init__(self,
+        index_type=constants_utils.INDEX_TYPE,
+        load_from_existing_index_store=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
+    ):
+        self.index_type = index_type
+        self.load_from_existing_index_store = load_from_existing_index_store
+        # Temporary index in the current context for the doc_type in consideration
+        self.index = None
+        # Master index which contains data from multiple sources (PDF, Online PDF, Text files, URLs, etc. It gets updated on Uploading the data from new files/urls without downtime of the application on-demand.)
+        self.master_index = None
+        # Data source wise index
+        self.index_category_doc_type_wise_index = dict(
+            (ic, dict(
+                (ds, None) for ds in list(constants_utils.DATA_SOURCES.values()))
+        ) for ic in constants_utils.INDEX_CATEGORY)
+        # Data loaded as a Document format in the current context for the doc_type in consideration
+        self.documents = []
+        # Instantiate data_loader_utils class object
+        self.data_loader_utils_obj = data_loader_utils.DATA_LOADER()
+        # Instantiate UTILS class object
+        self.utils_obj = utils.UTILS()
+        # Initialize embeddings (we can also use other embeddings)
+        self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
+    def generate_prompt_template(
+        self,
+        prompt_type='general'
+    ):
+        prompt_template = ''
+        if prompt_type == 'general':
+            prompt_template = """Write a concise summary of the following:
+            {text}
+            SUMMARIZE IN ENGLISH:"""
+        elif prompt_type == 'weather':
+            prompt_template = """
+                What would be the weather based on the below data:
+                {text}
+            """
+        return prompt_template
+    def get_textual_summary(
+        self,
+        text,
+        chain_type="stuff",
+        custom_prompt=True,
+        prompt_type='general'
+    ):
+        texts = [text]
+        docs = [Document(page_content=t) for t in texts[:3]]
+        llm = OpenAI(temperature=0)
+        if custom_prompt:
+            prompt_template = self.generate_prompt_template(prompt_type)
+            PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
+            chain = load_summarize_chain(llm, chain_type=chain_type, prompt=PROMPT)
+        else:
+            chain = load_summarize_chain(llm, chain_type=chain_type)
+        text_summary = chain.run(docs)
+        return text_summary
+    def get_weather_forecast_summary(
+        self,
+        text,
+        chain_type="stuff"
+    ):
+        text = f"""
+            What would be the weather based on the below data:
+            {text}
+            Give simple response without technical numbers which can be explained to human.
+        """
+        texts = [text]
+        docs = [Document(page_content=t) for t in texts[:3]]
+        llm = OpenAI(temperature=0)
+        chain = load_summarize_chain(llm, chain_type=chain_type)
+        text_summary = chain.run(docs)
+        return text_summary
+    def get_answer_from_para(
+        self,
+        para,
+        question,
+        chain_type="stuff",
+        custom_prompt=True
+    ):
+        # Prepare data (Split paragraph into chunks of small documents)
+        text_splitter = CharacterTextSplitter(chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE, chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP)
+        texts = text_splitter.split_text(para)
+        if self.index_type == 'FAISS':
+            # Find similar docs that are relevant to the question
+            docsearch = FAISS.from_texts(
+                texts, self.embeddings,
+                metadatas=[{"source": str(i)} for i in range(len(texts))]
+            )
+        elif self.index_type == 'Chroma':
+            # Find similar docs that are relevant to the question
+            docsearch = Chroma.from_texts(
+                texts, self.embeddings,
+                metadatas=[{"source": str(i)} for i in range(len(texts))]
+            )
+        # Search for the similar docs
+        docs = docsearch.similarity_search(question, k=1)
+        llm = OpenAI(temperature=0)
+        # Create a Chain for question answering
+        if custom_prompt:
+            prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
+            {context}
+            Question: {question}
+            Answer in English:"""
+            PROMPT = PromptTemplate(
+                template=prompt_template, input_variables=["context", "question"]
+            )
+            chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
+        else:
+            # chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
+            chain = load_qa_chain(llm, chain_type=chain_type)
+            # chain.run(input_documents=docs, question=question)
+        out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
+        return out_dict['output_text']
+    def load_documents(
+        self,
+        doc_type,
+        doc_filepath='',
+        urls=[]
+    ):
+        """
+            Load data in Document format of the given doc_type from either doc_filepath or list of urls.
+            It can load multiple files/urls in one shot.
+            Args:
+                doc_type: can be any of [pdf, online_pdf, urls, textfile]
+                doc_filepath: can be a directory or a filepath
+                urls: list of urls
+        """
+        logger.info(f'Loading {doc_type} data into Documents format')
+        if doc_type == 'pdf':
+            # Load data from PDFs stored in local directory
+            self.documents.extend(
+                self.data_loader_utils_obj.load_documents_from_pdf(
+                    doc_filepath=doc_filepath,
+                    doc_type=doc_type
+                ))
+        elif doc_type == 'online_pdf':
+            # Load data from PDFs stored in local directory
+            self.documents.extend(
+                self.data_loader_utils_obj.load_documents_from_pdf(
+                    urls=urls,
+                    doc_type=doc_type
+                ))
+        elif doc_type == 'urls':
+            # Load data from URLs
+            self.documents.extend(
+                self.data_loader_utils_obj.load_documents_from_urls(
+                    urls=urls,
+                    doc_type=doc_type
+                ))
+        elif doc_type == 'textfile':
+            # Load data from text files & Convert texts into Document format
+            self.documents.extend(
+                self.convert_text_to_documents(
+                    self.data_loader_utils_obj.load_documents_from_text(
+                    doc_filepath=doc_filepath,
+                    doc_type=doc_type
+                )
+            ))
+        elif doc_type == 'directory':
+            # Load data from local directory
+            self.documents.extend(
+                self.data_loader_utils_obj.load_documents_from_directory(
+                    doc_filepath=doc_filepath,
+                    doc_type=doc_type
+                ))
+        logger.info(f'{doc_type} data into Documents format loaded successfully!')
+    def create_index(
+        self
+    ):
+        logger.info(f'Creating index')
+        if not self.documents:
+            logger.warning(f'Empty documents. Index cannot be created!')
+            return None
+        ############## Build the Vector store for docs ##############
+        # Vector store using Facebook AI Similarity Search
+        if self.index_type == 'FAISS':
+            text_splitter = CharacterTextSplitter(
+                chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
+                chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
+            )
+            self.documents = text_splitter.split_documents(self.documents)
+            self.index = FAISS.from_documents(
+                self.documents,
+                self.embeddings
+            )
+        # Vector store using Chroma DB
+        elif self.index_type == 'Chroma':
+            if not os.path.exists(self.index_filepath):
+                os.makedirs(self.index_filepath)
+            text_splitter = CharacterTextSplitter(
+                chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
+                chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP
+            )
+            self.documents = text_splitter.split_documents(self.documents)
+            self.index = Chroma.from_documents(
+                self.documents,
+                self.embeddings,
+                persist_directory=self.index_filepath
+            )
+        # Vector store using GPT vector index
+        elif self.index_type == 'GPTSimpleVectorIndex':
+            self.index = GPTSimpleVectorIndex.from_documents(self.documents)
+        logger.info(f'Index created successfully!')
+        return self.index
+    def get_index_filepath(
+        self,
+        index_category,
+        doc_type
+    ):
+        if doc_type == 'master':
+            self.index_filepath = os.path.join(
+                constants_utils.OUTPUT_PATH, f'index_{index_category}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}.json')
+        else:
+            self.index_filepath = os.path.join(
+                constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}.json')
+        return self.index_filepath
+    def load_master_doctype_indices_for_index_category(
+        self,
+        index_category
+    ):
+        logger.info(f'Loading master and doc_type indices for: {index_category}')
+        # Set master index of index_category = None
+        self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
+        for doc_type in self.index_category_doc_type_wise_index[index_category].keys():
+            self.index = None
+            self.index_filepath = self.get_index_filepath(
+                index_category=index_category,
+                doc_type=doc_type
+            )
+            self.load_index()
+            # Set master/doc_type index
+            self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
+        logger.info(f'Master and doc_type indices for: {index_category} loaded successfully!')
+    def load_create_index(
+        self
+    ):
+        logger.info(f'Loading/Creating index for each index_category')
+        for index_category in constants_utils.INDEX_CATEGORY:
+            # Load master index_category index if self.load_from_existing_index_store == True
+            if self.load_from_existing_index_store:
+                self.load_master_doctype_indices_for_index_category(index_category)
+            # For any reason, if master index is not loaded then create the new index/vector store
+            if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
+                logger.info(f'Creating a new Vector/Index store for: {index_category}')
+                doc_filepath = os.path.join(constants_utils.DATA_PATH, index_category)
+                urls = []
+                # Build the Vector/Index store
+                for doc_type in list(constants_utils.DATA_SOURCES.values()):
+                    logger.info(f'Creating a new Vector/Index store for: {index_category} from data source: {doc_type}')
+                    index = None
+                    if doc_type in ['pdf', 'textfile']:
+                        index = self.create_store_index(
+                            doc_type=doc_type,
+                            doc_filepath=doc_filepath,
+                            index_category=index_category
+                        )
+                    else:
+                        # Build the Vector/Index store from web urls
+                        index = self.create_store_index(
+                            doc_type=doc_type,
+                            urls=urls,
+                            index_category=index_category
+                        )
+                    if index:
+                        self.index_category_doc_type_wise_index[index_category][doc_type] = index
+                    logger.info(f'New Vector/Index store for: {index_category} from data source: {doc_type} created successfully!')
+                logger.info(f'New Vector/Index store for: {index_category} created successfully!')
+                # Merge index of each doc_type into a single index_category
+                self.merge_store_master_index(
+                    index_category=index_category
+                )
+        logger.info(f'Index for each index_category loaded successfully!')
+    def create_store_index(
+        self,
+        doc_type='pdf',
+        doc_filepath=constants_utils.DATA_PATH,
+        urls=[],
+        index_category=constants_utils.INDEX_CATEGORY[0]
+    ):
+        logger.info(f'Creating and storing {doc_type} index')
+        self.documents = []
+        self.index = None
+        self.index_filepath = self.get_index_filepath(
+            index_category=index_category,
+            doc_type=doc_type
+        )
+        # Delete the old index file
+        shutil.rmtree(self.index_filepath, ignore_errors=True)
+        logger.info(f'{self.index_filepath} deleted.')
+        # Load data in Documents format that can be consumed for index creation
+        self.load_documents(
+            doc_type,
+            doc_filepath,
+            urls
+        )
+        # Create the index from documents for search/retrieval
+        self.index = self.create_index()
+        # Store index
+        self.store_index(
+            index=self.index,
+            index_filepath=self.index_filepath
+        )
+        logger.info(f'{doc_type} index created and stored successfully!')
+        # Return the index of the given doc_type (this is an index for a single doc_type). Indices from multiple doc_types should be merged later on in the master index so that query could be made from a single index.
+        return self.index
+    def store_index(
+        self,
+        index,
+        index_filepath
+    ):
+        logger.info(f'Saving index to: {index_filepath}')
+        if not index:
+            logger.warning(f'Cannot write an empty index to: {index_filepath}!')
+            return
+        if not os.path.exists(index_filepath):
+            os.makedirs(index_filepath)
+        if self.index_type == 'FAISS':
+            index.save_local(index_filepath)
+        elif self.index_type == 'Chroma':
+            index.persist()
+        elif self.index_type == 'GPTSimpleVectorIndex':
+            index.save_to_disk(index_filepath)
+        elif self.index_type == 'pickle':
+            with open(index_filepath, "wb") as f:
+                pickle.dump(index, f)
+        logger.info(f'Index saved to: {index_filepath} successfully!')
+    def load_index(
+        self
+    ):
+        logger.info(f'Loading index from: {self.index_filepath}')
+        if not os.path.exists(self.index_filepath):
+            logger.warning(f"Cannot load index from {self.index_filepath} as the path doest not exist!")
+            return
+        if self.index_type == 'FAISS':
+            self.index = FAISS.load_local(self.index_filepath, self.embeddings)
+        elif self.index_type == 'Chroma':
+            self.index = Chroma(
+                persist_directory=self.index_filepath,
+                embedding_function=self.embeddings
+            )
+        elif self.index_type == 'GPTSimpleVectorIndex':
+            self.index = GPTSimpleVectorIndex.load_from_disk(self.index_filepath)
+        elif self.index_type == 'pickle':
+            with open(self.index_filepath, "rb") as f:
+                self.index = pickle.load(f)
+        logger.info(f'Index loaded from: {self.index_filepath} successfully!')
+    def convert_text_to_documents(
+        self,
+        text_list=[]
+    ):
+        """
+            Converts the list of text data to Documents format that can be feed to GPT API to build the Vector store
+        """
+        from llama_index import Document
+        documents = [Document(t) for t in text_list]
+        return documents
+    def merge_documents_from_different_sources(
+        self,
+        doc_documents,
+        url_documents
+    ):
+        # Build the Vector store for docs
+        doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
+        # Build the Vector store for URLs
+        url_index = GPTSimpleVectorIndex.from_documents(url_documents)
+        # Set summary of each index
+        doc_index.set_text("index_from_docs")
+        url_index.set_text("index_from_urls")
+        # Merge index of different data sources
+        index = GPTListIndex([doc_index, url_index])
+        return index
+    def merge_store_master_index(
+        self,
+        index_category
+    ):
+        """
+        Merge multiple doc_type indices into a single master index. Query/search would be performed on this merged index.
+        Args:
+            index_category: index_category (can be any of: [crops, fruits, pest_management, govt_policy, soil, etc.])
+        """
+        logger.info('Merging doc_type indices of different index categories into a master index')
+        self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
+        doc_type_indices = self.index_category_doc_type_wise_index[index_category]
+        if self.index_type == 'FAISS':
+            for doc_type, index in doc_type_indices.items():
+                if doc_type == constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE:
+                    # Only merge the non-master doc_type_indices
+                    continue
+                if not index or not isinstance(index, FAISS):
+                    logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.faiss.FAISS')
+                    continue
+                if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
+                    self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = index
+                else:
+                    self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE].merge_from(index)
+        elif self.index_type == 'Chroma':
+            for doc_type, index in doc_type_indices.items():
+                if not index or not isinstance(index, Chroma):
+                    logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.Chroma')
+                    continue
+                raise NotImplementedError
+        elif self.index_type == 'GPTSimpleVectorIndex':
+            for doc_type, index in doc_type_indices.items():
+                if not index or not isinstance(index, GPTSimpleVectorIndex):
+                    logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
+                    continue
+                raise NotImplementedError
+        # Store index_category master index
+        self.store_index(
+            index=self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE],
+            index_filepath=self.get_index_filepath(
+                index_category=index_category,
+                doc_type=constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE
+            )
+        )
+        logger.info('doc_type indices of different index categories into a master index merged successfully!')
+    def init_chromadb(self):
+        logger.info('Initializing Chroma DB')
+        if not os.path.exists(self.index_filepath):
+            os.makedirs(self.index_filepath)
+        client_settings = chromadb.config.Settings(
+            chroma_db_impl="duckdb+parquet",
+            persist_directory=self.index_filepath,
+            anonymized_telemetry=False
+        )
+        self.index = Chroma(
+            collection_name="langchain_store",
+            embedding_function=self.embeddings,
+            client_settings=client_settings,
+            persist_directory=self.index_filepath,
+        )
+        logger.info('Chroma DB initialized successfully!')
+    def query_chromadb(self, question, k=1):
+        return self.index.similarity_search(query=question, k=k)
+    def query(self,
+        question,
+        question_category,
+        mode='embedding',
+        response_mode="default",
+        similarity_top_k=2,
+        required_keywords=[],
+        exclude_keywords=[],
+        verbose=False
+    ):
+        '''
+            Args:
+                mode: can be any of [default, embedding]
+                response_mode: can be any of [default, compact, tree_summarize]
+        '''
+        logger.info(f'question category: {question_category}; question: {question}')
+        response = None
+        # Get the index of the given question_category
+        index = self.index_category_doc_type_wise_index[question_category]['master']
+        if self.index_type == 'FAISS':
+            response = index.similarity_search(
+                question,
+                k=similarity_top_k
+            )
+        elif self.index_type == 'Chroma':
+            response = index.similarity_search(
+                question,
+                k=similarity_top_k
+            )
+        elif self.index_type == 'GPTSimpleVectorIndex':
+            # Querying the index
+            response = index.query(
+                question,
+                mode=mode,
+                response_mode=response_mode,
+                similarity_top_k=similarity_top_k,
+                required_keywords=required_keywords,
+                exclude_keywords=exclude_keywords,
+                verbose=verbose
+            )
+        return response
+    def load_uploaded_documents(
+        self,
+        doc_type,
+        files_or_urls
+    ):
+        logger.info(f'Loading uploaded documents from: {doc_type}')
+        if doc_type == 'pdf':
+            if not isinstance(files_or_urls, list):
+                files_or_urls = [files_or_urls]
+            for pdf in files_or_urls:
+                if not pdf.name.endswith('.pdf'):
+                    logger.warning(f'Found a file other than .pdf format. Cannot load {pdf.name} file!')
+                    continue
+                logger.info(f'Loading PDF from: {pdf.name}')
+                # Load PDF as documents
+                self.documents.extend(
+                    self.data_loader_utils_obj.load_documents_from_pdf(
+                        doc_filepath=pdf.name,
+                        doc_type=doc_type
+                    )
+                )
+        elif doc_type == 'textfile':
+            if not isinstance(files_or_urls, list):
+                files_or_urls = [files_or_urls]
+            for text_file in files_or_urls:
+                if not text_file.name.endswith('.txt'):
+                    logger.warning(f'Found a file other than .txt format. Cannot load {text_file.name} file!')
+                    continue
+                logger.info(f'Loading textfile from: {text_file.name}')
+                # Load textfile as documents
+                self.documents.extend(
+                    self.data_loader_utils_obj.load_documents_from_text(
+                        doc_filepath=text_file.name,
+                        doc_type=doc_type
+                    )
+                )
+        elif doc_type == 'online_pdf':
+            files_or_urls = self.utils_obj.split_text(files_or_urls)
+            # Load online_pdfs as documents
+            self.documents.extend(
+                self.data_loader_utils_obj.load_documents_from_pdf(
+                    doc_type=doc_type,
+                    urls=files_or_urls
+                )
+            )
+        elif doc_type == 'urls':
+            files_or_urls = self.utils_obj.split_text(files_or_urls)
+            # Load URLs as documents
+            self.documents.extend(
+                self.data_loader_utils_obj.load_documents_from_urls(
+                    doc_type=doc_type,
+                    urls=files_or_urls
+                )
+            )
+        logger.info(f'Uploaded documents from: {doc_type} loaded successfully!')
+    def upload_data(
+        self,
+        doc_type,
+        files_or_urls,
+        index_category
+    ):
+        logger.info(f'Uploading data for: {index_category}-{doc_type}')
+        self.documents = []
+        self.index = None
+        # Create documents of the uploaded files
+        self.load_uploaded_documents(
+            doc_type,
+            files_or_urls
+        )
+        # Create the index from documents for search/retrieval
+        self.index = self.create_index()
+        # Update the existing index with the newly data
+        self.upsert_index(
+            doc_type=doc_type,
+            index_category=index_category
+        )
+        logger.info(f'{index_category}-{doc_type} data uploaded successfully!')
+    def upsert_index(
+        self,
+        doc_type,
+        index_category
+    ):
+        """
+            Updates the index of the given index_category-doc_type, if present.
+            Creates a new index if index_category-doc_type index is not present.
+            Also updates the master index for the given index_category.
+        """
+        logger.info(f'Upserting index for: {index_category}-{doc_type}')
+        if not self.index_category_doc_type_wise_index.get(index_category, None):
+            """
+                If index_category index does not exists
+                Steps:
+                    - set index_category index
+                    - set doc_type index
+                    - Store new index_category index as master
+                    - Store new doc_type index
+            """
+            logger.info(f'Master index does not exist for: {index_category}. A new {index_category} master index & {doc_type} index would be created.')
+            self.index_category_doc_type_wise_index.setdefault(index_category, {})
+            # Set a master index only if it doesn't exist. Else keep it's value as-it-is.
+            self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
+            # Set an index for the given doc_type only if it doesn't exist. Else keep it's value as-it-is.
+            self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
+        elif not self.index_category_doc_type_wise_index[index_category].get(doc_type, None):
+            """
+                If doc_type index does not exists
+                Steps:
+                    - set doc_type index
+                    - if master index does not exist for the index_category - set a master index
+                    - if master index exists - update the master index to merge it with doc_type index
+                    - Store new/updated index_category index as master
+                    - Store new doc_type index
+            """
+            logger.info(f'{doc_type} index does not exist for: {index_category}-{doc_type}. A new {doc_type} index would be created.')
+            # create doc_type index
+            self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
+            # if master index does not exist for the index_category - create a master index
+            if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
+                logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
+                self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
+        else:
+            """
+                If the new document is of the existing index_category & doc_type
+                Steps:
+                    - if master index does not exist for the index_category - set a master index
+                    - if master index exists - update the master index to merge it with doc_type index
+                    - update the doc_type index
+                    - Store updated index_category index as master
+                    - Store updated doc_type index
+            """
+            # if master index does not exist for the index_category - create a master index
+            if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
+                logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
+                self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
+            # Merge new self.index with existing doc_type index
+            self.index_category_doc_type_wise_index[index_category][doc_type].merge_from(self.index)
+            # Update self.index to store/overwrite the existing index with the updated index
+            self.index = self.index_category_doc_type_wise_index[index_category][doc_type]
+        # Store newly created/merged index
+        self.store_index(
+            index=self.index,
+            index_filepath=self.get_index_filepath(
+                index_category=index_category,
+                doc_type=doc_type
+            )
+        )
+        # Merge and store master index for index_category
+        self.merge_store_master_index(
+            index_category=index_category
+        )
+        logger.info(f'Index for: {index_category}-{doc_type} upserted successful!')
+    def delete_index(
+        self,
+        ids: Optional[List[str]] = None,
+        # filter: Optional[DocumentMetadataFilter] = None,
+        delete_all: Optional[bool] = None,
+    ):
+        """
+            Removes vectors by ids, filter, or everything in the datastore.
+            Multiple parameters can be used at once.
+            Returns whether the operation was successful.
+        """
+        logger.info(f'Deleting index')
+        raise NotImplementedError
+        # NOTE: we can delete a specific collection
+        self.index.delete_collection()
+        self.index.persist()
+        # Or just nuke the persist directory
+        # !rm -rf self.index_filepath

utils/mandi_price.py CHANGED Viewed

@@ -2,32 +2,32 @@ import requests
 class MANDI_PRICE:
-	def __init__(self):
-		self.base_url = "https://enam.gov.in/web/Ajax_ctrl/trade_data_list"
-		# "https://enam.gov.in/web/dashboard/trade-data",
-		# "https://enam.gov.in/web/dashboard/trade_data_list",
-	def get_mandi_price(self,
-		state_name,
-		apmc_name,
-		commodity_name,
-		from_date,
-		to_date
-	):
-		# Prepare the payload for POST request
-		payload = f"language=en&stateName={state_name}&apmcName={apmc_name}&commodityName={commodity_name}&fromDate={from_date}&toDate={to_date}"
-		headers = {
-			"Content-type": "application/x-www-form-urlencoded; charset=UTF-8",
-			"Referer": "https://enam.gov.in/web/dashboard/trade-data",
-			"Accept": "application/json, text/javascript, */*; q=0.01",
-		}
-		response = requests.post(
-			self.base_url,
-			json=payload,
-			headers=headers,
-		)
-		return response.json()

 class MANDI_PRICE:
+    def __init__(self):
+        self.base_url = "https://enam.gov.in/web/Ajax_ctrl/trade_data_list"
+        # "https://enam.gov.in/web/dashboard/trade-data",
+        # "https://enam.gov.in/web/dashboard/trade_data_list",
+    def get_mandi_price(self,
+        state_name,
+        apmc_name,
+        commodity_name,
+        from_date,
+        to_date
+    ):
+        # Prepare the payload for POST request
+        payload = f"language=en&stateName={state_name}&apmcName={apmc_name}&commodityName={commodity_name}&fromDate={from_date}&toDate={to_date}"
+        headers = {
+            "Content-type": "application/x-www-form-urlencoded; charset=UTF-8",
+            "Referer": "https://enam.gov.in/web/dashboard/trade-data",
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+        }
+        response = requests.post(
+            self.base_url,
+            json=payload,
+            headers=headers,
+        )
+        return response.json()

utils/ner_detection.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+import openai
+import os
+import re
+import ast
+openai.api_key = "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"
+SYSTEM_PROMPT = "You are a smart and intelligent Named Entity Recognition (NER) system. I will provide you the definition of the entities you need to extract, the sentence from where your extract the entities and the output format with examples."
+USER_PROMPT_1 = "Are you clear about your role?"
+ASSISTANT_PROMPT_1 = "Sure, I'm ready to help you with your NER task. Please provide me with the necessary information to get started."
+GUIDELINES_PROMPT = (
+    """Entity Definition:\n"
+    "1. PEST NAME: Name of the pest which has attacked a particular crop which may lead to crop damage.\n"
+    "2. CROP DISEASE: Any kind of crop disease which occurs in agriculture land in india and nearby resgions.\n"
+    "3. WEATHER CONDITION: Severe climate conditions like heavy rainfall, hailstorm which has destroyed crops.\n"
+    "\n"
+    "Output Format:\n"
+    "{{'PEST NAME': [list of entities present], 'CROP DISEASE': [list of entities present], 'WEATHER CONDITION': [list of entities present]}}\n"
+    "If no entities are presented in any categories keep it None\n"
+    "\n"
+    "Examples:\n"
+    "\n"
+    "1. Sentence: Pest attack on maize crop in lower Kangra : The Tribune India. Farmers in lower Kangra are a harried lot as the fall armyworm pest has attacked their maize crop. 'Kolshi' continues to affect Vidarbha's Orange crop cultivation (Citrus Black Fly) | Krishak Jagat. A total of 1,50,000 hectares of land in the Vidarbha region is planted with oranges, and of them, 25% are seriously damaged by Kolshi, a citrus black fly disease. India's June tea output drops 17% as floods hit plucking | Mint. India's June tea production fell 17.4% from a year earlier to 141.31 million kilograms, the state-run Tea Board said, as floods and pest attack dented output in the main producing region\n"
+    "Output: {{'PEST NAME': ['fall armyworm'], 'CROP DISEASE': ['citrus black fly disease'], 'WEATHER CONDITION': ['floods']}}\n"
+    "\n"
+    "2. Sentence: ICAR issues pest alert in Leparada, W/Siang | The Arunachal Times. 70 percent prevalence of fall army worm in maize fields in Pagi, Gori and Bam villages in Leparada district and Darka, Kombo and Jirdin villages in West Siang district was observed. After maize, Kangra vegetable crops under white fly attack : The Tribune India. Vegetable crops are under attack by white fly in the lower hills of Kangra district. The pest attack comes after the recent damage caused by fall armyworm to the maize crop in the area. Pest attacks on paddy crop worry farmers in the integrated Karimnagar district | Hindudayashankar. Crops withering due to stem borer, leaf folder and rice blast; farmers have to incur huge expenditures to control menace. Cyclone Amphan damages crop, vegetable prices shoot up | Cities News,The Indian Express. Cyclone Amphan has damaged vegetables across South Bengal. Farmers lost 80 to 90 per cent of crop as fields were flooded.\n"
+    "Output: {{'PEST NAME': ['fall army worm', 'white fly attack', 'stem borer', 'leaf folder'], 'CROP DISEASE': ['rice blast'], 'WEATHER CONDITION': ['Cyclone Amphan']}}\n"
+    "\n"
+    "3. Sentence: {}\n"
+    "Output: """
+)
+def openai_chat_completion_response(news_article_text):
+    final_prompt = GUIDELINES_PROMPT.format(news_article_text)
+    response = openai.ChatCompletion.create(
+                  model="gpt-3.5-turbo",
+                  messages=[
+                        {"role": "system", "content": SYSTEM_PROMPT},
+                        {"role": "user", "content": USER_PROMPT_1},
+                        {"role": "assistant", "content": ASSISTANT_PROMPT_1},
+                        {"role": "user", "content": final_prompt}
+                    ]
+    )
+    return response['choices'][0]['message']['content'].strip(" \n")
+# def preprocess(prompt):
+#     return GUIDELINES_PROMPT.format(prompt)
+# def main():
+# my_sentence = "Hundreds of hectares of land under the cotton crop, once referred to as white gold, has come under attack of a wide range of insects like whitefly, pink bollworm and mealybug. This is likely to hit the cotton production this year."
+# GUIDELINES_PROMPT = GUIDELINES_PROMPT.format(my_sentence)
+# # print(GUIDELINES_PROMPT)
+# ners = openai_chat_completion_response(GUIDELINES_PROMPT)
+# print(ners)
+import gradio as gra
+#define gradio interface and other parameters
+app =  gra.Interface(fn = openai_chat_completion_response, inputs="text", outputs="text")
+app.launch(share=True)

utils/translator.py CHANGED Viewed

@@ -7,55 +7,55 @@ from googletrans import Translator, constants
 class TRANSLATOR:
-	def __init__(self):
-		print()
-	def split_sentences(self, paragraph, language):
-		if language == "en":
-			with MosesSentenceSplitter(language) as splitter:
-				return splitter([paragraph])
-		elif language in constants_utils.INDIC_LANGUAGE:
-			return sentence_tokenize.sentence_split(paragraph, lang=language)
-	def get_in_hindi(self, payload):
-		tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-		model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
-		article = self.split_sentences(payload['inputs'], 'en')
-		# inputs = tokenizer(payload['input'], return_tensors="pt")
-		out_text = ""
-		for a in article:
-			inputs = tokenizer(a, return_tensors="pt")
-			translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hin_Deva"], max_length=100)
-			translated_sent = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-			out_text = out_text.join(translated_sent)
-		return out_text
-	def get_in_indic(self, text, language='Hindi'):
-		tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-		model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
-		inputs = tokenizer(text, return_tensors="pt")
-		code = "eng_Latn"
-		if language == 'Hindi':
-			code= "hin_Deva"
-		elif language == 'Marathi':
-			code = "mar_Deva"
-		translated_tokens = model.generate(
-			**inputs,
-			forced_bos_token_id=tokenizer.lang_code_to_id[code],
-			max_length=1000
-		)
-		out_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-		return out_text
-	def get_indic_google_translate(self, text, language='Hindi'):
-		# Init the Google API translator
-		translator = Translator()
-		translations = translator.translate(text, dest=constants_utils.INDIC_LANGUAGE.get(language, 'en'))
-		return str(translations.text)

 class TRANSLATOR:
+    def __init__(self):
+        print()
+    def split_sentences(self, paragraph, language):
+        if language == "en":
+            with MosesSentenceSplitter(language) as splitter:
+                return splitter([paragraph])
+        elif language in constants_utils.INDIC_LANGUAGE:
+            return sentence_tokenize.sentence_split(paragraph, lang=language)
+    def get_in_hindi(self, payload):
+        tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+        article = self.split_sentences(payload['inputs'], 'en')
+        # inputs = tokenizer(payload['input'], return_tensors="pt")
+        out_text = ""
+        for a in article:
+            inputs = tokenizer(a, return_tensors="pt")
+            translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hin_Deva"], max_length=100)
+            translated_sent = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+            out_text = out_text.join(translated_sent)
+        return out_text
+    def get_in_indic(self, text, language='Hindi'):
+        tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+        model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+        inputs = tokenizer(text, return_tensors="pt")
+        code = "eng_Latn"
+        if language == 'Hindi':
+            code= "hin_Deva"
+        elif language == 'Marathi':
+            code = "mar_Deva"
+        translated_tokens = model.generate(
+            **inputs,
+            forced_bos_token_id=tokenizer.lang_code_to_id[code],
+            max_length=1000
+        )
+        out_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+        return out_text
+    def get_indic_google_translate(self, text, language='Hindi'):
+        # Init the Google API translator
+        translator = Translator()
+        translations = translator.translate(text, dest=constants_utils.INDIC_LANGUAGE.get(language, 'en'))
+        return str(translations.text)

utils/utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+import re
+import pandas as pd
+from urllib.parse import urlparse
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+)
+class UTILS:
+    def __init__(self):
+        pass
+    def split_text(
+        self,
+        text
+    ):
+        text = text.split(',')
+        text = [t.strip() for t in text]
+        return text
+    def replace_newlines_and_spaces(
+        self,
+        text
+    ):
+        # Replace all newline characters with spaces
+        text = text.replace("\n", " ")
+        # Replace multiple spaces with a single space
+        text = re.sub(r'\s+', ' ', text)
+        return text
+    def clean_df(
+        self,
+        df,
+        dropna=True,
+        fillna=False
+    ):
+        if fillna:
+            df.fillna('', inplace=True)
+        if dropna:
+            df.dropna(inplace=True)
+            # df = df[~df.isna()]
+        df = df.drop_duplicates().reset_index(drop=True)
+        return df
+    def validate_url_format(
+        self,
+        urls,
+        url_type='urls'
+    ):
+        valid_urls = []
+        for url in urls:
+            result = urlparse(url)
+            # Check if the url is valid
+            if all([result.scheme, result.netloc]):
+                # Online PDF urls should end with .pdf extension
+                if url_type == 'online_pdf' and not url.endswith('.pdf'):
+                    continue
+                valid_urls.append(url)
+        logging.info(f'Valid URLs are: {valid_urls}')
+        return valid_urls

utils/weather.py CHANGED Viewed

@@ -3,200 +3,200 @@ from bs4 import BeautifulSoup as bs
 STATE_CODES = {
-	'Andaman-Nicobar': '01',
-	'Andhra-Pradesh': '02',
-	'Arunachal-Pradesh': '03',
-	'Assam': '04',
-	'Bihar': '05',
-	'Chandigarh': '06',
-	'Chhattisgarh': '07',
-	'Dadra-and-Nagar-Haveli': '08',
-	'Daman-and-Diu': '09',
-	'Delhi': '10',
-	'Goa': '11',
-	'Gujarat': '12',
-	'Haryana': '13',
-	# 14
-	'Himachal-Pradesh': '15',
-	'Jammu-Kashmir': '16',
-	'Jharkhand': '17',
-	'Karnataka': '18',
-	'Kerala': '19',
-	'Lakshadweep': '20',
-	'Madhya-Pradesh': '21',
-	'Maharashtra': '22',
-	'Manipur': '23',
-	'Meghalaya': '24',
-	'Mizoram': '25',
-	'Nagaland': '26',
-	'Odisha': '27',
-	'Pondicherry': '28',
-	'Punjab': '29',
-	'Rajasthan': '30',
-	'Sikkim': '31',
-	'Tamilnadu': '32',
-	'Telangana': '33',
-	'Tripura': '34',
-	'Uttar-Pradesh': '35',
-	'Uttarakhand': '36',
-	'West-Bengal': '37',
 }
 # List of states that are given as the input selection to https://nwp.imd.gov.in/blf/blf_temp/ to get the weather forecast
 STATES = {
-	'Andaman-Nicobar': {},
-	'Andhra-Pradesh': {},
-	'Arunachal-Pradesh': {},
-	'Assam': {},
-	'Bihar': {},
-	'Chandigarh': {},
-	'Chhattisgarh': {},
-	'Dadra-and-Nagar-Haveli': {},
-	'Daman-and-Diu': {},
-	'Delhi': {
-		'CENTRAL-DELHI': ['CENTRAL-DELHI'],
-		'EAST-DELHI': ['EAST-DELHI'],
-		'NEW-DELHI': ['NEW-DELHI'],
-		'NORTH-DELHI': ['NORTH-DELHI'],
-		'NORTH-EAST-DELHI': ['NORTH-EAST-DELHI'],
-		'NORTH-WEST-DELHI': ['NORTH-WEST-DELHI'],
-		'SHAHDARA': ['SHAHDARA'],
-		'SOUTH-DELHI': ['SOUTH-DELHI'],
-		'SOUTH-EAST-DELHI': ['SOUTH-EAST-DELHI'],
-		'SOUTH-WEST-DELHI': ['SOUTH-WEST-DELHI'],
-		'WEST-DELHI': ['WEST-DELHI'],
-	},
-	'Goa': {},
-	'Gujarat': {
-		'AHMADABAD': ['AHMEDABAD-CITY', 'BAVLA', 'DASKROI', 'DETROJ-RAMPURA', 'DHANDHUKA', 'DHOLERA', 'DHOLKA', 'MANDAL', 'SANAND', 'VIRAMGAM'],
-		'AMRELI': ['AMRELI', 'BABRA', 'BAGASARA', 'DHARI', 'JAFRABAD', 'KHAMBHA', 'KUNKAVAV-VADIA', 'LATHI', 'LILIA', 'RAJULA', 'SAVERKUNDLA'],
-		'ANAND': [],
-		'ARVALLI': [],
-		'BANASKANTHA': [],
-		'BHARUCH': [],
-		'BHAVNAGAR': [],
-		'BOTAD': [],
-		'CHHOTAUDEPUR': [],
-		'DANG': [],
-		'DEVBHUMI-DWARKA': [],
-		'DOHAD': [],
-		'GANDHINAGAR': [],
-		'GIR-SOMNATH': [],
-		'JAMNAGAR': [],
-		'JUNAGADH': [],
-		'KACHCHH': [],
-		'KHEDA': [],
-		'MAHESANA': [],
-		'MAHISAGAR': [],
-		'MORBI': [],
-		'NARMADA': [],
-		'NAVSARI': [],
-		'PANCH-MAHALS': [],
-		'PATAN': [],
-		'PORBANDAR': [],
-		'RAJKOT': [],
-		'SABAR-KANTHA': [],
-		'SURAT': ['BARDOLI', 'CHORASI', 'KAMREJ', 'MAHUVA', 'MANDVI', 'MANGROL', 'OLPAD', 'PALSANA', 'SURAT-CITY', 'UMARPADA'],
-		'SURENDRANAGAR': [],
-		'TAPI': [],
-		'VADODARA': [],
-		'VALSAD': [],
-	},
-	'Haryana': {},
-	'Himachal-Pradesh': {},
-	'Jammu-Kashmir': {},
-	'Jharkhand': {},
-	'Karnataka': {},
-	'Kerala': {},
-	'Lakshadweep': {},
-	'Madhya-Pradesh': {},
-	'Maharashtra': {},
-	'Manipur': {},
-	'Meghalaya': {},
-	'Mizoram': {},
-	'Nagaland': {},
-	'Odisha': {},
-	'Pondicherry': {},
-	'Punjab': {},
-	'Rajasthan': {},
-	'Sikkim': {},
-	'Tamilnadu': {},
-	'Telangana': {},
-	'Tripura': {},
-	'Uttar-Pradesh': {},
-	'Uttarakhand': {},
-	'West-Bengal': {},
 }
 class WEATHER:
-	def __init__(self):
-		self.base_url = 'https://nwp.imd.gov.in/blf/blf_temp'
-	# Weather forecast from Govt. website
-	def get_weather_forecast(self, state, district, is_block_level=False):
-		self.district_url = f"{self.base_url}/block.php?dis={STATE_CODES.get(state, '') + district}"
-		self.block_url = f'{self.base_url}/table2.php'
-		response = requests.get(self.district_url if not is_block_level else self.block_url)
-		soup = bs(response.text, 'html.parser')
-		scripts = soup.findAll('font')[0]
-		return scripts.text
-	# Weather using Google weather API
-	def get_weather(self, city):
-		city = city + " weather"
-		city = city.replace(" ", "+")
-		headers = {
-			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-		}
-		response = requests.get(
-			f'https://www.google.com/search?q={city}&oq={city}&aqs=chrome.0.35i39l2j0l4j46j69i60.6128j1j7&sourceid=chrome&ie=UTF-8', headers=headers)
-		soup = bs(response.text, 'html.parser')
-		location = soup.select('#wob_loc')[0].getText().strip()
-		time = soup.select('#wob_dts')[0].getText().strip()
-		info = soup.select('#wob_dc')[0].getText().strip()
-		temperature = soup.select('#wob_tm')[0].getText().strip()
-		temperature = temperature + "°C"
-		return time, info, temperature

 STATE_CODES = {
+    'Andaman-Nicobar': '01',
+    'Andhra-Pradesh': '02',
+    'Arunachal-Pradesh': '03',
+    'Assam': '04',
+    'Bihar': '05',
+    'Chandigarh': '06',
+    'Chhattisgarh': '07',
+    'Dadra-and-Nagar-Haveli': '08',
+    'Daman-and-Diu': '09',
+    'Delhi': '10',
+    'Goa': '11',
+    'Gujarat': '12',
+    'Haryana': '13',
+    # 14
+    'Himachal-Pradesh': '15',
+    'Jammu-Kashmir': '16',
+    'Jharkhand': '17',
+    'Karnataka': '18',
+    'Kerala': '19',
+    'Lakshadweep': '20',
+    'Madhya-Pradesh': '21',
+    'Maharashtra': '22',
+    'Manipur': '23',
+    'Meghalaya': '24',
+    'Mizoram': '25',
+    'Nagaland': '26',
+    'Odisha': '27',
+    'Pondicherry': '28',
+    'Punjab': '29',
+    'Rajasthan': '30',
+    'Sikkim': '31',
+    'Tamilnadu': '32',
+    'Telangana': '33',
+    'Tripura': '34',
+    'Uttar-Pradesh': '35',
+    'Uttarakhand': '36',
+    'West-Bengal': '37',
 }
 # List of states that are given as the input selection to https://nwp.imd.gov.in/blf/blf_temp/ to get the weather forecast
 STATES = {
+    'Andaman-Nicobar': {},
+    'Andhra-Pradesh': {},
+    'Arunachal-Pradesh': {},
+    'Assam': {},
+    'Bihar': {},
+    'Chandigarh': {},
+    'Chhattisgarh': {},
+    'Dadra-and-Nagar-Haveli': {},
+    'Daman-and-Diu': {},
+    'Delhi': {
+        'CENTRAL-DELHI': ['CENTRAL-DELHI'],
+        'EAST-DELHI': ['EAST-DELHI'],
+        'NEW-DELHI': ['NEW-DELHI'],
+        'NORTH-DELHI': ['NORTH-DELHI'],
+        'NORTH-EAST-DELHI': ['NORTH-EAST-DELHI'],
+        'NORTH-WEST-DELHI': ['NORTH-WEST-DELHI'],
+        'SHAHDARA': ['SHAHDARA'],
+        'SOUTH-DELHI': ['SOUTH-DELHI'],
+        'SOUTH-EAST-DELHI': ['SOUTH-EAST-DELHI'],
+        'SOUTH-WEST-DELHI': ['SOUTH-WEST-DELHI'],
+        'WEST-DELHI': ['WEST-DELHI'],
+    },
+    'Goa': {},
+    'Gujarat': {
+        'AHMADABAD': ['AHMEDABAD-CITY', 'BAVLA', 'DASKROI', 'DETROJ-RAMPURA', 'DHANDHUKA', 'DHOLERA', 'DHOLKA', 'MANDAL', 'SANAND', 'VIRAMGAM'],
+        'AMRELI': ['AMRELI', 'BABRA', 'BAGASARA', 'DHARI', 'JAFRABAD', 'KHAMBHA', 'KUNKAVAV-VADIA', 'LATHI', 'LILIA', 'RAJULA', 'SAVERKUNDLA'],
+        'ANAND': [],
+        'ARVALLI': [],
+        'BANASKANTHA': [],
+        'BHARUCH': [],
+        'BHAVNAGAR': [],
+        'BOTAD': [],
+        'CHHOTAUDEPUR': [],
+        'DANG': [],
+        'DEVBHUMI-DWARKA': [],
+        'DOHAD': [],
+        'GANDHINAGAR': [],
+        'GIR-SOMNATH': [],
+        'JAMNAGAR': [],
+        'JUNAGADH': [],
+        'KACHCHH': [],
+        'KHEDA': [],
+        'MAHESANA': [],
+        'MAHISAGAR': [],
+        'MORBI': [],
+        'NARMADA': [],
+        'NAVSARI': [],
+        'PANCH-MAHALS': [],
+        'PATAN': [],
+        'PORBANDAR': [],
+        'RAJKOT': [],
+        'SABAR-KANTHA': [],
+        'SURAT': ['BARDOLI', 'CHORASI', 'KAMREJ', 'MAHUVA', 'MANDVI', 'MANGROL', 'OLPAD', 'PALSANA', 'SURAT-CITY', 'UMARPADA'],
+        'SURENDRANAGAR': [],
+        'TAPI': [],
+        'VADODARA': [],
+        'VALSAD': [],
+    },
+    'Haryana': {},
+    'Himachal-Pradesh': {},
+    'Jammu-Kashmir': {},
+    'Jharkhand': {},
+    'Karnataka': {},
+    'Kerala': {},
+    'Lakshadweep': {},
+    'Madhya-Pradesh': {},
+    'Maharashtra': {},
+    'Manipur': {},
+    'Meghalaya': {},
+    'Mizoram': {},
+    'Nagaland': {},
+    'Odisha': {},
+    'Pondicherry': {},
+    'Punjab': {},
+    'Rajasthan': {},
+    'Sikkim': {},
+    'Tamilnadu': {},
+    'Telangana': {},
+    'Tripura': {},
+    'Uttar-Pradesh': {},
+    'Uttarakhand': {},
+    'West-Bengal': {},
 }
 class WEATHER:
+    def __init__(self):
+        self.base_url = 'https://nwp.imd.gov.in/blf/blf_temp'
+    # Weather forecast from Govt. website
+    def get_weather_forecast(self, state, district, is_block_level=False):
+        self.district_url = f"{self.base_url}/block.php?dis={STATE_CODES.get(state, '') + district}"
+        self.block_url = f'{self.base_url}/table2.php'
+        response = requests.get(self.district_url if not is_block_level else self.block_url)
+        soup = bs(response.text, 'html.parser')
+        scripts = soup.findAll('font')[0]
+        return scripts.text
+    # Weather using Google weather API
+    def get_weather(self, city):
+        city = city + " weather"
+        city = city.replace(" ", "+")
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+        }
+        response = requests.get(
+            f'https://www.google.com/search?q={city}&oq={city}&aqs=chrome.0.35i39l2j0l4j46j69i60.6128j1j7&sourceid=chrome&ie=UTF-8', headers=headers)
+        soup = bs(response.text, 'html.parser')
+        location = soup.select('#wob_loc')[0].getText().strip()
+        time = soup.select('#wob_dts')[0].getText().strip()
+        info = soup.select('#wob_dc')[0].getText().strip()
+        temperature = soup.select('#wob_tm')[0].getText().strip()
+        temperature = temperature + "°C"
+        return time, info, temperature

utils/web_crawler.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import requests
+from bs4 import BeautifulSoup as bs
+class LOAD_ONLINE_PDF_IPM_PACKAGES:
+    def __init__(self):
+        self.base_url = 'https://ppqs.gov.in/ipm-packages'
+        self.ipm_packages = []
+        self.pdfs_urls = []
+        self.headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        }
+    def _get_ipm_packages_name_list(self):
+        """
+            Parse HTML page to get the names of each IPM Package
+        """
+        response = requests.get(
+            self.base_url,
+            headers=self.headers,
+        )
+        soup = bs(response.text, 'html.parser')
+        packages = soup.findAll('span', {'class': 'field-content region-name'}, limit=None)
+        for package in packages:
+            self.ipm_packages.append(package.a['href'].split('/')[-1])
+    def get_ipm_packages_pdfs_list(self):
+        """
+            Parse HTML page to get the PDF URLs of each IPM Package
+        """
+        self._get_ipm_packages_name_list()
+        for ip in self.ipm_packages:
+            source_url = f'{self.base_url}/{ip}'
+            print(f'Loading PDFs from: {source_url}')
+            response = requests.get(
+                source_url,
+                headers=self.headers,
+            )
+            soup = bs(response.text, 'html.parser')
+            urls = soup.findAll('td', {'class': 'views-field views-field-php'}, limit=None)
+            for url in urls:
+                self.pdfs_urls.append(url.a['href'])
+def get_ipm_packages_pdfs_urls():
+    pdf = LOAD_ONLINE_PDF_IPM_PACKAGES()
+    pdf.get_ipm_packages_pdfs_list()
+    print('Total pdfs:', len(pdf.pdfs_urls))
+    return pdf.pdfs_urls