#%% from bs4 import BeautifulSoup import requests import matplotlib.pyplot as plt import io import base64 import re # get URL def scrape(commodity="Apple"): #list list1 = ['rice','maize','jute','cotton','coconut','papaya','orange_(fruit)','apple','cantaloupe','watermelon', 'grapes','mango','banana','pomegranate','lentil','blackgram','mungbean','mothbean','pigeonpea', 'chickpea','coffee'] link = "https://en.wikipedia.org/wiki/" page = requests.get(link+(commodity.capitalize())) # scrape webpage soup = BeautifulSoup(page.content, 'lxml') tags = soup.find_all('h2') headings = ['description', 'agronomy', 'pests', 'structure and physiology', 'pests and diseases', 'cultivation', 'uses','use', 'environmental impact', 'types','varieties','breeding'] # for head in tags: # print(head.text) infopage = "" textjson = {} l=[] i,j,k=1,1,1 for head in tags: if any(sub in head.text.lower() for sub in headings): #tempjson = {} #print(head.text) #infopage+=head.text head_text = head.text head_text = re.sub(r'\[\d+\]', '', head_text) textjson[f"heading{i}"] = head_text l.append(f"heading{i}") i+=1 content = head.find_next_sibling() #tempjson['heading'] = head_text while content and content.name!='h2': if content.name =='h3' : subheading = content.text #print(content.text) subheading = re.sub(r'\[\d+\]', '', subheading) #print(subheading) textjson[f"subheading{j}"] = subheading l.append(f"subheading{j}") j+=1 #infopage+=content.text if content.name == 'p': para =content.text para = re.sub(r'\[\d+\]', '', para) textjson[f"para{k}"] = para l.append(f"para{k}") k+=1 content = content.find_next_sibling() # textjson[f'heading{i}']= head_text # textjson[f'content{i}'] = tempjson #text = re.sub(r'\[\d+\]', '', infopage) #for commodity prices link2 = "https://api.data.gov.in/resource/9ef84268-d588-465a-a308-a864a43d0070?api-key=579b464db66ec23bdd000001cdd3946e44ce4aad7209ff7b23ac571b&format=json&filters%5Bcommodity%5D=" #commodity = "Apple" info = requests.get(link2+(commodity.capitalize())) # print(link2+(commodity.capitalize())) # print(info) if info.status_code == 200: data = info.json() data = data['records'] if data == []: print("no data available") else: print("Failed to retrieve data. Status code:", info.status_code) # Extracting data for plotting markets = [entry['market'] for entry in data] min_prices = [float(entry['min_price']) for entry in data] max_prices = [float(entry['max_price']) for entry in data] modal_prices = [float(entry['modal_price']) for entry in data] #modifications # for i in range(len(markets)): # plt.text(modal_prices[i], i, f"{modal_prices[i]}", ha='left', va='center', fontsize=8, color='black') # plt.text(max_prices[i], i, f"{max_prices[i]}", ha='left', va='center', fontsize=8, color='black') # plt.text(min_prices[i], i, f"{min_prices[i]}", ha='left', va='center', fontsize=8, color='black') # for index, value in enumerate(modal_prices): # plt.text(value, index, str(value), va='center', fontsize=10, color='black') # Plotting plt.figure(figsize=(10, 6)) plt.barh(markets, min_prices, color='lightblue', label='Min Price') plt.barh(markets, max_prices, color='skyblue', label='Max Price', left=min_prices) plt.barh(markets, modal_prices, color='dodgerblue', label='Modal Price', left=[min + max for min, max in zip(min_prices, max_prices)]) plt.xlabel('Price') plt.ylabel('Market') plt.title(f'{commodity} Prices in Different Markets') plt.legend() plt.tight_layout() #plt.show() # Encode the plot image as a bytes object buffer = io.BytesIO() plt.savefig(buffer, format='png') buffer.seek(0) plot_image = buffer.getvalue() buffer.close() # Encode the plot image as base64 plot_image_base64 = base64.b64encode(plot_image).decode('utf-8') response_data = plot_image_base64 # { # 'plot_image': # } return textjson,response_data,l # display scraped data #print(soup.prettify()) # %%