Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- Dockerfile +11 -0
- app.py +14 -0
- requirements.txt +13 -0
- webscraping.py +128 -0
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
COPY . .
|
10 |
+
|
11 |
+
CMD ["flask", "run","--host=0.0.0.0", "--port=7860"]
|
app.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
from webscraping import scrape
|
3 |
+
app = Flask(__name__)
|
4 |
+
app.config['JSON_SORT_KEYS'] = False
|
5 |
+
|
6 |
+
@app.route('/',methods=['POST','GET'])
|
7 |
+
def give():
|
8 |
+
filters = request.get_json()
|
9 |
+
commodity = filters['commodity']
|
10 |
+
data1,data2,l = scrape(commodity)
|
11 |
+
return jsonify({"text":data1, "image":data2, "indices":l})
|
12 |
+
|
13 |
+
if __name__=="__main__":
|
14 |
+
app.run()
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4==4.10.0
|
2 |
+
Flask==3.0.2
|
3 |
+
requests==2.25.1
|
4 |
+
Jinja2==3.1.3
|
5 |
+
numpy==1.26.4
|
6 |
+
Werkzeug==3.0.1
|
7 |
+
itsdangerous==2.1.2
|
8 |
+
blinker==1.7.0
|
9 |
+
click==8.1.7
|
10 |
+
urllib3==1.26.5
|
11 |
+
matplotlib==3.5.1
|
12 |
+
|
13 |
+
|
webscraping.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#%%
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import requests
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import io
|
6 |
+
import base64
|
7 |
+
import re
|
8 |
+
|
9 |
+
# get URL
|
10 |
+
|
11 |
+
def scrape(commodity="Apple"):
|
12 |
+
#list
|
13 |
+
list1 = ['rice','maize','jute','cotton','coconut','papaya','orange_(fruit)','apple','cantaloupe','watermelon',
|
14 |
+
'grapes','mango','banana','pomegranate','lentil','blackgram','mungbean','mothbean','pigeonpea',
|
15 |
+
'chickpea','coffee']
|
16 |
+
link = "https://en.wikipedia.org/wiki/"
|
17 |
+
page = requests.get(link+(commodity.capitalize()))
|
18 |
+
|
19 |
+
# scrape webpage
|
20 |
+
soup = BeautifulSoup(page.content, 'lxml')
|
21 |
+
tags = soup.find_all('h2')
|
22 |
+
headings = ['description', 'agronomy', 'pests', 'structure and physiology', 'pests and diseases',
|
23 |
+
'cultivation', 'uses','use', 'environmental impact', 'types','varieties','breeding']
|
24 |
+
|
25 |
+
# for head in tags:
|
26 |
+
# print(head.text)
|
27 |
+
infopage = ""
|
28 |
+
textjson = {}
|
29 |
+
l=[]
|
30 |
+
i,j,k=1,1,1
|
31 |
+
for head in tags:
|
32 |
+
if any(sub in head.text.lower() for sub in headings):
|
33 |
+
#tempjson = {}
|
34 |
+
#print(head.text)
|
35 |
+
#infopage+=head.text
|
36 |
+
head_text = head.text
|
37 |
+
head_text = re.sub(r'\[\d+\]', '', head_text)
|
38 |
+
textjson[f"heading{i}"] = head_text
|
39 |
+
l.append(f"heading{i}")
|
40 |
+
i+=1
|
41 |
+
content = head.find_next_sibling()
|
42 |
+
#tempjson['heading'] = head_text
|
43 |
+
|
44 |
+
while content and content.name!='h2':
|
45 |
+
if content.name =='h3' :
|
46 |
+
subheading = content.text
|
47 |
+
#print(content.text)
|
48 |
+
subheading = re.sub(r'\[\d+\]', '', subheading)
|
49 |
+
#print(subheading)
|
50 |
+
textjson[f"subheading{j}"] = subheading
|
51 |
+
l.append(f"subheading{j}")
|
52 |
+
j+=1
|
53 |
+
#infopage+=content.text
|
54 |
+
if content.name == 'p':
|
55 |
+
para =content.text
|
56 |
+
para = re.sub(r'\[\d+\]', '', para)
|
57 |
+
textjson[f"para{k}"] = para
|
58 |
+
l.append(f"para{k}")
|
59 |
+
k+=1
|
60 |
+
|
61 |
+
content = content.find_next_sibling()
|
62 |
+
# textjson[f'heading{i}']= head_text
|
63 |
+
# textjson[f'content{i}'] = tempjson
|
64 |
+
#text = re.sub(r'\[\d+\]', '', infopage)
|
65 |
+
|
66 |
+
|
67 |
+
#for commodity prices
|
68 |
+
link2 = "https://api.data.gov.in/resource/9ef84268-d588-465a-a308-a864a43d0070?api-key=579b464db66ec23bdd000001cdd3946e44ce4aad7209ff7b23ac571b&format=json&filters%5Bcommodity%5D="
|
69 |
+
#commodity = "Apple"
|
70 |
+
info = requests.get(link2+(commodity.capitalize()))
|
71 |
+
# print(link2+(commodity.capitalize()))
|
72 |
+
# print(info)
|
73 |
+
|
74 |
+
if info.status_code == 200:
|
75 |
+
data = info.json()
|
76 |
+
data = data['records']
|
77 |
+
if data == []:
|
78 |
+
print("no data available")
|
79 |
+
else:
|
80 |
+
print("Failed to retrieve data. Status code:", info.status_code)
|
81 |
+
|
82 |
+
# Extracting data for plotting
|
83 |
+
markets = [entry['market'] for entry in data]
|
84 |
+
min_prices = [float(entry['min_price']) for entry in data]
|
85 |
+
max_prices = [float(entry['max_price']) for entry in data]
|
86 |
+
modal_prices = [float(entry['modal_price']) for entry in data]
|
87 |
+
|
88 |
+
#modifications
|
89 |
+
# for i in range(len(markets)):
|
90 |
+
# plt.text(modal_prices[i], i, f"{modal_prices[i]}", ha='left', va='center', fontsize=8, color='black')
|
91 |
+
# plt.text(max_prices[i], i, f"{max_prices[i]}", ha='left', va='center', fontsize=8, color='black')
|
92 |
+
# plt.text(min_prices[i], i, f"{min_prices[i]}", ha='left', va='center', fontsize=8, color='black')
|
93 |
+
|
94 |
+
# for index, value in enumerate(modal_prices):
|
95 |
+
# plt.text(value, index, str(value), va='center', fontsize=10, color='black')
|
96 |
+
|
97 |
+
|
98 |
+
# Plotting
|
99 |
+
plt.figure(figsize=(10, 6))
|
100 |
+
plt.barh(markets, min_prices, color='lightblue', label='Min Price')
|
101 |
+
plt.barh(markets, max_prices, color='skyblue', label='Max Price', left=min_prices)
|
102 |
+
plt.barh(markets, modal_prices, color='dodgerblue', label='Modal Price', left=[min + max for min, max in zip(min_prices, max_prices)])
|
103 |
+
plt.xlabel('Price')
|
104 |
+
plt.ylabel('Market')
|
105 |
+
plt.title(f'{commodity} Prices in Different Markets')
|
106 |
+
plt.legend()
|
107 |
+
plt.tight_layout()
|
108 |
+
#plt.show()
|
109 |
+
|
110 |
+
# Encode the plot image as a bytes object
|
111 |
+
buffer = io.BytesIO()
|
112 |
+
plt.savefig(buffer, format='png')
|
113 |
+
buffer.seek(0)
|
114 |
+
plot_image = buffer.getvalue()
|
115 |
+
buffer.close()
|
116 |
+
|
117 |
+
# Encode the plot image as base64
|
118 |
+
plot_image_base64 = base64.b64encode(plot_image).decode('utf-8')
|
119 |
+
|
120 |
+
response_data = plot_image_base64
|
121 |
+
# {
|
122 |
+
# 'plot_image':
|
123 |
+
# }
|
124 |
+
return textjson,response_data,l
|
125 |
+
|
126 |
+
# display scraped data
|
127 |
+
#print(soup.prettify())
|
128 |
+
# %%
|