Spaces:

delphine18
/

scraper_api

Sleeping

App Files Files Community

delphine18 commited on Mar 3, 2024

Commit

695b43d

verified ·

1 Parent(s): 6f86415

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +11 -0
app.py +14 -0
requirements.txt +13 -0
webscraping.py +128 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.10
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["flask", "run","--host=0.0.0.0", "--port=7860"]

app.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from flask import Flask, request, jsonify
+from webscraping import scrape
+app = Flask(__name__)
+app.config['JSON_SORT_KEYS'] = False
+@app.route('/',methods=['POST','GET'])
+def give():
+    filters = request.get_json()
+    commodity = filters['commodity']
+    data1,data2,l = scrape(commodity)
+    return jsonify({"text":data1, "image":data2, "indices":l})
+if __name__=="__main__":
+    app.run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+beautifulsoup4==4.10.0
+Flask==3.0.2
+requests==2.25.1
+Jinja2==3.1.3
+numpy==1.26.4
+Werkzeug==3.0.1
+itsdangerous==2.1.2
+blinker==1.7.0
+click==8.1.7
+urllib3==1.26.5
+matplotlib==3.5.1

webscraping.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#%%
+from bs4 import BeautifulSoup
+import requests
+import matplotlib.pyplot as plt
+import io
+import base64
+import re
+# get URL
+def scrape(commodity="Apple"):
+#list
+    list1 = ['rice','maize','jute','cotton','coconut','papaya','orange_(fruit)','apple','cantaloupe','watermelon',
+        'grapes','mango','banana','pomegranate','lentil','blackgram','mungbean','mothbean','pigeonpea',
+        'chickpea','coffee']
+    link = "https://en.wikipedia.org/wiki/"
+    page = requests.get(link+(commodity.capitalize()))
+    # scrape webpage
+    soup = BeautifulSoup(page.content, 'lxml')
+    tags = soup.find_all('h2')
+    headings = ['description', 'agronomy', 'pests', 'structure and physiology', 'pests and diseases',
+                'cultivation', 'uses','use', 'environmental impact', 'types','varieties','breeding']
+    # for head in tags:
+    #     print(head.text)
+    infopage = ""
+    textjson = {}
+    l=[]
+    i,j,k=1,1,1
+    for head in tags:
+        if any(sub in head.text.lower() for sub in headings):
+            #tempjson = {}
+            #print(head.text)
+            #infopage+=head.text
+            head_text = head.text
+            head_text = re.sub(r'\[\d+\]', '', head_text)
+            textjson[f"heading{i}"] = head_text
+            l.append(f"heading{i}")
+            i+=1
+            content = head.find_next_sibling()
+            #tempjson['heading'] = head_text
+            while content and content.name!='h2':
+                if content.name =='h3' :
+                    subheading = content.text
+                    #print(content.text)
+                    subheading = re.sub(r'\[\d+\]', '', subheading)
+                    #print(subheading)
+                    textjson[f"subheading{j}"] = subheading
+                    l.append(f"subheading{j}")
+                    j+=1
+                    #infopage+=content.text
+                if content.name == 'p':
+                    para =content.text
+                    para = re.sub(r'\[\d+\]', '', para)
+                    textjson[f"para{k}"] = para
+                    l.append(f"para{k}")
+                    k+=1
+                content = content.find_next_sibling()
+            # textjson[f'heading{i}']= head_text
+            # textjson[f'content{i}'] = tempjson
+    #text = re.sub(r'\[\d+\]', '', infopage)
+    #for commodity prices
+    link2 = "https://api.data.gov.in/resource/9ef84268-d588-465a-a308-a864a43d0070?api-key=579b464db66ec23bdd000001cdd3946e44ce4aad7209ff7b23ac571b&format=json&filters%5Bcommodity%5D="
+    #commodity = "Apple"
+    info = requests.get(link2+(commodity.capitalize()))
+    # print(link2+(commodity.capitalize()))
+    # print(info)
+    if info.status_code == 200:
+        data = info.json()
+        data = data['records']
+        if data == []:
+            print("no data available")
+    else:
+        print("Failed to retrieve data. Status code:", info.status_code)
+    # Extracting data for plotting
+    markets = [entry['market'] for entry in data]
+    min_prices = [float(entry['min_price']) for entry in data]
+    max_prices = [float(entry['max_price']) for entry in data]
+    modal_prices = [float(entry['modal_price']) for entry in data]
+    #modifications
+    # for i in range(len(markets)):
+    #     plt.text(modal_prices[i], i, f"{modal_prices[i]}", ha='left', va='center', fontsize=8, color='black')
+    #     plt.text(max_prices[i], i, f"{max_prices[i]}", ha='left', va='center', fontsize=8, color='black')
+    #     plt.text(min_prices[i], i, f"{min_prices[i]}", ha='left', va='center', fontsize=8, color='black')
+    # for index, value in enumerate(modal_prices):
+    #     plt.text(value, index, str(value), va='center', fontsize=10, color='black')
+    # Plotting
+    plt.figure(figsize=(10, 6))
+    plt.barh(markets, min_prices, color='lightblue', label='Min Price')
+    plt.barh(markets, max_prices, color='skyblue', label='Max Price', left=min_prices)
+    plt.barh(markets, modal_prices, color='dodgerblue', label='Modal Price', left=[min + max for min, max in zip(min_prices, max_prices)])
+    plt.xlabel('Price')
+    plt.ylabel('Market')
+    plt.title(f'{commodity} Prices in Different Markets')
+    plt.legend()
+    plt.tight_layout()
+    #plt.show()
+    # Encode the plot image as a bytes object
+    buffer = io.BytesIO()
+    plt.savefig(buffer, format='png')
+    buffer.seek(0)
+    plot_image = buffer.getvalue()
+    buffer.close()
+    # Encode the plot image as base64
+    plot_image_base64 = base64.b64encode(plot_image).decode('utf-8')
+    response_data = plot_image_base64
+    # {
+    #     'plot_image':
+    # }
+    return textjson,response_data,l
+# display scraped data
+#print(soup.prettify())
+# %%