Spaces:
Runtime error
Runtime error
| import os | |
| import yaml | |
| import requests | |
| import pandas as pd | |
| def internal_data(type): | |
| """ | |
| Extract internal data from either catalog or query. | |
| :param type: str, 'catalog' or 'query' | |
| :return: pandas.DataFrame, dataframe containing product name and category name | |
| """ | |
| if type == 'catalog': | |
| dfs = [] | |
| for file in os.listdir('catalog'): | |
| if file.endswith('.xlsx'): | |
| df = pd.read_excel('catalog/' + file) | |
| dfs.append(df) | |
| catalog = pd.concat(dfs, ignore_index=True) | |
| return catalog | |
| elif type == 'query': | |
| dfs = [] | |
| for file in os.listdir('query'): | |
| if file.endswith('.xlsx'): | |
| df = pd.read_excel('query/' + file) | |
| dfs.append(df) | |
| query = pd.concat(dfs, ignore_index=True) | |
| return query | |
| else: | |
| return 'Error: type must be either catalog or query' | |
| def registered_fertilizer_data(): | |
| """ | |
| Scrape registered fertilizer data in Ministry of Agriculture website. | |
| :param type: str, 'organik' or 'anorganik' | |
| :return: pandas.DataFrame, dataframe containing registered fertilizer data | |
| """ | |
| # check if the "external" folder is empty | |
| if os.listdir('external') == []: | |
| print('External folder is empty. Extracting data from Ministry of Agriculture website...') | |
| print('Extracting Organic Fertilizer Data...') | |
| dfs1 = [] | |
| # Scrape every table in every page: Organic | |
| i = 1 | |
| while True: | |
| url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['organik'][0] + str(i) | |
| result = requests.get(url).content | |
| try: | |
| df = pd.read_html(result)[5].iloc[2:-1, [2, 3, 6]].rename(columns={2: 'Merek', 3: 'Jenis', 6: 'Nomor Pendaftaran'}) | |
| df['Page Number'] = i | |
| dfs1.append(df) | |
| i += 1 | |
| except IndexError: | |
| break | |
| registered_organic_fertilizers = pd.concat(dfs1, ignore_index=True).dropna() | |
| print('Extracting Inorganic Fertilizer Data...') | |
| dfs2 = [] | |
| # Scrape every table in every page: Inorganic | |
| i = 1 | |
| while True: | |
| url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['anorganik'][0] + str(i) | |
| result = requests.get(url).content | |
| try: | |
| df = pd.read_html(result)[5].iloc[2:-1, 5:8].rename(columns={5: 'Merek', 6: 'Jenis', 7: 'Nomor Pendaftaran'}) | |
| df['Page Number'] = i | |
| dfs2.append(df) | |
| i += 1 | |
| except IndexError: | |
| break | |
| registered_inorganic_fertilizers = pd.concat(dfs2, ignore_index=True).dropna() | |
| registered_fertilizers = pd.concat([registered_organic_fertilizers, registered_inorganic_fertilizers], ignore_index=True) | |
| registered_fertilizers['Nama Lengkap'] = registered_fertilizers['Jenis'] + ' ' + registered_fertilizers['Merek'] | |
| return registered_fertilizers | |
| else : | |
| return pd.read_csv('external/registered_fertilizers.csv') | |
| def scrape_result(): | |
| """ | |
| Extract scraped result data. | |
| :return: pandas.DataFrame, dataframe containing scraped result data | |
| """ | |
| dfs = [] | |
| for filename in os.listdir('scrape_result'): | |
| df = pd.read_csv('scrape_result/'+filename) | |
| dfs.append(df) | |
| # combine | |
| final_df = pd.concat(dfs, ignore_index=True) | |
| return final_df |