# NOTE: In the case of code not running, you should first check the cookies link in each content import requests import time from tqdm import tqdm from src.utils.utils import load_json, save_json import os import argparse import logging URL_TEMPLATE = "https://www.foody.vn/vung-tau/{name}?ds=Restaurant&vt=row&st=1&c=7&page={page_num}&provinceId=223&categoryId={categorie_num}&append=true" def parser(): parser = argparse.ArgumentParser(description='Crawling Foody Vung Tau') parser.add_argument('--ids_location_path', type=str, help='Specify the path to locations index') parser.add_argument("--save_path", type=str, help="Specify the path to save the scraped data") args = parser.parse_args() return args class FoodyScraper: def __init__(self, args): self.headers = self._initialize_headers() self.cookies_infos = self._initialize_cookies_infos() self.ids_location = load_json(args.ids_location_path) self.args = args self.max_pages = 10 self.sleep_duration = 0.5 self.list_output = [] def _initialize_headers(self): return { 'authority': 'www.foody.vn', 'accept': 'application/json, text/javascript, */*; q=0.01', 'accept-language': 'en-US,en;q=0.9', 'cookie': '', 'referer': 'https://www.foody.vn/vung-tau/tiec-cuoi-hoi-nghi?CategoryGroup=wedding&c=tiec-cuoi-hoi-nghi', 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Linux"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', 'x-foody-user-token': 'null', 'x-requested-with': 'XMLHttpRequest' } def _initialize_cookies_infos(self): return { "food": "bc-jcb=1; flg=vn; __ondemand_sessionid=mgzbsjrydfrpj505xyuuc2su; floc=223; _ga=GA1.2.947999007.1713953117; _gid=GA1.2.1159064546.1713953117; _fbp=fb.1.1713953116606.1641731163; __utmc=257500956; __utmz=257500956.1713953117.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); fd.keys=; fd.res.view.223=1000073114; __utma=257500956.947999007.1713953117.1713953117.1713957714.2; __utmt_UA-33292184-1=1; gcat=food; _ga_6M8E625L9H=GS1.2.1713957714.2.1.1713959027.26.0.0; __utmb=257500956.9.10.1713957714", "entertain": "bc-jcb=1; flg=vn; __ondemand_sessionid=mgzbsjrydfrpj505xyuuc2su; floc=223; _ga=GA1.2.947999007.1713953117; _gid=GA1.2.1159064546.1713953117; _fbp=fb.1.1713953116606.1641731163; __utmc=257500956; __utmz=257500956.1713953117.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); fd.keys=; fd.res.view.223=1000073114; __utma=257500956.947999007.1713953117.1713953117.1713957714.2; __utmt_UA-33292184-1=1; _gat=1; gcat=entertain; __utmb=257500956.14.10.1713957714; _ga_6M8E625L9H=GS1.2.1713957714.2.1.1713959827.15.0.0", "shop": "bc-jcb=1; flg=vn; __ondemand_sessionid=mgzbsjrydfrpj505xyuuc2su; floc=223; _ga=GA1.2.947999007.1713953117; _gid=GA1.2.1159064546.1713953117; _fbp=fb.1.1713953116606.1641731163; __utmc=257500956; __utmz=257500956.1713953117.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); fd.keys=; fd.res.view.223=1000073114; __utma=257500956.947999007.1713953117.1713953117.1713957714.2; __utmt_UA-33292184-1=1; gcat=shop; _gat=1; __utmb=257500956.13.10.1713957714; _ga_6M8E625L9H=GS1.2.1713957714.2.1.1713959782.60.0.0", "travel": "bc-jcb=1; flg=vn; __ondemand_sessionid=mgzbsjrydfrpj505xyuuc2su; floc=223; _ga=GA1.2.947999007.1713953117; _gid=GA1.2.1159064546.1713953117; _fbp=fb.1.1713953116606.1641731163; __utmc=257500956; __utmz=257500956.1713953117.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); fd.keys=; fd.res.view.223=1000073114; __utma=257500956.947999007.1713953117.1713953117.1713957714.2; __utmt_UA-33292184-1=1; gcat=travel; __utmb=257500956.12.10.1713957714; _ga_6M8E625L9H=GS1.2.1713957714.2.1.1713959616.49.0.0" } def update_cookie(self, content): """ Update the cookie value in the headers dictionary. :param headers: The original headers dictionary. :param new_values: A dictionary of cookie values that need to be updated. """ self.headers['cookie'] = self.cookies_infos[content] def request_data(self, category, page_num, category_id): r = requests.get(URL_TEMPLATE.format( name=category, page_num=page_num, categorie_num=category_id), headers=self.headers ) return r def scrape_category(self, content, category): self.update_cookie(content) list_json = [] category_id = self.ids_location[content][category] for page_num in range(self.max_pages): try: response = self.request_data(category, page_num, category_id) response_json = response.json() if not response_json["searchItems"] or response_json["totalResult"] < 12: list_json.append(response_json) break list_json.append(response_json) time.sleep(self.sleep_duration) except requests.RequestException as e: logging.error(f"Request error: {e}") break except ValueError as e: logging.error(f"JSON decoding error: {e}") break os.makedirs(os.path.join(args.save_path, content), exist_ok=True) save_json(os.path.join(args.save_path, content, f"{category}.json"), list_json) def scrape_data(self): for content in tqdm(self.ids_location): for category in tqdm(self.ids_location[content]): self.scrape_category(content, category) if __name__ == "__main__": args = parser() scraper = FoodyScraper(args) scraper.scrape_data()