Spaces:
Running
Running
# url = input("enter Restro url : ") | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
from config import settings | |
import pymongo | |
import json | |
import pandas as pd | |
def main_all_extract(url): | |
client = pymongo.MongoClient(settings.MONGO_URL) | |
def restro_details(url,location): | |
db = client.Restaurants_in_dubai | |
collection = db.Restaurant_details | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' | |
} | |
response = requests.get(url, headers=headers) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
parsed_url = urlparse(url,location) | |
platform_name = parsed_url.netloc.split('.')[1].capitalize() | |
restaurant_tag = soup.find('h1', {'data-testid': 'restaurant-title'}) | |
restaurant_name = restaurant_tag.contents[0].strip() | |
restaurant_details = { | |
'url' : url, | |
'platform_name' : platform_name, | |
"restaurant_name" : restaurant_name, | |
"location" : location | |
} | |
result = collection.insert_one(restaurant_details) | |
print("Inserted document IDs:", result.inserted_id) | |
def excel_extract(url): | |
def extract_choices(item_id,restaurant_id): | |
choice_url = f"https://www.talabat.com/nextMenuApi/v2/branches/{restaurant_id}/menu/{item_id}/choices" | |
response = requests.get(choice_url, headers=headers) | |
if response.status_code == 200: | |
choice_data = response.json() | |
return choice_data | |
else: | |
print("Failed to retrieve choices for item ID:", item_id) | |
return None | |
url = url | |
parsed_url = urlparse(url) | |
path_segments = parsed_url.path.split('/') | |
restaurant_id = path_segments[-2] | |
restaurant_name = path_segments[-1] | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' | |
} | |
response = requests.get(url, headers=headers) | |
category_name_list = [] | |
j = 0 | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
script_tag = soup.find('script', id='__NEXT_DATA__') | |
if script_tag: | |
json_content = json.loads(script_tag.string.strip()) | |
menu_data = json_content['props']['pageProps']['initialMenuState']['menuData']['items'] | |
menu_items_list = [] | |
for i,item in enumerate(menu_data): | |
item_id = item['id'] | |
name = item['name'] | |
description = item['description'] | |
price = item['price'] | |
original_image = item['originalImage'] | |
original_section = item['originalSection'] | |
has_choices = item['hasChoices'] | |
if original_section not in category_name_list: | |
category_name_list.append(original_section) | |
j = j+1 | |
Category_position = j | |
else: | |
Category_position = j | |
menu_item = { | |
"Category": original_section, | |
"Category_positon": Category_position, | |
"Item_name": name, | |
"Item_position": i, | |
"Image": original_image, | |
"description": description, | |
"price": price, | |
"id": item_id | |
} | |
menu_items_list.append(menu_item) | |
if has_choices: | |
choice_data = extract_choices(item_id,restaurant_id) | |
if choice_data: | |
choice_for_item = choice_data["result"].get('choiceForItem', [])[0] # Accessing the first element of the list if exists | |
choice_sections = choice_for_item.get('choiceSections', []) | |
grouped_data = {} | |
for option_group in choice_sections: | |
option_group_name = option_group.get('nm', '') | |
min_quantity = option_group.get('mnq', '') | |
max_quantity = option_group.get('mxq', '') | |
options = option_group.get('ich', []) | |
for option_index, option in enumerate(options, start=1): | |
option_name = option.get('nm', '') | |
option_price = option.get('pr', '') | |
grouped_data.setdefault(option_group_name, { | |
"Option_group_name": option_group_name, | |
"Min_quantity": min_quantity, | |
"Max_quantity": max(max_quantity,1) | |
}) | |
grouped_data[option_group_name][f"Option_{option_index}_Name"] = option_name | |
grouped_data[option_group_name][f"Option_{option_index}_Price"] = option_price | |
menu_items_list.extend(grouped_data.values()) | |
df = pd.DataFrame(menu_items_list) | |
if "Max_quantity" in df.columns: | |
max_column_index = df.columns.get_loc('Max_quantity') | |
for i in range(max_column_index + 1, len(df.columns)): | |
df.rename(columns={df.columns[i]: ''}, inplace=True) | |
option_group_name_index = df.columns.get_loc('Option_group_name') | |
for i in range(option_group_name_index, len(df.columns)): | |
df.iloc[:, i] = df.iloc[:, i].shift(-1) | |
excel_file = f"{restaurant_name}_menu.xlsx" | |
df.to_excel(excel_file, index=False) | |
print(f"Menu items saved to {excel_file}") | |
else: | |
print("Script tag with id '__NEXT_DATA__' not found.") | |
else: | |
print(f"Failed to get menu items. Status code: {response.status_code}") | |
def main(url): | |
def extract_choices(item_id,restaurant_id): | |
choice_url = f"https://www.talabat.com/nextMenuApi/v2/branches/{restaurant_id}/menu/{item_id}/choices" | |
response = requests.get(choice_url, headers=headers) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
choice_data = json.loads(soup.string.strip()) | |
return choice_data | |
else: | |
print("Failed to retrieve choices for item ID:", item_id) | |
return None | |
url = url | |
parsed_url = urlparse(url) | |
path_segments = parsed_url.path.split('/') | |
restaurant_id = path_segments[-2] | |
restaurant_name = path_segments[-1] | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' | |
} | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
script_tag = soup.find('script', id='__NEXT_DATA__') | |
j = 0 | |
category_name_list = [] | |
if script_tag: | |
json_content = json.loads(script_tag.string.strip()) | |
menu_data = json_content['props']['pageProps']['initialMenuState']['menuData']['items'] | |
location = json_content.get('props', {}).get('pageProps', {}).get('gtmEventData', {}).get('area', {}).get('name') | |
items = [] | |
for i,item in enumerate(menu_data): | |
item_id = item['id'] | |
name = item['name'] | |
description = item['description'] | |
price = item['price'] | |
original_image = item['originalImage'] | |
original_section = item['originalSection'] | |
Category_id = item['sectionId'] | |
has_choices = item['hasChoices'] | |
if original_section not in category_name_list: | |
category_name_list.append(original_section) | |
j = j+1 | |
Category_position = j | |
else: | |
Category_position = j | |
item_info = { | |
'category': original_section, | |
'category_postion': Category_position, | |
'category_id': Category_id, | |
'item_name': name, | |
'item_position': i, | |
'item_image': original_image, | |
'description': description, | |
'price': price, | |
'item_id': item_id, | |
'has_choices' : has_choices, | |
} | |
if has_choices: | |
option_groups_info = [] | |
choice_data = extract_choices(item_id,restaurant_id) | |
if choice_data: | |
choice_for_item = choice_data["result"]['choiceForItem'][0] # Accessing the first element of the list | |
choice_sections = choice_for_item['choiceSections'] | |
for option_group in choice_sections: | |
option_group_info = { | |
'option_group_name': option_group['nm'], | |
'min_quantity': option_group['mnq'], | |
'max_quantity': option_group['mxq'], | |
'option_group_names': [] | |
} | |
if 'ich' in option_group: | |
option_group_names = option_group['ich'] | |
for option_group_name in option_group_names: | |
option_group_name_info = { | |
'option_name': option_group_name['nm'], | |
'option_price': option_group_name['pr'] | |
} | |
option_group_info['option_group_names'].append(option_group_name_info) | |
option_groups_info.append(option_group_info) | |
item_info['option_groups'] = option_groups_info | |
items.append(item_info) | |
# with open(f"{restaurant_name}.json", "w") as json_file: | |
# json.dump(items, json_file, indent=4) | |
print(f"josn named {restaurant_name}.json created succesfully") | |
# excel_extract(url) | |
# print("excel Created succesfully") | |
else: | |
print("Script tag with id '__NEXT_DATA__' not found.") | |
else: | |
print("Failed to retrieve the webpage. Status code:", response.status_code) | |
return items,json_content,location | |
def extract_item(items,url): | |
db = client.Restaurants_in_dubai | |
collection = db.Items | |
json_data = items | |
category_collection = db['Category'] | |
restro_collection = db['Restaurant_details'] | |
items_info = [] | |
for item in json_data: | |
id = item['item_id'] | |
name = item['item_name'] | |
description = item['description'] | |
price = item['price'] | |
img_url= item['item_image'] | |
category_name = item['category'] | |
item_position = item['item_position'] | |
has_choices = item['has_choices'] | |
if has_choices == True: | |
modifires = [] | |
for option_group in item.get('option_groups', []): | |
modifires.append(option_group['option_group_name']) | |
else: | |
modifires = "None" | |
restro = restro_collection.find_one({'url': url}) | |
if restro: | |
restro_id = restro['_id'] | |
restro_ref_id = restro_id | |
category = category_collection.find_one({ | |
'category_name': category_name, | |
'restro_ref_id': restro_ref_id | |
}) | |
if category: | |
category_id = category['_id'] | |
else: | |
category_id = None | |
ref_id = category_id | |
item_info = { | |
'item_id': id, | |
'name': name, | |
'description': description, | |
'amount': price, | |
'image': img_url, | |
'category_name':category_name, | |
'item_position':item_position, | |
'modifires':modifires, | |
'ref_id_category' : ref_id, | |
'restro_ref_id' : restro_ref_id | |
} | |
items_info.append(item_info) | |
result = collection.insert_many(items_info) | |
print("Inserted document IDs:", result.inserted_ids) | |
def extract_category(items,json_content,url): | |
db = client.Restaurants_in_dubai | |
collection = db.Category | |
json_data = items | |
restro_collection = db['Restaurant_details'] | |
def item_extract_category(json_content,name): | |
menu_data = json_content['props']['pageProps']['initialMenuState']['menuData']['categories'] | |
items_list = [] | |
for category in menu_data: | |
if category["name"] == name: | |
for item in category["items"]: | |
item_info = { | |
"id": item["id"], | |
"name": item["name"] | |
} | |
items_list.append(item_info) | |
return items_list | |
categories_info = [] | |
existing_categories = set() | |
for item in json_data: | |
name = item['category'] | |
if name not in existing_categories: | |
category_positin = 1 | |
category_isActive = True | |
items = item_extract_category(json_content, name) | |
restro = restro_collection.find_one({'url': url}) | |
if restro: | |
restro_id = restro['_id'] | |
restro_ref_id = restro_id | |
category_info = { | |
'category_name': name, | |
'category_position': category_positin, | |
'category_isActive': category_isActive, | |
'items': items, | |
'restro_ref_id' : restro_ref_id | |
} | |
categories_info.append(category_info) | |
existing_categories.add(name) | |
result = collection.insert_many(categories_info) | |
print("Inserted document IDs:", result.inserted_ids) | |
def extract_option_group(items,url): | |
db = client.Restaurants_in_dubai | |
collection = db.OptionGroup | |
option_group_info = [] | |
existing_categories = [] | |
option_group_names_list = [] | |
restro_collection = db['Restaurant_details'] | |
for item in items: | |
for option_group in item.get('option_groups', []): | |
flag = 1 | |
restro = restro_collection.find_one({'url': url}) | |
if restro: | |
restro_id = restro['_id'] | |
restro_ref_id = restro_id | |
option_group_name = option_group["option_group_name"] | |
min_quantity = option_group["min_quantity"] | |
max_quantity = option_group["max_quantity"] | |
option_names = [] | |
option_names_trial_all = [] | |
for option in option_group.get("option_group_names", []): | |
option_name = option["option_name"] | |
option_price = option["option_price"] | |
option_names.append(option_name) | |
option_names_trial = { | |
'option_name': option_name, | |
'option_price': option_price | |
} | |
option_names_trial_all.append(option_names_trial) | |
item_id = [] | |
for item in items: | |
for option_group in item.get('option_groups', []): | |
option_group_name2 = option_group["option_group_name"] | |
option_group_names1 = option_group.get('option_group_names',[]) | |
if(option_group_name2 == option_group_name and (sorted(option_group_names1, key=lambda x: x['option_name'])) == sorted(option_names_trial_all, key=lambda x: x['option_name']) ): | |
item_id.append(item['item_id']) | |
restro = restro_collection.find_one({'url': url}) | |
if restro: | |
restro_id = restro['_id'] | |
restro_ref_id = restro_id | |
option_group_information = { | |
"option_group_name" : option_group_name, | |
"min_quantity" : min_quantity, | |
"max_quantity" : max_quantity, | |
"option_names" : option_names, | |
"item_id" : item_id, | |
"restro_ref_id" : restro_ref_id | |
} | |
option_group_check ={ | |
"option_group_name" : option_group_name, | |
"option_names" : option_names | |
} | |
for category in existing_categories: | |
if (category['option_group_name'] == option_group_check['option_group_name'] and | |
sorted(category['option_names']) == sorted(option_group_check['option_names'])): | |
flag = 0 | |
if flag==1: | |
option_group_info.append(option_group_information) | |
existing_categories.append(option_group_check) | |
if option_group_info: | |
result = collection.insert_many(option_group_info) | |
print("Inserted document IDs:", result.inserted_ids) | |
def extract_option_group_names(items,url): | |
db = client.Restaurants_in_dubai | |
collection = db.OptionName | |
option_group_collection = db['OptionGroup'] | |
restro_collection = db['Restaurant_details'] | |
json_data = items | |
option_names = [] | |
option_names_list = [] | |
for item in json_data: | |
has_choices = item['has_choices'] | |
if has_choices == True: | |
flag = 1 | |
for option_group in item.get('option_groups', []): | |
option_group_name2 = option_group["option_group_name"] | |
for option in option_group.get("option_group_names", []): | |
restro = restro_collection.find_one({'url': url}) | |
if restro: | |
restro_id = restro['_id'] | |
restro_ref_id = restro_id | |
option_name = option["option_name"] | |
option_price = option["option_price"] | |
query = { | |
'restro_ref_id': restro_id, | |
'option_names': option_name | |
} | |
matching_documents = option_group_collection.find(query) | |
matching_ids = [doc['_id'] for doc in matching_documents] | |
# option_group_name_true = option_group_collection.find_one({'option_group_name': option_group_name2, | |
# 'restro_ref_id' : restro_id}) | |
# if option_group_name_true: | |
# option_group_id = option_group_name_true['_id'] | |
option_group_name = { | |
"option_name" : option_name, | |
"option_price" : option_price, | |
"ref_option_group_id" : matching_ids, | |
"restro_ref_id" : restro_ref_id | |
} | |
if (option_name in option_names_list ): | |
flag = 0 | |
if flag==1: | |
option_names.append(option_group_name) | |
option_names_list.append(option_name) | |
if option_names: | |
result = collection.insert_many(option_names) | |
print("Inserted document IDs:", result.inserted_ids) | |
items,json_content,location = main(url) | |
restro_details(url,location) | |
extract_category(items,json_content,url) | |
extract_item(items,url) | |
extract_option_group(items,url) | |
extract_option_group_names(items,url) | |
return True | |
# main_all_extract(url) |