TE-Scrapper / Talabat_files /main_function.py
viraj
Initial Commit
e79fbb1
# url = input("enter Restro url : ")
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from config import settings
import pymongo
import json
import pandas as pd
def main_all_extract(url):
client = pymongo.MongoClient(settings.MONGO_URL)
def restro_details(url,location):
db = client.Restaurants_in_dubai
collection = db.Restaurant_details
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
parsed_url = urlparse(url,location)
platform_name = parsed_url.netloc.split('.')[1].capitalize()
restaurant_tag = soup.find('h1', {'data-testid': 'restaurant-title'})
restaurant_name = restaurant_tag.contents[0].strip()
restaurant_details = {
'url' : url,
'platform_name' : platform_name,
"restaurant_name" : restaurant_name,
"location" : location
}
result = collection.insert_one(restaurant_details)
print("Inserted document IDs:", result.inserted_id)
def excel_extract(url):
def extract_choices(item_id,restaurant_id):
choice_url = f"https://www.talabat.com/nextMenuApi/v2/branches/{restaurant_id}/menu/{item_id}/choices"
response = requests.get(choice_url, headers=headers)
if response.status_code == 200:
choice_data = response.json()
return choice_data
else:
print("Failed to retrieve choices for item ID:", item_id)
return None
url = url
parsed_url = urlparse(url)
path_segments = parsed_url.path.split('/')
restaurant_id = path_segments[-2]
restaurant_name = path_segments[-1]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)
category_name_list = []
j = 0
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
script_tag = soup.find('script', id='__NEXT_DATA__')
if script_tag:
json_content = json.loads(script_tag.string.strip())
menu_data = json_content['props']['pageProps']['initialMenuState']['menuData']['items']
menu_items_list = []
for i,item in enumerate(menu_data):
item_id = item['id']
name = item['name']
description = item['description']
price = item['price']
original_image = item['originalImage']
original_section = item['originalSection']
has_choices = item['hasChoices']
if original_section not in category_name_list:
category_name_list.append(original_section)
j = j+1
Category_position = j
else:
Category_position = j
menu_item = {
"Category": original_section,
"Category_positon": Category_position,
"Item_name": name,
"Item_position": i,
"Image": original_image,
"description": description,
"price": price,
"id": item_id
}
menu_items_list.append(menu_item)
if has_choices:
choice_data = extract_choices(item_id,restaurant_id)
if choice_data:
choice_for_item = choice_data["result"].get('choiceForItem', [])[0] # Accessing the first element of the list if exists
choice_sections = choice_for_item.get('choiceSections', [])
grouped_data = {}
for option_group in choice_sections:
option_group_name = option_group.get('nm', '')
min_quantity = option_group.get('mnq', '')
max_quantity = option_group.get('mxq', '')
options = option_group.get('ich', [])
for option_index, option in enumerate(options, start=1):
option_name = option.get('nm', '')
option_price = option.get('pr', '')
grouped_data.setdefault(option_group_name, {
"Option_group_name": option_group_name,
"Min_quantity": min_quantity,
"Max_quantity": max(max_quantity,1)
})
grouped_data[option_group_name][f"Option_{option_index}_Name"] = option_name
grouped_data[option_group_name][f"Option_{option_index}_Price"] = option_price
menu_items_list.extend(grouped_data.values())
df = pd.DataFrame(menu_items_list)
if "Max_quantity" in df.columns:
max_column_index = df.columns.get_loc('Max_quantity')
for i in range(max_column_index + 1, len(df.columns)):
df.rename(columns={df.columns[i]: ''}, inplace=True)
option_group_name_index = df.columns.get_loc('Option_group_name')
for i in range(option_group_name_index, len(df.columns)):
df.iloc[:, i] = df.iloc[:, i].shift(-1)
excel_file = f"{restaurant_name}_menu.xlsx"
df.to_excel(excel_file, index=False)
print(f"Menu items saved to {excel_file}")
else:
print("Script tag with id '__NEXT_DATA__' not found.")
else:
print(f"Failed to get menu items. Status code: {response.status_code}")
def main(url):
def extract_choices(item_id,restaurant_id):
choice_url = f"https://www.talabat.com/nextMenuApi/v2/branches/{restaurant_id}/menu/{item_id}/choices"
response = requests.get(choice_url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
choice_data = json.loads(soup.string.strip())
return choice_data
else:
print("Failed to retrieve choices for item ID:", item_id)
return None
url = url
parsed_url = urlparse(url)
path_segments = parsed_url.path.split('/')
restaurant_id = path_segments[-2]
restaurant_name = path_segments[-1]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
script_tag = soup.find('script', id='__NEXT_DATA__')
j = 0
category_name_list = []
if script_tag:
json_content = json.loads(script_tag.string.strip())
menu_data = json_content['props']['pageProps']['initialMenuState']['menuData']['items']
location = json_content.get('props', {}).get('pageProps', {}).get('gtmEventData', {}).get('area', {}).get('name')
items = []
for i,item in enumerate(menu_data):
item_id = item['id']
name = item['name']
description = item['description']
price = item['price']
original_image = item['originalImage']
original_section = item['originalSection']
Category_id = item['sectionId']
has_choices = item['hasChoices']
if original_section not in category_name_list:
category_name_list.append(original_section)
j = j+1
Category_position = j
else:
Category_position = j
item_info = {
'category': original_section,
'category_postion': Category_position,
'category_id': Category_id,
'item_name': name,
'item_position': i,
'item_image': original_image,
'description': description,
'price': price,
'item_id': item_id,
'has_choices' : has_choices,
}
if has_choices:
option_groups_info = []
choice_data = extract_choices(item_id,restaurant_id)
if choice_data:
choice_for_item = choice_data["result"]['choiceForItem'][0] # Accessing the first element of the list
choice_sections = choice_for_item['choiceSections']
for option_group in choice_sections:
option_group_info = {
'option_group_name': option_group['nm'],
'min_quantity': option_group['mnq'],
'max_quantity': option_group['mxq'],
'option_group_names': []
}
if 'ich' in option_group:
option_group_names = option_group['ich']
for option_group_name in option_group_names:
option_group_name_info = {
'option_name': option_group_name['nm'],
'option_price': option_group_name['pr']
}
option_group_info['option_group_names'].append(option_group_name_info)
option_groups_info.append(option_group_info)
item_info['option_groups'] = option_groups_info
items.append(item_info)
# with open(f"{restaurant_name}.json", "w") as json_file:
# json.dump(items, json_file, indent=4)
print(f"josn named {restaurant_name}.json created succesfully")
# excel_extract(url)
# print("excel Created succesfully")
else:
print("Script tag with id '__NEXT_DATA__' not found.")
else:
print("Failed to retrieve the webpage. Status code:", response.status_code)
return items,json_content,location
def extract_item(items,url):
db = client.Restaurants_in_dubai
collection = db.Items
json_data = items
category_collection = db['Category']
restro_collection = db['Restaurant_details']
items_info = []
for item in json_data:
id = item['item_id']
name = item['item_name']
description = item['description']
price = item['price']
img_url= item['item_image']
category_name = item['category']
item_position = item['item_position']
has_choices = item['has_choices']
if has_choices == True:
modifires = []
for option_group in item.get('option_groups', []):
modifires.append(option_group['option_group_name'])
else:
modifires = "None"
restro = restro_collection.find_one({'url': url})
if restro:
restro_id = restro['_id']
restro_ref_id = restro_id
category = category_collection.find_one({
'category_name': category_name,
'restro_ref_id': restro_ref_id
})
if category:
category_id = category['_id']
else:
category_id = None
ref_id = category_id
item_info = {
'item_id': id,
'name': name,
'description': description,
'amount': price,
'image': img_url,
'category_name':category_name,
'item_position':item_position,
'modifires':modifires,
'ref_id_category' : ref_id,
'restro_ref_id' : restro_ref_id
}
items_info.append(item_info)
result = collection.insert_many(items_info)
print("Inserted document IDs:", result.inserted_ids)
def extract_category(items,json_content,url):
db = client.Restaurants_in_dubai
collection = db.Category
json_data = items
restro_collection = db['Restaurant_details']
def item_extract_category(json_content,name):
menu_data = json_content['props']['pageProps']['initialMenuState']['menuData']['categories']
items_list = []
for category in menu_data:
if category["name"] == name:
for item in category["items"]:
item_info = {
"id": item["id"],
"name": item["name"]
}
items_list.append(item_info)
return items_list
categories_info = []
existing_categories = set()
for item in json_data:
name = item['category']
if name not in existing_categories:
category_positin = 1
category_isActive = True
items = item_extract_category(json_content, name)
restro = restro_collection.find_one({'url': url})
if restro:
restro_id = restro['_id']
restro_ref_id = restro_id
category_info = {
'category_name': name,
'category_position': category_positin,
'category_isActive': category_isActive,
'items': items,
'restro_ref_id' : restro_ref_id
}
categories_info.append(category_info)
existing_categories.add(name)
result = collection.insert_many(categories_info)
print("Inserted document IDs:", result.inserted_ids)
def extract_option_group(items,url):
db = client.Restaurants_in_dubai
collection = db.OptionGroup
option_group_info = []
existing_categories = []
option_group_names_list = []
restro_collection = db['Restaurant_details']
for item in items:
for option_group in item.get('option_groups', []):
flag = 1
restro = restro_collection.find_one({'url': url})
if restro:
restro_id = restro['_id']
restro_ref_id = restro_id
option_group_name = option_group["option_group_name"]
min_quantity = option_group["min_quantity"]
max_quantity = option_group["max_quantity"]
option_names = []
option_names_trial_all = []
for option in option_group.get("option_group_names", []):
option_name = option["option_name"]
option_price = option["option_price"]
option_names.append(option_name)
option_names_trial = {
'option_name': option_name,
'option_price': option_price
}
option_names_trial_all.append(option_names_trial)
item_id = []
for item in items:
for option_group in item.get('option_groups', []):
option_group_name2 = option_group["option_group_name"]
option_group_names1 = option_group.get('option_group_names',[])
if(option_group_name2 == option_group_name and (sorted(option_group_names1, key=lambda x: x['option_name'])) == sorted(option_names_trial_all, key=lambda x: x['option_name']) ):
item_id.append(item['item_id'])
restro = restro_collection.find_one({'url': url})
if restro:
restro_id = restro['_id']
restro_ref_id = restro_id
option_group_information = {
"option_group_name" : option_group_name,
"min_quantity" : min_quantity,
"max_quantity" : max_quantity,
"option_names" : option_names,
"item_id" : item_id,
"restro_ref_id" : restro_ref_id
}
option_group_check ={
"option_group_name" : option_group_name,
"option_names" : option_names
}
for category in existing_categories:
if (category['option_group_name'] == option_group_check['option_group_name'] and
sorted(category['option_names']) == sorted(option_group_check['option_names'])):
flag = 0
if flag==1:
option_group_info.append(option_group_information)
existing_categories.append(option_group_check)
if option_group_info:
result = collection.insert_many(option_group_info)
print("Inserted document IDs:", result.inserted_ids)
def extract_option_group_names(items,url):
db = client.Restaurants_in_dubai
collection = db.OptionName
option_group_collection = db['OptionGroup']
restro_collection = db['Restaurant_details']
json_data = items
option_names = []
option_names_list = []
for item in json_data:
has_choices = item['has_choices']
if has_choices == True:
flag = 1
for option_group in item.get('option_groups', []):
option_group_name2 = option_group["option_group_name"]
for option in option_group.get("option_group_names", []):
restro = restro_collection.find_one({'url': url})
if restro:
restro_id = restro['_id']
restro_ref_id = restro_id
option_name = option["option_name"]
option_price = option["option_price"]
query = {
'restro_ref_id': restro_id,
'option_names': option_name
}
matching_documents = option_group_collection.find(query)
matching_ids = [doc['_id'] for doc in matching_documents]
# option_group_name_true = option_group_collection.find_one({'option_group_name': option_group_name2,
# 'restro_ref_id' : restro_id})
# if option_group_name_true:
# option_group_id = option_group_name_true['_id']
option_group_name = {
"option_name" : option_name,
"option_price" : option_price,
"ref_option_group_id" : matching_ids,
"restro_ref_id" : restro_ref_id
}
if (option_name in option_names_list ):
flag = 0
if flag==1:
option_names.append(option_group_name)
option_names_list.append(option_name)
if option_names:
result = collection.insert_many(option_names)
print("Inserted document IDs:", result.inserted_ids)
items,json_content,location = main(url)
restro_details(url,location)
extract_category(items,json_content,url)
extract_item(items,url)
extract_option_group(items,url)
extract_option_group_names(items,url)
return True
# main_all_extract(url)