|
import json |
|
import os |
|
import re |
|
import sys |
|
from tools.flights.apis import Flights |
|
from tools.accommodations.apis import Accommodations |
|
from tools.restaurants.apis import Restaurants |
|
from tools.googleDistanceMatrix.apis import GoogleDistanceMatrix |
|
from tools.googlePlaces.apis import GooglePlaces |
|
from tools.attractions.apis import Attractions |
|
from annotation.src.utils import get_valid_name_city,extract_before_parenthesis |
|
from tqdm import tqdm |
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), ".."))) |
|
os.chdir(os.path.dirname(os.path.abspath(__file__))) |
|
|
|
|
|
flight = Flights() |
|
accommodations = Accommodations() |
|
restaurants = Restaurants() |
|
googleDistanceMatrix = GoogleDistanceMatrix() |
|
googlePlaces = GooglePlaces() |
|
attractions = Attractions() |
|
|
|
|
|
def load_line_json_data(filename): |
|
data = [] |
|
with open(filename, 'r', encoding='utf-8') as f: |
|
for line in f.read().strip().split('\n'): |
|
unit = json.loads(line) |
|
data.append(unit) |
|
return data |
|
|
|
def extract_numbers_from_filenames(directory): |
|
|
|
pattern = r'annotation_(\d+).json' |
|
|
|
|
|
files = os.listdir(directory) |
|
|
|
|
|
numbers = [int(re.search(pattern, file).group(1)) for file in files if re.match(pattern, file)] |
|
|
|
return numbers |
|
|
|
def extract_from_to(text: str): |
|
""" |
|
Extracts 'A' and 'B' from the format "from A to B" in the given text, with B ending at a comma or the end of the string. |
|
|
|
Args: |
|
- text (str): The input string. |
|
|
|
Returns: |
|
- tuple: A tuple containing 'A' and 'B'. If no match is found, returns (None, None). |
|
""" |
|
pattern = r"from\s+(.+?)\s+to\s+([^,]+)(?=[,\s]|$)" |
|
matches = re.search(pattern, text) |
|
return matches.groups() if matches else (None, None) |
|
|
|
def extract_city_list(query_data, annotated_data): |
|
city_list = [] |
|
for unit in annotated_data[:query_data['days']]: |
|
if 'from' in unit['current_city']: |
|
from_city, to_city = extract_from_to(unit['current_city']) |
|
from_city = extract_before_parenthesis(from_city) |
|
to_city = extract_before_parenthesis(to_city) |
|
if from_city not in city_list: |
|
city_list.append(from_city) |
|
if to_city not in city_list: |
|
city_list.append(to_city) |
|
else: |
|
city = extract_before_parenthesis(unit['current_city']) |
|
if city not in city_list: |
|
city_list.append(city) |
|
|
|
return city_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
set_type = ['train','dev','test'][2] |
|
directory = '/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{}'.format(set_type) |
|
query_data_list = load_line_json_data('/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{}/query/query.jsonl'.format(set_type)) |
|
numbers = [i for i in range(1,len(query_data_list)+1)] |
|
for number in tqdm(numbers): |
|
json_data = json.load(open(os.path.join(directory, 'plan/plan_{}.json'.format(number))))[1] |
|
query_data = query_data_list[number-1] |
|
city_list = extract_city_list(query_data,json_data) |
|
human_collected_info = [] |
|
|
|
for city in city_list[1:]: |
|
attractions_data = attractions.run(city) |
|
|
|
|
|
if type(attractions_data) != str: |
|
attractions_data = attractions_data.to_string(index=False) |
|
restaurants_data = restaurants.run(city) |
|
|
|
if type(restaurants_data) != str: |
|
restaurants_data = restaurants_data.to_string(index=False) |
|
accommodations_data = accommodations.run(city) |
|
|
|
if type(accommodations_data) != str: |
|
accommodations_data = accommodations_data.to_string(index=False) |
|
human_collected_info.append({"Description":"Attractions in {}".format(city),"Content":attractions_data}) |
|
human_collected_info.append({"Description":"Restaurants in {}".format(city),"Content":restaurants_data}) |
|
human_collected_info.append({"Description":"Accommodations in {}".format(city),"Content":accommodations_data}) |
|
|
|
|
|
for idx, unit in enumerate(json_data): |
|
if unit != {}: |
|
if 'from' in unit['current_city']: |
|
from_city, to_city = extract_from_to(unit['current_city']) |
|
from_city = extract_before_parenthesis(from_city) |
|
to_city = extract_before_parenthesis(to_city) |
|
date = query_data_list[number-1]['date'][idx] |
|
flight_data = flight.run(from_city, to_city, date) |
|
if type(flight_data) != str: |
|
|
|
flight_data = flight_data.to_string(index=False) |
|
human_collected_info.append({"Description":"Flight from {} to {} on {}".format(from_city, to_city, date), "Content":flight_data}) |
|
self_driving_data = googleDistanceMatrix.run(from_city, to_city,mode="self-driving") |
|
human_collected_info.append({"Description":"Self-driving from {} to {}".format(from_city, to_city), "Content":self_driving_data}) |
|
taxi_data = googleDistanceMatrix.run(from_city, to_city, mode='taxi') |
|
human_collected_info.append({"Description":"Taxi from {} to {}".format(from_city, to_city), "Content":taxi_data}) |
|
|
|
|
|
with open(os.path.join(directory, 'plan/human_collected_info_{}.json'.format(number)), 'w', encoding='utf-8') as f: |
|
json.dump(human_collected_info, f, indent=4, ensure_ascii=False) |
|
|
|
|