hsaest commited on
Commit
ed66543
1 Parent(s): b0783b4

Delete utils/extract_human_annotation.py

Browse files
Files changed (1) hide show
  1. utils/extract_human_annotation.py +0 -183
utils/extract_human_annotation.py DELETED
@@ -1,183 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- import sys
5
- from tools.flights.apis import Flights
6
- from tools.accommodations.apis import Accommodations
7
- from tools.restaurants.apis import Restaurants
8
- from tools.googleDistanceMatrix.apis import GoogleDistanceMatrix
9
- from tools.googlePlaces.apis import GooglePlaces
10
- from tools.attractions.apis import Attractions
11
- from annotation.src.utils import get_valid_name_city,extract_before_parenthesis
12
- from tqdm import tqdm
13
-
14
- sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
15
- os.chdir(os.path.dirname(os.path.abspath(__file__)))
16
-
17
-
18
- flight = Flights()
19
- accommodations = Accommodations()
20
- restaurants = Restaurants()
21
- googleDistanceMatrix = GoogleDistanceMatrix()
22
- googlePlaces = GooglePlaces()
23
- attractions = Attractions()
24
-
25
-
26
- def load_line_json_data(filename):
27
- data = []
28
- with open(filename, 'r', encoding='utf-8') as f:
29
- for line in f.read().strip().split('\n'):
30
- unit = json.loads(line)
31
- data.append(unit)
32
- return data
33
-
34
- def extract_numbers_from_filenames(directory):
35
- # Define the pattern to match files
36
- pattern = r'annotation_(\d+).json'
37
-
38
- # List all files in the directory
39
- files = os.listdir(directory)
40
-
41
- # Extract numbers from filenames that match the pattern
42
- numbers = [int(re.search(pattern, file).group(1)) for file in files if re.match(pattern, file)]
43
-
44
- return numbers
45
-
46
- def extract_from_to(text: str):
47
- """
48
- Extracts 'A' and 'B' from the format "from A to B" in the given text, with B ending at a comma or the end of the string.
49
-
50
- Args:
51
- - text (str): The input string.
52
-
53
- Returns:
54
- - tuple: A tuple containing 'A' and 'B'. If no match is found, returns (None, None).
55
- """
56
- pattern = r"from\s+(.+?)\s+to\s+([^,]+)(?=[,\s]|$)"
57
- matches = re.search(pattern, text)
58
- return matches.groups() if matches else (None, None)
59
-
60
- def extract_city_list(query_data, annotated_data):
61
- city_list = []
62
- for unit in annotated_data[:query_data['days']]:
63
- if 'from' in unit['current_city']:
64
- from_city, to_city = extract_from_to(unit['current_city'])
65
- from_city = extract_before_parenthesis(from_city)
66
- to_city = extract_before_parenthesis(to_city)
67
- if from_city not in city_list:
68
- city_list.append(from_city)
69
- if to_city not in city_list:
70
- city_list.append(to_city)
71
- else:
72
- city = extract_before_parenthesis(unit['current_city'])
73
- if city not in city_list:
74
- city_list.append(city)
75
-
76
- return city_list
77
-
78
-
79
- # if __name__ == '__main__':
80
- # user_name = 'all'
81
- # directory = '../data/annotation/{}'.format(user_name)
82
- # query_data_list = load_line_json_data('../data/query/{}.jsonl'.format(user_name))
83
- # numbers = extract_numbers_from_filenames(directory)
84
- # print(numbers)
85
- # for number in tqdm(numbers):
86
- # json_data = json.load(open(os.path.join(directory, 'annotation_{}.json'.format(number))))
87
- # query_data = query_data_list[number-1]
88
- # city_list = extract_city_list(query_data,json_data)
89
- # human_collected_info = []
90
-
91
- # for city in city_list[1:]:
92
- # attractions_data = attractions.run(city)
93
- # if type(attractions_data) != str:
94
- # attractions_data.drop(['Latitude','Longitude','Address','Phone','Website','City'],axis=1,inplace=True)
95
- # if type(attractions_data) != str:
96
- # attractions_data = attractions_data.to_string(index=False)
97
- # restaurants_data = restaurants.run(city)
98
- # restaurants_data.drop(['City'],axis=1,inplace=True)
99
- # if type(restaurants_data) != str:
100
- # restaurants_data = restaurants_data.to_string(index=False)
101
- # accommodations_data = accommodations.run(city)
102
- # accommodations_data.drop(['city'],axis=1,inplace=True)
103
- # if type(accommodations_data) != str:
104
- # accommodations_data = accommodations_data.to_string(index=False)
105
- # human_collected_info.append({"Description":"Attractions in {}".format(city),"Content":attractions_data})
106
- # human_collected_info.append({"Description":"Restaurants in {}".format(city),"Content":restaurants_data})
107
- # human_collected_info.append({"Description":"Accommodations in {}".format(city),"Content":accommodations_data})
108
-
109
-
110
- # for idx, unit in enumerate(json_data):
111
- # if unit != {}:
112
- # if 'from' in unit['current_city']:
113
- # from_city, to_city = extract_from_to(unit['current_city'])
114
- # from_city = extract_before_parenthesis(from_city)
115
- # to_city = extract_before_parenthesis(to_city)
116
- # date = query_data_list[number-1]['date'][idx]
117
- # flight_data = flight.run(from_city, to_city, date)
118
- # if type(flight_data) != str:
119
- # flight_data.drop(['OriginCityName','DestCityName','Distance','FlightDate'],axis=1,inplace=True)
120
- # flight_data = flight_data.to_string(index=False)
121
- # human_collected_info.append({"Description":"Flight from {} to {} on {}".format(from_city, to_city, date), "Content":flight_data})
122
- # self_driving_data = googleDistanceMatrix.run(from_city, to_city,mode="self-driving")
123
- # human_collected_info.append({"Description":"Self-driving from {} to {}".format(from_city, to_city), "Content":self_driving_data})
124
- # taxi_data = googleDistanceMatrix.run(from_city, to_city, mode='taxi')
125
- # human_collected_info.append({"Description":"Taxi from {} to {}".format(from_city, to_city), "Content":taxi_data})
126
-
127
- # # write to json file
128
- # with open(os.path.join(directory, 'human_collected_info_{}.json'.format(number)), 'w', encoding='utf-8') as f:
129
- # json.dump(human_collected_info, f, indent=4, ensure_ascii=False)
130
- # # break
131
-
132
-
133
- if __name__ == '__main__':
134
- set_type = ['train','dev','test'][2]
135
- directory = '/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{}'.format(set_type)
136
- query_data_list = load_line_json_data('/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{}/query/query.jsonl'.format(set_type))
137
- numbers = [i for i in range(1,len(query_data_list)+1)]
138
- for number in tqdm(numbers):
139
- json_data = json.load(open(os.path.join(directory, 'plan/plan_{}.json'.format(number))))[1]
140
- query_data = query_data_list[number-1]
141
- city_list = extract_city_list(query_data,json_data)
142
- human_collected_info = []
143
-
144
- for city in city_list[1:]:
145
- attractions_data = attractions.run(city)
146
- # if type(attractions_data) != str:
147
- # attractions_data.drop(['Latitude','Longitude','Address','Phone','Website','City'],axis=1,inplace=True)
148
- if type(attractions_data) != str:
149
- attractions_data = attractions_data.to_string(index=False)
150
- restaurants_data = restaurants.run(city)
151
- # restaurants_data.drop(['City'],axis=1,inplace=True)
152
- if type(restaurants_data) != str:
153
- restaurants_data = restaurants_data.to_string(index=False)
154
- accommodations_data = accommodations.run(city)
155
- # accommodations_data.drop(['city'],axis=1,inplace=True)
156
- if type(accommodations_data) != str:
157
- accommodations_data = accommodations_data.to_string(index=False)
158
- human_collected_info.append({"Description":"Attractions in {}".format(city),"Content":attractions_data})
159
- human_collected_info.append({"Description":"Restaurants in {}".format(city),"Content":restaurants_data})
160
- human_collected_info.append({"Description":"Accommodations in {}".format(city),"Content":accommodations_data})
161
-
162
-
163
- for idx, unit in enumerate(json_data):
164
- if unit != {}:
165
- if 'from' in unit['current_city']:
166
- from_city, to_city = extract_from_to(unit['current_city'])
167
- from_city = extract_before_parenthesis(from_city)
168
- to_city = extract_before_parenthesis(to_city)
169
- date = query_data_list[number-1]['date'][idx]
170
- flight_data = flight.run(from_city, to_city, date)
171
- if type(flight_data) != str:
172
- # flight_data.drop(['OriginCityName','DestCityName','Distance','FlightDate'],axis=1,inplace=True)
173
- flight_data = flight_data.to_string(index=False)
174
- human_collected_info.append({"Description":"Flight from {} to {} on {}".format(from_city, to_city, date), "Content":flight_data})
175
- self_driving_data = googleDistanceMatrix.run(from_city, to_city,mode="self-driving")
176
- human_collected_info.append({"Description":"Self-driving from {} to {}".format(from_city, to_city), "Content":self_driving_data})
177
- taxi_data = googleDistanceMatrix.run(from_city, to_city, mode='taxi')
178
- human_collected_info.append({"Description":"Taxi from {} to {}".format(from_city, to_city), "Content":taxi_data})
179
-
180
- # write to json file
181
- with open(os.path.join(directory, 'plan/human_collected_info_{}.json'.format(number)), 'w', encoding='utf-8') as f:
182
- json.dump(human_collected_info, f, indent=4, ensure_ascii=False)
183
- # break