Joshua Sundance Bailey commited on
Commit
d362ca8
1 Parent(s): 20c266f

kml_tricks init

Browse files
geospatial-data-converter/kml_tricks.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile
2
+
3
+ import bs4
4
+ import fiona
5
+ import geopandas as gpd
6
+ import pandas as pd
7
+
8
+ fiona.drvsupport.supported_drivers["KML"] = "rw"
9
+
10
+
11
+ def desctogdf(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
12
+ """Parses Descriptions from Google Earth file to create a legit gpd.GeoDataFrame"""
13
+ dfs = []
14
+ len(gdf)
15
+ # pull chunks of data from feature descriptions
16
+ for idx, desc in enumerate(gdf["Description"], start=1):
17
+ try:
18
+ tmpdf = pd.read_html(desc)[1].T
19
+ except IndexError:
20
+ tmpdf = pd.read_html(desc)[0].T
21
+ tmpdf.columns = tmpdf.iloc[0]
22
+ tmpdf = tmpdf.iloc[1:]
23
+ dfs.append(tmpdf)
24
+ # join chunks together
25
+ ccdf = pd.concat(dfs, ignore_index=True)
26
+ ccdf["geometry"] = gdf["geometry"]
27
+ df = gpd.GeoDataFrame(ccdf, crs=gdf.crs)
28
+ return df
29
+
30
+
31
+ def readkmz(path: str) -> gpd.GeoDataFrame:
32
+ """Simply read kmz using geopandas/fiona without parsing Descriptions"""
33
+ # get name of kml in kmz (should be doc.kml but we don't assume)
34
+ with zipfile.ZipFile(path, "r") as kmz:
35
+ namelist = [f for f in kmz.namelist() if f.endswith(".kml")]
36
+ if len(namelist) != 1:
37
+ # this should never really happen
38
+ raise IndexError(
39
+ "kmz contains more than one kml. Extract or convert to multiple kmls.",
40
+ )
41
+ # return GeoDataFrame by reading contents of kmz
42
+ return gpd.read_file("zip://{}\\{}".format(path, namelist[0]), driver="KML")
43
+
44
+
45
+ def ge_togdf(path: str) -> gpd.GeoDataFrame:
46
+ """Return gpd.GeoDataFrame after reading kmz or kml and parsing Descriptions"""
47
+ if path.endswith(".kml"):
48
+ gdf = desctogdf(gpd.read_file(path, driver="KML"))
49
+ elif path.endswith(".kmz"):
50
+ gdf = desctogdf(readkmz(path))
51
+ else:
52
+ raise ValueError("File must end with .kml or .kmz")
53
+ return gdf
54
+
55
+
56
+ def simpledata_fromcode(kmlcode: str) -> pd.DataFrame:
57
+ """Return DataFrame extracted from KML code
58
+ parameter kmlcode (str): kml source code
59
+ Uses simpledata tags, NOT embedded tables in feature descriptions
60
+ """
61
+ # get the KML source code as a BeautifulSoup object
62
+ soup = bs4.BeautifulSoup(kmlcode, "html.parser")
63
+ # find all rows (schemadata tags) in the soup
64
+ rowtags = soup.find_all("schemadata")
65
+ # generator expression yielding a {name: value} dict for each row
66
+ rowdicts = (
67
+ {field.get("name"): field.text for field in row.find_all("simpledata")}
68
+ for row in rowtags
69
+ )
70
+ # return pd.DataFrame from row dict generator
71
+ return pd.DataFrame(rowdicts)
72
+
73
+
74
+ def kmlcode_fromfile(gefile: str) -> str:
75
+ """Return kml source code (str) extracted from Google Earth File
76
+ parameter gefile (str): absolute or relative path to Google Earth file
77
+ (kmz or kml)
78
+ Uses simpledata tags, NOT embedded tables in feature descriptions
79
+ """
80
+ fileextension = gefile.lower().split(".")[-1]
81
+ if fileextension == "kml":
82
+ with open(gefile, "r") as kml:
83
+ kmlsrc = kml.read()
84
+ elif fileextension == "kmz":
85
+ with zipfile.ZipFile(gefile) as kmz:
86
+ # there should only be one kml file and it should be named doc.kml
87
+ # we won't make that assumption
88
+ kmls = [f for f in kmz.namelist() if f.lower().endswith(".kml")]
89
+ if len(kmls) != 1:
90
+ raise IndexError(
91
+ "kmz contains more than one kml. Extract or convert to multiple kmls.",
92
+ )
93
+ with kmz.open(kmls[0]) as kml:
94
+ # .decode() because zipfile.ZipFile.open(name).read() -> bytes
95
+ kmlsrc = kml.read().decode()
96
+ else:
97
+ raise ValueError("parameter gefile must end with .kml or .kmz")
98
+ return kmlsrc
99
+
100
+
101
+ def simpledata_fromfile(gefile: str) -> pd.DataFrame:
102
+ """Return DataFrame extracted from Google Earth File
103
+ parameter gefile (str): absolute or relative path to Google Earth file
104
+ (kmz or kml)
105
+ Uses simpledata tags, NOT embedded tables in feature descriptions
106
+ """
107
+ df = simpledata_fromcode(kmlcode_fromfile(gefile))
108
+ if gefile.endswith(".kmz"):
109
+ gefile_gdf = readkmz(gefile)
110
+ else:
111
+ gefile_gdf = gpd.read_file(gefile, driver="KML")
112
+ gdf = gpd.GeoDataFrame(df, geometry=gefile_gdf["geometry"], crs=gefile_gdf.crs)
113
+ return gdf
114
+
115
+
116
+ def readge(gefile: str) -> pd.DataFrame:
117
+ """Extract data from Google Earth file & save as zip
118
+ parameter gefile (str): absolute or relative path to Google Earth file
119
+ parameter zipfile (str): absolute or relative path to output zip file
120
+ Will read simpledata tags OR embedded tables in feature descriptions
121
+ """
122
+ # retrieve DataFrame from gefile and use its to_file method
123
+ try:
124
+ # this function pulls data from tables embedded in feature descriptions
125
+ df = ge_togdf(gefile)
126
+ except (pd.errors.ParserError, ValueError):
127
+ # this function pulls data from simpledata tags
128
+ df = simpledata_fromfile(gefile)
129
+ return df
requirements.txt CHANGED
@@ -1,3 +1,5 @@
 
1
  geopandas==0.14.0
 
2
  pyogrio==0.6.0
3
  streamlit==1.27.2
 
1
+ beautifulsoup4==4.12.2
2
  geopandas==0.14.0
3
+ lxml==4.9.3
4
  pyogrio==0.6.0
5
  streamlit==1.27.2