英文:
Create, save, and load spatial index using GeoPandas
问题
我想使用GeoPandas创建一个空间索引,并将其保存到文件中,而不是每次都重新创建它。我该如何做到这一点?
英文:
I want to create a spatial index using GeoPandas once and save it to a file instead of recreating it every time. How do I do this?
答案1
得分: 1
你可以使用Python的pickle
库将GeoDataFrame写入磁盘。这将产生与使用Pandas方法.to_pickle()
相同的结果。
# 使用pickle保存GeoDataFrame
with open('gdf_polygons.pkl', 'wb') as pickle_file:
pickle.dump(gdf_polygons, pickle_file)
# 使用pickle加载GeoDataFrame
with open('gdf_polygons.pkl', 'rb') as pickle_file:
gdf_polygons = pickle.load(pickle_file)
值得注意的是,对于点在多边形分析,使用空间连接比使用迭代查找匹配要快。您可以使用GeoPandas的.sjoin()
方法。作为一个额外的好处,当您执行空间连接时,如果还没有空间索引,GeoPandas会自动创建一个。我改进了Tyler的示例以显示这种方法更快:
import geopandas as gpd
from shapely.geometry import Point
import os
import pandas as pd
import time
shapefile_path = "C:/Data/GIS/US/ZipCodes/tl_2022_us_zcta520.shp"
gdf_zip_codes_file = "gdf_zip_codes.pkl"
# 如果数据框已经创建,则加载数据框。
if os.path.exists(gdf_zip_codes_file):
gdf_zip_codes = pd.read_pickle(gdf_zip_codes_file)
else:
# 创建并保存数据框和空间索引。
gdf_zip_codes = gpd.read_file(shapefile_path)
gdf_zip_codes.sindex
gdf_zip_codes.to_pickle(gdf_zip_codes_file)
# 查找给定纬度和经度的邮政编码的函数。
def find_zip_code(latitude, longitude):
point = Point(longitude, latitude)
possible_matches = list(gdf_zip_codes.sindex.intersection((point.x, point.y)))
for idx in possible_matches:
if gdf_zip_codes.geometry.iloc[idx].contains(point):
return gdf_zip_codes['ZCTA5CE20'].iloc[idx]
return None
def find_zip_code_sjoin(test_coordinates: list) -> gpd.GeoDataFrame:
# 创建Point几何对象的列表
geometry = [Point(lon, lat) for lat, lon in test_coordinates]
# 创建GeoDataFrame - 假设点位于NAD83坐标系,epsg 4269
gdf_points = gpd.GeoDataFrame(geometry=geometry, columns=['lat', 'lng'], crs=gdf_zip_codes.crs)
# 向DataFrame中添加'lat'和'lng'值
gdf_points['lat'] = [lat for lat, lon in test_coordinates]
gdf_points['lng'] = [lon for lat, lon in test_coordinates]
# 执行空间连接 - 为每个点找到相交的多边形
gdf_joined = gpd.sjoin(gdf_points, gdf_zip_codes, how='left', predicate='intersects', lsuffix='point',
rsuffix='poly')
for index, row in gdf_joined.iterrows():
print(f"The zip code for latitude {row['lat']}, longitude {row['lng']} is {row['ZCTA5CE20']}.")
return gdf_joined
# 示例坐标(纬度,经度)。
test_coordinates = [
(40.7128, -74.0060), # 纽约市,纽约
(34.0522, -118.2437), # 洛杉矶,加利福尼亚
(41.8781, -87.6298), # 芝加哥,伊利诺伊
# ... (其他坐标)
]
# 使用迭代测试函数的每一对纬度和经度。
# 方法#1 使用迭代
start_time = time.time()
for latitude, longitude in test_coordinates:
zip_code = find_zip_code(latitude, longitude)
if zip_code:
print(f"The zip code for latitude {latitude}, longitude {longitude} is {zip_code}.")
else:
print(f"No zip code found for latitude {latitude}, longitude {longitude}.")
end_time = time.time()
total_time = end_time - start_time
print(f'Time for method with iteration: {total_time:.6f} seconds')
# 方法#2 使用空间连接。
start_time = time.time()
find_zip_code_sjoin(test_coordinates)
end_time = time.time()
total_time = end_time - start_time
print(f'Time with spatial join: {total_time:.6f} seconds')
结果:
Time for method with iteration: 0.189 seconds
...
Time with spatial join: 0.036 seconds
英文:
You can use Python's pickle
library to write a GeoDataFrame to disk. It will give identical results to using Pandas
method .to_pickle()
.
# Save GeoDataFrame with pickle
with open('gdf_polygons.pkl', 'wb') as pickle_file:
pickle.dump(gdf_polygons, pickle_file)
# Load GeoDataFrame with pickle
with open('gdf_polygons.pkl', 'rb') as pickle_file:
gdf_polygons = pickle.load(pickle_file)
It is worth noting that for a points-in-polygons analysis, it is faster to use a spatial join, rather than using iteration to find a match. You can use GeoPandas' .sjoin()
method. As a bonus, when you do a spatial join, GeoPandas will automatically create a spatial index if there is not one already. I reworked Tyler's example to show that this method is faster:
import geopandas as gpd
from shapely.geometry import Point
import os
import pandas as pd
import time
shapefile_path = "C:/Data/GIS/US/ZipCodes/tl_2022_us_zcta520.shp"
gdf_zip_codes_file = "gdf_zip_codes.pkl"
# Load the dataframe if it's already been created.
if os.path.exists(gdf_zip_codes_file):
gdf_zip_codes = pd.read_pickle(gdf_zip_codes_file)
else:
# Create and save the dataframe and spatial index.
gdf_zip_codes = gpd.read_file(shapefile_path)
gdf_zip_codes.sindex
gdf_zip_codes.to_pickle(gdf_zip_codes_file)
# Function to find the zip code at a given latitude and longitude.
def find_zip_code(latitude, longitude):
point = Point(longitude, latitude)
possible_matches = list(gdf_zip_codes.sindex.intersection((point.x, point.y)))
for idx in possible_matches:
if gdf_zip_codes.geometry.iloc[idx].contains(point):
return gdf_zip_codes['ZCTA5CE20'].iloc[idx]
return None
def find_zip_code_sjoin(test_coordinates: list) -> gpd.GeoDataFrame:
# Create a list of Point geometries
geometry = [Point(lon, lat) for lat, lon in test_coordinates]
# Create a GeoDataFrame - assume points are in NAD83, epsg 4269
gdf_points = gpd.GeoDataFrame(geometry=geometry, columns=['lat', 'lng'], crs=gdf_zip_codes.crs)
# Add the 'lat' and 'lng' values to the DataFrame
gdf_points['lat'] = [lat for lat, lon in test_coordinates]
gdf_points['lng'] = [lon for lat, lon in test_coordinates]
# Perform the spatial join - finds the intersecting polygon for each point
gdf_joined = gpd.sjoin(gdf_points, gdf_zip_codes, how='left', predicate='intersects', lsuffix='point',
rsuffix='poly')
for index, row in gdf_joined.iterrows():
print(f"The zip code for latitude {row['lat']}, longitude {row['lng']} is {row['ZCTA5CE20']}.")
return gdf_joined
# Example coordinates (latitude, longitude).
test_coordinates = [
(40.7128, -74.0060), # New York City, NY
(34.0522, -118.2437), # Los Angeles, CA
(41.8781, -87.6298), # Chicago, IL
(29.7604, -95.3698), # Houston, TX
(33.4484, -112.0740), # Phoenix, AZ
(39.9526, -75.1652), # Philadelphia, PA
(32.7157, -117.1611), # San Diego, CA
(29.9511, -90.0715), # New Orleans, LA
(37.7749, -122.4194), # San Francisco, CA
(38.9072, -77.0369), # Washington, D.C.
(33.7490, -84.3880), # Atlanta, GA
(35.2271, -80.8431), # Charlotte, NC
(42.3601, -71.0589), # Boston, MA
(36.7783, -119.4179), # Fresno, CA
(30.2672, -97.7431), # Austin, TX
(32.7767, -96.7970), # Dallas, TX
(25.7617, -80.1918), # Miami, FL
(39.7684, -86.1581), # Indianapolis, IN
(47.6062, -122.3321), # Seattle, WA
(35.7796, -78.6382), # Raleigh, NC
]
# Test the function for each pair of latitude and longitude.
# Method #1 with iteration
start_time = time.time()
for latitude, longitude in test_coordinates:
zip_code = find_zip_code(latitude, longitude)
if zip_code:
print(f"The zip code for latitude {latitude}, longitude {longitude} is {zip_code}.")
else:
print(f"No zip code found for latitude {latitude}, longitude {longitude}.")
end_time = time.time()
total_time = end_time - start_time
print(f'Time for method with iteration: {total_time:.6f} seconds')
# Method #2 with spatial joins.
start_time = time.time()
find_zip_code_sjoin(test_coordinates)
end_time = time.time()
total_time = end_time - start_time
print(f'Time with spatial join: {total_time:.6f} seconds')
Results:
Time for method with iteration: 0.189 seconds
...
Time with spatial join: 0.036 seconds
答案2
得分: 0
您可以通过将GeoPandas数据帧进行pickling来避免每次都重新创建空间索引。
import geopandas as gpd
from shapely.geometry import Point
import os
import pandas as pd
import datetime
start_time = datetime.datetime.now().time().strftime('%H:%M:%S')
shapefile_path = "tl_2022_us_zcta520.shp"
gdf_zip_codes_file = "gdf_zip_codes.pkl"
# 如果已经创建了数据帧,则加载数据帧。
if os.path.exists(gdf_zip_codes_file):
gdf_zip_codes = pd.read_pickle(gdf_zip_codes_file)
# 否则,创建并保存数据帧和空间索引。
else:
gdf_zip_codes = gpd.read_file(shapefile_path)
gdf_zip_codes.sindex
gdf_zip_codes.to_pickle(gdf_zip_codes_file)
# 用于查找给定纬度和经度的邮政编码的函数。
def find_zip_code(latitude, longitude):
point = Point(longitude, latitude)
possible_matches = list(gdf_zip_codes.sindex.intersection((point.x, point.y)))
for idx in possible_matches:
if gdf_zip_codes.geometry.iloc[idx].contains(point):
return gdf_zip_codes['ZCTA5CE20'].iloc[idx]
return None
# 示例坐标(纬度,经度)。
test_coordinates = [
(40.7128, -74.0060), # 纽约市,纽约
(34.0522, -118.2437), # 洛杉矶,加利福尼亚
(41.8781, -87.6298), # 芝加哥,伊利诺伊
(29.7604, -95.3698), # 休斯敦,得克萨斯
(33.4484, -112.0740), # 凤凰城,亚利桑那
(39.9526, -75.1652), # 费城,宾夕法尼亚
(32.7157, -117.1611), # 圣迭戈,加利福尼亚
(29.9511, -90.0715), # 新奥尔良,路易斯安那
(37.7749, -122.4194), # 旧金山,加利福尼亚
(38.9072, -77.0369), # 华盛顿特区
(33.7490, -84.3880), # 亚特兰大,佐治亚
(35.2271, -80.8431), # 夏洛特,北卡罗来纳
(42.3601, -71.0589), # 波士顿,马萨诸塞
(36.7783, -119.4179), # 弗雷斯诺,加利福尼亚
(30.2672, -97.7431), # 奥斯汀,得克萨斯
(32.7767, -96.7970), # 达拉斯,得克萨斯
(25.7617, -80.1918), # 迈阿密,佛罗里达
(39.7684, -86.1581), # 印第安纳波利斯,印第安纳
(47.6062, -122.3321), # 西雅图,华盛顿
(35.7796, -78.6382), # 里尔,北卡罗来纳
]
# 为每对纬度和经度测试函数。
for latitude, longitude in test_coordinates:
zip_code = find_zip_code(latitude, longitude)
if zip_code:
print(f"纬度{latitude},经度{longitude}的邮政编码是{zip_code}。")
else:
print(f"未找到纬度{latitude},经度{longitude}的邮政编码。")
end_time = datetime.datetime.now().time().strftime('%H:%M:%S')
total_time = (datetime.datetime.strptime(end_time, '%H:%M:%S') - datetime.datetime.strptime(start_time, '%H:%M:%S'))
print('总时间:' + str(total_time))
第一次运行:
纬度40.7128,经度-74.006的邮政编码为10007。
纬度34.0522,经度-118.2437的邮政编码为90012。
纬度41.8781,经度-87.6298的邮政编码为60604。
...
总时间:0:01:00
第二次运行:
...
总时间:0:00:03
英文:
You can avoid recreating the spatial index each time by pickling the GeoPandas dataframe.
Using 2022 US Census Zip Code Tabulation Areas as an example:
import geopandas as gpd
from shapely.geometry import Point
import os
import pandas as pd
import datetime
start_time = datetime.datetime.now().time().strftime('%H:%M:%S')
shapefile_path = "tl_2022_us_zcta520.shp"
gdf_zip_codes_file = "gdf_zip_codes.pkl"
# Load the dataframe if it's already been created.
if os.path.exists(gdf_zip_codes_file):
gdf_zip_codes = pd.read_pickle(gdf_zip_codes_file)
# Create and save the dataframe and spatial index.
else:
gdf_zip_codes = gpd.read_file(shapefile_path)
gdf_zip_codes.sindex
gdf_zip_codes.to_pickle(gdf_zip_codes_file)
# Function to find the zip code at a given latitude and longitude.
def find_zip_code(latitude, longitude):
point = Point(longitude, latitude)
possible_matches = list(gdf_zip_codes.sindex.intersection((point.x, point.y)))
for idx in possible_matches:
if gdf_zip_codes.geometry.iloc[idx].contains(point):
return gdf_zip_codes['ZCTA5CE20'].iloc[idx]
return None
# Example coordinates (latitude, longitude).
test_coordinates = [
(40.7128, -74.0060), # New York City, NY
(34.0522, -118.2437), # Los Angeles, CA
(41.8781, -87.6298), # Chicago, IL
(29.7604, -95.3698), # Houston, TX
(33.4484, -112.0740), # Phoenix, AZ
(39.9526, -75.1652), # Philadelphia, PA
(32.7157, -117.1611), # San Diego, CA
(29.9511, -90.0715), # New Orleans, LA
(37.7749, -122.4194), # San Francisco, CA
(38.9072, -77.0369), # Washington, D.C.
(33.7490, -84.3880), # Atlanta, GA
(35.2271, -80.8431), # Charlotte, NC
(42.3601, -71.0589), # Boston, MA
(36.7783, -119.4179), # Fresno, CA
(30.2672, -97.7431), # Austin, TX
(32.7767, -96.7970), # Dallas, TX
(25.7617, -80.1918), # Miami, FL
(39.7684, -86.1581), # Indianapolis, IN
(47.6062, -122.3321), # Seattle, WA
(35.7796, -78.6382), # Raleigh, NC
]
# Test the function for each pair of latitude and longitude.
for latitude, longitude in test_coordinates:
zip_code = find_zip_code(latitude, longitude)
if zip_code:
print(f"The zip code for latitude {latitude}, longitude {longitude} is {zip_code}.")
else:
print(f"No zip code found for latitude {latitude}, longitude {longitude}.")
end_time = datetime.datetime.now().time().strftime('%H:%M:%S')
total_time = (datetime.datetime.strptime(end_time,'%H:%M:%S') - datetime.datetime.strptime(start_time,'%H:%M:%S'))
print('Total time: ' + str(total_time))
First run:
The zip code for latitude 40.7128, longitude -74.006 is 10007.
The zip code for latitude 34.0522, longitude -118.2437 is 90012.
The zip code for latitude 41.8781, longitude -87.6298 is 60604.
...
Total time: 0:01:00
Second run:
...
Total time: 0:00:03
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论