rchardptrsn · August 15, 2023 00:43
diff --git a/read_geojson.py b/read_geojson.py
 import geopandas as gpd
 # Read in the dataset from geojson file to geopandas dataframe
 gdf = gpd.read_file('WFIGS_-_2022_Wildland_Fire_Perimeters_to_Date.geojson')

 # print a description of the dataframe
 print(gdf.info())

 # Check for NaN records
 # Result: geometry does not have any NaN but many other columns do
 print(f'Columns with nan records: {gdf.columns[gdf.isna().any()].tolist()}')

 # Count NULLs in poly_IncidentName
 # Result: 20

 print(f'Count of Nulls in poly_IncidentName: {gdf.poly_IncidentName.isnull().sum()}')


 # Identify duplicate records

 print(f"Number of duplicate records: {gdf[gdf.duplicated(['poly_IncidentName'], keep=False)].sort_values(by='poly_IncidentName').shape[0]}")

 # drop duplicate values in the gdf dataframe
 gdf = gdf.sort_values('SHAPE_Area', ascending=False).drop_duplicates('poly_IncidentName').sort_index()

 # test to make sure the duplicate values were dropped
 print(f"Number of duplicate records: {gdf[gdf.duplicated(['poly_IncidentName'], keep=False)].sort_values(by='poly_IncidentName').shape[0]}")

 # Subset by only rows where poly_FeatureCategory = 'Wildfire Final Fire Perimeter'
 gdf = gdf[gdf['poly_FeatureCategory']=='Wildfire Final Fire Perimeter']
	import geopandas as gpd
	# Read in the dataset from geojson file to geopandas dataframe
	gdf = gpd.read_file('WFIGS_-_2022_Wildland_Fire_Perimeters_to_Date.geojson')

	# print a description of the dataframe
	print(gdf.info())

	# Check for NaN records
	# Result: geometry does not have any NaN but many other columns do
	print(f'Columns with nan records: {gdf.columns[gdf.isna().any()].tolist()}')

	# Count NULLs in poly_IncidentName
	# Result: 20

	print(f'Count of Nulls in poly_IncidentName: {gdf.poly_IncidentName.isnull().sum()}')


	# Identify duplicate records

	print(f"Number of duplicate records: {gdf[gdf.duplicated(['poly_IncidentName'], keep=False)].sort_values(by='poly_IncidentName').shape[0]}")

	# drop duplicate values in the gdf dataframe
	gdf = gdf.sort_values('SHAPE_Area', ascending=False).drop_duplicates('poly_IncidentName').sort_index()

	# test to make sure the duplicate values were dropped
	print(f"Number of duplicate records: {gdf[gdf.duplicated(['poly_IncidentName'], keep=False)].sort_values(by='poly_IncidentName').shape[0]}")

	# Subset by only rows where poly_FeatureCategory = 'Wildfire Final Fire Perimeter'
	gdf = gdf[gdf['poly_FeatureCategory']=='Wildfire Final Fire Perimeter']