Created
October 24, 2022 10:48
-
-
Save toebR/fe1f60e501320260652b822978cfaea8 to your computer and use it in GitHub Desktop.
data wrangling basics in python pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import geopandas as gpd | |
import pandas as pd | |
import numpy as np | |
from matplotlib import pyplot as plt | |
#set pandas options so that all column names are diplayed when printes | |
pd.set_option('display.max_columns', None) | |
pd.set_option('display.max_rows', None) | |
######load gdb with arcpy and convert to pd dataframe######### | |
input = r'my_project.gdb\my_table' | |
arr = arcpy.da.TableToNumPyArray(input, '*') | |
# convert to a Pandas DataFrame | |
df = pd.DataFrame(arr) | |
##################### | |
#load .gdb | |
gdb = gpd.read_file(r"D:\learn_py\basics\data\_STILLBERG_GIS\_STILLBERG_GIS.gdb") | |
#plot gdb | |
gdb.plot() | |
#first few rows of gdb | |
gdb.head() | |
#transform to pd dataframe | |
df = pd.DataFrame(gdb) | |
df | |
#column names | |
df.columns | |
#select columns | |
sel = df[["Code_group", 'Code_description', "Quality_pos____hgt_", "Quality_pos_"]] | |
sel | |
#select rows (filter) with "pkt" and return the column Code_group and Code_description, if second statement is empty, it returns the whole df | |
sel.loc[(sel.Code_group == "pkt") & (sel.Code_description == "aluprofil"), ["Code_group", "Code_description"]] | |
#unique values in a column (returns array) | |
a = np.unique(sel[['Code']]) | |
a | |
#mutate a new column based on values in 2 existing columns (deprec) | |
sel = sel.assign(test = sel["Quality_pos____hgt_"] - sel["Quality_pos_"]) | |
#conditional column value formatting (TRUE, False can be swapped with strings of my own choice) | |
sel['ifesle'] = np.where((sel['Code_description'] == 'aluprofil') & (sel["Quality_pos____hgt_"] < 0.25), True, False) | |
sel | |
#plot hist of dataframe column | |
sel.hist(column="Quality_pos____hgt_") | |
#look into python list comprehension | |
#grouped summary statistics over multiple columns | |
summary_sel = sel.groupby(['ifesle', "Code_group"]).describe() | |
#count entries per categorial variable in column | |
summary_count = sel.groupby('ifesle')["Code_group"].count() | |
summary_count = pd.DataFrame(summary_count) | |
########################## calculate total of row in a new column | |
summary_count_coltot = summary_count.assign(total = np.sum(summary_count["Code_group"].values)) #add total to dataframe based on values in Code_group #total in a new column | |
summary_count_coltot #now with this car we can e.g. calcualte rowwise percentages etc. | |
summary_count_coltot = summary_count_coltot.reset_index() #turn descriptors to column name (reset index) | |
summary_count_coltot["ifesle"] = summary_count_coltot["ifesle"].astype(str) #turn boolians into strings for plotting | |
plt.bar(x = summary_count_coltot["ifesle"], height= summary_count_coltot["Code_group"]) | |
############################ | |
#calcualate total of column in a new row | |
summary_count.loc['Column_Total']= summary_count.sum(numeric_only=True, axis=0) | |
summary_count | |
summary_count = summary_count.reset_index() #turn descriptors to column name (reset index) | |
summary_count["ifesle"] = summary_count["ifesle"].astype(str) #turn ifesle into one columns type (now its a string and 2 boolians) | |
plt.bar(x = summary_count["ifesle"], height = summary_count["Code_group"]) #codegroups count per total | |
#for loop of features | |
bsp = range(100) | |
rangetolist = [num for num in bsp] #jede nummer im range object wird gepostet (conditions können geadded werden) --> gibt list zurück (nicht mehr eine range)) | |
bsp = [num for num in bsp if num < 50] #[return for iterator in bsp if condition] | |
sel.shape #shape array def | |
sel.shape[0] #number of rows in the pandas dataframe | |
bsp = bsp[::-2] | |
#to loop over all rows of a pd dataframe e.g. (here base on entries of shape). | |
for x in range(len(bsp)): | |
#print(sel[x,2]) | |
print(x) | |
for x in enumerate(bsp): #enumerate "enumerates" iterator entries - creates tupples with iterator values | |
print(x) | |
for x,num in enumerate(bsp): #enumerate "enumerates" iterator entries - creates tupples with iterator values | |
print(x) | |
print(num) #adding a ",num" 'unzipps' the tupples into a singular vector (see print(num) result) | |
#function | |
def functioname(dfvar): | |
result = dfvar.columns | |
return result | |
functioname(sel) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment