bennyistanto · June 8, 2024 07:04
diff --git a/imerg_thresholdbymonth.py b/imerg_thresholdbymonth.py
 import xarray as xr
 import numpy as np
 import os
 import glob
 import pandas as pd
 from tqdm import tqdm
 from scipy.stats import genextreme

 # Define directories and filenames for threshold calculations
 threshold_calculations = [
    {"days": 1, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_1day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_1day"},
    {"days": 2, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_2day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_2day"},
    {"days": 3, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_3day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_3day"},
    {"days": 4, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_4day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_4day"},
    {"days": 5, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_5day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_5day"},
 ]

 # Threshold definitions
 threshold_definitions = [
    {"name": "moderate", "percentile": 50, "years": 2, "output_suffix": "02yr"},
    {"name": "heavy", "percentile": 80, "years": 5, "output_suffix": "05yr"},
    {"name": "intense", "percentile": None, "years": 10, "output_suffix": "10yr"},
    {"name": "extreme", "percentile": None, "years": 25, "output_suffix": "25yr"},
 ]

 # Global variable to store user's choice
 user_choice = None

 def set_user_decision():
    """
    Prompt user for decision on existing files and store it globally.
    
    This function prompts the user to decide whether to replace, skip, or abort
    if an output file already exists. The decision is stored in a global variable
    to be used throughout the script.
    """
    global user_choice
    if user_choice is None:
        decision = input("An output file already exists. Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
        while decision not in ['R', 'S', 'A']:
            print("Invalid choice. Please choose again.")
            decision = input("Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
        user_choice = decision

 def calculate_percentile_threshold(data, percentile):
    """
    Calculate percentile threshold for given data.
    
    Parameters:
        data (numpy.ndarray): The input precipitation data.
        percentile (int): The percentile to calculate (e.g., 50 for P50).
    
    Returns:
        numpy.ndarray: The calculated percentile threshold.
    """
    return np.nanpercentile(data, percentile, axis=0)

 def calculate_gev_threshold(data, return_period):
    """
    Calculate GEV threshold using Maximum Likelihood Estimation (MLE) for fitting the GEV distribution.
    
    Parameters:
        data (numpy.ndarray): The input precipitation data.
        return_period (int): The return period for the threshold (e.g., 10 years).
    
    Returns:
        float: The calculated GEV threshold.
    """
    shape, loc, scale = genextreme.fit(data)  # MLE is used by default
    return genextreme.ppf(1 - 1 / return_period, shape, loc=loc, scale=scale)

 # Process each accumulation period
 for calc in threshold_calculations:
    days = calc["days"]
    input_dir = calc["input_dir"]
    output_dir = calc["output_dir"]

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Create a list of files
    file_list = sorted(glob.glob(os.path.join(input_dir, f"wld_cli_imerg_fddmax_{days}day_*.nc4")))

    # Group files by month
    files_by_month = {}
    for file_path in file_list:
        date_str = file_path.split('_')[-1].split('.')[0]
        month = date_str[4:6]
        if month not in files_by_month:
            files_by_month[month] = []
        files_by_month[month].append(file_path)

    # Process each month
    with tqdm(total=len(files_by_month), desc=f"Processing {days}-day data by month") as month_pbar:
        for month, month_files in files_by_month.items():
            with tqdm(total=len(threshold_definitions), desc=f"Calculating thresholds for month {month}", leave=False) as threshold_pbar:
                for threshold_def in threshold_definitions:
                    name = threshold_def["name"]
                    percentile = threshold_def["percentile"]
                    years = threshold_def["years"]
                    output_suffix = threshold_def["output_suffix"]

                    output_filename = f"wld_cli_imerg_{days}day_extreme_threshold_{output_suffix}_{month}.nc4"
                    output_filepath = os.path.join(output_dir, output_filename)

                    if os.path.exists(output_filepath):
                        if user_choice is None:
                            set_user_decision()
                        if user_choice == 'S':
                            print(f"Skipping existing file: {output_filepath}")
                            continue  # Skip to the next file
                        elif user_choice == 'A':
                            print("Aborting process.")
                            exit(0)  # Exit the script
                        elif user_choice == 'R':
                            pass  # Replace the file

                    try:
                        # Load data for the current month
                        datasets = [xr.open_dataset(f) for f in month_files]
                        combined_ds = xr.concat(datasets, dim='time')
                        data = combined_ds['precipitation'].values

                        # Calculate threshold
                        if percentile is not None:
                            threshold = calculate_percentile_threshold(data, percentile)
                        else:
                            threshold = calculate_gev_threshold(data.flatten(), years)

                        # Create an xarray DataArray for the threshold
                        threshold_da = xr.DataArray(threshold, dims=["lat", "lon"], coords={"lat": combined_ds["lat"], "lon": combined_ds["lon"]})
                        threshold_ds = threshold_da.to_dataset(name="threshold")
                        threshold_ds.attrs['Conventions'] = 'CF-1.8'

                        # Ensure CF conventions and compression
                        encoding = {var: {'zlib': True, 'complevel': 5} for var in threshold_ds.data_vars}
                        threshold_ds.to_netcdf(output_filepath, encoding=encoding)
                        print(f"Saved: {output_filepath}")

                    except Exception as e:
                        print(f"Error processing month {month} for {name} threshold: {e}")

                    finally:
                        for ds in datasets:
                            ds.close()

                    threshold_pbar.update(1)
            month_pbar.update(1)

 print("All thresholds calculated and saved.")
	import xarray as xr
	import numpy as np
	import os
	import glob
	import pandas as pd
	from tqdm import tqdm
	from scipy.stats import genextreme

	# Define directories and filenames for threshold calculations
	threshold_calculations = [
	{"days": 1, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_1day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_1day"},
	{"days": 2, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_2day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_2day"},
	{"days": 3, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_3day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_3day"},
	{"days": 4, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_4day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_4day"},
	{"days": 5, "input_dir": "/mnt/d/temp/imerg/data/fddmaxm_5day", "output_dir": "/mnt/d/temp/imerg/data/threshold_fddmaxm_5day"},
	]

	# Threshold definitions
	threshold_definitions = [
	{"name": "moderate", "percentile": 50, "years": 2, "output_suffix": "02yr"},
	{"name": "heavy", "percentile": 80, "years": 5, "output_suffix": "05yr"},
	{"name": "intense", "percentile": None, "years": 10, "output_suffix": "10yr"},
	{"name": "extreme", "percentile": None, "years": 25, "output_suffix": "25yr"},
	]

	# Global variable to store user's choice
	user_choice = None

	def set_user_decision():
	"""
	Prompt user for decision on existing files and store it globally.

	This function prompts the user to decide whether to replace, skip, or abort
	if an output file already exists. The decision is stored in a global variable
	to be used throughout the script.
	"""
	global user_choice
	if user_choice is None:
	decision = input("An output file already exists. Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
	while decision not in ['R', 'S', 'A']:
	print("Invalid choice. Please choose again.")
	decision = input("Choose an action - Replace (R), Skip (S), Abort (A): ").upper()
	user_choice = decision

	def calculate_percentile_threshold(data, percentile):
	"""
	Calculate percentile threshold for given data.

	Parameters:
	data (numpy.ndarray): The input precipitation data.
	percentile (int): The percentile to calculate (e.g., 50 for P50).

	Returns:
	numpy.ndarray: The calculated percentile threshold.
	"""
	return np.nanpercentile(data, percentile, axis=0)

	def calculate_gev_threshold(data, return_period):
	"""
	Calculate GEV threshold using Maximum Likelihood Estimation (MLE) for fitting the GEV distribution.

	Parameters:
	data (numpy.ndarray): The input precipitation data.
	return_period (int): The return period for the threshold (e.g., 10 years).

	Returns:
	float: The calculated GEV threshold.
	"""
	shape, loc, scale = genextreme.fit(data) # MLE is used by default
	return genextreme.ppf(1 - 1 / return_period, shape, loc=loc, scale=scale)

	# Process each accumulation period
	for calc in threshold_calculations:
	days = calc["days"]
	input_dir = calc["input_dir"]
	output_dir = calc["output_dir"]

	# Ensure output directory exists
	os.makedirs(output_dir, exist_ok=True)

	# Create a list of files
	file_list = sorted(glob.glob(os.path.join(input_dir, f"wld_cli_imerg_fddmax_{days}day_*.nc4")))

	# Group files by month
	files_by_month = {}
	for file_path in file_list:
	date_str = file_path.split('_')[-1].split('.')[0]
	month = date_str[4:6]
	if month not in files_by_month:
	files_by_month[month] = []
	files_by_month[month].append(file_path)

	# Process each month
	with tqdm(total=len(files_by_month), desc=f"Processing {days}-day data by month") as month_pbar:
	for month, month_files in files_by_month.items():
	with tqdm(total=len(threshold_definitions), desc=f"Calculating thresholds for month {month}", leave=False) as threshold_pbar:
	for threshold_def in threshold_definitions:
	name = threshold_def["name"]
	percentile = threshold_def["percentile"]
	years = threshold_def["years"]
	output_suffix = threshold_def["output_suffix"]

	output_filename = f"wld_cli_imerg_{days}day_extreme_threshold_{output_suffix}_{month}.nc4"
	output_filepath = os.path.join(output_dir, output_filename)

	if os.path.exists(output_filepath):
	if user_choice is None:
	set_user_decision()
	if user_choice == 'S':
	print(f"Skipping existing file: {output_filepath}")
	continue # Skip to the next file
	elif user_choice == 'A':
	print("Aborting process.")
	exit(0) # Exit the script
	elif user_choice == 'R':
	pass # Replace the file

	try:
	# Load data for the current month
	datasets = [xr.open_dataset(f) for f in month_files]
	combined_ds = xr.concat(datasets, dim='time')
	data = combined_ds['precipitation'].values

	# Calculate threshold
	if percentile is not None:
	threshold = calculate_percentile_threshold(data, percentile)
	else:
	threshold = calculate_gev_threshold(data.flatten(), years)

	# Create an xarray DataArray for the threshold
	threshold_da = xr.DataArray(threshold, dims=["lat", "lon"], coords={"lat": combined_ds["lat"], "lon": combined_ds["lon"]})
	threshold_ds = threshold_da.to_dataset(name="threshold")
	threshold_ds.attrs['Conventions'] = 'CF-1.8'

	# Ensure CF conventions and compression
	encoding = {var: {'zlib': True, 'complevel': 5} for var in threshold_ds.data_vars}
	threshold_ds.to_netcdf(output_filepath, encoding=encoding)
	print(f"Saved: {output_filepath}")

	except Exception as e:
	print(f"Error processing month {month} for {name} threshold: {e}")

	finally:
	for ds in datasets:
	ds.close()

	threshold_pbar.update(1)
	month_pbar.update(1)

	print("All thresholds calculated and saved.")