Created
November 4, 2023 20:26
-
-
Save wvandeweyer/e5aeaa5fa486549fbb1a1b7be9fdd692 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pytz | |
from datetime import datetime | |
from pytz.exceptions import AmbiguousTimeError | |
from tqdm import tqdm | |
# Read the CSV file - Replace fluviusdata with filename | |
df = pd.read_csv('fluviusdata.csv', delimiter=';') | |
# Replace commas with periods in the 'Volume' column | |
df['Volume'] = df['Volume'].str.replace(',', '.') | |
# Initialize a set to keep track of previous UTC times | |
previous_utc_times = set() | |
# Function to handle the daylight saving time change | |
def handle_dst(row): | |
local_time = datetime.strptime(f"{row['Van Datum']} {row['Van Tijdstip']}", "%d-%m-%Y %H:%M:%S") | |
brussels = pytz.timezone('Europe/Brussels') | |
# Check if this is the ambiguous 2 AM hour on the day of DST change | |
if local_time.hour == 2 and local_time.month == 10 and (local_time.day == 29 or local_time.day == 30): | |
try: | |
# First try to localize assuming DST (is_dst=True) | |
local_dt = brussels.localize(local_time, is_dst=True) | |
utc_dt = local_dt.astimezone(pytz.utc) | |
utc_str = utc_dt.strftime("%Y-%m-%d %H:%M:%S") | |
# If the UTC time already exists, it means we've hit the second instance of 2 AM, hence we re-localize as non-DST | |
if utc_str in previous_utc_times: | |
local_dt = brussels.localize(local_time, is_dst=False) | |
utc_dt = local_dt.astimezone(pytz.utc) | |
# Add the UTC time to the set to keep track | |
previous_utc_times.add(utc_dt.strftime("%Y-%m-%d %H:%M:%S")) | |
return utc_dt.strftime("%Y-%m-%d %H:%M:%S") | |
except AmbiguousTimeError: | |
# In case of an error, assume standard time | |
local_dt = brussels.localize(local_time, is_dst=False) | |
utc_dt = local_dt.astimezone(pytz.utc) | |
previous_utc_times.add(utc_dt.strftime("%Y-%m-%d %H:%M:%S")) | |
return utc_dt.strftime("%Y-%m-%d %H:%M:%S") | |
else: | |
# For non-ambiguous times, just localize without DST assumptions | |
local_dt = brussels.localize(local_time, is_dst=None) | |
utc_dt = local_dt.astimezone(pytz.utc) | |
return utc_dt.strftime("%Y-%m-%d %H:%M:%S") | |
# Function to convert volume to Wh and replace nulls with zero | |
def handle_volume(volume): | |
if pd.isnull(volume) or volume == '': | |
return 0 | |
else: | |
return int(float(volume) * 1000) | |
# Function to map register values | |
def map_register(register): | |
register_map = { | |
'Afname Nacht': 'C', | |
'Afname Dag': 'C', | |
'Injectie Dag': 'I', | |
'Injectie Nacht': 'I' | |
} | |
return register_map.get(register, register) | |
# Function to process each row | |
def process_row(row): | |
row['UTC Timestamp'] = handle_dst(row) | |
row['Volume'] = handle_volume(row['Volume']) | |
row['Register'] = map_register(row['Register']) | |
return row | |
# Apply the processing function to each row with a progress bar | |
tqdm.pandas() # Enable progress_apply in pandas with tqdm | |
processed_df = df.progress_apply(process_row, axis=1) | |
# Select only the desired columns | |
final_df = processed_df[['UTC Timestamp', 'Register', 'Volume']] | |
# Write the final DataFrame to a new CSV file | |
final_df.to_csv('fluviusdata_converted.csv', index=False, sep=';') | |
print("Conversion completed. The output is saved in 'fluviusdata_converted.csv'.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment