Created
September 27, 2023 21:03
-
-
Save ingenieroariel/2ef747074efcb1b62d7589e125603f32 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ pkgs, ... }: | |
{ | |
packages = with pkgs; [ | |
git | |
google-cloud-sdk | |
(python311.withPackages(ps: with ps; [ | |
pyarrow | |
duckdb | |
h3 | |
])) | |
]; | |
scripts.google.exec = ''gsutil -m rsync -avhP gs://open-buildings-data/v3/points_s2_level_4_gzip $1''; | |
scripts.arrow.exec = ''time python -W ignore google_to_arrow.py $1 $2''; | |
scripts.pstac.exec = ''echo "psych"''; | |
enterShell = '' | |
git --version | |
''; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import glob | |
import subprocess | |
import multiprocessing | |
import pyarrow as pa | |
import duckdb as db | |
import h3.unstable.vect as vect | |
def process_file(filename): | |
print(f"Processing {filename}") | |
b = db.sql(f"SELECT latitude, longitude FROM '{filename}'").arrow() | |
lats = b["latitude"].combine_chunks().to_numpy() | |
lons = b["longitude"].combine_chunks().to_numpy() | |
h3_15 = vect.geo_to_h3(lats, lons, 15) | |
h3_7 = vect.h3_to_parent(h3_15, 7) | |
h3_15_arrow = pa.array(h3_15, type=pa.uint64()) | |
h3_7_arrow = pa.array(h3_7, type=pa.uint64()) | |
table = pa.table({'h3_15': h3_15_arrow, 'h3_7': h3_7_arrow}) | |
out_dir = sys.argv[2] | |
os.makedirs(out_dir, exist_ok=True) | |
target_file = os.path.join(out_dir, os.path.basename(filename)) + ".arrow" | |
with pa.OSFile(target_file, "wb") as sink: | |
with pa.RecordBatchFileWriter(sink, table.schema) as writer: | |
writer.write_table(table) | |
print(f"Wrote {target_file}") | |
if __name__ == '__main__': | |
google_dir = sys.argv[1] | |
print(f"Entering {google_dir}") | |
with multiprocessing.Pool(multiprocessing.cpu_count() - 2) as processing_pool: | |
processing_pool.map(process_file, glob.glob(f"{google_dir}/*.csv.gz")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import duckdb as db | |
import pandas as pd | |
import pyarrow as pa | |
from glob import glob | |
file_paths = glob.glob("/Users/x/data/points_arrow_h3_15_and_7/*") | |
tm = lambda pa_dtype : pd.ArrowDtype(pa_dtype) | |
# Open and read each file, storing the tables in a list | |
tables = [] | |
for path in file_paths: | |
with pa.ipc.open_file(path) as f: | |
tables.append(f.read_all().to_pandas(types_mapper=tm)[["h3_15"]]) | |
df = pd.concat(tables) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment