Created
April 7, 2020 14:52
-
-
Save robertvunabandi/767675136c3dfa8600aa6310ee253008 to your computer and use it in GitHub Desktop.
A script to convert the grapes data from https://www.ncdc.noaa.gov/paleo-search/study/13194 into just one table with fields year, location, abbreviation, latitude, longitude, value (which is harvest_days_after_august_31st)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Dict, Generic, List, Tuple, TypeVar | |
T = TypeVar("T") | |
S = TypeVar("T") | |
class Pair(Generic[T, S]): | |
pass | |
# filenames. Note that for the first 3 filenames, we named them this way. | |
# so the name mapping must remain the same or you'd need to change the name | |
# here to whatever name used. | |
F_LOC = "locations.csv". # extracted locations into its own csv | |
F_ABBR = "abbreviations.csv" # extracted abbreviatons into its own csv | |
F_DATA = "data.csv" # extracted the data into its own csv | |
F_OUT = "clean_data.csv" # the final output csv | |
# fields | |
class Field: | |
YEAR = "year" | |
LOC = "location" | |
ABBR = "abbreviation" | |
LAT = "latitude" | |
LON = "longitude" | |
VAL = "value" | |
NEW_HEADERS = (Field.YEAR, Field.LOC, Field.ABBR, Field.LAT, Field.LON, Field.VAL) | |
def create_parsed() -> None: | |
# first, get the abbreviations and locations | |
abbrs = get_abbreviations() | |
locs = get_locations() | |
# now, the data | |
data = get_data() | |
# now build the new table | |
new_table = [] | |
new_table.append(list(NEW_HEADERS)) | |
for year, value_map in data: | |
for loc, (lat, lon) in locs.items(): | |
row = [] | |
row.append(year) | |
row.append(loc) | |
row.append(abbrs[loc]) | |
row.append(lat) | |
row.append(lon) | |
row.append(value_map[loc]) | |
new_table.append(row) | |
# store the data in the output file | |
with open(F_OUT, "w") as f: | |
for line in new_table: | |
f.write(",".join(line) + "\n") | |
def get_abbreviations() -> Dict[str, str]: | |
""" Dict[Location, LocationAbbreviation] """ | |
with open(F_ABBR) as f: | |
lines = f.read().splitlines() | |
rows = [[s.strip() for s in line.split(",")] for line in lines] | |
return dict(zip(rows[0], rows[1])) | |
def get_locations() -> Dict[str, Tuple[str, str]]: | |
""" Dict[Location, Tuple[Latitude, Longitude]] """ | |
with open(F_LOC) as f: | |
lines = f.read().splitlines() | |
rows = [[s.strip() for s in line.split(",")] for line in lines][1:] | |
return { | |
loc: (str(float(lat)), str(float(lon))) | |
for loc, lat, lon in rows | |
} | |
def get_data() -> List[Pair[int, Dict[str, float]]]: | |
""" List[Pair[Year, Dict[Location, Value]]] """ | |
with open(F_DATA) as f: | |
lines = f.read().splitlines() | |
rows = [line.split(",") for line in lines] | |
header, rows = rows[0], rows[1:] | |
year_idx = 0 | |
idx_to_location = {idx: loc.strip() for idx, loc in enumerate(header[1:])} | |
return [ | |
[ | |
str(int(row[year_idx])), | |
{ | |
idx_to_location[idx]: try_float(value) | |
for idx, value in enumerate(row[1:]) | |
} | |
] | |
for row in rows | |
] | |
def try_float(s: str) -> str: | |
try: | |
return str(float(s)) | |
except ValueError: | |
return "" | |
if __name__ == "__main__": | |
create_parsed() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment