kissmygritts · January 26, 2024 01:09
diff --git a/feature_catalog.py b/feature_catalog.py
 from __future__ import annotations
 from enum import Enum
 from dataclasses import dataclass
 from abc import ABC, abstractstaticmethod
 import geopandas as gpd

 # The current feature catalog class is doing a lot and feels overloaded.
 # it's trying to be a factory/formatter class and a dataset class. I think
 # we can split the dataset classes into a few separate concrete implementations
 # of a FeatureCatalog abstract base class. This class will have it's own
 # loader (as an abstract method) that creates an instance of itself.
 #
 # My current understanding is each FeatureCatalog class uses the same inventory
 # file, but parses that inventory file differently for each class. This makes
 # a lot of the code more difficult to reason about. For example the LiveDeadCatalog
 # needs several months of sentinel data in the FeatureCatalog. The CanopyCoverCatalog
 # needs only 2 months of sentinel data.
 #
 # Then create a FeatureCatalogFactor to do the intial loading of the S3 inventory
 # file. Parsing this inventory file into each of the catalogs is delegated to
 # the correct FeatureCatalog class. The factory will instantiate the dataset class.

 class CatalogType(Enum):
    CANOPY_COVER = "canopy_cover"
    CHM = "chm"
    LIVE_DEAD = "live_dead"

 @dataclass
 class FeatureCatalog(ABC):
    gdf: gpd.GeodDataFrame

    @abstractstaticmethod
    def format():
        pass

    def n_features(self) -> int:
        # common feature catalog methods
        return len(self.gdf)

 @dataclass
 class CanopyCoverFeatureCatalog(FeatureCatalog):
    def format() -> FeatureCatalog:
        # add concrete implementation here, i.e. the formatting logic
        # that creates a canopy cover feature catalog
        pass

 @dataclass
 class CHMFeatureCatalog(FeatureCatalog):
    def format() -> FeatureCatalog:
        # add concrete implementation here, i.e. the formatting logic
        # that creates a chm feature catalog
        pass

 class FeatureCatalogFactory:
    def __init__(self, inventory_url: str | None):
        self._full_inventory = None
        self._inventory_url = inventory_url

        self.load_custom_inventory()

    def load_custom_inventory(self) -> None:
        # some initialization logic if necessary, this can be used
        # in the init or by outside callers
        if self._inventory_url:
            self._full_inventory = gpd.read_file(self._inventory_url)

    def create(self, catalog_type: CatalogType) -> FeatureCatalog:
        # do something with inventory URL if needed
        return self._catalog_formatters[catalog_type].format()

    @staticmethod
    def _catalog_class_map() -> dict[CatalogType, FeatureCatalog]:
        # replace key with an enum and then we have better type safety
        return {
            CatalogType.CANOPY_COVER: CanopyCoverFeatureCatalog,
            CatalogType.CHM: CHMFeatureCatalog,
        }

 # usage
 # defualt behavior
 catalog_formatter = FeatureCatalogFactory()
 canopy_cover_catalog: CanopyCoverFeatureCatalog = catalog_formatter(CatalogType.CANOPY_COVER)

 # use a custom inventory file
 catalog_formatter = FeatureCatalogFactory("s3://vp-eng-test-data/inventory/inventory.csv")
 chm_catalog: CHMFeatureCatalog = catalog_formatter(CatalogType.CHM)
	from __future__ import annotations
	from enum import Enum
	from dataclasses import dataclass
	from abc import ABC, abstractstaticmethod
	import geopandas as gpd

	# The current feature catalog class is doing a lot and feels overloaded.
	# it's trying to be a factory/formatter class and a dataset class. I think
	# we can split the dataset classes into a few separate concrete implementations
	# of a FeatureCatalog abstract base class. This class will have it's own
	# loader (as an abstract method) that creates an instance of itself.
	#
	# My current understanding is each FeatureCatalog class uses the same inventory
	# file, but parses that inventory file differently for each class. This makes
	# a lot of the code more difficult to reason about. For example the LiveDeadCatalog
	# needs several months of sentinel data in the FeatureCatalog. The CanopyCoverCatalog
	# needs only 2 months of sentinel data.
	#
	# Then create a FeatureCatalogFactor to do the intial loading of the S3 inventory
	# file. Parsing this inventory file into each of the catalogs is delegated to
	# the correct FeatureCatalog class. The factory will instantiate the dataset class.

	class CatalogType(Enum):
	CANOPY_COVER = "canopy_cover"
	CHM = "chm"
	LIVE_DEAD = "live_dead"

	@dataclass
	class FeatureCatalog(ABC):
	gdf: gpd.GeodDataFrame

	@abstractstaticmethod
	def format():
	pass

	def n_features(self) -> int:
	# common feature catalog methods
	return len(self.gdf)

	@dataclass
	class CanopyCoverFeatureCatalog(FeatureCatalog):
	def format() -> FeatureCatalog:
	# add concrete implementation here, i.e. the formatting logic
	# that creates a canopy cover feature catalog
	pass

	@dataclass
	class CHMFeatureCatalog(FeatureCatalog):
	def format() -> FeatureCatalog:
	# add concrete implementation here, i.e. the formatting logic
	# that creates a chm feature catalog
	pass

	class FeatureCatalogFactory:
	def __init__(self, inventory_url: str \| None):
	self._full_inventory = None
	self._inventory_url = inventory_url

	self.load_custom_inventory()

	def load_custom_inventory(self) -> None:
	# some initialization logic if necessary, this can be used
	# in the init or by outside callers
	if self._inventory_url:
	self._full_inventory = gpd.read_file(self._inventory_url)

	def create(self, catalog_type: CatalogType) -> FeatureCatalog:
	# do something with inventory URL if needed
	return self._catalog_formatters[catalog_type].format()

	@staticmethod
	def _catalog_class_map() -> dict[CatalogType, FeatureCatalog]:
	# replace key with an enum and then we have better type safety
	return {
	CatalogType.CANOPY_COVER: CanopyCoverFeatureCatalog,
	CatalogType.CHM: CHMFeatureCatalog,
	}

	# usage
	# defualt behavior
	catalog_formatter = FeatureCatalogFactory()
	canopy_cover_catalog: CanopyCoverFeatureCatalog = catalog_formatter(CatalogType.CANOPY_COVER)

	# use a custom inventory file
	catalog_formatter = FeatureCatalogFactory("s3://vp-eng-test-data/inventory/inventory.csv")
	chm_catalog: CHMFeatureCatalog = catalog_formatter(CatalogType.CHM)