noklam · April 6, 2022 22:27
diff --git a/gistfile1.txt b/gistfile1.txt
 # Here you can define all your data sets by using simple YAML syntax.
 #
 # Documentation for this file format can be found in "The Data Catalog"
 # Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
 #
 # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
 #
 # An example data set definition can look as follows:
 #
 #bikes:
 #  type: pandas.CSVDataSet
 #  filepath: "data/01_raw/bikes.csv"
 #
 #weather:
 #  type: spark.SparkDataSet
 #  filepath: s3a://your_bucket/data/01_raw/weather*
 #  file_format: csv
 #  credentials: dev_s3
 #  load_args:
 #    header: True
 #    inferSchema: True
 #  save_args:
 #    sep: '|'
 #    header: True
 #
 #scooters:
 #  type: pandas.SQLTableDataSet
 #  credentials: scooters_credentials
 #  table_name: scooters
 #  load_args:
 #    index_col: ['name']
 #    columns: ['name', 'gear']
 #  save_args:
 #    if_exists: 'replace'
 #    # if_exists: 'fail'
 #    # if_exists: 'append'
 #
 # The Data Catalog supports being able to reference the same file using two different DataSet implementations
 # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
 # https://kedro.readthedocs.io/en/stable/data/data_catalog.html

 companies:
  type: pandas.CSVDataSet
  filepath: data/01_raw/companies.csv
  # more about layers in the Data Engineering Convention:
  # https://kedro.readthedocs.io/en/stable/tutorial/visualise_pipeline.html#interact-with-data-engineering-convention
  layer: raw

 reviews:
  type: pandas.CSVDataSet
  filepath: data/01_raw/reviews.csv
  layer: raw

 shuttles:
  type: pandas.ExcelDataSet
  filepath: data/01_raw/shuttles.xlsx
  layer: raw

 data_processing.preprocessed_companies:
  type: pandas.ParquetDataSet
  filepath: data/02_intermediate/preprocessed_companies.pq
  layer: intermediate

 data_processing.preprocessed_shuttles:
  type: pandas.ParquetDataSet
  filepath: data/02_intermediate/preprocessed_shuttles.pq
  layer: intermediate

 model_input_table:
  type: pandas.ParquetDataSet
  filepath: data/03_primary/model_input_table.pq
  layer: primary

 data_science.active_modelling_pipeline.regressor:
  type: pickle.PickleDataSet
  filepath: data/06_models/regressor_active.pickle
  versioned: true
  layer: models

 data_science.candidate_modelling_pipeline.regressor:
  type: pickle.PickleDataSet
  filepath: data/06_models/regressor_candidate.pickle
  versioned: true
  layer: models
  
 data_science.candidate_modelling_pipeline.metrics:
  type: tracking.MetricsDataSet
  filepath: data/09_tracking/metrics_active.json

 data_science.active_modelling_pipeline.metrics:
  type: tracking.MetricsDataSet
  filepath: data/09_tracking/metrics_candidate.json

 data_processing.companies_columns:
  type: tracking.JSONDataSet
  filepath: data/09_tracking/companies_columns.json
	# Here you can define all your data sets by using simple YAML syntax.
	#
	# Documentation for this file format can be found in "The Data Catalog"
	# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
	#
	# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
	#
	# An example data set definition can look as follows:
	#
	#bikes:
	# type: pandas.CSVDataSet
	# filepath: "data/01_raw/bikes.csv"
	#
	#weather:
	# type: spark.SparkDataSet
	# filepath: s3a://your_bucket/data/01_raw/weather*
	# file_format: csv
	# credentials: dev_s3
	# load_args:
	# header: True
	# inferSchema: True
	# save_args:
	# sep: '\|'
	# header: True
	#
	#scooters:
	# type: pandas.SQLTableDataSet
	# credentials: scooters_credentials
	# table_name: scooters
	# load_args:
	# index_col: ['name']
	# columns: ['name', 'gear']
	# save_args:
	# if_exists: 'replace'
	# # if_exists: 'fail'
	# # if_exists: 'append'
	#
	# The Data Catalog supports being able to reference the same file using two different DataSet implementations
	# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
	# https://kedro.readthedocs.io/en/stable/data/data_catalog.html

	companies:
	type: pandas.CSVDataSet
	filepath: data/01_raw/companies.csv
	# more about layers in the Data Engineering Convention:
	# https://kedro.readthedocs.io/en/stable/tutorial/visualise_pipeline.html#interact-with-data-engineering-convention
	layer: raw

	reviews:
	type: pandas.CSVDataSet
	filepath: data/01_raw/reviews.csv
	layer: raw

	shuttles:
	type: pandas.ExcelDataSet
	filepath: data/01_raw/shuttles.xlsx
	layer: raw

	data_processing.preprocessed_companies:
	type: pandas.ParquetDataSet
	filepath: data/02_intermediate/preprocessed_companies.pq
	layer: intermediate

	data_processing.preprocessed_shuttles:
	type: pandas.ParquetDataSet
	filepath: data/02_intermediate/preprocessed_shuttles.pq
	layer: intermediate

	model_input_table:
	type: pandas.ParquetDataSet
	filepath: data/03_primary/model_input_table.pq
	layer: primary

	data_science.active_modelling_pipeline.regressor:
	type: pickle.PickleDataSet
	filepath: data/06_models/regressor_active.pickle
	versioned: true
	layer: models

	data_science.candidate_modelling_pipeline.regressor:
	type: pickle.PickleDataSet
	filepath: data/06_models/regressor_candidate.pickle
	versioned: true
	layer: models

	data_science.candidate_modelling_pipeline.metrics:
	type: tracking.MetricsDataSet
	filepath: data/09_tracking/metrics_active.json

	data_science.active_modelling_pipeline.metrics:
	type: tracking.MetricsDataSet
	filepath: data/09_tracking/metrics_candidate.json

	data_processing.companies_columns:
	type: tracking.JSONDataSet
	filepath: data/09_tracking/companies_columns.json