params.yaml

# This file contains all the control parameters, hyperparameters, toggles, etc.
# needed to run the CCAO's automated valuation model

# Parameters in this file are associated with certain stages in the modeling
# pipeline. Changing a parameter means any pipeline stage associated with that
# parameter must be re-run. See dvc.yaml for parameter <-> stage associations
# and https://dvc.org/doc/command-reference/params for more information


# Run Control ------------------------------------------------------------------

# Model tag used to identify the purpose of the run. Must be one of:
# "junk", "rejected", "test", "baseline", "candidate", or "final"
run_type: "test"

# Note included with each run. Use this to summarize what changed about the run
# or add context
run_note: |
  Final 2024 residential model with updated data and new features

toggle:
  # Should the train stage run full cross-validation? Otherwise, the model
  # will be trained with the default hyperparameters specified below
  cv_enable: FALSE

  # Should SHAP values be calculated for this run in the interpret stage? Can be
  # desirable to save time when testing many models
  shap_enable: TRUE

  # Should comps be calculated for this run in the interpret stage?
  comp_enable: TRUE

  # Upload all modeling artifacts and results to S3 in the upload stage. Set
  # to FALSE if you are not a CCAO employee
  upload_enable: TRUE


# Data/Ingest ------------------------------------------------------------------

# Assessment context and dates
assessment:
  # Year of assessment. Used to pull land rates, HIEs, and other information
  year: "2024"

  # The statutorily set "sale date" for the purpose of prediction
  date: "2024-01-01"

  # Added context for model artifacts stored in S3. Does not change behavior
  triad: "city"
  group: "residential"

  # Year from which property characteristics are pulled. Usually lags the
  # assessment year by 1
  data_year: "2023"

  # Year used to partition data on S3. Working year in this case means the year
  # the Data Department is currently creating models for
  working_year: "2024"

# Parameters used to define the input/training data
input:
  # The min and max year of sales to use for the training data sample
  min_sale_year: "2015"
  max_sale_year: "2023"

  # Number of years back to look for count_past_n_years feature
  n_years_prior: 4

  # Parameters used to generate townhome complex identifiers
  complex:
    # Townhomes (class 210/295) should match exactly on these variables to be
    # considered in the same complex
    match_exact:
      - "meta_township_code"
      - "meta_class"
      - "char_bsmt"
      - "char_gar1_size"
      - "char_attic_fnsh"
      - "char_beds"
      - "meta_pin_num_cards"
      - "meta_tieback_proration_rate"

    # Townhomes should match fuzzily on these variables to be in the same
    # complex e.g. a PIN with 2000 and a PIN with 2020 square feet will match
    match_fuzzy:
      rooms: 1
      bldg_sf: 25
      yrblt: 4
      dist_ft: 250


# Cross-validation -------------------------------------------------------------

# Cross-validation parameters used in the train stage
cv:
  # Proportion of the training data to use for training vs test, split by time.
  # 0.9 means the most recent 10% of sales are used as a test set
  split_prop: 0.9

  # Number of folds to use for cross-validation. For v-fold CV, the data will be
  # randomly split. For rolling-origin, the data will be split into V chunks by
  # time, with each chunk/period calculated automatically
  num_folds: 10

  # The number of months time-based folds should overlap each other. Only
  # applicable to rolling-origin CV. See https://www.tmwr.org/resampling#rolling
  fold_overlap: 9

  # Number of initial iterations to create before tuning. Recommend this number
  # be greater than the number of hyperparameters being tuned
  initial_set: 20

  # Max number of total search iterations
  max_iterations: 50

  # Max number of search iterations without improvement before stopping search
  no_improve: 15

  # The number of iterations with no improvement before an uncertainty sample
  # is created where a sample with high predicted variance is chosen
  uncertain: 8

  # Metric used to select the "best" set of parameters from CV iterations. Must
  # be manually included the metric_set() passed to tune_bayes()
  best_metric: "rmse"


# Model (Hyper)parameters ------------------------------------------------------

# Static and tuneable parameters that define the structure and behavior of the
# model itself
model:
  engine: "lightgbm"

  # Objective/loss function minimized by LightGBM. See website for possible
  # options: https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective
  objective: "rmse"

  # Parameters related to model determinism. Current settings should force
  # the same output every time if the same hyperparameters are used
  seed: 2024
  deterministic: TRUE
  force_row_wise: TRUE

  # Model verbosity: < 0: Fatal, = 0: Error (Warning), = 1: Info, > 1: Debug
  verbose: -1

  predictor:
    # Vector of predictors from the training data included in the model. Edit
    # this list to add or remove variables from the model
    all:
      - "meta_township_code"
      - "meta_nbhd_code"
      - "meta_sale_count_past_n_years"
      - "char_yrblt"
      - "char_air"
      - "char_apts"
      - "char_attic_fnsh"
      - "char_attic_type"
      - "char_beds"
      - "char_bldg_sf"
      - "char_bsmt"
      - "char_bsmt_fin"
      - "char_class"
      - "char_ext_wall"
      - "char_fbath"
      - "char_frpl"
      - "char_gar1_att"
      - "char_gar1_cnst"
      - "char_gar1_size"
      - "char_hbath"
      - "char_land_sf"
      - "char_heat"
      - "char_ncu"
      - "char_porch"
      - "char_roof_cnst"
      - "char_rooms"
      - "char_tp_dsgn"
      - "char_type_resd"
      - "char_recent_renovation"
      - "loc_longitude"
      - "loc_latitude"
      - "loc_census_tract_geoid"
      - "loc_env_flood_fs_factor"
      - "loc_env_airport_noise_dnl"
      - "loc_school_elementary_district_geoid"
      - "loc_school_secondary_district_geoid"
      - "loc_access_cmap_walk_nta_score"
      - "loc_access_cmap_walk_total_score"
      - "loc_tax_municipality_name"
      - "prox_num_pin_in_half_mile"
      - "prox_num_bus_stop_in_half_mile"
      - "prox_num_foreclosure_per_1000_pin_past_5_years"
      - "prox_num_school_in_half_mile"
      - "prox_num_school_with_rating_in_half_mile"
      - "prox_avg_school_rating_in_half_mile"
      - "prox_airport_dnl_total"
      - "prox_nearest_bike_trail_dist_ft"
      - "prox_nearest_cemetery_dist_ft"
      - "prox_nearest_cta_route_dist_ft"
      - "prox_nearest_cta_stop_dist_ft"
      - "prox_nearest_hospital_dist_ft"
      - "prox_lake_michigan_dist_ft"
      - "prox_nearest_major_road_dist_ft"
      - "prox_nearest_metra_route_dist_ft"
      - "prox_nearest_metra_stop_dist_ft"
      - "prox_nearest_park_dist_ft"
      - "prox_nearest_railroad_dist_ft"
      - "prox_nearest_secondary_road_dist_ft"
      - "prox_nearest_university_dist_ft"
      - "prox_nearest_vacant_land_dist_ft"
      - "prox_nearest_water_dist_ft"
      - "prox_nearest_golf_course_dist_ft"
      - "acs5_percent_age_children"
      - "acs5_percent_age_senior"
      - "acs5_median_age_total"
      - "acs5_percent_mobility_no_move"
      - "acs5_percent_mobility_moved_from_other_state"
      - "acs5_percent_household_family_married"
      - "acs5_percent_household_nonfamily_alone"
      - "acs5_percent_education_high_school"
      - "acs5_percent_education_bachelor"
      - "acs5_percent_education_graduate"
      - "acs5_percent_income_below_poverty_level"
      - "acs5_median_income_household_past_year"
      - "acs5_median_income_per_capita_past_year"
      - "acs5_percent_income_household_received_snap_past_year"
      - "acs5_percent_employment_unemployed"
      - "acs5_median_household_total_occupied_year_built"
      - "acs5_median_household_renter_occupied_gross_rent"
      - "acs5_percent_household_owner_occupied"
      - "acs5_percent_household_total_occupied_w_sel_cond"
      - "acs5_percent_mobility_moved_in_county"
      - "other_tax_bill_rate"
      - "other_school_district_elementary_avg_rating"
      - "other_school_district_secondary_avg_rating"
      - "ccao_is_active_exe_homeowner"
      - "ccao_is_corner_lot"
      - "ccao_n_years_exe_homeowner"
      - "time_sale_year"
      - "time_sale_day"
      - "time_sale_quarter_of_year"
      - "time_sale_month_of_year"
      - "time_sale_day_of_year"
      - "time_sale_day_of_month"
      - "time_sale_day_of_week"
      - "time_sale_post_covid"

    # List of predictors included in predictor.all which are categoricals.
    # It is CRITICAL that any categorical variables are included in this list,
    # else LightGBM will treat them as numeric
    categorical:
      - "meta_township_code"
      - "meta_nbhd_code"
      - "char_air"
      - "char_apts"
      - "char_attic_fnsh"
      - "char_attic_type"
      - "char_bsmt"
      - "char_bsmt_fin"
      - "char_class"
      - "char_ext_wall"
      - "char_gar1_att"
      - "char_gar1_cnst"
      - "char_gar1_size"
      - "char_heat"
      - "char_porch"
      - "char_roof_cnst"
      - "char_tp_dsgn"
      - "char_type_resd"
      - "loc_census_tract_geoid"
      - "loc_tax_municipality_name"
      - "loc_school_elementary_district_geoid"
      - "loc_school_secondary_district_geoid"
      - "time_sale_quarter_of_year"

    # List of identifiers for each observation, can be ignored
    id:
      - "meta_year"
      - "meta_pin"
      - "meta_class"
      - "meta_card_num"
      - "meta_sale_document_num"

  parameter:
    # For CV only, proportion of the training data to hold out for use in
    # early stopping + the metric to evaluate. See R docs for details:
    # https://lightgbm.readthedocs.io/en/latest/R/reference/lgb.train.html#arguments
    # WARNING: See GitLab issue #82 for critical notes about early stopping / CV
    validation_prop: 0.1
    validation_type: "random"
    validation_metric: "rmse"

    # Custom parameters added by the CCAO's lightsnip wrapper package. Setting
    # to TRUE will set max_depth = floor(log2(num_leaves)) + add_to_linked_depth
    # This is to prevent tune_bayes from exploring useless parameter space
    link_max_depth: TRUE

    # During CV, the number of iterations to go without improvement before
    # stopping training. Early stopping is deactivated when NULL
    stop_iter: 50

  hyperparameter:
    # Default set of hyperparameters to use if CV is not enabled
    default:
      # Total/maximum number of iterations. Usually changed in tandem with
      # learning_rate. One of the most important params controlling complexity

      # If cross-validation and early stopping are disabled then the model will
      # always train to exactly this number of iterations. The ideal strategy
      # for setting this parameter is to discover a good fixed value using CV,
      # then manually set that value for non-CV runs (which don't use
      # early stopping by default)
      num_iterations: 1575
      learning_rate: 0.015

      # Maximum number of bins for discretizing continuous features. Lower uses
      # less memory and speeds up training
      max_bin: 512

      # See docs for details on each of the remaining parameters:
      # https://lightgbm.readthedocs.io/en/latest/Parameters.html
      num_leaves: 185
      add_to_linked_depth: 4
      feature_fraction: 0.61
      min_gain_to_split: 75.5
      min_data_in_leaf: 31
      max_cat_threshold: 165
      min_data_per_group: 300
      cat_smooth: 82.0
      cat_l2: 1.00
      lambda_l1: 0.022
      lambda_l2: 0.152

    # Range of possible hyperparameter values for tuning to explore
    range:
      # NOTE: If cross-validation is used and/or early stopping is enabled, then
      # the upper bound for num_iterations is effectively the MAXIMUM number of
      # trees (i.e. the model can stop before reaching this number), and the
      # number actual used is reported in the lgbm_final_params object in the
      # train stage and parameter_final table in the run outputs/Athena
      num_iterations: [100, 2500]
      learning_rate: [-3.0, -0.4] # 10 ^ X
      max_bin: [50, 512]
      num_leaves: [32, 2048]
      add_to_linked_depth: [1, 7]
      feature_fraction: [0.3, 0.7]
      min_gain_to_split: [-3.0, 4.0] # 10 ^ X
      min_data_in_leaf: [2, 400]
      max_cat_threshold: [10, 250]
      min_data_per_group: [2, 400]
      cat_smooth: [10.0, 200.0]
      cat_l2: [-3, 2] # 10 ^ X
      lambda_l1: [-3, 2] # 10 ^ X
      lambda_l2: [-3, 2] # 10 ^ X


# Post-Valuation ---------------------------------------------------------------

# Parameters used in the assess stage to finalize the intial model predictions
pv:
  # For multi-card PINs (rare), implement a heuristic that caps the potential
  # change in value. See assess stage code for details
  multicard_yoy_cap: 2.2

  # Cap the proportion of the PIN's total value dedicated to land. This is
  # necessary since sometimes the model provides low predictions relative to the
  # land rates created by Valuations
  land_pct_of_total_cap: 0.5

  # Rounding settings to apply to initial predictions. Rounding is done to
  # indicate to property owners that model values are estimates, not exact
  round_break: [1000, 10000, 100000]
  round_to_nearest: [1, 500, 5000, 10000]
  round_type: "floor"


# Ratio Study ------------------------------------------------------------------

# Years and assessment stages used to calculate YoY changes in the evaluate
# stage. Typically we want to compare new model values with the finalized values
# (post-appeal) from the last reassessment and the most recent values from the
# prior year
ratio_study:
  far_year: "2021"
  far_stage: "board"
  far_column: "meta_2yr_pri_board_tot"
  near_year: "2023"
  near_stage: "certified"
  near_column: "meta_certified_tot"

  # Min. number of sales to calculate ratio statistics, per CCAO SOPs. Groups
  # must have greater than or equal to the number of sales specified
  min_n_sales: 30

  # Quantile breakouts to use in the evaluate stage. For example, 3 will split
  # each geography in evaluate into terciles
  num_quantile: [3, 5, 10]

  # Geographies for which to calculate performance statistics in the evaluate
  # stage. Each geography is also broken out by class
  geographies:
    - "meta_township_code"
    - "meta_nbhd_code"
    - "loc_tax_municipality_name"
    - "loc_ward_num"
    - "loc_census_puma_geoid"
    - "loc_census_tract_geoid"
    - "loc_school_elementary_district_geoid"
    - "loc_school_secondary_district_geoid"
    - "loc_school_unified_district_geoid"

  # Pins for which to generate Quarto reports using reports/pin/pin.qmd. Can
  # also be appended to using the environmental variable REPORT_ADDITIONAL_PINS
  pins:
    - "13253180150000" # Typical 211 2-flat in Logan Square, 2100 sqft
    - "05174150240000" # Large 6 bed Georgian in Winnetka, 4600 sqft
    - "14074220130000" # Uptown 205, 3 bed, 2.5 bath, 2000 sqft
    - "20102120200000" # Bronzeville greystone, 211, 11 beds, 4100 sqft
    - "20321100160000" # Auburn Gresham 203, 5 bed, 3 bath, 2400 sqft
    - "20202080210000" # West Englewood 211, 6 bed, 2 bath post-fire, 2000 sqft
    - "17203230280000" # Pilsen 211 2-flat, 4 bed, 2 bath, 1800 sqft
    - "14321100320000" # Lincoln Park 206, 4 bed, 3 bath, 4000 sqft
    - "16054270120000" # Austin 205, 6 bed, 2 bath, 2000 sqft
    - "20142140140000" # Hyde Park greystone, 210, 5 beds, 4 baths, 3000 sqft


# Comparables ------------------------------------------------------------------

# Parameters related to the experimental comparables finding feature
# in the interpret pipeline stage. Only used if comp_enable = TRUE
comp:
  # Number of comps to generate for each PIN/card
  num_comps: 20

  # Number of price bins to use when binning properties for the purpose of
  # comparison. Corresponds to ntile of predicted FMV, e.g. 10 creates deciles
  # of predicted values. Larger deciles = larger comparables search range
  num_price_bins: 20


# Export -----------------------------------------------------------------------

# Final run ID(s) chosen for export to Desk Review spreadsheets and iasWorld
# upload
export:
  triad_code: "1"
  run_id: "2024-03-17-stupefied-maya"