Source code for er_evaluation.error_analysis._subgroup_discovery

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor


[docs]def fit_dt_regressor( X, y, numerical_features=None, categorical_features=None, sample_weights=None, random_state=0, criterion="squared_error", **kwargs ): """ Fits a decision tree regressor model with optional preprocessing for numerical and categorical features. Args: X (numpy array or pandas DataFrame): The input features. y (numpy array or pandas Series): The target values. numerical_features (list of int or str, optional): The column indices or column names of numerical features. Default is None. categorical_features (list of int or str, optional): The column indices or column names of categorical features. Default is None. sample_weights (numpy array, optional): Individual weights for each sample. Default is None. random_state (int): Random state for the decision tree regressor. criterion (str): The function to measure the quality of a split. Supported criteria are "squared_error", "friedman_mse", "absolute_error", and "poisson". Default is "squared_error". **kwargs: Additional keyword arguments passed to the DecisionTreeRegressor constructor. Returns: sklearn.pipeline.Pipeline: A fitted decision tree regressor model with preprocessing steps. Examples: >>> import numpy as np >>> import pandas as pd >>> X = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": ["a", "b", "a"]}) >>> y = np.array([2, 4, 6]) >>> numerical_features = ["A", "B"] >>> categorical_features = ["C"] >>> model = fit_dt_regressor(X, y, numerical_features, categorical_features) >>> isinstance(model, Pipeline) True """ clf = DecisionTreeRegressor(**kwargs, criterion=criterion, random_state=random_state) preprocess_steps = [] if numerical_features is not None: preprocess_steps += [("imputer", SimpleImputer(strategy="constant", fill_value=-1), numerical_features)] if categorical_features is not None: preprocess_steps += [("one_hot_encoder", OneHotEncoder(), categorical_features)] model = Pipeline( steps=[ ("preprocessor", ColumnTransformer(preprocess_steps)), ("regressor", clf), ] ) model.fit(X, y, regressor__sample_weight=sample_weights) return model