Source code for er_evaluation.error_analysis._subgroup_discovery
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
[docs]def fit_dt_regressor(
X,
y,
numerical_features=None,
categorical_features=None,
sample_weights=None,
random_state=0,
criterion="squared_error",
**kwargs
):
"""
Fits a decision tree regressor model with optional preprocessing for numerical and categorical features.
Args:
X (numpy array or pandas DataFrame): The input features.
y (numpy array or pandas Series): The target values.
numerical_features (list of int or str, optional): The column indices or column names of numerical features. Default is None.
categorical_features (list of int or str, optional): The column indices or column names of categorical features. Default is None.
sample_weights (numpy array, optional): Individual weights for each sample. Default is None.
random_state (int): Random state for the decision tree regressor.
criterion (str): The function to measure the quality of a split. Supported criteria are "squared_error", "friedman_mse", "absolute_error", and "poisson". Default is "squared_error".
**kwargs: Additional keyword arguments passed to the DecisionTreeRegressor constructor.
Returns:
sklearn.pipeline.Pipeline: A fitted decision tree regressor model with preprocessing steps.
Examples:
>>> import numpy as np
>>> import pandas as pd
>>> X = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": ["a", "b", "a"]})
>>> y = np.array([2, 4, 6])
>>> numerical_features = ["A", "B"]
>>> categorical_features = ["C"]
>>> model = fit_dt_regressor(X, y, numerical_features, categorical_features)
>>> isinstance(model, Pipeline)
True
"""
clf = DecisionTreeRegressor(**kwargs, criterion=criterion, random_state=random_state)
preprocess_steps = []
if numerical_features is not None:
preprocess_steps += [("imputer", SimpleImputer(strategy="constant", fill_value=-1), numerical_features)]
if categorical_features is not None:
preprocess_steps += [("one_hot_encoder", OneHotEncoder(), categorical_features)]
model = Pipeline(
steps=[
("preprocessor", ColumnTransformer(preprocess_steps)),
("regressor", clf),
]
)
model.fit(X, y, regressor__sample_weight=sample_weights)
return model