implementing XGBoost from scratch on python

  Kiến thức lập trình

I tried to write my own implementation

class XGBoost_own(BaseEstimator, ClassifierMixin):
    def __init__(self, n_estimators=100, learning_rate=0.2, max_depth=3,
                 loss='logistic', reg_alpha=0, reg_lambda=1, random_state=0):

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.loss = loss
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.initialization = lambda y: np.mean(y) * np.ones([y.shape[0]])
        self.loss_by_iter = []
        self.trees_ = []
        self.boosted_pred = None
        self.random_state = random_state

    def _sigmoid(self, predictions):
        return 1 / (1 + np.exp(-predictions))

    def _softmax(self, predictions):
        exp = np.exp(predictions)
        return exp / np.sum(exp, axis=1, keepdims=True)

    def _compute_loss_gradient(self, F, y):
        if self.loss == 'logistic':
            return y - self._sigmoid(F)
        else:
            raise ValueError("Unsupported loss function")

    def _compute_regularization_penalty(self):
        if self.reg_alpha == 0:
            return 0
        elif self.reg_alpha > 0 and self.reg_lambda == 0:
            return self.reg_alpha * np.sum(np.abs(tree.coef_))
        elif self.reg_alpha == 0 and self.reg_lambda > 0:
            return 0.5 * self.reg_lambda * np.sum(tree.coef_ ** 2)
        elif self.reg_alpha > 0 and self.reg_lambda > 0:
            return self.reg_alpha * np.sum(np.abs(tree.coef_)) + 0.5 * self.reg_lambda * np.sum(tree.coef_ ** 2)
        else:
            raise ValueError("Invalid combination of regularization parameters")

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.boosted_pred = np.zeros_like(y)
        F = self.initialization(y)

        for t in range(self.n_estimators):
            residuals = self._compute_loss_gradient(F, y)
            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=self.random_state)
            tree.fit(X, residuals)
            self.trees_.append(tree)
            update = self.learning_rate * tree.predict(X)
            F += update
            self.boosted_pred += update

        return self

    def predict_proba(self, X):
        F = np.zeros(X.shape[0])
        for tree in self.trees_:
            F += self.learning_rate * tree.predict(X)
        F = np.clip(F, -700, 700)
        if self.loss == 'logistic':
            return self._sigmoid(F).reshape(-1, 1)
        elif self.loss == 'softmax':
            return self._softmax(F)
        else:
            raise ValueError("Unsupported loss function")

    def predict(self, X):
        proba = self.predict_proba(X)
        return np.argmax(proba, axis=1)

    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'learning_rate': self.learning_rate,
            'max_depth': self.max_depth,
            'loss': self.loss,
            'reg_alpha': self.reg_alpha,
            'reg_lambda': self.reg_lambda,
            'random_state': self.random_state,
        }

running the model through the function. Maybe I’m implementing it incorrectly. I’m confused. Also, someone has done it, I’ll be glad to see self-written implementations of LightGBM, CatBoost, Histogram-Based Gradient Boosting

def hyperparameter_tuning(models, param_grid, Xre_train, Yre_train, Xre_test, Yre_test, results):
    for name, model in models.items():
        print(f"Tuning hyperparameters for {name}...")
        param_grid_name = param_grid[name]
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid_name, scoring='accuracy', cv=5, n_jobs=-1)
        grid_search.fit(Xre_train, Yre_train)
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        Yre_pred = best_model.predict(Xre_test)
        accuracy = accuracy_score(Yre_test, Yre_pred)
        precision = precision_score(Yre_test, Yre_pred, average='weighted')
        recall = recall_score(Yre_test, Yre_pred, average='weighted')
        f1 = f1_score(Yre_test, Yre_pred, average='weighted')
        roc_auc = roc_auc_score(Yre_test, best_model.predict_proba(Xre_test), multi_class='ovr', average='weighted')
        confusion = confusion_matrix(Yre_test, Yre_pred)

I get an error.Dataset for multiclass classification

numpy.core._exceptions._UFuncOutputCastingError: Cannot cast ufunc 'add' output from dtype('float64') to dtype('int32') with casting rule 'same_kind'

New contributor

Аслан Алибеков is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

LEAVE A COMMENT