千家信息网

python怎么实现AdaBoost算法

发表于:2025-01-27 作者:千家信息网编辑
千家信息网最后更新 2025年01月27日,这篇文章主要讲解了"python怎么实现AdaBoost算法",文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着小编的思路慢慢深入,一起来研究和学习"python怎么实现AdaBoost算法"吧
千家信息网最后更新 2025年01月27日python怎么实现AdaBoost算法

这篇文章主要讲解了"python怎么实现AdaBoost算法",文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着小编的思路慢慢深入,一起来研究和学习"python怎么实现AdaBoost算法"吧!

通过实现AdaBoost类算法

import numpy as npimport pandas as pdimport mathfrom math import logfrom math import expfrom sklearn.datasets import load_irisfrom sklearn.model_selection import train_test_splitdef create_data():    iris = load_iris()    df = pd.DataFrame(iris.data, columns=iris.feature_names)    df['label'] = iris.target    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']    data = np.array(df.iloc[:100, [0, 1, -1]])    for i in range(len(data)):        if data[i, -1] == 0:            data[i, -1] = -1    return data[:, :2], data[:, -1]class AdaBoost:    def __init__(self, n_estimators=50, learning_rate=1.0):        self.clf_num = n_estimators        self.learning_rate = learning_rate    def init_args(self, datasets, labels):        self.X = datasets        self.Y = labels        self.M, self.N = datasets.shape        # 弱分类器数目和集合        self.clf_sets = []        # 初始化weights        self.weights = [1.0 / self.M] * self.M        # G(x)系数 alpha        self.alpha = []    def _G(self, features, labels, weights):        m = len(features)        error = 100000.0  # 无穷大        best_v = 0.0        # 单维features        features_min = min(features)        features_max = max(features)        n_step = (features_max - features_min +                  self.learning_rate) // self.learning_rate        # print('n_step:{}'.format(n_step))        direct, compare_array = None, None        for i in range(1, int(n_step)):            v = features_min + self.learning_rate * i            if v not in features:                # 误分类计算                compare_array_positive = np.array(                    [1 if features[k] > v else -1 for k in range(m)])                weight_error_positive = sum([                    weights[k] for k in range(m)                    if compare_array_positive[k] != labels[k]                ])                compare_array_nagetive = np.array(                    [-1 if features[k] > v else 1 for k in range(m)])                weight_error_nagetive = sum([                    weights[k] for k in range(m)                    if compare_array_nagetive[k] != labels[k]                ])                if weight_error_positive < weight_error_nagetive:                    weight_error = weight_error_positive                    _compare_array = compare_array_positive                    direct = 'positive'                else:                    weight_error = weight_error_nagetive                    _compare_array = compare_array_nagetive                    direct = 'nagetive'                # print('v:{} error:{}'.format(v, weight_error))                if weight_error < error:                    error = weight_error                    compare_array = _compare_array                    best_v = v        return best_v, direct, error, compare_array    # 计算alpha    def _alpha(self, error):        return 0.5 * np.log((1 - error) / error)    # 规范化因子    def _Z(self, weights, a, clf):        return sum([            weights[i] * np.exp(-1 * a * self.Y[i] * clf[i])            for i in range(self.M)        ])    # 权值更新    def _w(self, a, clf, Z):        for i in range(self.M):            self.weights[i] = self.weights[i] * np.exp(                -1 * a * self.Y[i] * clf[i]) / Z    # G(x)的线性组合    def _f(self, alpha, clf_sets):        pass    def G(self, x, v, direct):        if direct == 'positive':            return 1 if x > v else -1        else:            return -1 if x > v else 1    def fit(self, X, y):        self.init_args(X, y)        for epoch in range(self.clf_num):            axis = 0            final_direct = 'null'            best_clf_error, best_v, clf_result = 100000, None, None            # 根据特征维度, 选择误差最小的            for j in range(self.N):                features = self.X[:, j]                # 分类阈值,分类误差,分类结果                v, direct, error, compare_array = self._G(                    features, self.Y, self.weights)                if error < best_clf_error:                    best_clf_error = error                    best_v = v                    final_direct = direct                    clf_result = compare_array                    axis = j  # axis数字代表第几个属性列                # print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v))                if best_clf_error == 0:                    break            # 计算G(x)系数a            a = self._alpha(best_clf_error)            self.alpha.append(a)            # 记录分类器            self.clf_sets.append((axis, best_v, final_direct))            # 规范化因子            Z = self._Z(self.weights, a, clf_result)            # 权值更新            self._w(a, clf_result, Z)    def predict(self, feature):        result = 0.0        for i in range(len(self.clf_sets)):            axis, clf_v, direct = self.clf_sets[i]            f_input = feature[axis]            result += self.alpha[i] * self.G(f_input, clf_v, direct)        # sign        return 1 if result > 0 else -1    def score(self, X_test, y_test):        right_count = 0        for i in range(len(X_test)):            feature = X_test[i]            if self.predict(feature) == y_test[i]:                right_count += 1        return right_count / len(X_test)X, y = create_data()X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)clf = AdaBoost(n_estimators=3, learning_rate=0.5)clf.fit(X_train, y_train)print("评分:{}".format(clf.score(X_test, y_test)))

结果:有时1.0有时0.75有时0.6有时0.4注意,这个程序计算规范化因子的时候可能报错:TypeError: 'NoneType' object is not subscriptable。原因是由于划分数据的时候,v选择的时候恰好造成了一边为空,另一边为满的。由于有一边是空的,所以,计算规范化因子的时候,参数clf为none。这时候我们在用clf[i],肯定是不行的,也就报了这个错误。

sklearn已有包调用

import numpy as npimport pandas as pdimport mathfrom math import logfrom math import expfrom sklearn.datasets import load_irisfrom sklearn.model_selection import train_test_splitfrom sklearn.ensemble import AdaBoostClassifierdef create_data():    iris = load_iris()    df = pd.DataFrame(iris.data, columns=iris.feature_names)    df['label'] = iris.target    df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']    data = np.array(df.iloc[:100, [0, 1, -1]])    for i in range(len(data)):        if data[i, -1] == 0:            data[i, -1] = -1    return data[:, :2], data[:, -1]X, y = create_data()X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)clf.fit(X_train, y_train)print("评分:{}".format(clf.score(X_test, y_test)))

感谢各位的阅读,以上就是"python怎么实现AdaBoost算法"的内容了,经过本文的学习后,相信大家对python怎么实现AdaBoost算法这一问题有了更深刻的体会,具体使用情况还需要大家实践验证。这里是,小编将为大家推送更多相关知识点的文章,欢迎关注!

0