决策树ID3 C4.5 CART代码
生活随笔
收集整理的这篇文章主要介绍了
决策树ID3 C4.5 CART代码
小编觉得挺不错的,现在分享给大家,帮大家做个参考.
ID3
# encoding: gbkimport pandas as pd import numpy as npclass DecisionTree:def __init__(self):self.model = None# 计算熵def calEntropy(self, y):valRate = y.value_counts().apply(lambda x: x / y.size) # 频次汇总 得到各个特征对应的概率valEntropy = np.inner(valRate, np.log2(valRate)) * -1return valEntropydef fit(self, xTrain, yTrain=pd.Series([],dtype=pd.StringDtype())):if yTrain.size == 0: # 如果不传进参数yTrain,自动选择最后一列作为分类标签yTrain = xTrain.iloc[:, -1]xTrain = xTrain.iloc[:, :len(xTrain.columns) - 1]self.model = self.buildDecisionTree(xTrain, yTrain)return self.modeldef buildDecisionTree(self, xTrain, yTrain):propNamesAll = xTrain.columns # 各属性名# print(propNamesAll)yTrainCounts = yTrain.value_counts()if yTrainCounts.size == 1:# print('only one class', yTrainCounts.index[0])return yTrainCounts.index[0]entropyD = self.calEntropy(yTrain)maxGain = None # 最大信息增益maxEntropyPropName = None # 最大信息增益对应的属性for propName in propNamesAll:propDatas = xTrain[propName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 频次汇总 得到各个特征对应的概率sumEntropyByProp = 0for propClass, dvRate in propClassSummary.items():yDataByPropClass = yTrain[xTrain[propName] == propClass]entropyDv = self.calEntropy(yDataByPropClass)sumEntropyByProp += entropyDv * dvRategainEach = entropyD - sumEntropyByPropif maxGain is None or gainEach > maxGain:maxGain = gainEachmaxEntropyPropName = propName# print('select prop:', maxEntropyPropName, maxGain)propDatas = xTrain[maxEntropyPropName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 频次汇总 得到各个特征对应的概率retClassByProp = {}for propClass, dvRate in propClassSummary.items():whichIndex = xTrain[maxEntropyPropName] == propClass # whichIndex: pd.Series()类型xDataByPropClass = xTrain[whichIndex]yDataByPropClass = yTrain[whichIndex]del xDataByPropClass[maxEntropyPropName] # 删除已经选择的属性列retClassByProp[propClass] = self.buildDecisionTree(xDataByPropClass, yDataByPropClass)return {'Node': maxEntropyPropName, 'Edge': retClassByProp}def predictBySeries(self, modelNode, data=pd.Series([],dtype=pd.StringDtype())):if not isinstance(modelNode, dict):return modelNodenodePropName = modelNode['Node'] # 节点所表示的属性prpVal = data.get(nodePropName) # 待预测数据在该属性上的取值for edge, nextNode in modelNode['Edge'].items():if prpVal == edge:return self.predictBySeries(nextNode, data)return Nonedef predict(self, data):if isinstance(data, pd.Series): # 如果只有一个数据,则执行。此时数据是Series,DataframeDataframe的一行数据是一个Seriesreturn self.predictBySeries(self.model, data)elif isinstance (data, pd.DataFrame): # 若是一组数据,则执行。 此时数据是Dataframereturn data.apply(lambda d: self.predictBySeries(self.model, d), axis=1) # axis-1: 对行应用函数data = pd.read_csv("xigua.csv", encoding="gbk")data_train = data.iloc[:,:-1] # 除去标签 # print(pd.DataFrame(data_train))decisionTree = DecisionTree() treeData = decisionTree.fit(data) print("树结构为:\n",treeData)print(pd.DataFrame({'预测值': decisionTree.predict(data_train), '真实标签': data.iloc[:, -1]}))C4.5
# encoding: gbkimport pandas as pd import numpy as npclass DecisionTree:def __init__(self):self.model = None# 计算熵def calEntropy(self, y):valRate = y.value_counts().apply(lambda x: x / y.size) # 频次汇总 得到各个特征对应的概率valEntropy = np.inner(valRate, np.log2(valRate)) * -1return valEntropydef fit(self, xTrain, yTrain=pd.Series([],dtype=pd.StringDtype())):if yTrain.size == 0: # 如果不传进参数yTrain,自动选择最后一列作为分类标签yTrain = xTrain.iloc[:, -1]xTrain = xTrain.iloc[:, :len(xTrain.columns) - 1]self.model = self.buildDecisionTree(xTrain, yTrain)return self.modeldef buildDecisionTree(self, xTrain, yTrain):"""Attribute:propNamesAll: 所有属性名maxGrain_ratio:最大信息增益率maxEntropyPropName:最大信息增益率对应的属性propName:某个属性名propClassSummary:某属性的各个取值的频率"""propNamesAll = xTrain.columns # 各属性名# print(propNamesAll)yTrainCounts = yTrain.value_counts()if yTrainCounts.size == 1:# print('only one class', yTrainCounts.index[0])return yTrainCounts.index[0]entropyD = self.calEntropy(yTrain) # 熵maxGrain_ratio = None # 最大信息增益率maxEntropyPropName = None # 最大信息增益率对应的属性for propName in propNamesAll:propDatas = xTrain[propName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 频次汇总 得到各个特征对应的概率sumEntropyByProp = 0IV = 0for propClass, dvRate in propClassSummary.items(): # propClass:属性的取值yDataByPropClass = yTrain[xTrain[propName] == propClass]entropyDv = self.calEntropy(yDataByPropClass) # 某属性的某个取值的熵sumEntropyByProp += entropyDv * dvRateIV += np.inner(dvRate,np.log(dvRate))*-1gainEach = entropyD - sumEntropyByPropGrain_ratio = gainEach/IVif maxGrain_ratio is None or Grain_ratio > maxGrain_ratio:maxGrain_ratio = Grain_ratiomaxEntropyPropName = propName# print('select prop:', maxEntropyPropName, maxGain)propDatas = xTrain[maxEntropyPropName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 频次汇总 得到各个特征对应的概率retClassByProp = {}for propClass, dvRate in propClassSummary.items():whichIndex = xTrain[maxEntropyPropName] == propClass # whichIndex: pd.Series()类型xDataByPropClass = xTrain[whichIndex]yDataByPropClass = yTrain[whichIndex]del xDataByPropClass[maxEntropyPropName] # 删除已经选择的属性列retClassByProp[propClass] = self.buildDecisionTree(xDataByPropClass, yDataByPropClass)return {'Node': maxEntropyPropName, 'Edge': retClassByProp}def predictBySeries(self, modelNode, data=pd.Series([],dtype=pd.StringDtype())):if not isinstance(modelNode, dict):return modelNodenodePropName = modelNode['Node'] # 节点所表示的属性prpVal = data.get(nodePropName) # 待预测数据在该属性上的取值for edge, nextNode in modelNode['Edge'].items():if prpVal == edge:return self.predictBySeries(nextNode, data)return Nonedef predict(self, data):if isinstance(data, pd.Series): # 如果只有一个数据,则执行。此时数据是Series,DataframeDataframe的一行数据是一个Seriesreturn self.predictBySeries(self.model, data)elif isinstance (data, pd.DataFrame): # 若是一组数据,则执行。 此时数据是Dataframereturn data.apply(lambda d: self.predictBySeries(self.model, d), axis=1) # axis-1: 对行应用函数data = pd.read_csv("xigua.csv", encoding="gbk")data_train = data.iloc[:,:-1] # 除去标签 # print(pd.DataFrame(data_train))decisionTree = DecisionTree() treeData = decisionTree.fit(data) print("树结构为:\n",treeData)print(pd.DataFrame({'预测值': decisionTree.predict(data_train), '真实标签': data.iloc[:, -1]}))CART
# encoding: gbkimport pandas as pd import numpy as npclass DecisionTree:def __init__(self):self.model = None# 计算熵def Gini(self, y):valRate = y.value_counts().apply(lambda x: x / y.size) # 频次汇总 得到各个特征对应的概率valGini = 1 - np.inner(valRate, valRate) * -1return valGinidef fit(self, xTrain, yTrain=pd.Series([],dtype=pd.StringDtype())):if yTrain.size == 0: # 如果不传进参数yTrain,自动选择最后一列作为分类标签yTrain = xTrain.iloc[:, -1]xTrain = xTrain.iloc[:, :len(xTrain.columns) - 1]self.model = self.buildDecisionTree(xTrain, yTrain)return self.modeldef buildDecisionTree(self, xTrain, yTrain):propNamesAll = xTrain.columns # 各属性名# print(propNamesAll)yTrainCounts = yTrain.value_counts()if yTrainCounts.size == 1:# print('only one class', yTrainCounts.index[0])return yTrainCounts.index[0]entropyD = self.Gini(yTrain)maxGini_index = None # 最大基尼指数maxGiniName = None # 最大基尼指数对应的属性for propName in propNamesAll:propDatas = xTrain[propName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 频次汇总 得到各个特征对应的概率sumGiniByProp = 0for propClass, dvRate in propClassSummary.items():yDataByPropClass = yTrain[xTrain[propName] == propClass]Gini = self.Gini(yDataByPropClass)sumGiniByProp += Gini * dvRateif maxGini_index is None or sumGiniByProp > maxGini_index:maxGini_index = sumGiniByPropmaxGiniName = propName# print('select prop:', maxGiniName, maxGain)propDatas = xTrain[maxGiniName]propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size) # 频次汇总 得到各个特征对应的概率retClassByProp = {}for propClass, dvRate in propClassSummary.items():whichIndex = xTrain[maxGiniName] == propClass # whichIndex: pd.Series()类型xDataByPropClass = xTrain[whichIndex]yDataByPropClass = yTrain[whichIndex]del xDataByPropClass[maxGiniName] # 删除已经选择的属性列retClassByProp[propClass] = self.buildDecisionTree(xDataByPropClass, yDataByPropClass)return {'Node': maxGiniName, 'Edge': retClassByProp}def predictBySeries(self, modelNode, data=pd.Series([],dtype=pd.StringDtype())):if not isinstance(modelNode, dict):return modelNodenodePropName = modelNode['Node'] # 节点所表示的属性prpVal = data.get(nodePropName) # 待预测数据在该属性上的取值for edge, nextNode in modelNode['Edge'].items():if prpVal == edge:return self.predictBySeries(nextNode, data)return Nonedef predict(self, data):if isinstance(data, pd.Series): # 如果只有一个数据,则执行。此时数据是Series,DataframeDataframe的一行数据是一个Seriesreturn self.predictBySeries(self.model, data)elif isinstance (data, pd.DataFrame): # 若是一组数据,则执行。 此时数据是Dataframereturn data.apply(lambda d: self.predictBySeries(self.model, d), axis=1) # axis-1: 对行应用函数data = pd.read_csv("xigua.csv", encoding="gbk")data_train = data.iloc[:,:-1] # 除去标签 # print(pd.DataFrame(data_train))decisionTree = DecisionTree() treeData = decisionTree.fit(data) print("树结构为:\n",treeData)print(pd.DataFrame({'预测值': decisionTree.predict(data_train), '真实标签': data.iloc[:, -1]}))xigua.csv数据集
总结
以上是生活随笔为你收集整理的决策树ID3 C4.5 CART代码的全部内容,希望文章能够帮你解决所遇到的问题。
- 上一篇: 作者:宋璇(1993-),女,食品安全大
- 下一篇: 作者:张群(1988-),女,博士,中国