数据挖掘中决策树ID3非递归算法

2023年9月23日 217次阅读来源: 递归算法

最近看了下ID3算法，虽然很经典，但是网上90%的实现方式都是用的递归，大家都知道递归的效率低下，特别是当数据集很多的时候，刚好最近在看python，因此无聊就把他改成非递归的了

数据集：

Sunny Hot High Weak No
Sunny Hot High Strong No
Overcast Hot High Weak Yes
Rain Mild High Weak Yes
Rain Cool Normal Weak Yes
Rain Cool Normal Strong No
Overcast Cool Normal Strong Yes
Sunny Mild High Weak No
Sunny Cool Normal Weak Yes
Rain Mild Normal Weak Yes
Sunny Mild Normal Strong Yes
Overcast Mild High Strong Yes
Overcast Hot Normal Weak Yes
Rain Mild High Strong No

代码：

#encoding:utf-8
__author__ = 'wangjiewen'

import json
from math import log

class ID3(object):
    def __init__(self):
        self.matrix = []
        self.labels = ['Outlook', 'Temperature', 'Humidity', 'Wind']
        self.tree = {}
        self.loadFile('trainset.txt')

    def loadFile(self, filename):
        """
        加载数据文件
        :param filename: 文件路径
        :return: void
        """
        fr = open(filename)
        lines = fr.readlines()

        for lineStr in lines:
            lineArr = lineStr.strip().split(' ')
            self.matrix.append(lineArr)

    def calcEntropy(self, dataSet):
        """
        计算熵 entropy = -∑p(x) * log p(x)
        :param dataSet: 数据矩阵，最后一列为类标号
        :return: float 熵值
        """
        nums = len(dataSet)
        labelsCount = {}

        #统计每个类别的数量
        for featVec in dataSet:
            classLabel = featVec[-1]
            if classLabel not in labelsCount.keys():
                labelsCount[classLabel] = 0
            labelsCount[classLabel] += 1

        entropy = 0.0
        for label in labelsCount:
            prob = float(labelsCount[label]) / nums
            entropy -= prob * log(prob, 2)
        return entropy


    def splitDataSet(self, dataSet, col):
        """
        将dataSet按照第col列分割，返回第col列值为value的数据
        :param dataSet: []
        :param col: 分割的列
        :return: {} col列所取值的对象数组
        """
        # result = []
        # for featVec in dataSet:
        #     if featVec[col] == value:
        #         tmpLeft = featVec[:col]
        #         tmpLeft.extend(featVec[col + 1:])
        #         result.append(tmpLeft)
        result = {}
        for featVec in dataSet:
            # tmpLine = featVec[:col]
            # tmpLine.extend(featVec[col + 1:])
            tmpLine = featVec[:]

            key = featVec[col]
            if key not in result.keys():
                tmpArr = []
                tmpArr.append(tmpLine)
                result[key] = tmpArr
            else:
                tmpArr = result[key]
                tmpArr.append(tmpLine)


        return result

    def selectMaxGainCol(self, dataSet):
        """
        选择信息最大信息增益的列标号
        :param dataSet:
        :return:
        """
        numsOfCol = len(dataSet[0]) - 1 #数据集的列数，最后一列为类标号
        numsOfData = len(dataSet) #数据集的条目数量

        maxGain = 0.0 #最大的信息增益
        maxFeatCol = -1 #最大信息增益对应的列

        #Entropy(S)
        entropyS = self.calcEntropy(dataSet)

        for col in range(0, numsOfCol):
            featDict = self.splitDataSet(dataSet, col)

            tmpGain = entropyS
            for key in featDict.keys():
                featArr = featDict[key]
                entropyFeat = self.calcEntropy(featArr)
                delta = (len(featArr) / float(numsOfData)) * entropyFeat
                tmpGain -= delta

            # print "Gain(", self.labels[col], ") =", tmpGain
            if tmpGain > maxGain:
                maxGain = tmpGain
                maxFeatCol = col
        return maxFeatCol


    def createTree(self):
        """
        创建决策树
        :return:
        """
        initCol = self.selectMaxGainCol(self.matrix)
        root = {self.labels[initCol]: {}}

        #节点栈，保存当前访问的节点
        nodeStack = []
        nodeStack.append(root[self.labels[initCol]])

        #类标号栈，保存当前进行划分的类标号
        colStack = []
        colStack.append(initCol)

        #数据站，保存当前需要被划分的数据，和类标号一一对应
        dataStack = []
        dataStack.append(self.matrix)

        while(len(dataStack) > 0):
            dataSet = dataStack.pop()
            col = colStack.pop()
            pCur = nodeStack.pop() #指向当前节点的指针

            #按属性进行划分后的数据字典
            splitDict = self.splitDataSet(dataSet, col)

            for key in splitDict:
                data = splitDict[key]

                #如果全部属于正类或负类，则标记其类别，代表已经划分完成
                classSet = set(example[-1] for example in data)
                if len(classSet) == 1:
                    endLabel = classSet.pop()
                    label = self.labels[col]

                    #分情况讨论，当节点具有分支的时候，
                    if label in pCur.keys():
                        pCur[label][key] = endLabel
                    else:
                        pCur[key] = endLabel
                    continue

                #如果属性的还可以继续划分，则将该节点加入对应的栈中
                #因为最后一行为类标号，因此要>1
                if len(data[0]) > 1:
                    tmpMaxCol = self.selectMaxGainCol(data)
                    label = self.labels[tmpMaxCol]
                    pCur[key] = {}
                    pCur[key][label] = {}
                    dataStack.append(data)
                    colStack.append(tmpMaxCol)
                    nodeStack.append(pCur[key])

        print json.dumps(root, indent=4)
        self.tree = root
        return root


    def classify(self, featLabels=[], testVec=[]):
        """
        分类函数预测
        :param testVec:
        :return:
        """
        rootKey = self.tree.keys()[0]
        rootNode = self.tree[rootKey]

        keyStack = []
        keyStack.append(rootKey)
        nodeStack = []
        nodeStack.append(rootNode)

        while len(nodeStack) > 0:
            curKey = keyStack.pop()
            curNode = nodeStack.pop()
            featIndex = featLabels.index(curKey)

            #keyOfAttr是属性的key, 例如sunny,rainy
            for keyOfAttr in curNode.keys():
                if(testVec[featIndex] == keyOfAttr):

                    #如果节点类型为字典，则不是叶节点，继续加入栈中
                    if type(curNode[keyOfAttr]).__name__ == 'dict':
                        #nextKey是特征的标号，如Outlook,Humidity
                        nextKey = curNode[keyOfAttr].keys()[0]
                        nextNode = curNode[keyOfAttr][nextKey]
                        keyStack.append(nextKey)
                        nodeStack.append(nextNode)
                    else:
                        classLabel = curNode[keyOfAttr]
                        return classLabel


    def test(self):
        """
        用作测试，没有什么用处
        :return:
        """
        dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
        labels = ['no surfacing', 'flippers']
        self.matrix = dataSet
        self.labels = labels



model = ID3()
# model.test()
tree = model.createTree()
featLabels = ['Outlook', 'Temperature', 'Humidity', 'Wind']
testVec = "Rain Mild High Weak".split(" ")
testVec2 = "Overcast Mild High Weak".split(" ")

result = model.classify(featLabels, testVec)
print result

最近看了下ID3算法，虽然很经典，但是网上90%的实现方式都是用的递归，大家都知道递归的效率低下，特别是当数据集很多的时候，因此无聊就把他改成非递归的了

上代码：

    原文作者：递归算法
    原文地址: https://blog.csdn.net/yinchuandong2/article/details/41682293
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。