Python ID3 DecisionTree

2024年6月10日 145次阅读

#coding=UTF-8
from math import log
import operator
#from DecisionTree import treePlotter
#from DecisionTree.treePlotter import createPlot



def createDataSet():
    #训练数据集
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    #训练数据集分为2类 yes,no
    #labels为特征的名称
    labels = ['no surfacing','flippers']
    return dataSet, labels

#计算给定数据集的熵
def calcShannonEnt(dataSet):
    #总的训练样本数
    numEntries = len(dataSet)
    #类标签，每条样本所属类别
    labelCounts = {} 
    #遍历每条样本数据
    for featVec in dataSet: 
        #每条最后一列为各自类别
        currentLabel = featVec[-1]
        #为所有可能的类别取值建立<key,value>结构
        #key表示类别，value表示该类出现的次数
        #此处开始初始化
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 
        #出现一次加1
        labelCounts[currentLabel] += 1 
    #保存信心熵
    shannonEnt = 0.0
    #遍历样本后，计算各类别占样本的比例，即概率
    #遍历词典<key,value>结构
    for key in labelCounts:
        #计算该类别的比例
        prob = float(labelCounts[key])/numEntries 
        #计算信息增益，以2为底取对数
        shannonEnt -= prob * log(prob,2)
    #返回数据熵
    return shannonEnt

#计算条件熵，划分数据集
def splitDataSet(dataSet, axis, value):
    #定义新变量，保存划分后的数据集
    retDataSet = []
    #遍历数据集每一条数据
    for featVec in dataSet:
        #将符合要求的数据抽取抽来存入retDataSet中
        if featVec[axis] == value: 
            #除给定的特征axis及值value，整行数据保存下来
            #如选取的axis为年龄，value为青年
            #这部分的数据在书记集中有多少行都会被保存下来
            #但年龄为青年这一列的数据不会被保存
            reducedFeatVec = featVec[:axis]     
            reducedFeatVec.extend(featVec[axis+1:]) 
            #保存去除该列后的数据
            retDataSet.append(reducedFeatVec) 
    #返回去除指定特征列的数据，便于计算该条件下的条件熵            
    return retDataSet

#选择最好的特征划分数据集，返回最佳特征下标
#最好的特征即为信息增益最大的特征
def chooseBestFeatureToSplit(dataSet):
    #保存特征个数，最后一列为类标签，减1
    numFeatures = len(dataSet[0]) - 1 
    #数据集的熵
    baseEntropy = calcShannonEnt(dataSet)
    #保存最大的信息增益
    bestInfoGain = 0.0
    #信息增益最大的特征
    bestFeature = -1
    #依次遍历数据集中的各个特征
    for i in range(numFeatures):         
        #取得当前特征对应列下的值
        featList = [example[i] for example in dataSet]
        #print i,featList
        #当前特征下对应值去重，即每个特征值唯一
        #如年龄，取值3个青年，中年，老年
        uniqueVals = set(featList)
        #保存对应特征值的条件熵
        newEntropy = 0.0
        #遍历特征对应的特征值，即依次令年龄为青年，中年，老年
        #遍历当前特征的取值域
        for value in uniqueVals:
            #根据当前值划分子集
            subDataSet = splitDataSet(dataSet, i, value) 
            #计算子集记录数与集合总记录数的比例，即子集概率
            prob = len(subDataSet)/float(len(dataSet))
            #计算每个子数据集的熵，加和为该条件下的条件熵
            newEntropy += prob * calcShannonEnt(subDataSet) 
        #整个特征计算完成，计算该特征下的信息增益
        #信息增益=数据集的熵-数据集按条件划分后的熵
        infoGain = baseEntropy - newEntropy   
        #最大的信息增益，为最好的特征
        if (infoGain > bestInfoGain):       
            bestInfoGain = infoGain         
            bestFeature = i
    #返回最好的特征
    return bestFeature                      

#多数表决的方法决定叶子节点的分类
#所有特征都用完时，以数据集中的类别数量最多的类别作为最终类别
def majorityCnt(classList):
    #<key,value>数据集中每个类别出现的次数
    classCount={}
    #遍历数据集中的类别
    for vote in classList:
        #初始化类别第1次加入字典
        if vote not in classCount.keys(): classCount[vote] = 0
        #记录次数
        classCount[vote] += 1
    #遍历结束后，次数value值从小到大排序    
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    #返回数量最多的类别
    return sortedClassCount[0][0]

#创建树
#输入：数据集，特征名
#输出：树 {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
def createTree(dataSet,labels):
    #取出数据集最后一列数据，即训练数据集的类标签    
    classList = [example[-1] for example in dataSet]
    #类别完全相同则不划分，返回类别标签
    #具体所有数据值为同一值，如classList[0]='yes'个数
    #为整个列表长度，显然所有的值均为yes
    if classList.count(classList[0]) == len(classList): 
        #返回该类别
        return classList[0]
    #如果数据集中没有特征值    
    if len(dataSet[0]) == 1: 
        #返回数据集中类别数量最多的类别
        return majorityCnt(classList)
    #选出数据集中最佳的划分子集的特征(信息增益最大的列)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    #将该特征名作为根节点
    bestFeatLabel = labels[bestFeat]
    #初始赋值决策树
    myTree = {bestFeatLabel:{}}
    #删除已选择的特征名称
    del(labels[bestFeat])
    #取得最后划分特征列(最佳划分子集特征列)下的值
    featValues = [example[bestFeat] for example in dataSet]
    #每个值去重
    uniqueVals = set(featValues)
    #遍历去重后的特征值
    for value in uniqueVals:
        #获得除去已删除的特征外，其余特征的名称
        subLabels = labels[:]
        #以当前的特征值划分子集
        #以子集为参数，递归调用创建树的方法
        #将递归调用的结果最为数节点的一个分子
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    #返回树        
    return myTree  


#使用决策树进行分类
#输入：训练好的决策树
#构造树的类别标签向量（用于确定特征在数据集中的位置）
#测试数据
#判断测试数据testVec,属于哪个类别
def classify(inputTree,featLabels,testVec):
    #firstStr存放决策树的根节点名称
    #取得根节点名称为no surfacting
    #firstStr='no surfacting'
    firstStr = inputTree.keys()[0]
    #除去根节点名称以外的值
    #即{0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
    secondDict = inputTree[firstStr]
    #index方法查找当前列表中第一个匹配firstStr变量的元素
    #即找到树根节点在所有特征列的第几列
    featIndex = featLabels.index(firstStr)
    
    #测试数据对应根节点下的取值
    key = testVec[featIndex]
    #secondDict[0]='no' secondDict[1]='{'flippers':{0:'no',1:'yes'}}'
    valueOfFeat = secondDict[key]
    #判断valueofFeat的类型
    #为词典类型，递归寻找
    if isinstance(valueOfFeat, dict): 
        classLabel = classify(valueOfFeat, featLabels, testVec)
    #为数值，直接返回    
    else: classLabel = valueOfFeat
    #返回最终类别
    return classLabel

#决策树的存储
#pickle序列化对象，可以在磁盘上保存对象
def storeTree(inputTree,filename):
    import pickle
    fw = open(filename,'w')
    pickle.dump(inputTree,fw)
    fw.close()

#在需要的时候读取磁盘中的对象    
def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)



if __name__ == "__main__":
    dataSet, labels=createDataSet()
   
    #mytree=treePlotter.retrieveTree(0)
    #print mytree
    #print classify(mytree,labels,[1,1])

    #fr=open('lenses.txt')
    #lenses=[inst.strip().split('\t') for inst in fr.readlines()]
    #lensesLabels=['age','prescript','astigmatic','tearRate']
    
   
    #构建树
    inputTree=createTree(dataSet, labels)
    print inputTree

    labels2= ['no surfacing','flippers']    
    print classify(inputTree,labels2,[0,1])
    """
    #测试分类
    print classify(inputTree,labels2,[0,1])
   
    storeTree(inputTree, "inputTree.tree")
    
    #使用读取序列化后的树，进行测试数据分类
    inputTree=grabTree("inputTree.tree")
    print classify(inputTree,labels2,[0,1])
    
    """
    
    """
    fr=open('lenses.txt')
    lenses=[inst.strip().split('\t') for inst in fr.readlines()]
    lensesLabels=['age','prescript','astigmatic','tearRate']

    lensesLabels2=lensesLabels[:]
    inputTree=createTree(lenses, lensesLabels)
    print "---------------------------------------------------"
    print inputTree
    print classify(inputTree,lensesLabels2,['pre','myope','yes','normal','hard'])
    #createPlot(lensesTree)
    """

lenses.txt文件内容

young	myope	no	reduced	no lenses
young	myope	no	normal	soft
young	myope	yes	reduced	no lenses
young	myope	yes	normal	hard
young	hyper	no	reduced	no lenses
young	hyper	no	normal	soft
young	hyper	yes	reduced	no lenses
young	hyper	yes	normal	hard
pre	myope	no	reduced	no lenses
pre	myope	no	normal	soft
pre	myope	yes	reduced	no lenses
pre	myope	yes	normal	hard
pre	hyper	no	reduced	no lenses
pre	hyper	no	normal	soft
pre	hyper	yes	reduced	no lenses
pre	hyper	yes	normal	no lenses
presbyopic	myope	no	reduced	no lenses
presbyopic	myope	no	normal	no lenses
presbyopic	myope	yes	reduced	no lenses
presbyopic	myope	yes	normal	hard
presbyopic	hyper	no	reduced	no lenses
presbyopic	hyper	no	normal	soft
presbyopic	hyper	yes	reduced	no lenses
presbyopic	hyper	yes	normal	no lenses