學習日志---決策樹算法ID3

發布時間：2020-07-12 19:24:08 來源：網絡閱讀：1750 作者：wukong0716 欄目：開發技術

ID3算法

#coding=utf-8

from math import log  
import operator  
  
#這里定義個樣本集
def createDataSet():  
    dataSet = [[1, 1, 'yes'],  
               [1, 1, 'yes'],  
               [1, 0, 'no'],  
               [0, 1, 'no'],  
               [0, 1, 'no']]  
    labels = ['no surfacing','flippers']  
    #change to discrete values  
    return dataSet, labels  
  
#這里計算該樣本的期望值
def calcShannonEnt(dataSet):  
    numEntries = len(dataSet)  
    labelCounts = {}  
    for featVec in dataSet: #the the number of unique elements and their occurance  
        currentLabel = featVec[-1]  
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0  
        labelCounts[currentLabel] += 1  
    shannonEnt = 0.0  
    for key in labelCounts:  
        prob = float(labelCounts[key])/numEntries  
        shannonEnt -= prob * log(prob,2) #log base 2  
    return shannonEnt  
     
#這里計算數據集中某列滿足相同條件下的其他數值組成的矩陣，用于計算該特征值的期望值
def splitDataSet(dataSet, axis, value):  
    retDataSet = []  
    for featVec in dataSet:  
        if featVec[axis] == value:  
            reducedFeatVec = featVec[:axis]     #chop out axis used for splitting  
            reducedFeatVec.extend(featVec[axis+1:])  
            retDataSet.append(reducedFeatVec)  
    return retDataSet  
      
#針對上個函數算出的矩陣，計算出所有特征值的期望值，然后得出最大的信息增量
def chooseBestFeatureToSplit(dataSet):  
    numFeatures = len(dataSet[0]) - 1      #the last column is used for the labels  
    baseEntropy = calcShannonEnt(dataSet)  
    bestInfoGain = 0.0; bestFeature = -1  
    for i in range(numFeatures):        #iterate over all the features  
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature  
        uniqueVals = set(featList)       #get a set of unique values  
        newEntropy = 0.0  
        for value in uniqueVals: 
            #這里選取某一特征值得所有可能計算期望值 
            subDataSet = splitDataSet(dataSet, i, value)  
            prob = len(subDataSet)/float(len(dataSet))  
            newEntropy += prob * calcShannonEnt(subDataSet)       
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy  
        if (infoGain > bestInfoGain):       #compare this to the best gain so far  
            bestInfoGain = infoGain         #if better than current best, set to best  
            bestFeature = i  
    return bestFeature                      #returns an integer  
  
  
#如果最后的結果有多個，投票機制取最大的
def majorityCnt(classList):  
    classCount={}  
    for vote in classList:  
        if vote not in classCount.keys(): classCount[vote] = 0  
        classCount[vote] += 1  
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)  
    return sortedClassCount[0][0]  
  
#創建樹，使用遞歸
def createTree(dataSet,labels):  
    #計算dataset里是否只有單值
    classList = [example[-1] for example in dataSet]  
   
    #如果只有單值，且唯一，則返回單值
    if classList.count(classList[0]) == len(classList):   
        return classList[0]
        
    #如果是最后結果，但有多值，取最多的
    if len(dataSet[0]) == 1: 
        return majorityCnt(classList)  
        
    #取最佳的特征值
    bestFeat = chooseBestFeatureToSplit(dataSet)  
    bestFeatLabel = labels[bestFeat] 
    
    #根據這個特征值制定字典 
    myTree = {bestFeatLabel:{}}  
    
    #刪除這個特征值
    del(labels[bestFeat]) 
     
    #找出這個特征值下有幾個選擇
    featValues = [example[bestFeat] for example in dataSet]  
    uniqueVals = set(featValues)  
    for value in uniqueVals:  
        subLabels = labels[:]  
        #針對每個選擇，建立不同的分支     
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)  
    return myTree                              
      
#決策樹分類器，inputtree是決策樹，fearlabel是列種類，testVec是要分類的向量
def classify(inputTree,featLabels,testVec):  
    firstStr = inputTree.keys()[0]  
    secondDict = inputTree[firstStr]  
    featIndex = featLabels.index(firstStr)  
    key = testVec[featIndex]  
    valueOfFeat = secondDict[key]  
    if isinstance(valueOfFeat, dict):   
        classLabel = classify(valueOfFeat, featLabels, testVec)  
    else: classLabel = valueOfFeat  
    return classLabel  
  
#序列化決策樹
def storeTree(inputTree,filename):  
    import pickle  
    fw = open(filename,'w')  
    pickle.dump(inputTree,fw)  
    fw.close()  
    
#提取決策樹      
def grabTree(filename):  
    import pickle  
    fr = open(filename)  
    return pickle.load(fr)

輸入矩陣是一張表，最后一列是結果。

ID3算法是比較粗糙的算法，對標志性變量分類可以，但是無法分析數值型數據。

而且在選擇特征值時，傾向于選擇種類較多的特征值，因此需要改進。

向AI問一下細節

91超碰碰碰碰久久久久久综合_超碰av人澡人澡人澡人澡人掠_国产黄大片在线观看画质优化_txt小说免费全本

學習日志---決策樹算法ID3

猜你喜歡

91超碰碰碰碰久久久久久综合_超碰av人澡人澡人澡人澡人掠_国产黄大片在线观看画质优化_txt小说免费全本

學習日志---決策樹算法ID3

猜你喜歡

最新資訊

相關推薦

相關標簽