您好,登錄后才能下訂單哦!
ID3算法
#coding=utf-8 from math import log import operator #這里定義個樣本集 def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing','flippers'] #change to discrete values return dataSet, labels #這里計算該樣本的期望值 def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: #the the number of unique elements and their occurance currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob,2) #log base 2 return shannonEnt #這里計算數據集中某列滿足相同條件下的其他數值組成的矩陣,用于計算該特征值的期望值 def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] #chop out axis used for splitting reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet #針對上個函數算出的矩陣,計算出所有特征值的期望值,然后得出最大的信息增量 def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0; bestFeature = -1 for i in range(numFeatures): #iterate over all the features featList = [example[i] for example in dataSet]#create a list of all the examples of this feature uniqueVals = set(featList) #get a set of unique values newEntropy = 0.0 for value in uniqueVals: #這里選取某一特征值得所有可能計算期望值 subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy if (infoGain > bestInfoGain): #compare this to the best gain so far bestInfoGain = infoGain #if better than current best, set to best bestFeature = i return bestFeature #returns an integer #如果最后的結果有多個,投票機制取最大的 def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] #創建樹,使用遞歸 def createTree(dataSet,labels): #計算dataset里是否只有單值 classList = [example[-1] for example in dataSet] #如果只有單值,且唯一,則返回單值 if classList.count(classList[0]) == len(classList): return classList[0] #如果是最后結果,但有多值,取最多的 if len(dataSet[0]) == 1: return majorityCnt(classList) #取最佳的特征值 bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] #根據這個特征值制定字典 myTree = {bestFeatLabel:{}} #刪除這個特征值 del(labels[bestFeat]) #找出這個特征值下有幾個選擇 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] #針對每個選擇,建立不同的分支 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) return myTree #決策樹分類器,inputtree是決策樹,fearlabel是列種類,testVec是要分類的向量 def classify(inputTree,featLabels,testVec): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) key = testVec[featIndex] valueOfFeat = secondDict[key] if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel #序列化決策樹 def storeTree(inputTree,filename): import pickle fw = open(filename,'w') pickle.dump(inputTree,fw) fw.close() #提取決策樹 def grabTree(filename): import pickle fr = open(filename) return pickle.load(fr)
輸入矩陣是一張表,最后一列是結果。
ID3算法是比較粗糙的算法,對標志性變量分類可以,但是無法分析數值型數據。
而且在選擇特征值時,傾向于選擇種類較多的特征值,因此需要改進。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。