您好,登錄后才能下訂單哦!
這篇文章將為大家詳細講解有關解決keras讀取多個hdf5文件進行訓練的方法,小編覺得挺實用的,因此分享給大家做個參考,希望大家閱讀完這篇文章后可以有所收獲。
用keras進行大數據訓練,為了加快訓練,需要提前制作訓練集。
由于HDF5的特性,所有數據需要一次性讀入到內存中,才能保存。
為此,我采用分批次分為2個以上HDF5進行存儲。
1、先讀取每個標簽下的圖片,并設置標簽
def load_dataset(path_name,data_path): images = [] labels = [] train_images = [] valid_images = [] train_labels = [] valid_labels = [] counter = 0 allpath = os.listdir(path_name) nb_classes = len(allpath) print("label_num: ",nb_classes) for child_dir in allpath: child_path = os.path.join(path_name, child_dir) for dir_image in os.listdir(child_path): if dir_image.endswith('.jpg'): img = cv2.imread(os.path.join(child_path, dir_image)) image = misc.imresize(img, (IMAGE_SIZE, IMAGE_SIZE), interp='bilinear') #resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE)) images.append(image) labels.append(counter)
2、該標簽下的數據集分割為訓練集(train images),驗證集(val images),訓練標簽(train labels),驗證標簽
(val labels)
def split_dataset(images, labels): train_images, valid_images, train_labels, valid_labels = train_test_split(images,\ labels, test_size = 0.2, random_state = random.randint(0, 100)) #print(train_images.shape[0], 'train samples') #print(valid_images.shape[0], 'valid samples') return train_images, valid_images, train_labels ,valid_labels
3、分割后的數據分別添加到總的訓練集,驗證集,訓練標簽,驗證標簽。
其次,清空原有的圖片集和標簽集,目的是節省內存。假如一次性讀入多個標簽的數據集與標簽集,進行數據分割后,會占用大于單純進行上述操作兩倍以上的內存。
images = np.array(images) t_images, v_images, t_labels ,v_labels = split_dataset(images, labels) for i in range(len(t_images)): train_images.append(t_images[i]) train_labels.append(t_labels[i]) for j in range(len(v_images)): valid_images.append(v_images[j]) valid_labels.append(v_labels[j]) if counter%50== 49: print( counter+1 , "is read to the memory!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") images = [] labels = [] counter = counter + 1 print("train_images num: ", len(train_images), " ", "valid_images num: ",len(valid_images))
4、進行判斷,直到讀到自己自己分割的那個標簽。
開始進行寫入。寫入之前,為了更好地訓練模型,需要把對應的圖片集和標簽打亂順序。
if ((counter % 4316 == 4315) or (counter == nb_classes - 1)): print("start write images and labels data...................................................................") num = counter // 5000 dirs = data_path + "/" + "h6_" + str(num - 1) if not os.path.exists(dirs): os.makedirs(dirs) data2h6(dirs, t_images, v_images, t_labels ,v_labels)
對應打亂順序并寫入到HDF5
def data2h6(dirs_path, train_images, valid_images, train_labels ,valid_labels): TRAIN_HDF5 = dirs_path + '/' + "train.hdf5" VAL_HDF5 = dirs_path + '/' + "val.hdf5" #shuffle state1 = np.random.get_state() np.random.shuffle(train_images) np.random.set_state(state1) np.random.shuffle(train_labels) state2 = np.random.get_state() np.random.shuffle(valid_images) np.random.set_state(state2) np.random.shuffle(valid_labels) datasets = [ ("train",train_images,train_labels,TRAIN_HDF5), ("val",valid_images,valid_labels,VAL_HDF5)] for (dType,images,labels,outputPath) in datasets: # HDF5 initial f = h6py.File(outputPath, "w") f.create_dataset("x_"+dType, data=images) f.create_dataset("y_"+dType, data=labels) #f.create_dataset("x_"+dType, data=images, compression="gzip", compression_opts=9) #f.create_dataset("y_"+dType, data=labels, compression="gzip", compression_opts=9) f.close()
5、判斷文件全部讀入
def read_dataset(dirs): files = os.listdir(dirs) print(files) for file in files: path = dirs+'/' + file dataset = h6py.File(path, "r") file = file.split('.') set_x_orig = dataset["x_"+file[0]].shape[0] set_y_orig = dataset["y_"+file[0]].shape[0] print(set_x_orig) print(set_y_orig)
6、訓練中,采用迭代器讀入數據
def generator(self, datagen, mode): passes=np.inf aug = ImageDataGenerator( featurewise_center = False, samplewise_center = False, featurewise_std_normalization = False, samplewise_std_normalization = False, zca_whitening = False, rotation_range = 20, width_shift_range = 0.2, height_shift_range = 0.2, horizontal_flip = True, vertical_flip = False) epochs = 0 # 默認是無限循環遍歷 while epochs < passes: # 遍歷數據 file_dir = os.listdir(self.data_path) for file in file_dir: #print(file) file_path = os.path.join(self.data_path,file) TRAIN_HDF5 = file_path +"/train.hdf5" VAL_HDF5 = file_path +"/val.hdf5" #TEST_HDF5 = file_path +"/test.hdf5" db_t = h6py.File(TRAIN_HDF5) numImages_t = db_t['y_train'].shape[0] db_v = h6py.File(VAL_HDF5) numImages_v = db_v['y_val'].shape[0] if mode == "train": for i in np.arange(0, numImages_t, self.BS): images = db_t['x_train'][i: i+self.BS] labels = db_t['y_train'][i: i+self.BS] if K.image_data_format() == 'channels_first': images = images.reshape(images.shape[0], 3, IMAGE_SIZE,IMAGE_SIZE) else: images = images.reshape(images.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3) images = images.astype('float32') images = images/255 if datagen : (images,labels) = next(aug.flow(images,labels,batch_size = self.BS)) # one-hot編碼 if self.binarize: labels = np_utils.to_categorical(labels,self.classes) yield ({'input_1': images}, {'softmax': labels}) elif mode == "val": for i in np.arange(0, numImages_v, self.BS): images = db_v['x_val'][i: i+self.BS] labels = db_v['y_val'][i: i+self.BS] if K.image_data_format() == 'channels_first': images = images.reshape(images.shape[0], 3, IMAGE_SIZE,IMAGE_SIZE) else: images = images.reshape(images.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3) images = images.astype('float32') images = images/255 if datagen : (images,labels) = next(aug.flow(images,labels,batch_size = self.BS)) #one-hot編碼 if self.binarize: labels = np_utils.to_categorical(labels,self.classes) yield ({'input_1': images}, {'softmax': labels}) epochs += 1
7、至此,就大功告成了
完整的代碼:
# -*- coding: utf-8 -*- """ Created on Mon Feb 12 20:46:12 2018 @author: william_yue """ import os import numpy as np import cv2 import random from scipy import misc import h6py from sklearn.model_selection import train_test_split from keras import backend as K K.clear_session() from keras.utils import np_utils IMAGE_SIZE = 128 # 加載數據集并按照交叉驗證的原則劃分數據集并進行相關預處理工作 def split_dataset(images, labels): # 導入了sklearn庫的交叉驗證模塊,利用函數train_test_split()來劃分訓練集和驗證集 # 劃分出了20%的數據用于驗證,80%用于訓練模型 train_images, valid_images, train_labels, valid_labels = train_test_split(images,\ labels, test_size = 0.2, random_state = random.randint(0, 100)) return train_images, valid_images, train_labels ,valid_labels def data2h6(dirs_path, train_images, valid_images, train_labels ,valid_labels): #def data2h6(dirs_path, train_images, valid_images, test_images, train_labels ,valid_labels, test_labels): TRAIN_HDF5 = dirs_path + '/' + "train.hdf5" VAL_HDF5 = dirs_path + '/' + "val.hdf5" #采用標簽與圖片相同的順序分別打亂訓練集與驗證集 state1 = np.random.get_state() np.random.shuffle(train_images) np.random.set_state(state1) np.random.shuffle(train_labels) state2 = np.random.get_state() np.random.shuffle(valid_images) np.random.set_state(state2) np.random.shuffle(valid_labels) datasets = [ ("train",train_images,train_labels,TRAIN_HDF5), ("val",valid_images,valid_labels,VAL_HDF5)] for (dType,images,labels,outputPath) in datasets: # 初始化HDF5寫入 f = h6py.File(outputPath, "w") f.create_dataset("x_"+dType, data=images) f.create_dataset("y_"+dType, data=labels) #f.create_dataset("x_"+dType, data=images, compression="gzip", compression_opts=9) #f.create_dataset("y_"+dType, data=labels, compression="gzip", compression_opts=9) f.close() def read_dataset(dirs): files = os.listdir(dirs) print(files) for file in files: path = dirs+'/' + file file_read = os.listdir(path) for i in file_read: path_read = os.path.join(path, i) dataset = h6py.File(path_read, "r") i = i.split('.') set_x_orig = dataset["x_"+i[0]].shape[0] set_y_orig = dataset["y_"+i[0]].shape[0] print(set_x_orig) print(set_y_orig) #循環讀取每個標簽集下的所有圖片 def load_dataset(path_name,data_path): images = [] labels = [] train_images = [] valid_images = [] train_labels = [] valid_labels = [] counter = 0 allpath = os.listdir(path_name) nb_classes = len(allpath) print("label_num: ",nb_classes) for child_dir in allpath: child_path = os.path.join(path_name, child_dir) for dir_image in os.listdir(child_path): if dir_image.endswith('.jpg'): img = cv2.imread(os.path.join(child_path, dir_image)) image = misc.imresize(img, (IMAGE_SIZE, IMAGE_SIZE), interp='bilinear') #resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE)) images.append(image) labels.append(counter) images = np.array(images) t_images, v_images, t_labels ,v_labels = split_dataset(images, labels) for i in range(len(t_images)): train_images.append(t_images[i]) train_labels.append(t_labels[i]) for j in range(len(v_images)): valid_images.append(v_images[j]) valid_labels.append(v_labels[j]) if counter%50== 49: print( counter+1 , "is read to the memory!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") images = [] labels = [] if ((counter % 4316 == 4315) or (counter == nb_classes - 1)): print("train_images num: ", len(train_images), " ", "valid_images num: ",len(valid_images)) print("start write images and labels data...................................................................") num = counter // 5000 dirs = data_path + "/" + "h6_" + str(num - 1) if not os.path.exists(dirs): os.makedirs(dirs) data2h6(dirs, train_images, valid_images, train_labels ,valid_labels) #read_dataset(dirs) print("File HDF5_%d "%num, " id done!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") train_images = [] valid_images = [] train_labels = [] valid_labels = [] counter = counter + 1 print("All File HDF5 done!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") read_dataset(data_path) #讀取訓練數據集的文件夾,把他們的名字返回給一個list def read_name_list(path_name): name_list = [] for child_dir in os.listdir(path_name): name_list.append(child_dir) return name_list if __name__ == '__main__': path = "data" data_path = "data_hdf5_half" if not os.path.exists(data_path): os.makedirs(data_path) load_dataset(path,data_path)
關于解決keras讀取多個hdf5文件進行訓練的方法就分享到這里了,希望以上內容可以對大家有一定的幫助,可以學到更多知識。如果覺得文章不錯,可以把它分享出去讓更多的人看到。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。