Why start with pixel value and not texture metric for this image?
为什么从像素值开始,而不是纹理度量这张图像?
Because it gives the best split of input data.
因为它给出了输入数据的最佳分割。
How to pick a node that gives the best split?
如何选择一个能给出最佳分割的节点?
Use Gini impurity → pick the one that maximizes the Gini gain.
使用基尼系数→选择一个使基尼系数增益最大化的。
Gini lmpurity is the probability of incorrectly classifying a randomly chosenelement in the dataset if it were randomly labeled according to the classdistribution in the dataset. lt’s calculated as
基尼系数是指数据集中随机选择的元素,如果根据数据集中的类别分布随机标记,则错误分类的概率。计算为
G=i=1∑Cp(i)∗(1−p(i))
where C is the number of classes and p(i) is the probability of randomly picking an element of class i.
其中 C 是类的数量,p(i) 是随机抽取类 i 中的一个元素的概率。
Primary Disadvantage of decision trees:
Often suffers from overfitting →works well on training data but fails on newdata leading to low accuracy.
决策树的主要缺点:
经常存在过拟合问题→在训练数据上很好,但在新数据上失败,导致精度低。
Random Forest to the rescue!
使用决策森林规避决策树的缺点。
60 - How to use Random Forest in Python
1 2 3 4 5 6
import pandas as pd import matplotlib.pyplot as plt import numpy as np
# Generate Gabor features num = 1# To count numbers up in order to give Gabor features a lable in the data frame kernels = [] for theta inrange(2): # Define number of thetas theta = theta / 4. * np.pi for sigma in (1, 3): # Sigma with 1 and 3 for lamda in np.arange(0, np.pi, np.pi / 4): # Range of wavelengths for gamma in (0.05, 0.5): # Gamma values of 0.05 and 0.5 gabor_label = 'Gabor' + str(num) # Label Gabor columns as Gabor1, Gabor2, etc. ksize = 9 kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma, 0, ktype=cv2.CV_32F) kernels.append(kernel) # Now filter the image and add values to a new column fimg = cv2.filter2D(img2, cv2.CV_8UC3, kernel) filtered_img = fimg.reshape(-1) df[gabor_label] = filtered_img # Labels columns as Gabor1, Gabor2, etc. print(gabor_label, ': theta =', theta, ': sigma =', sigma, ': lamda =', lamda, ': gamma =', gamma) num += 1# Increment for gabor column label
# All features generated must match the way features are generated for TRAINING. # Feature1 is our original image pixels img2 = img.reshape(-1) df['Original Image'] = img2
# Generate Gabor features num = 1 kernels = [] for theta inrange(2): theta = theta / 4. * np.pi for sigma in (1, 3): for lamda in np.arange(0, np.pi, np.pi / 4): for gamma in (0.05, 0.5): gabor_label = 'Gabor' + str(num) ksize=9 kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma, 0, ktype=cv2.CV_32F) kernels.append(kernel) # Now filter image and add values to new column fimg = cv2.filter2D(img2, cv2.CV_8UC3, kernel) filtered_img = fimg.reshape(-1) df[gabor_label] = filtered_img # Modify this to add new column for each gabor num += 1 ######################################## # Geerate OTHER FEATURES and add them to the data frame # Feature 3 is canny edge edges = cv2.Canny(img, 100,200) # Image, min and max values edges1 = edges.reshape(-1) df['Canny Edge'] = edges1 # Add column to original dataframe
from skimage.filters import roberts, sobel, scharr, prewitt
# Feature 4 is Roberts edge edge_roberts = roberts(img) edge_roberts1 = edge_roberts.reshape(-1) df['Roberts'] = edge_roberts1
path = "images/Train_images/*.tif" for file in glob.glob(path): print(file) # just stop here to see all file names printed img = cv2.imread(file, 0) # Call the feature extraction function. X = feature_extraction(img) result = loaded_model.predict(X) segmented = result.reshape((img.shape))
name = file.split("e_") cv2.imwrite('images/Segmented/'+ name[1], segmented)
67b - Feature based image segmentation using traditional machine learning. -Multi-training images-
# Check if the input image is RGB or grey and convert to grey if RGB if input_img.ndim == 3and input_img.shape[-1] == 3: img = cv2.cvtColor(input_img,cv2.COLOR_BGR2GRAY) elif input_img.ndim == 2: img = input_img else: raise excerption("The module works only with grayscale and RGB images!")
################################################################ # START ADDING DATA TO THE DATAFRAME
# Add pixel values to the data frame pixel_values = img.reshape(-1) df['Pixel_Value'] = pixel_values # Pixel value itself as a feature df['Image_Name'] = image # Capture image name as we read multiple images
############################################################################ # Generate Gabor features num = 1# To count numbers up in order to give Gabor features a lable in the data frame kernels = [] for theta inrange(2): # Define number of thetas theta = theta / 4. * np.pi for sigma in (1, 3): # Sigma with 1 and 3 for lamda in np.arange(0, np.pi, np.pi / 4): # Range of wavelengths for gamma in (0.05, 0.5): # Gamma values of 0.05 and 0.5 gabor_label = 'Gabor' + str(num) # Label Gabor columns as Gabor1, Gabor2, etc. ksize=9 kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma, 0, ktype=cv2.CV_32F) kernels.append(kernel) # Now filter the image and add values to a new column fimg = cv2.filter2D(img, cv2.CV_8UC3, kernel) filtered_img = fimg.reshape(-1) df[gabor_label] = filtered_img #Labels columns as Gabor1, Gabor2, etc. print(gabor_label, ': theta=', theta, ': sigma=', sigma, ': lamda=', lamda, ': gamma=', gamma) num += 1# Increment for gabor column label ######################################## # Gerate OTHER FEATURES and add them to the data frame
# CANNY EDGE edges = cv2.Canny(img, 100,200) #Image, min and max values edges1 = edges.reshape(-1) df['Canny Edge'] = edges1 #Add column to original dataframe
from skimage.filters import roberts, sobel, scharr, prewitt
# MEDIAN with sigma=3 median_img = nd.median_filter(img, size=3) median_img1 = median_img.reshape(-1) df['Median s3'] = median_img1
# VARIANCE with size=3 variance_img = nd.generic_filter(img, np.var, size=3) variance_img1 = variance_img.reshape(-1) df['Variance s3'] = variance_img1 # Add column to original dataframe
###################################### # Update dataframe for images to include details for each image in the loop image_dataset = image_dataset.append(df)
STEP 2: READ LABELED IMAGES (MASKS) AND CREATE ANOTHER DATAFRAME WITH LABEL VALUES AND LABEL FILE NAMES
mask_dataset = pd.DataFrame() # Create dataframe to capture mask info.
mask_path = "images/train_masks/" for mask in os.listdir(mask_path): # iterate through each file to perform some action print(mask)
df2 = pd.DataFrame() # Temporary dataframe to capture info for each mask in the loop input_mask = cv2.imread(mask_path + mask)
# Check if the input mask is RGB or grey and convert to grey if RGB if input_mask.ndim == 3and input_mask.shape[-1] == 3: label = cv2.cvtColor(input_mask,cv2.COLOR_BGR2GRAY) elif input_mask.ndim == 2: label = input_mask else: raise excerption("The module works only with grayscale and RGB images!")
# Add pixel values to the data frame label_values = label.reshape(-1) df2['Label_Value'] = label_values df2['Mask_Name'] = mask
mask_dataset = mask_dataset.append(df2) # Update mask dataframe with all the info from each mask
STEP 3: GET DATA READY FOR RANDOM FOREST (or other classifier) COMBINE BOTH DATAFRAMES INTO A SINGLE DATASET
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
dataset = pd.concat([image_dataset, mask_dataset], axis=1) # Concatenate both image and mask datasets
# If you expect image and mask names to be the same this is where we can perform sanity check # dataset['Image_Name'].equals(dataset['Mask_Name']) # If we do not want to include pixels with value 0 # e.g. Sometimes unlabeled pixels may be given a value 0. dataset = dataset[dataset.Label_Value != 0]
# Assign training features to X and labels to Y # Drop columns that are not relevant for training (non-features) X = dataset.drop(labels = ["Image_Name", "Mask_Name", "Label_Value"], axis=1)
# Assign label values to Y (our prediction) Y = dataset["Label_Value"].values
# Split data into train and test to verify accuracy after fitting the model. from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20)
STEP 4: Define the classifier and fit a model with our training data
1 2 3 4 5 6 7
# Import training classifier from sklearn.ensemble import RandomForestClassifier # Instantiate model with n number of decision trees model = RandomForestClassifier(n_estimators = 50, random_state = 42)
# Train the model on training data model.fit(X_train, y_train)
STEP 5: Accuracy check
1 2 3 4 5
from sklearn import metrics
prediction_test = model.predict(X_test) # Check accuracy on test dataset. print("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
STEP 6: SAVE MODEL FOR FUTURE USE
1 2 3 4 5
# You can store the model for future use. In fact, this is how you do machine elarning # Train on training images, validate on test images and deploy the model on unknown images. # Save the trained model as pickle string to disk for future use model_name = "sandstone_model" pickle.dump(model, open(model_name, 'wb'))