From 6a1eb30f641b9c7a5f7f4b82b6da480fbf663c9e Mon Sep 17 00:00:00 2001
From: Nimshi Venkat <nmeripo@asu.edu>
Date: Mon, 9 Oct 2017 01:22:24 -0600
Subject: [PATCH] Upload code

---
 analyze.py      |  47 ++++++++++++++
 data_augment.py | 140 ++++++++++++++++++++++++++++++++++++++++++
 preprocess.py   | 113 ++++++++++++++++++++++++++++++++++
 train_model.py  | 159 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 459 insertions(+)
 create mode 100644 analyze.py
 create mode 100644 data_augment.py
 create mode 100644 preprocess.py
 create mode 100644 train_model.py

diff --git a/analyze.py b/analyze.py
new file mode 100644
index 0000000..62ab8b7
--- /dev/null
+++ b/analyze.py
@@ -0,0 +1,47 @@
+import pandas as pd
+import numpy as np
+import seaborn as sn
+import matplotlib.pyplot as plt
+import pickle
+import cv2
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import f1_score
+from keras.utils.np_utils import to_categorical
+from sklearn.metrics import roc_curve, auc
+
+
+test =  pd.read_csv("/home/venkat/ClothingAttributeDataset/preprocessed/category_test.csv")
+labels= list(test.columns)
+del labels[0]
+y_true =  np.asarray(test[labels])
+print y_true.shape
+
+num_classes = len(labels)
+with open('/home/venkat/y_pred.pkl', 'rb') as f:
+        y_pred = pickle.load(f)
+
+y_pred = (y_pred == y_pred.max(axis=1)[:,None]).astype(int)
+y_pred = y_pred.argmax(1)
+y_true = y_true.argmax(1)
+
+
+# Micro .. Macro F1 scores
+print f1_score(y_true, y_pred, average='micro')
+print f1_score(y_true, y_pred, average='macro')
+
+
+# Plot confusion matrix and normalized confusion matrix
+cm = confusion_matrix(y_true, y_pred)
+
+df_cm = pd.DataFrame(cm, index = [i for i in labels],
+                  columns = [i for i in labels])
+plt.figure(figsize = (10,7))
+sn.heatmap(df_cm, annot=True)
+plt.show()
+
+cm_norm = cm / cm.astype(np.float).sum(axis=0)
+df_cm_norm = pd.DataFrame(cm_norm, index = [i for i in labels],
+                  columns = [i for i in labels])
+plt.figure(figsize = (10,7))
+sn.heatmap(df_cm_norm, annot=True)
+plt.show()
diff --git a/data_augment.py b/data_augment.py
new file mode 100644
index 0000000..a4a0a12
--- /dev/null
+++ b/data_augment.py
@@ -0,0 +1,140 @@
+import cv2
+import numpy as np
+import random
+import pandas as pd
+import math
+import glob
+import pickle
+
+coef = np.array([[[0.114, 0.587, 0.299]]])
+
+
+def random_crop(img, size):
+    w, h = img.shape[0], img.shape[1]
+    rangew = (w - size) // 2
+    rangeh = (h - size) // 2
+    offsetw = 0 if rangew == 0 else np.random.randint(rangew)
+    offseth = 0 if rangeh == 0 else np.random.randint(rangeh)
+    return img[offsetw:offsetw + size, offseth:offseth + size, :]
+
+
+def center_crop(img, size):
+    centerw, centerh = img.shape[0] // 2, img.shape[1] // 2
+    halfw, halfh = size // 2, size // 2
+    return img[centerw - halfw:centerw + halfw, centerh - halfh:centerh + halfh, :]
+
+
+def resize(img, size):
+    return cv2.resize(img, (size, size), interpolation=cv2.INTER_CUBIC)
+
+
+def random_flip(img, size):
+    if np.random.uniform() < 0.5:
+        # horizontal_flip
+        img = np.asarray(img).swapaxes(1, 0)
+        img = img[::-1, ...]
+        img = img.swapaxes(0, 1)
+
+    else:
+        # vertical_flip
+        img = np.asarray(img).swapaxes(0, 0)
+        img = img[::-1, ...]
+        img = img.swapaxes(0, 0)
+
+    return img
+
+
+def brightness_aug(img, brightness=0.2):
+    alpha = 1.0 + np.random.uniform(-brightness, brightness)
+    img *= alpha
+    return img
+
+
+def contrast_aug(img, contrast=0.2):
+    alpha = 1.0 + np.random.uniform(-contrast, contrast)
+    gray = img * coef
+    gray = (3.0 * (1.0 - alpha) / gray.size) * np.sum(gray)
+    img *= alpha
+    img += gray
+    return img
+
+
+def saturation_aug(img, saturation=0.4):
+    alpha = 1.0 + np.random.uniform(-saturation, saturation)
+    gray = img * coef
+    gray = np.sum(gray, axis=2, keepdims=True)
+    gray *= (1.0 - alpha)
+    img *= alpha
+    img += gray
+    return img
+
+
+def color_jitter(img):
+    lst = [brightness_aug, contrast_aug, saturation_aug]
+    random.shuffle(lst)
+    for aug in lst:
+        img = aug(img)
+    return img.astype(np.uint8)
+
+
+def normalize(img):
+    mean_pixel = [103.939, 116.779, 123.68]
+    img = img.astype(np.float32, copy=False)
+    for c in range(3):
+        img[:, :, c] = img[:, :, c] - mean_pixel[c]
+    # img = img.transpose((2,0,1))
+    # img = np.expand_dims(img, axis=0)
+    return img
+
+
+IMAGES_FOLDER = "/home/venkat/ClothingAttributeDataset/images/"
+
+# preprocess train data
+train_df = pd.read_csv("/home/venkat/ClothingAttributeDataset/preprocessed/category_train.csv")
+train_imgs = list(train_df["images"])
+train_labels = train_df[['shirt', 'sweater', 't-shirt', 'outerwear', 'suit', 'tank_top', 'dress']].values
+
+X_train = []
+y_train = []
+
+for i in range(len(train_imgs)):
+    img_path = IMAGES_FOLDER + train_imgs[i]
+    img = cv2.imread(img_path)
+    img_resize = normalize(resize(img, 224))
+    img_rf = random_flip(img_resize, 224)
+    img_crop = normalize(center_crop(img, 224))
+    img_cj = normalize(color_jitter(resize(img, 224).astype(np.float64)))
+    X_train += [img_resize, img_rf, img_crop, img_cj]
+    temp = [list(train_labels[i]), list(train_labels[i]), list(train_labels[i]), list(train_labels[i])]
+    y_train += temp
+
+X_train = np.asarray(X_train)
+y_train = np.asarray(y_train)
+
+pickle.dump(X_train, open("X_train.pkl", "wb"), pickle.HIGHEST_PROTOCOL)
+pickle.dump(y_train, open("y_train.pkl", "wb"), pickle.HIGHEST_PROTOCOL)
+
+# preprocess test data
+test_df = pd.read_csv("/home/venkat/ClothingAttributeDataset/preprocessed/category_test.csv")
+test_imgs = list(test_df["images"])
+test_labels = test_df[['shirt', 'sweater', 't-shirt', 'outerwear', 'suit', 'tank_top', 'dress']].values
+
+X_test = []
+y_test = []
+
+for i in range(len(test_imgs)):
+    img_path = IMAGES_FOLDER + test_imgs[i]
+    img = cv2.imread(img_path)
+    img_resize = normalize(resize(img, 224))
+    X_test += [img_resize]
+    temp = [list(test_labels[i])]
+    y_test += temp
+
+X_test = np.asarray(X_test)
+y_test = np.asarray(y_test)
+
+pickle.dump(X_test, open("X_test.pkl", "wb"), pickle.HIGHEST_PROTOCOL)
+pickle.dump(y_test, open("y_test.pkl", "wb"), pickle.HIGHEST_PROTOCOL)
+
+print X_train.shape
+print X_test.shape
diff --git a/preprocess.py b/preprocess.py
new file mode 100644
index 0000000..8cddd61
--- /dev/null
+++ b/preprocess.py
@@ -0,0 +1,113 @@
+import pandas as pd
+import numpy as np
+import glob
+import scipy.io
+import shutil
+from tqdm import tqdm
+import os
+
+
+def merge_dicts(*dict_args):
+    result = {}
+    for dictionary in dict_args:
+        result.update(dictionary)
+    return result
+
+
+ROOT = "/home/venkat/ClothingAttributeDataset/"
+LABELS = "/home/venkat/ClothingAttributeDataset/labels/"
+PREPROCESS = "/home/venkat/ClothingAttributeDataset/preprocessed/"
+
+if not os.path.exists(PREPROCESS):
+    os.makedirs(PREPROCESS)
+
+val = ["No", "Yes"]
+data_colors = {'black': val, 'blue': val, 'brown': val, 'cyan': val, 'gray': val, 'green': val, 'purple': val,
+               'red': val, 'white': val, 'yellow': val, 'purple': val, 'many_colors': val}
+
+data_pattern = {'pattern_floral': val, 'pattern_graphics': val, 'pattern_plaid': val,
+                'pattern_solid': val, 'pattern_spot': val, 'pattern_stripe': val}
+
+data_binary = {'collar': val, 'gender': ["male", "female"], 'necktie': val,
+               'placket': val, 'skin_exposure': ["low", "high"], 'scarf': val}
+
+data_multi = {'sleevelength': ["no", "short", "long"], 'neckline': ["v-shape", "round", "other"],
+              'category': ["shirt", "sweater", "t-shirt", "outerwear", "suit", "tank_top", "dress"]}
+
+data = merge_dicts(data_colors, data_binary, data_pattern)
+
+category_df = pd.DataFrame()
+
+for filename in glob.iglob(LABELS + '*.mat'):
+    feature_name = filename.split("/")[-1].split(".")[0][:-3]
+
+    if feature_name == "category":
+        labels = data_multi[feature_name]
+        mat = scipy.io.loadmat(filename)['GT'].flatten()
+        category_df = pd.get_dummies(mat, prefix="category")
+        category_df.columns = labels
+        category_df.insert(0, "images", category_df.index.map(lambda val: "{:06d}.jpg".format(val + 1)))
+        category_df = category_df[~np.isnan(mat)]
+
+# train-test split randomly
+msk = np.random.rand(len(category_df)) < 0.8
+train = category_df[msk]
+test = category_df[~msk]
+
+# Data Percentage for each category
+for key in data_multi['category']:
+    print key, round(100 * category_df[key].value_counts()[1]/ float(category_df.shape[0]), 2)
+
+train.to_csv(PREPROCESS + "category_train" + ".csv", index=False)
+test.to_csv(PREPROCESS + "category_test" + ".csv", index=False)
+
+
+# For Keras ImageGenerator - Flow from Directory
+"""
+train_label_map = {}
+
+for item in data_multi['category']:
+    train_label_map[item] = list(train.loc[train[item] == 1]["images"])
+
+test_label_map = {}
+for item in data_multi['category']:
+    test_label_map[item] = list(test.loc[test[item] == 1]["images"])
+
+label_cols = list(train.columns)
+del label_cols[0]
+y_train = train[label_cols].values
+y_test = test[label_cols].values
+
+copy_path_train = ROOT + "category_train/"
+copy_path_test = ROOT + "category_test/"
+
+if not os.path.exists(copy_path_train):
+    os.makedirs(copy_path_train)
+
+if not os.path.exists(copy_path_test):
+    os.makedirs(copy_path_test)
+
+for key in train_label_map.keys():
+    class_path = copy_path_train + key
+
+    if not os.path.exists(class_path):
+        os.makedirs(class_path)
+    img_paths = train_label_map[key]
+
+    for path in img_paths:
+        src_path = "/home/venkat/ClothingAttributeDataset/images/" + path
+        copy_path = class_path + "/" + path
+        shutil.copyfile(src_path, copy_path)
+
+for key in test_label_map.keys():
+    class_path = copy_path_test + key
+    
+    if not os.path.exists(class_path):
+        os.makedirs(class_path)
+    img_paths = test_label_map[key]
+    
+    for path in img_paths:
+        src_path = "/home/venkat/ClothingAttributeDataset/images/" + path
+        copy_path = class_path + "/" + path
+        shutil.copyfile(src_path, copy_path)
+"""
diff --git a/train_model.py b/train_model.py
new file mode 100644
index 0000000..725fe64
--- /dev/null
+++ b/train_model.py
@@ -0,0 +1,159 @@
+import numpy as np
+from keras.preprocessing.image import ImageDataGenerator
+from keras.models import model_from_json
+from keras.models import Sequential
+from keras.layers import Dropout, Flatten, Dense, Activation
+from keras.layers.convolutional import Convolution2D, AveragePooling2D, ZeroPadding2D, MaxPooling2D, Conv2D
+from keras.layers.pooling import GlobalAveragePooling2D
+from keras.layers.normalization import BatchNormalization
+from keras.callbacks import LearningRateScheduler
+from keras import applications
+from keras.optimizers import SGD, Adam
+from keras.models import Sequential, Model, load_model
+from keras import applications
+from keras import optimizers
+import pandas as pd
+import matplotlib.pyplot as plt
+import cv2
+import pickle
+import sys
+
+# read labels
+
+train = pd.read_csv("/home/gopal/venkat/ClothingAttributeDataset/preprocessed/category_train.csv")
+test =  pd.read_csv("/home/gopal/venkat/ClothingAttributeDataset/preprocessed/category_test.csv")
+
+label_cols = list(train.columns)
+del label_cols[0]
+y_train = train[label_cols].values
+y_test = test[label_cols].values
+
+print(y_train.shape)
+print(y_test.shape)
+
+
+# dimensions of our images.
+img_width, img_height = 224, 224
+img_rows, img_cols = 224, 224
+
+top_model_weights_path = 'bottleneck_fc_model.h5'
+train_data_dir = '/home/gopal/venkat/ClothingAttributeDataset/category_train'
+validation_data_dir = '/home/gopal/venkat/ClothingAttributeDataset/category_test'
+nb_train_samples = y_train.shape[0]
+nb_validation_samples = y_test.shape[0]
+epochs = 50
+batch_size = 8
+
+
+
+def vgg16_model(X_train, y_train, X_test, y_test, img_rows =224, img_cols=224, img_channel=3, num_classes=7):
+
+    base_model = applications.VGG16(weights='imagenet', include_top=False,
+                                    input_shape=(img_rows, img_cols, img_channel))
+
+    # extract max pool-4 layer
+    for i in xrange(4):
+        base_model.layers.pop()
+
+    inp = base_model.input
+    out = base_model.layers[-1].output
+    mod_model = Model(inp, out)
+
+    print(mod_model.layers[-1])
+    print mod_model.output_shape
+    add_model = Sequential()
+    add_model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=mod_model.output_shape[1:]))
+    add_model.add(BatchNormalization())
+    #add_model.add(MaxPooling2D(pool_size=(2, 2)))
+    add_model.add(Dropout(0.1))
+
+    add_model.add(Conv2D(32, (3, 3), activation='relu'))
+    add_model.add(BatchNormalization())
+    #add_model.add(MaxPooling2D(pool_size=(2, 2)))
+    #add_model.add(Dropout(0.2))
+
+    add_model.add(GlobalAveragePooling2D())
+    #add_model.add(Flatten())
+    #add_model.add(Dropout(0.2))
+    #add_model.add(Dense(64, activation='relu'))
+
+
+    add_model.add(Dense(num_classes, activation='softmax'))
+
+    print add_model.summary()
+
+    final_model = Model(inputs=mod_model.input, outputs=add_model(mod_model.output))
+    #final_model.load_weights("final_model.h5")
+    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
+    final_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
+
+
+    history = final_model.fit(X_train, y_train, batch_size=32, epochs=25, validation_data=(X_test, y_test), shuffle=True)
+
+    print(history.history.keys())
+    # summarize history for accuracy
+    plt.plot(history.history['acc'])
+    plt.plot(history.history['val_acc'])
+    plt.title('model accuracy')
+    plt.ylabel('accuracy')
+    plt.xlabel('epoch')
+    plt.legend(['train', 'test'], loc='upper left')
+    plt.show()
+    # summarize history for loss
+    plt.plot(history.history['loss'])
+    plt.plot(history.history['val_loss'])
+    plt.title('model loss')
+    plt.ylabel('loss')
+    plt.xlabel('epoch')
+    plt.legend(['train', 'test'], loc='upper left')
+    plt.show()
+
+
+    """
+    # Average augmented data predictions for each test sample
+    pred_list = []
+    print X_test.shape
+    for idx in range(0, X_test.shape[0]):
+        if (idx + 1) % 4 == 0:
+            print idx
+            pred = final_model.predict(X_test[:idx])
+            pred_list.append(pred.mean(0))
+
+    preds = np.asarray(pred_list)
+    """
+    preds = final_model.predict(X_test)
+    print(preds.shape)
+    pickle.dump(preds, open("y_pred.pkl", "wb"))
+
+    # serialize model to JSON
+    model_json = final_model.to_json()
+    with open("final_model.json", "w") as json_file:
+        json_file.write(model_json)
+    # serialize weights to HDF5
+    final_model.save_weights("final_model.h5")
+    print("Saved model to disk")
+
+    # later...
+
+    # load json and create model
+    json_file = open('final_model.json', 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+
+
+with open('./X_train.pkl', 'rb') as f:
+    X_train = pickle.load(f)
+
+with open('./X_test.pkl', 'rb') as f:
+    X_test = pickle.load(f)
+
+with open('./y_train.pkl', 'rb') as f:
+    y_train = pickle.load(f)
+
+with open('./y_test.pkl', 'rb') as f:
+    y_test = pickle.load(f)
+
+
+vgg16_model(X_train, y_train, X_test, y_test)
+
+