-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMLE_1_Feature_Engineering.py
443 lines (323 loc) · 12.5 KB
/
MLE_1_Feature_Engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#!python2
###############################################################################
# MachineLearningExperiment.py - Machine Learning Comic Book Cover Finding Experiment
# jamesj223
###############################################################################
# Initialisation
import os, cPickle, gzip
import re
pattern = re.compile(r"\W+")
from datetime import datetime
import imutils, cv2, pytesseract
import numpy as np
from tqdm import tqdm
#Attempting parallel processing! :s
from multiprocessing import Pool
try:
from PIL import Image
except ImportError:
import Image
###############################################################################
# Config
supportedImageExtensions = ['.jpg', '.jpeg']
inputDirectory = ""
outputCSV = ""
# Clean flag, will delete pickles and recreate everything from scratch
# Not Yet Implemented
clean = False
# Attempt Parallel Processing :s
parallel = True
###############################################################################
# Classes
################################################################################
# Pickle Functions - With gzip
def save_obj(obj, name, protocol=2):
with open('obj/'+ name + '.pkl', 'wb') as f:
cPickle.dump(obj, f, protocol)
def load_obj(name ):
with open('obj/' + name + '.pkl', 'rb') as f:
return cPickle.load(f)
def saveAndZip(obj, name, protocol=2):
filename = 'obj/' + name + '.pkz'
file = gzip.GzipFile(filename, 'wb')
file.write(cPickle.dumps(obj, protocol))
file.close()
return filename
def loadAndUnZip(name):
filename = 'obj/' + name + '.pkz'
file = gzip.GzipFile(filename, 'rb')
buffer = ""
while True:
data = file.read()
if data == "":
break
buffer += data
object = cPickle.loads(buffer)
file.close()
return object
# Returns pickle if exsits or False if it doesn't
def loadPickleIfExists( Name ):
try:
pickle = loadAndUnZip( Name )
except:
pickle = False
return pickle
###############################################################################
# Functions
# Creates directory if it doesn't already exist
def createDirectoryIfNotExists(path):
if not os.path.exists(path):
os.makedirs(path)
def recursiveFileSearchByExtension(folder, extensionList):
files = []
count = {}
#count['Other'] = 0
for root, dirnames, filenames in os.walk(folder):
for filename in filenames:
fullFilePathAndName = os.path.join(root, filename)
for extension in extensionList:
if filename.lower().endswith( tuple(extension) ):
if fullFilePathAndName not in files:
files.append(fullFilePathAndName)
if extension in count:
count[extension] += 1
else:
count[extension] = 1
if fullFilePathAndName not in files:
#count["Other"] += 1
extension = fullFilePathAndName.split(".")[-1]
if extension in count:
count[extension] += 1
else:
count[extension] = 1
#if debug:
# print filename
return files
def makeCSV(filename, data):
if not filename.endswith(".csv"):
filename += ".csv"
f = open(filename, 'wb')
# Headers? - TODO
headerLine = ",".join(
(
"fileName",
"FNhasVariant",
"height",
"width",
"hLinesBlack",
"hLinesWhite",
"numWhitePixels",
"numBlackPixels",
"OCRwordCount",
"OCRhasVariant",
"OCRhasMarvel",
"IMGhasMarvelBool",
"IMGhasMarvelScore"
)
)
headerLine += "\n"
f.write(headerLine)
for line in data:
csvLine = ""
for thing in line:
csvLine += str(thing.replace(',', '')) + "," #Attempting to mitigate
#the "fix" from reateIndivualImageFeatureSet(File) ~line 254
csvLine = csvLine[0:-1] + "\n"
f.write(csvLine)
f.close()
def clean_obj_folder():
path = './' + 'obj'
for filename in os.listdir(path):
if filename.endswith('pkz'):
f = os.path.join(path, filename)
os.remove(f)
# Create a training data set file list
def createTrainingFileSet(folder, numRandomFiles, numRandomImages):
print "TODO"
# grab 1st image, and a random number of nonfirst, nonlast images
#or
# Grab a predefined list of archives and extract all files from them.
# Advantages include, not biasing the training data as much
# Disadvantages include - Longer extracting time, longer training time, less training data potentially
# Store tuple with:
# ArchiveName, FileName, PostionInArchive?, ExtractedImagePath
# CreateIndivualImageFeatureSet(Image) -> Return full image feature set for given image
# Excludes meta features (ArchiveName, ImageName) - these can be added outside this function
def createFeatureSet(FileList):
featureSet = []
# Normal
if not parallel:
for file in tqdm(FileList, ascii=True):
fileName = os.path.basename(file)
tqdm.write( fileName , end="" )
featureSet.append( createIndivualImageFeatureSet(file) )
# Parallel
else:
p = Pool()
featureSet += p.map(createIndivualImageFeatureSet, FileList)
archiveName = "CBR"
#File position in archive? Issue is current planned training set would be super biased towards pos[0] or pos[1]
# If I just wanted to catch cbr's with mutliples at the start, ML wouldn't be necessary
# I want to catch the covers out of the middle of collected books too.
#Cover(s) in archive name - same issue with current training set bias
return featureSet
def createIndivualImageFeatureSet(File):
#print File
fileName = os.path.basename(File)#.replace(',', '')#Maybe un-comment this before you make CSVs!
print "Building feature set for: " + fileName
wordsList = set( re.split(pattern, fileName.lower()) )
FNhasVariant = "variant" in wordsList
(height, width, channels) = cv2.imread(File).shape
(hLinesBlack, hLinesWhite) = getHorizontalLineFeatures(File)
#Number of each colour pixels?
(numWhitePixels, numBlackPixels) = getColourFeatures(File)
#"Average" colour - https://www.pyimagesearch.com/2014/03/03/charizard-explains-describe-quantify-image-using-feature-vectors/
#>>> means = cv2.mean(image)
#>>> means
#(181.12238527002307, 199.18315040165433, 206.514296508391, 0.0)
#Presence of Barcode or marvel logo - done kinda.
#Doesn't seem to be working well. Too many false positives and false negatives.
#And computationally expensive, relative to other features
#Variant in file name? Add this next
(OCRwordCount, OCRhasVariant, OCRhasMarvel) = getOCRFeatures(File)
(IMGhasMarvelBool, IMGhasMarvelScore) = getImageRecognitionFeatures(File)
featureSet = (
fileName,
int(FNhasVariant),
height,
width,
hLinesBlack,
hLinesWhite,
numWhitePixels,
numBlackPixels,
OCRwordCount,
int(OCRhasVariant),
int(OCRhasMarvel),
int(IMGhasMarvelBool),
IMGhasMarvelScore
)
return featureSet
# Try to find panel borders/gaps
def getHorizontalLineFeatures(File):
BW = cv2.imread(File, cv2.IMREAD_GRAYSCALE)
(height, width, channels) = cv2.imread(File).shape
hLinesWhite = 0
hLinesBlack = 0
threshold = 10
# Calculate min value (black) and max value (white)
# If comic was true black (0 value) it would be 0, but seems like we need a threshold.
# Likewise for true white (255) it would be 255 * width
minValue = threshold * width
maxValue = (255 - threshold) * width
#https://stackoverflow.com/questions/25642532/opencv-pointx-y-represent-column-row-or-row-column
for i in range(height): #tqdm( ):
lineSum = np.sum( BW[ i, 0:-1 ] )
judgement = ""
if lineSum < minValue:
#judgement = " - Black Line"
hLinesBlack += 1
elif lineSum > maxValue:
#judgement = " - White Line"
hLinesWhite += 1
#print str(i) + " : " + str( lineSum ) + judgement
#print "hLinesWhite: " + str(hLinesWhite)
#print "hLinesBlack: " + str(hLinesBlack)
return (hLinesWhite, hLinesBlack)
# Get number of black and white pixels
def getColourFeatures(File):
BW = cv2.imread(File, cv2.IMREAD_GRAYSCALE)
blackPixelCount = np.sum(BW == 0)
whitePixelCount = np.sum(BW == 255)
return (blackPixelCount, whitePixelCount)
# Use OCR to try to find words on the page.
def getOCRFeatures(File):
allText = pytesseract.image_to_string(Image.open(File)).lower()
wordsList = set( re.split(pattern, allText) )
wordCount = len(wordsList)
hasVariant = "variant" in wordsList
hasMarvel = "marvel" in wordsList
return (wordCount, hasVariant, hasMarvel)
# Red Mask Helper for Image Rec
def redMaskThing(Image):
lower = np.array([0, 0, 150], dtype = "uint8")
upper = np.array([50, 50, 255], dtype = "uint8")
mask = cv2.inRange(Image, lower, upper)
output = cv2.bitwise_and(Image, Image, mask = mask)
return output
# Use Image/Template Matching to try and find features
# At this stage just the marvel logo
def getImageRecognitionFeatures(File):
template = cv2.imread("marvel.jpg")
template = redMaskThing(template)
template = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
template = cv2.Canny(template, 50, 200)
(tH, tW) = template.shape[:2]
threshold = 10000000 # 11000000.0
image = cv2.imread(File)
gray = redMaskThing(image)
gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
found = None
# loop over the scales of the image
for scale in np.linspace(0.2, 1.0, 20)[::-1]:
# resize the image according to the scale, and keep track
# of the ratio of the resizing
resized = imutils.resize(gray, width = int(gray.shape[1] * scale))
r = gray.shape[1] / float(resized.shape[1])
# if the resized image is smaller than the template, then break
# from the loop
if resized.shape[0] < tH or resized.shape[1] < tW:
break
# detect edges in the resized, grayscale image and apply template
# matching to find the template in the image
edged = cv2.Canny(resized, 50, 200)
result = cv2.matchTemplate(edged, template, cv2.TM_CCOEFF)
(_, maxVal, _, maxLoc) = cv2.minMaxLoc(result)
# if we have found a new maximum correlation value, then update
# the bookkeeping variable
if found is None or maxVal > found[0]:
found = (maxVal, maxLoc, r)
# unpack the bookkeeping variable and compute the (x, y) coordinates
# of the bounding box based on the resized ratio
(maxVal, maxLoc, r) = found
return ( ( maxVal > threshold ), maxVal )
def OLDgetImageRecognitionFeatures(File):
template = cv2.imread('marvel.jpg')
w, h = template.shape[:2]#[::-1]
methodString = 'cv2.TM_CCOEFF'
#fiftyPercentThreshold = 530830296
seventyFivePercentThreshold = 665854060
img = cv2.imread(File)
method = eval(methodString)
# Apply template Matching
res = cv2.matchTemplate(img,template,method)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
#top_left = max_loc
#confidence = max_val
return ( (max_val>seventyFivePercentThreshold), max_val )
###############################################################################
# Main
if __name__ == '__main__':
startTime = datetime.now()
print "Start - " + str(datetime.now())
# create obj folder if not exists
createDirectoryIfNotExists('obj')
if clean:
clean_obj_folder()
### File List
fileList = loadPickleIfExists("fileList")
if fileList is False:
print "No pickle found for fileList, creating from scratch"
fileList = recursiveFileSearchByExtension(inputDirectory, supportedImageExtensions)
saveAndZip(fileList, "fileList")
### End File List
#### Feature Set for File List
featureSet = loadPickleIfExists("featureSet")
if featureSet is False:
print "No pickle found for featureSet, creating from scratch"
featureSet = createFeatureSet(fileList)
saveAndZip(featureSet, "featureSet")
### End Feature Set
### CSV
makeCSV(outputCSV, featureSet)
print "End - " + str(datetime.now())
print "Took: " + str( datetime.now() - startTime )