This repository has been archived by the owner on Apr 2, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathfind_duplicates.py
205 lines (181 loc) · 6.78 KB
/
find_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#-------------------------------------------------------------------------------
# Name: move duplicates
# Purpose:
#
# Author: new
#
# Created: 27/06/2014
# Copyright: (c) new 2014
# Licence: <your licence>
#-------------------------------------------------------------------------------
#!/usr/bin/env python
import hashlib
import logging
import os
import shutil
import derpibooru_dl
def setup_logging(log_file_path):
# Setup logging (Before running any other code)
# http://inventwithpython.com/blog/2012/04/06/stop-using-print-for-debugging-a-5-minute-quickstart-guide-to-pythons-logging-module/
assert( len(log_file_path) > 1 )
assert( type(log_file_path) == type("") )
global logger
log_file_folder = os.path.split(log_file_path)[0]
if not os.path.exists(log_file_folder):
os.makedirs(log_file_folder)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh = logging.FileHandler(log_file_path)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)
logging.debug('Logging started.')
return
def uniquify(seq, idfun=None):
# List uniquifier from
# http://www.peterbe.com/plog/uniqifiers-benchmark
# order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result
def walk_for_file_paths(start_path):
"""Use os.walk to collect a list of paths to files mathcing input parameters.
Takes in a starting path and a list of patterns to check against filenames
Patterns follow fnmatch conventions."""
logging.debug("Starting walk. start_path:" + start_path)
assert(type(start_path) == type(""))
matches = []
for root, dirs, files in os.walk(start_path):
dirs[:] = [d for d in dirs if d not in ['json']]# Scanning /json/ is far too slow for large folders, skip it.
c = 1
logging.debug("root: "+root)
for filename in files:
c += 1
if (c % 1000) == 0:
logging.debug("File # "+str(c)+": "+filename)
match = os.path.join(root,filename)
matches.append(match)
logging.debug("end folder")
logging.debug("Finished walk.")
return matches
def hash_file(file_path):
"""Generate a SHA512 hash for a file"""
# http://www.pythoncentral.io/hashing-files-with-python/
BLOCKSIZE = 65536
hasher = hashlib.sha512()
with open(file_path, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
file_hash = u"" + hasher.hexdigest()# convert to unicode
return file_hash
def find_duplicates(input_folder):
"""Find duplicates in a folder"""
logging.info("Looking for duplicates in "+input_folder)
file_paths = walk_for_file_paths(input_folder)
hash_dict = {} # {hash : file_path}
hash_matches = []
logging.info("Generating and compating hashes")
c = 0
for file_path in file_paths:
c += 1
if (c % 1000) == 0:
logging.debug("Hashing file #"+str(c)+": "+file_path)
file_hash = hash_file(file_path)
# Check if hash has been seen
try:
previously_seen = hash_dict[file_hash]
logging.info("Match! "+hash_dict[file_hash]+" has the same hash as "+file_path)
# Add both to move list
hash_matches.append(hash_dict[file_hash])# From hash dict
hash_matches.append(file_path)# Current
except KeyError, ke:
# If no match in dict
hash_dict[file_hash] = file_path
# Uniquify move list
files_to_move = uniquify(hash_matches)
return files_to_move
def move_duplicates(input_folder,output_folder,pickle_path,no_move=False):
"""Find and move all duplicate files in a folder"""
duplicates_to_move = find_duplicates(input_folder)
logging.info("Found "+str(len(duplicates_to_move))+" items with hashes matching another file")
logging.debug("Duplicates found: "+str(duplicates_to_move))
derpibooru_dl.save_pickle(pickle_path, duplicates_to_move)
logging.info("Done moving duplicates")
return
def move_from_pickle(pickle_path,output_folder,no_move=False):
logging.info("Moving files from pickle: "+pickle_path)
file_paths = derpibooru_dl.read_pickle(pickle_path)
move_files(file_paths,output_folder,no_move=False)
return
def move_files(file_paths,output_folder,no_move=False):
logging.info("Moving files...")
for file_path in file_paths:
move_file(file_path, output_folder, no_move)
logging.info("Finished moving files.")
return
def move_file(from_path,output_folder,no_move=False):
"""Move a file to a specified folder or copy it if no_move is True"""
# Figure out the filename
filename = os.path.basename(from_path)
# Make the output path
output_path = os.path.join(output_folder, filename)
try:
# Ensure folder exists
if not os.path.exists(output_folder):
os.makedirs(output_folder)
if no_move is True:
logging.info("Copying "+from_path+" to "+output_path)
# Copy file
shutil.copy2(from_path, output_path)
return
elif no_move is False:
logging.info("Moving "+from_path+" to "+output_path)
# Move file
shutil.move(from_path, output_path)
return
else:
raise ValueError
except IOError, err:
logging.error("Error copying/moving files!")
logging.debug( repr( locals() ) )
logging.exception(err)
return
def main():
input_folder = "h:\\derpibooru_dl\\download\\combined_downloads"
output_folder = "duplicates"
global pickle_path
pickle_path = os.path.join("debug","found_duplicates.pickle")
move_from_pickle(pickle_path, output_folder, no_move=False)
return
move_duplicates(input_folder, output_folder, pickle_path, no_move = True)
if __name__ == '__main__':
# Setup logging
setup_logging(os.path.join("debug","derpibooru_move_duplicate_files_log.txt"))
try:
#cj = cookielib.LWPCookieJar()
#setup_browser()
main()
except Exception, err:
# Log exceptions
logging.critical("Unhandled exception!")
logging.critical(str( type(err) ) )
logging.exception(err)
logging.info( "Program finished.")
#raw_input("Press return to close")