Here's a python script I wrote some time ago that does the same.
I had a customer with a digital camera and she kept copying the same files over and over again to her hard disk, she didn't quite seem to understand that you need to delete the pictures from the camera after you've copied them ...
Communicating with her was difficult not just because of her complete cluelessness when it came to computers, but also because she was far removed from being intelligible because of the heavy Flemish accent and lack of dentures.
Anyway, so I needed to clean up the mess, which how this script was created
I only used this on Windows by the way, don't think I've tried using it on BSD or the likes, it should work though.
Code:
#!/usr/bin/env python
#
# Copyright (c) 2009, Martin Tournoij <mtournoij@aragorn.nl>
#
# Aragorn Computers & Automatisering
# http://www.aragorn.nl/
#
# Check for duplicate entries and remove them based on SHA256 hash.
#
import getopt
import hashlib
import os
import pprint
import sys
# Automagic slash/backslash conversion doesn't work with pythonpath.
if os.path.isdir('../aragorn'):
if sys.platform[:3] == 'win':
sys.path.append('..\\aragorn')
else:
sys.path.append('../aragorn')
import aragorn
def Usage():
print "%s [-hpt]" % sys.argv[0]
print ""
print "\t-h\tHelp"
print "\t-p\tPath to dir to check for duplicates."
print "\t-t\tPath to use as 'trash bin' do not use a subdir of -p"
print ""
def GetTree(dir, prev, dlist, flist, error, size, verbose=None):
"""
Get list of files/dirs recursively
"""
try:
for f in os.listdir(os.path.join(dir, prev)):
path = os.path.join(prev, f)
if os.path.isdir(os.path.join(dir, path)):
if verbose:
print "Adding directory `%s'" % path
dlist.append(path)
GetTree(dir, path, dlist, flist, error, size, verbose)
else:
try:
size[0] += os.path.getsize(os.path.join(dir, path))
if verbose:
print "Adding file `%s'" % path
flist.append(path)
except:
if verbose:
print "Error adding file `%s'" % path
error.append([path, sys.exc_info()[1]])
except:
error.append([path, sys.exc_info()[1]])
print "Error adding directory `%s'" % path
return dlist, flist, error, size
if __name__ == '__main__':
try:
options, arguments = getopt.getopt(sys.argv[1:], 'hp:t:')
except getopt.GetoptError:
msg, opt = sys.exc_info()[1]
print msg
print ""
Usage()
aragorn.MyExit(1)
optDict = {
'path': 'c:/images/',
'trash': 'c:/trash/'
}
for opt, arg in options:
if opt == '-h':
Usage()
aragorn.MyExit(0)
if opt == '-p':
optDict['path'] = arg
if opt == '-t':
optDict['trash'] = arg
if not os.path.exists(optDict['path']):
print "Dir to check `%s' does not exist." % optDict['path']
aragorn.MyExit(1)
aragorn.MakeDir(optDict['trash'])
dlist, flist, error, size = GetTree(optDict['path'], '', [], [], [], [0])
hashdict = {}
for fname in flist:
sha = aragorn.SHA256("%s/%s" % (optDict['path'], fname))
if not hashdict.has_key(sha):
hashdict[sha] = [fname]
else:
hashdict[sha].append(fname)
pprint.pprint(hashdict)
for (hash, samefiles) in hashdict.iteritems():
# Keep first item, move rest to trash
samefiles.pop(0)
for f in samefiles:
src = "%s/%s" % (optDict['path'], f)
dst = "%s/%s" % (optDict['trash'], f)
try:
os.rename(src, dst)
except OSError:
print "Error renaming `%s' to `%s'" % (src, dst)