View Single Post
  #2   (View Single Post)  
Old 24th April 2010
Carpetsmoker's Avatar
Carpetsmoker Carpetsmoker is offline
Real Name: Martin
Tcpdump Spy
 
Join Date: Apr 2008
Location: Netherlands
Posts: 2,243
Default

Here's a python script I wrote some time ago that does the same.

I had a customer with a digital camera and she kept copying the same files over and over again to her hard disk, she didn't quite seem to understand that you need to delete the pictures from the camera after you've copied them ...
Communicating with her was difficult not just because of her complete cluelessness when it came to computers, but also because she was far removed from being intelligible because of the heavy Flemish accent and lack of dentures.

Anyway, so I needed to clean up the mess, which how this script was created

I only used this on Windows by the way, don't think I've tried using it on BSD or the likes, it should work though.

Code:
#!/usr/bin/env python
#
# Copyright (c) 2009, Martin Tournoij <mtournoij@aragorn.nl>
#
# Aragorn Computers & Automatisering
# http://www.aragorn.nl/
#
# Check for duplicate entries and remove them based on SHA256 hash.
#

import getopt
import hashlib
import os
import pprint
import sys

# Automagic slash/backslash conversion doesn't work with pythonpath.
if os.path.isdir('../aragorn'):
	if sys.platform[:3] == 'win':
		sys.path.append('..\\aragorn')
	else:
		sys.path.append('../aragorn')

import aragorn

def Usage():
	print "%s [-hpt]" % sys.argv[0]
	print ""
	print "\t-h\tHelp"
	print "\t-p\tPath to dir to check for duplicates."
	print "\t-t\tPath to use as 'trash bin' do not use a subdir of -p"
	print ""

def GetTree(dir, prev, dlist, flist, error, size, verbose=None):
	"""
	Get list of files/dirs recursively
	"""
	try:
		for f in os.listdir(os.path.join(dir, prev)):
			path = os.path.join(prev, f)
			if os.path.isdir(os.path.join(dir, path)):
				if verbose:
					print "Adding directory `%s'" % path
				dlist.append(path)
				GetTree(dir, path, dlist, flist, error, size, verbose)
			else:
				try:
					size[0] += os.path.getsize(os.path.join(dir, path))
					if verbose:
						print "Adding file `%s'" % path
					flist.append(path)
				except:
					if verbose:
						print "Error adding file `%s'" % path
					error.append([path, sys.exc_info()[1]])
	except:
		error.append([path, sys.exc_info()[1]])
		print "Error adding directory `%s'" % path

	return dlist, flist, error, size

if __name__ == '__main__':
	try:
		options, arguments = getopt.getopt(sys.argv[1:], 'hp:t:')
	except getopt.GetoptError:
		msg, opt = sys.exc_info()[1]
		print msg
		print ""
		Usage()
		aragorn.MyExit(1)

	optDict = {
		'path': 'c:/images/',
		'trash': 'c:/trash/'
	}

	for opt, arg in options:
		if opt == '-h':
			Usage()
			aragorn.MyExit(0)
		if opt == '-p':
			optDict['path'] = arg
		if opt == '-t':
			optDict['trash'] = arg

	if not os.path.exists(optDict['path']):
		print "Dir to check `%s' does not exist." % optDict['path']
		aragorn.MyExit(1)

	aragorn.MakeDir(optDict['trash'])

	dlist, flist, error, size = GetTree(optDict['path'], '', [], [], [], [0])

	hashdict = {}
	for fname in flist:
		sha = aragorn.SHA256("%s/%s" % (optDict['path'], fname))
		if not hashdict.has_key(sha):
			hashdict[sha] = [fname]
		else:
			hashdict[sha].append(fname)

	pprint.pprint(hashdict)


for (hash, samefiles) in hashdict.iteritems():
	# Keep first item, move rest to trash
	samefiles.pop(0)

	for f in samefiles:
		src = "%s/%s" % (optDict['path'], f)
		dst = "%s/%s" % (optDict['trash'], f)

		try:
			os.rename(src, dst)
		except OSError:
			print "Error renaming `%s' to `%s'" % (src, dst)
__________________
UNIX was not designed to stop you from doing stupid things, because that would also stop you from doing clever things.
Reply With Quote