#!/usr/bin/env python
#
# find-duplicates
# by Keith Gaughan
#
# Finds an lists any duplicate files in the given directories.
#
# Copyright (c) Keith Gaughan, 2008.
# All Rights Reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# This license is subject to the laws and courts of the Republic of Ireland.
#
from __future__ import with_statement
import sys
import os
import hashlib
import getopt
import filecmp
USAGE = "Usage: %s [-h] [-m] *"
class crc:
"""
Wraps up zlib.crc32 to make it suitable for use as a faster but less
accurate alternative to the hashlib.* classes.
"""
def __init__(self, initial=None):
self.crc = 0
if initial is not None:
self.update(initial)
def update(self, block):
import zlib
self.crc = zlib.crc32(block, self.crc)
def hexdigest(self):
return "%X" % self.crc
def digest(self):
# Er...
return self.crc
def all_files(*tops):
"""Lists all files in the given directories."""
for top in tops:
for dirname, _, filenames in os.walk(top):
for f in filenames:
path = os.path.join(dirname, f)
if os.path.isfile(path):
yield path
def digest(file, method=hashlib.md5):
with open(file) as f:
h = method(f.read()).digest()
return h
def true_duplicates(files):
"""
Compare the given files, breaking them down into groups with identical
content.
"""
while len(files) > 1:
next_set = []
this_set = []
master = files[0]
this_set.append(master)
for other in files[1:]:
if filecmp.cmp(master, other, False):
this_set.append(other)
else:
next_set.append(other)
if len(this_set) > 1:
yield this_set
files = next_set
def group_by(groups, grouper, min_size=1):
"""Breaks each of the groups into smaller subgroups."""
for group in groups:
subgroups = {}
for item in group:
g = grouper(item)
if not subgroups.has_key(g):
subgroups[g] = []
subgroups[g].append(item)
for g in subgroups.itervalues():
if len(g) >= min_size:
yield g
def usage(message=None):
global USAGE
fh = sys.stdout
exit_code = 0
if message:
fh = sys.stderr
exit_code = 2
print >>fh, str(message)
name = os.path.basename(sys.argv[0])
print >>fh, USAGE % (name,)
sys.exit(exit_code)
def main():
try:
opts, paths = getopt.getopt(sys.argv[1:], "hm:")
except getopt.GetoptError, err:
usage(err)
method = crc
for o, a in opts:
if o == "-m":
if a == "crc":
method = crc
elif a == "md5":
method = hashlib.md5
else:
usage("Unknown grouping method: %s" % (a,))
elif o == "-h":
usage()
else:
usage("Unknown option: %s%s" % (o, a))
if len(paths) == 0:
paths = ["."]
first = True
groups = [all_files(*paths)]
for grouper in [os.path.getsize, lambda file: digest(file, method)]:
groups = group_by(groups, grouper, 2)
for group in groups:
for files in sorted(true_duplicates(group)):
if first:
first = False
else:
print
for file in files:
print file
if __name__ == "__main__":
main()