Here's a function that can delete duplicate files either from a single directory or recursively (i.e. duplicate files from different directories will be removed).
-- I've left in a lot of testing code; if you want to use it, test it carefully first to make sure my tests didn't miss anything.
-- It might use a lot of memory if your $path has a huge number of files under it.
-- The $restart argument shouldn't be used when the function is called. It's there for clearing the static variables on subsequent calls.
-- It probably needs work (at the least, removal of the echo lines).
-- As the function is, there's no telling which directory a file will be deleted from. That depends on the order in which entries are read.
-- Allowance isn't made for file/directory permissions, so make sure first that they're correct.
-- Not meant for frequent use on an active server.
function rm_dup_files($path, $recurse = false, $exempt = array(), $restart = true)
{
if ((substr($path, -1) !== '/') && (substr($path, -1) != '\\')) {
$path .= DIRECTORY_SEPARATOR;
}
static $md5_arr = array();
static $ret_arr = array();
if ($restart) {
$md5_arr = array();
$ret_arr['ct'] = 0;
$ret_arr['removed'] = array();
$ret_arr['not_removed'] = array();
if (!is_dir($path)) {
echo '!is_dir(' . $path . ')<br />';
return false;
}
}
$dir = dir($path);
while (false !== ($entry = $dir->read())) {
if (($entry == '.') || ($entry == '..')) {
continue;
}
if (!is_dir($path . $entry)) {
$md5 = md5(file_get_contents($path . $entry));
echo $path . $entry . ' => ' . $md5;
if (in_array($md5, $md5_arr) && !in_array($entry, $exempt)) {
unlink ($path . $entry);
$ret_arr['ct']++;
$ret_arr['removed'][] = $path . $entry;
echo ' => ' . '<span style="color:red">removed</span>';
} else {
$md5_arr[] = $md5;
$ret_arr['not_removed'][] = $path . $entry;
}
echo '<br />';
} elseif ($recurse) {
echo '<span style="color:blue">' . $path . '</span><br />';
$subdir_path = $path . $entry;
echo '<span style="color:green">recursing ' . $subdir_path . '</span><br />';
rm_dup_files($subdir_path, true, $exempt, false);
}
}
$dir->close();
return $ret_arr;
}
Some test code:
$path = '/path';
$exempt_files = array('file_a', 'file_2', 'file_iii');
$remove_result = rm_dup_files($path, true, $exempt_files);
if (!$remove_result) {
echo 'error';
} else {
echo '<pre>';
print_r($remove_result);
echo '</pre>';
}
$remove_result = rm_dup_files($path);
if (!$remove_result) {
echo 'error';
} else {
echo '<pre>';
print_r($remove_result);
echo '</pre>';
}