| 1 | /* $Id: kDeDup.c 3296 2019-01-22 21:29:08Z bird $ */
|
|---|
| 2 | /** @file
|
|---|
| 3 | * kDeDup - Utility that finds duplicate files, optionally hardlinking them.
|
|---|
| 4 | */
|
|---|
| 5 |
|
|---|
| 6 | /*
|
|---|
| 7 | * Copyright (c) 2016 knut st. osmundsen <bird-kBuild-spamx@anduin.net>
|
|---|
| 8 | *
|
|---|
| 9 | * This file is part of kBuild.
|
|---|
| 10 | *
|
|---|
| 11 | * kBuild is free software; you can redistribute it and/or modify
|
|---|
| 12 | * it under the terms of the GNU General Public License as published by
|
|---|
| 13 | * the Free Software Foundation; either version 2 of the License, or
|
|---|
| 14 | * (at your option) any later version.
|
|---|
| 15 | *
|
|---|
| 16 | * kBuild is distributed in the hope that it will be useful,
|
|---|
| 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 19 | * GNU General Public License for more details.
|
|---|
| 20 | *
|
|---|
| 21 | * You should have received a copy of the GNU General Public License
|
|---|
| 22 | * along with kBuild; if not, write to the Free Software
|
|---|
| 23 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|---|
| 24 | *
|
|---|
| 25 | */
|
|---|
| 26 |
|
|---|
| 27 | /*******************************************************************************
|
|---|
| 28 | * Header Files *
|
|---|
| 29 | *******************************************************************************/
|
|---|
| 30 | #include <k/kTypes.h>
|
|---|
| 31 | #include <errno.h>
|
|---|
| 32 | #include <string.h>
|
|---|
| 33 | #include <stdio.h>
|
|---|
| 34 | #include <wchar.h>
|
|---|
| 35 | #if K_OS != K_OS_WINDOWS
|
|---|
| 36 | # include <stdlib.h>
|
|---|
| 37 | # include <unistd.h>
|
|---|
| 38 | # include <sys/fcntl.h>
|
|---|
| 39 | # include <sys/stat.h>
|
|---|
| 40 | #endif
|
|---|
| 41 |
|
|---|
| 42 | #include "md5.h"
|
|---|
| 43 | //#include "sha2.h"
|
|---|
| 44 |
|
|---|
| 45 | #if K_OS == K_OS_WINDOWS
|
|---|
| 46 | # include "nt/ntstuff.h"
|
|---|
| 47 | # include "nt/ntstat.h"
|
|---|
| 48 | # include "nt/fts-nt.h"
|
|---|
| 49 | # include "nt/nthlp.h"
|
|---|
| 50 | # include "nt/ntunlink.h"
|
|---|
| 51 | #else
|
|---|
| 52 | # include "fts.h"
|
|---|
| 53 | #endif
|
|---|
| 54 |
|
|---|
| 55 |
|
|---|
| 56 | /*********************************************************************************************************************************
|
|---|
| 57 | * Structures and Typedefs *
|
|---|
| 58 | *********************************************************************************************************************************/
|
|---|
| 59 | /**
|
|---|
| 60 | * The key is made up of two cryptographic hashes, collisions are
|
|---|
| 61 | * highly unlikely (once SHA2 is implemented).
|
|---|
| 62 | */
|
|---|
| 63 | typedef struct KDUPFILENODEKEY
|
|---|
| 64 | {
|
|---|
| 65 | /** The MD5 digest of the file. */
|
|---|
| 66 | KU8 abMd5[16];
|
|---|
| 67 | /** The 256-bit SHA-2 digest of the file. */
|
|---|
| 68 | KU8 abSha2[32];
|
|---|
| 69 | } KDUPFILENODEKEY;
|
|---|
| 70 | /** Pointer to a file node.*/
|
|---|
| 71 | typedef struct KDUPFILENODE *PKDUPFILENODE;
|
|---|
| 72 | /**
|
|---|
| 73 | * Hash tree node.
|
|---|
| 74 | */
|
|---|
| 75 | typedef struct KDUPFILENODE
|
|---|
| 76 | {
|
|---|
| 77 | /** The is made up of two hashes. */
|
|---|
| 78 | KDUPFILENODEKEY mKey;
|
|---|
| 79 | /** Left branch. */
|
|---|
| 80 | PKDUPFILENODE mpLeft;
|
|---|
| 81 | /** Right branch. */
|
|---|
| 82 | PKDUPFILENODE mpRight;
|
|---|
| 83 | /** Tree height (hmm). */
|
|---|
| 84 | KU8 mHeight;
|
|---|
| 85 |
|
|---|
| 86 | /** The inode number. */
|
|---|
| 87 | KU64 uInode;
|
|---|
| 88 | /** The device number. */
|
|---|
| 89 | KU64 uDev;
|
|---|
| 90 |
|
|---|
| 91 | /** Pointer to next hard linked node (same inode and udev values). */
|
|---|
| 92 | PKDUPFILENODE pNextHardLink;
|
|---|
| 93 | /** Pointer to next duplicate node. */
|
|---|
| 94 | PKDUPFILENODE pNextDup;
|
|---|
| 95 | /** Pointer to next duplicate node on the global list. */
|
|---|
| 96 | PKDUPFILENODE pNextGlobalDup;
|
|---|
| 97 |
|
|---|
| 98 | /** The path to this file (variable size). */
|
|---|
| 99 | #if K_OS == K_OS_WINDOWS
|
|---|
| 100 | wchar_t wszPath[1];
|
|---|
| 101 | #else
|
|---|
| 102 | char szPath[1];
|
|---|
| 103 | #endif
|
|---|
| 104 | } KDUPFILENODE;
|
|---|
| 105 |
|
|---|
| 106 | #if K_OS == K_OS_WINDOWS
|
|---|
| 107 | # define PATH_PRI "ls"
|
|---|
| 108 | # define PATH_MEMB wszPath
|
|---|
| 109 | # define FTS_ACCPATH fts_wcsaccpath
|
|---|
| 110 | #else
|
|---|
| 111 | # define PATH_PRI "s"
|
|---|
| 112 | # define PATH_MEMB szPath
|
|---|
| 113 | # define FTS_ACCPATH fts_accpath
|
|---|
| 114 | #endif
|
|---|
| 115 |
|
|---|
| 116 | /*#define KAVL_EQUAL_ALLOWED*/
|
|---|
| 117 | #define KAVL_CHECK_FOR_EQUAL_INSERT
|
|---|
| 118 | #define KAVL_MAX_STACK 32
|
|---|
| 119 | /*#define KAVL_RANGE */
|
|---|
| 120 | /*#define KAVL_OFFSET */
|
|---|
| 121 | /*#define KAVL_STD_KEY_COMP*/
|
|---|
| 122 | #define KAVLKEY KDUPFILENODEKEY
|
|---|
| 123 | #define KAVLNODE KDUPFILENODE
|
|---|
| 124 | #define KAVL_FN(name) kDupFileTree_ ## name
|
|---|
| 125 | #define KAVL_TYPE(prefix,name) prefix ## KDUPFILENODE ## name
|
|---|
| 126 | #define KAVL_INT(name) KDUPFILENODEINT ## name
|
|---|
| 127 | #define KAVL_DECL(rettype) static rettype
|
|---|
| 128 | #define KAVL_G(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) > 0 )
|
|---|
| 129 | #define KAVL_E(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) == 0 )
|
|---|
| 130 | #define KAVL_NE(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) != 0 )
|
|---|
| 131 |
|
|---|
| 132 | #define register
|
|---|
| 133 | #include <k/kAvlTmpl/kAvlBase.h>
|
|---|
| 134 | //#include <k/kAvlTmpl/kAvlDoWithAll.h>
|
|---|
| 135 | //#include <k/kAvlTmpl/kAvlEnum.h> - busted
|
|---|
| 136 | #include <k/kAvlTmpl/kAvlGet.h>
|
|---|
| 137 | //#include <k/kAvlTmpl/kAvlGetBestFit.h>
|
|---|
| 138 | //#include <k/kAvlTmpl/kAvlGetWithParent.h>
|
|---|
| 139 | //#include <k/kAvlTmpl/kAvlRemove2.h>
|
|---|
| 140 | //#include <k/kAvlTmpl/kAvlRemoveBestFit.h>
|
|---|
| 141 | #include <k/kAvlTmpl/kAvlUndef.h>
|
|---|
| 142 | #undef register
|
|---|
| 143 |
|
|---|
| 144 |
|
|---|
| 145 | /** Pointer to a size tree node. */
|
|---|
| 146 | typedef struct KDUPSIZENODE *PKDUPSIZENODE;
|
|---|
| 147 | /**
|
|---|
| 148 | * Size tree node.
|
|---|
| 149 | */
|
|---|
| 150 | typedef struct KDUPSIZENODE
|
|---|
| 151 | {
|
|---|
| 152 | /** The file size. */
|
|---|
| 153 | KU64 mKey;
|
|---|
| 154 | /** Left branch. */
|
|---|
| 155 | PKDUPSIZENODE mpLeft;
|
|---|
| 156 | /** Right branch. */
|
|---|
| 157 | PKDUPSIZENODE mpRight;
|
|---|
| 158 | /** Tree height (hmm). */
|
|---|
| 159 | KU8 mHeight;
|
|---|
| 160 | /** Number of files. */
|
|---|
| 161 | KU32 cFiles;
|
|---|
| 162 | /** Tree with same sized files.
|
|---|
| 163 | * When cFiles is 1 the root node does not have hashes calculated yet. */
|
|---|
| 164 | KDUPFILENODEROOT FileRoot;
|
|---|
| 165 | } KDUPSIZENODE;
|
|---|
| 166 |
|
|---|
| 167 | /*#define KAVL_EQUAL_ALLOWED*/
|
|---|
| 168 | #define KAVL_CHECK_FOR_EQUAL_INSERT
|
|---|
| 169 | #define KAVL_MAX_STACK 32
|
|---|
| 170 | /*#define KAVL_RANGE */
|
|---|
| 171 | /*#define KAVL_OFFSET */
|
|---|
| 172 | #define KAVL_STD_KEY_COMP
|
|---|
| 173 | #define KAVLKEY KU64
|
|---|
| 174 | #define KAVLNODE KDUPSIZENODE
|
|---|
| 175 | #define KAVL_FN(name) kDupSizeTree_ ## name
|
|---|
| 176 | #define KAVL_TYPE(prefix,name) prefix ## KDUPSIZENODE ## name
|
|---|
| 177 | #define KAVL_INT(name) KDUPSIZENODEINT ## name
|
|---|
| 178 | #define KAVL_DECL(rettype) static rettype
|
|---|
| 179 |
|
|---|
| 180 | #include <k/kAvlTmpl/kAvlBase.h>
|
|---|
| 181 | //#include <k/kAvlTmpl/kAvlDoWithAll.h>
|
|---|
| 182 | //#include <k/kAvlTmpl/kAvlEnum.h> - busted
|
|---|
| 183 | #include <k/kAvlTmpl/kAvlGet.h>
|
|---|
| 184 | //#include <k/kAvlTmpl/kAvlGetBestFit.h>
|
|---|
| 185 | //#include <k/kAvlTmpl/kAvlGetWithParent.h>
|
|---|
| 186 | //#include <k/kAvlTmpl/kAvlRemove2.h>
|
|---|
| 187 | //#include <k/kAvlTmpl/kAvlRemoveBestFit.h>
|
|---|
| 188 | #include <k/kAvlTmpl/kAvlUndef.h>
|
|---|
| 189 |
|
|---|
| 190 |
|
|---|
| 191 | /*********************************************************************************************************************************
|
|---|
| 192 | * Global Variables *
|
|---|
| 193 | *********************************************************************************************************************************/
|
|---|
| 194 | /** The verbosity level. */
|
|---|
| 195 | static unsigned g_cVerbosity = 0;
|
|---|
| 196 |
|
|---|
| 197 | /** Whether to recurse into subdirectories. */
|
|---|
| 198 | static KBOOL g_fRecursive = K_FALSE;
|
|---|
| 199 | /** Whether to recurse into symlinked subdirectories. */
|
|---|
| 200 | static KBOOL g_fRecursiveViaSymlinks = K_FALSE;
|
|---|
| 201 | /** Whether to follow symbolicly linked files. */
|
|---|
| 202 | static KBOOL g_fFollowSymlinkedFiles = K_TRUE;
|
|---|
| 203 |
|
|---|
| 204 | /** Minimum file size to care about. */
|
|---|
| 205 | static KU64 g_cbMinFileSize = 1;
|
|---|
| 206 | /** Maximum file size to care about. */
|
|---|
| 207 | static KU64 g_cbMaxFileSize = KU64_MAX;
|
|---|
| 208 |
|
|---|
| 209 | /** The root of the size tree. */
|
|---|
| 210 | static KDUPSIZENODEROOT g_SizeRoot;
|
|---|
| 211 |
|
|---|
| 212 | /** Global list of duplicate file with duplicates.
|
|---|
| 213 | * @remarks This only contains the files in the hash tree, not the ones on
|
|---|
| 214 | * the KDUPFILENODE::pNextDup list. */
|
|---|
| 215 | static PKDUPFILENODE g_pDuplicateHead = NULL;
|
|---|
| 216 | /** Where to insert the next file with duplicates. */
|
|---|
| 217 | static PKDUPFILENODE *g_ppNextDuplicate = &g_pDuplicateHead;
|
|---|
| 218 |
|
|---|
| 219 | /** Number of files we're tracking. */
|
|---|
| 220 | static KU64 g_cFiles = 0;
|
|---|
| 221 | /** Number of hardlinked files or files entered more than once. */
|
|---|
| 222 | static KU64 g_cHardlinked = 0;
|
|---|
| 223 | /** Number of duplicates files (not hardlinked). */
|
|---|
| 224 | static KU64 g_cDuplicates = 0;
|
|---|
| 225 | /** Number of duplicates files that can be hardlinked. */
|
|---|
| 226 | static KU64 g_cDuplicatesSaved = 0;
|
|---|
| 227 | /** Size that could be saved if the duplicates were hardlinked. */
|
|---|
| 228 | static KU64 g_cbDuplicatesSaved = 0;
|
|---|
| 229 |
|
|---|
| 230 |
|
|---|
| 231 |
|
|---|
| 232 | /**
|
|---|
| 233 | * Wrapper around malloc() that complains when out of memory.
|
|---|
| 234 | *
|
|---|
| 235 | * @returns Pointer to allocated memory
|
|---|
| 236 | * @param cb The size of the memory to allocate.
|
|---|
| 237 | */
|
|---|
| 238 | static void *kDupAlloc(KSIZE cb)
|
|---|
| 239 | {
|
|---|
| 240 | void *pvRet = malloc(cb);
|
|---|
| 241 | if (pvRet)
|
|---|
| 242 | return pvRet;
|
|---|
| 243 | fprintf(stderr, "kDeDup: error: out of memory! (cb=%#zx)\n", cb);
|
|---|
| 244 | return NULL;
|
|---|
| 245 | }
|
|---|
| 246 |
|
|---|
| 247 | /** Wrapper around free() for symmetry. */
|
|---|
| 248 | #define kDupFree(ptr) free(ptr)
|
|---|
| 249 |
|
|---|
| 250 | #if K_OS != K_OS_WINDOWS
|
|---|
| 251 | /** Wrapper around read() that hides EINTR and such. */
|
|---|
| 252 | static ssize_t kDupReadFile(int fd, void *pvBuf, size_t cbToRead)
|
|---|
| 253 | {
|
|---|
| 254 | ssize_t cbRet;
|
|---|
| 255 | do
|
|---|
| 256 | cbRet = read(fd, pvBuf, cbToRead);
|
|---|
| 257 | while (cbRet < 0 && errno == EINTR);
|
|---|
| 258 | if (cbRet > 0 && (size_t)cbRet != cbToRead)
|
|---|
| 259 | {
|
|---|
| 260 | for (;;)
|
|---|
| 261 | {
|
|---|
| 262 | size_t cbLeft = cbToRead - (size_t)cbRet;
|
|---|
| 263 | ssize_t cbPart;
|
|---|
| 264 | do
|
|---|
| 265 | cbPart = read(fd, (KU8 *)pvBuf + (size_t)cbRet, cbLeft);
|
|---|
| 266 | while (cbPart < 0 && errno == EINTR);
|
|---|
| 267 | if (cbPart <= 0)
|
|---|
| 268 | break;
|
|---|
| 269 | cbRet += cbPart;
|
|---|
| 270 | }
|
|---|
| 271 | }
|
|---|
| 272 | return cbRet;
|
|---|
| 273 | }
|
|---|
| 274 | #endif
|
|---|
| 275 |
|
|---|
| 276 |
|
|---|
| 277 | static void kDupHashFile(PKDUPFILENODE pFileNode, FTSENT *pFtsEnt)
|
|---|
| 278 | {
|
|---|
| 279 | KSIZE i;
|
|---|
| 280 | PKDUPFILENODE *ppHash;
|
|---|
| 281 |
|
|---|
| 282 | /*
|
|---|
| 283 | * Open the file.
|
|---|
| 284 | */
|
|---|
| 285 | #if K_OS == K_OS_WINDOWS
|
|---|
| 286 | HANDLE hFile;
|
|---|
| 287 | if (pFtsEnt && pFtsEnt->fts_parent && pFtsEnt->fts_parent->fts_dirfd != INVALID_HANDLE_VALUE)
|
|---|
| 288 | hFile = birdOpenFileExW(pFtsEnt->fts_parent->fts_dirfd, pFtsEnt->fts_wcsname,
|
|---|
| 289 | FILE_READ_DATA | SYNCHRONIZE,
|
|---|
| 290 | FILE_ATTRIBUTE_NORMAL,
|
|---|
| 291 | FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
|
|---|
| 292 | FILE_OPEN,
|
|---|
| 293 | FILE_NON_DIRECTORY_FILE | FILE_OPEN_FOR_BACKUP_INTENT | FILE_SYNCHRONOUS_IO_NONALERT,
|
|---|
| 294 | OBJ_CASE_INSENSITIVE);
|
|---|
| 295 | else
|
|---|
| 296 | hFile = birdOpenFileExW(NULL, pFileNode->wszPath,
|
|---|
| 297 | FILE_READ_DATA | SYNCHRONIZE,
|
|---|
| 298 | FILE_ATTRIBUTE_NORMAL,
|
|---|
| 299 | FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
|
|---|
| 300 | FILE_OPEN,
|
|---|
| 301 | FILE_NON_DIRECTORY_FILE | FILE_OPEN_FOR_BACKUP_INTENT | FILE_SYNCHRONOUS_IO_NONALERT,
|
|---|
| 302 | OBJ_CASE_INSENSITIVE);
|
|---|
| 303 | if (hFile != INVALID_HANDLE_VALUE)
|
|---|
| 304 | #else /* K_OS != K_OS_WINDOWS */
|
|---|
| 305 | # ifdef O_BINARY
|
|---|
| 306 | int fd = open(pFileNode->szPath, O_RDONLY | O_BINARY);
|
|---|
| 307 | # else
|
|---|
| 308 | int fd = open(pFileNode->szPath, O_RDONLY);
|
|---|
| 309 | # endif
|
|---|
| 310 | if (fd >= 0)
|
|---|
| 311 | #endif /* K_OS != K_OS_WINDOWS */
|
|---|
| 312 | {
|
|---|
| 313 | /*
|
|---|
| 314 | * Init the hash calculation contexts.
|
|---|
| 315 | */
|
|---|
| 316 | struct MD5Context Md5Ctx;
|
|---|
| 317 | //SHA256CONTEXT Sha256Ctx;
|
|---|
| 318 | MD5Init(&Md5Ctx);
|
|---|
| 319 | //Sha256Init(&Sha256Ctx);
|
|---|
| 320 |
|
|---|
| 321 | /*
|
|---|
| 322 | * Process the file chunk by chunk.
|
|---|
| 323 | *
|
|---|
| 324 | * We could complicate this by memory mapping medium sized files, but
|
|---|
| 325 | * those kind of complications can wait.
|
|---|
| 326 | */
|
|---|
| 327 | for (;;)
|
|---|
| 328 | {
|
|---|
| 329 | static KU8 s_abBuffer[2*1024*1024];
|
|---|
| 330 | #if K_OS == K_OS_WINDOWS
|
|---|
| 331 | MY_NTSTATUS rcNt;
|
|---|
| 332 | MY_IO_STATUS_BLOCK Ios;
|
|---|
| 333 | Ios.Information = -1;
|
|---|
| 334 | Ios.u.Status = -1;
|
|---|
| 335 | rcNt = g_pfnNtReadFile(hFile, NULL /*hEvent*/, NULL /*pfnApc*/, NULL /*pvApcCtx*/,
|
|---|
| 336 | &Ios, s_abBuffer, sizeof(s_abBuffer), NULL /*poffFile*/, NULL /*puKey*/);
|
|---|
| 337 | if (MY_NT_SUCCESS(rcNt))
|
|---|
| 338 | {
|
|---|
| 339 | MD5Update(&Md5Ctx, s_abBuffer, (unsigned)Ios.Information);
|
|---|
| 340 | //SHA256Update(&Sha256Ctx, s_abBuffer, Ios.Information);
|
|---|
| 341 | }
|
|---|
| 342 | else if (rcNt != STATUS_END_OF_FILE)
|
|---|
| 343 | {
|
|---|
| 344 | fprintf(stderr, "kDeDup: warning: Error reading '%ls': %#x\n", pFileNode->wszPath, rcNt);
|
|---|
| 345 | break;
|
|---|
| 346 | }
|
|---|
| 347 |
|
|---|
| 348 | /* Check for end of file. */
|
|---|
| 349 | if ( rcNt == STATUS_END_OF_FILE
|
|---|
| 350 | || Ios.Information < sizeof(s_abBuffer))
|
|---|
| 351 | {
|
|---|
| 352 | MD5Final(pFileNode->mKey.abMd5, &Md5Ctx);
|
|---|
| 353 | //Sha256Final(pFileNode->mKey.abSha2, &Sha256Ctx);
|
|---|
| 354 |
|
|---|
| 355 | birdCloseFile(hFile);
|
|---|
| 356 | return;
|
|---|
| 357 | }
|
|---|
| 358 | #else /* K_OS != K_OS_WINDOWS */
|
|---|
| 359 | ssize_t cbRead = kDupReadFile(fd, s_abBuffer, sizeof(s_abBuffer));
|
|---|
| 360 | if (cbRead > 0)
|
|---|
| 361 | {
|
|---|
| 362 | MD5Update(&Md5Ctx, s_abBuffer, (unsigned)cbRead);
|
|---|
| 363 | //SHA256Update(&Sha256Ctx, s_abBuffer, (unsigned)cbRead);
|
|---|
| 364 | }
|
|---|
| 365 | else if (cbRead == 0)
|
|---|
| 366 | {
|
|---|
| 367 | MD5Final(pFileNode->mKey.abMd5, &Md5Ctx);
|
|---|
| 368 | //Sha256Final(pFileNode->mKey.abSha2, &Sha256Ctx);
|
|---|
| 369 | close(fd);
|
|---|
| 370 | return;
|
|---|
| 371 | }
|
|---|
| 372 | else
|
|---|
| 373 | {
|
|---|
| 374 | fprintf(stderr, "kDeDup: warning: Error reading '%s': %s (%d)\n", pFileNode->szPath, strerror(errno), errno);
|
|---|
| 375 | break;
|
|---|
| 376 | }
|
|---|
| 377 | #endif /* K_OS != K_OS_WINDOWS */
|
|---|
| 378 | }
|
|---|
| 379 |
|
|---|
| 380 | #if K_OS == K_OS_WINDOWS
|
|---|
| 381 | birdCloseFile(hFile);
|
|---|
| 382 | #else
|
|---|
| 383 | close(fd);
|
|---|
| 384 | #endif
|
|---|
| 385 | }
|
|---|
| 386 | else
|
|---|
| 387 | fprintf(stderr, "kDeDup: warning: Failed to open '%" PATH_PRI "': %s (%d)\n",
|
|---|
| 388 | pFileNode->PATH_MEMB, strerror(errno), errno);
|
|---|
| 389 |
|
|---|
| 390 | /*
|
|---|
| 391 | * Hashing failed. We fake the digests by repeating the node pointer value
|
|---|
| 392 | * again and again, holding a collision with both SHA2 and MD5 with similar
|
|---|
| 393 | * digest pattern for highly unlikely.
|
|---|
| 394 | */
|
|---|
| 395 | ppHash = (PKDUPFILENODE *)&pFileNode->mKey;
|
|---|
| 396 | i = sizeof(pFileNode->mKey) / sizeof(*ppHash);
|
|---|
| 397 | while (i-- > 0)
|
|---|
| 398 | *ppHash++ = pFileNode;
|
|---|
| 399 | }
|
|---|
| 400 |
|
|---|
| 401 |
|
|---|
| 402 | /**
|
|---|
| 403 | * Deal with one file, adding it to the tree if it matches the criteria.
|
|---|
| 404 | *
|
|---|
| 405 | * @returns 0 on success, non-zero on failure.
|
|---|
| 406 | * @param pFtsEnt The FTS entry for the file.
|
|---|
| 407 | */
|
|---|
| 408 | static int kDupDoFile(FTSENT *pFtsEnt)
|
|---|
| 409 | {
|
|---|
| 410 | KU64 cbFile;
|
|---|
| 411 | #if K_OS == K_OS_WINDOWS
|
|---|
| 412 | struct stat const *pStat = &pFtsEnt->fts_stat;
|
|---|
| 413 | #else
|
|---|
| 414 | struct stat const *pStat = pFtsEnt->fts_statp;
|
|---|
| 415 | #endif
|
|---|
| 416 |
|
|---|
| 417 | if (g_cVerbosity >= 2)
|
|---|
| 418 | printf("debug: kDupDoFile(%" PATH_PRI ")\n", pFtsEnt->FTS_ACCPATH);
|
|---|
| 419 |
|
|---|
| 420 | /*
|
|---|
| 421 | * Check that it's within the size range.
|
|---|
| 422 | */
|
|---|
| 423 | cbFile = pStat->st_size;
|
|---|
| 424 | if ( cbFile >= g_cbMinFileSize
|
|---|
| 425 | && cbFile <= g_cbMaxFileSize)
|
|---|
| 426 | {
|
|---|
| 427 | /*
|
|---|
| 428 | * Start out treating this like a unique file with a unique size, i.e.
|
|---|
| 429 | * allocate all the structures we might possibly need.
|
|---|
| 430 | */
|
|---|
| 431 | #if K_OS == K_OS_WINDOWS
|
|---|
| 432 | size_t cbAccessPath = (wcslen(pFtsEnt->fts_wcsaccpath) + 1) * sizeof(wchar_t);
|
|---|
| 433 | #else
|
|---|
| 434 | size_t cbAccessPath = strlen(pFtsEnt->fts_accpath) + 1;
|
|---|
| 435 | #endif
|
|---|
| 436 | PKDUPFILENODE pFileNode = (PKDUPFILENODE)kDupAlloc(sizeof(*pFileNode) + cbAccessPath);
|
|---|
| 437 | PKDUPSIZENODE pSizeNode = (PKDUPSIZENODE)kDupAlloc(sizeof(*pSizeNode));
|
|---|
| 438 | if (!pFileNode || !pSizeNode)
|
|---|
| 439 | return 3;
|
|---|
| 440 | g_cFiles++;
|
|---|
| 441 |
|
|---|
| 442 | memset(&pFileNode->mKey, 0, sizeof(pFileNode->mKey));
|
|---|
| 443 | pFileNode->pNextHardLink = NULL;
|
|---|
| 444 | pFileNode->pNextDup = NULL;
|
|---|
| 445 | pFileNode->pNextGlobalDup = NULL;
|
|---|
| 446 | pFileNode->uDev = pStat->st_dev;
|
|---|
| 447 | pFileNode->uInode = pStat->st_ino;
|
|---|
| 448 | memcpy(pFileNode->PATH_MEMB, pFtsEnt->FTS_ACCPATH, cbAccessPath);
|
|---|
| 449 |
|
|---|
| 450 | pSizeNode->mKey = cbFile;
|
|---|
| 451 | pSizeNode->cFiles = 1;
|
|---|
| 452 | kDupFileTree_Init(&pSizeNode->FileRoot);
|
|---|
| 453 | kDupFileTree_Insert(&pSizeNode->FileRoot, pFileNode);
|
|---|
| 454 |
|
|---|
| 455 | /*
|
|---|
| 456 | * Try insert it.
|
|---|
| 457 | */
|
|---|
| 458 | if (kDupSizeTree_Insert(&g_SizeRoot, pSizeNode))
|
|---|
| 459 | { /* unique size, nothing more to do for now. */ }
|
|---|
| 460 | else
|
|---|
| 461 | {
|
|---|
| 462 | /*
|
|---|
| 463 | * More than one file with this size. We may need to hash the
|
|---|
| 464 | * hash the file we encountered with this size, if this is the
|
|---|
| 465 | * second one. In that case we should check for hardlinked or
|
|---|
| 466 | * double entering of the file first as well.
|
|---|
| 467 | */
|
|---|
| 468 | kDupFree(pSizeNode);
|
|---|
| 469 | pSizeNode = kDupSizeTree_Get(&g_SizeRoot, cbFile);
|
|---|
| 470 | if (pSizeNode->cFiles == 1)
|
|---|
| 471 | {
|
|---|
| 472 | PKDUPFILENODE pFirstFileNode = pSizeNode->FileRoot.mpRoot;
|
|---|
| 473 | if ( pFirstFileNode->uInode == pFileNode->uInode
|
|---|
| 474 | && pFileNode->uInode != 0
|
|---|
| 475 | && pFirstFileNode->uDev == pFileNode->uDev)
|
|---|
| 476 | {
|
|---|
| 477 | pFileNode->pNextHardLink = pFirstFileNode->pNextHardLink;
|
|---|
| 478 | pFirstFileNode->pNextHardLink = pFileNode;
|
|---|
| 479 | if (g_cVerbosity >= 1)
|
|---|
| 480 | printf("Found hardlinked: '%" PATH_PRI "' -> '%" PATH_PRI "' (ino:%#" KX64_PRI " dev:%#" KX64_PRI ")\n",
|
|---|
| 481 | pFileNode->PATH_MEMB, pFirstFileNode->PATH_MEMB, pFileNode->uInode, pFileNode->uDev);
|
|---|
| 482 | g_cHardlinked += 1;
|
|---|
| 483 | return 0;
|
|---|
| 484 | }
|
|---|
| 485 |
|
|---|
| 486 | kDupHashFile(pFirstFileNode, NULL);
|
|---|
| 487 | }
|
|---|
| 488 | kDupHashFile(pFileNode, pFtsEnt);
|
|---|
| 489 |
|
|---|
| 490 | if (kDupFileTree_Insert(&pSizeNode->FileRoot, pFileNode))
|
|---|
| 491 | { /* great, unique content */ }
|
|---|
| 492 | else
|
|---|
| 493 | {
|
|---|
| 494 | /*
|
|---|
| 495 | * Duplicate content. Could be hardlinked or a duplicate entry.
|
|---|
| 496 | */
|
|---|
| 497 | PKDUPFILENODE pDupFileNode = kDupFileTree_Get(&pSizeNode->FileRoot, pFileNode->mKey);
|
|---|
| 498 | if ( pDupFileNode->uInode == pFileNode->uInode
|
|---|
| 499 | && pFileNode->uInode != 0
|
|---|
| 500 | && pDupFileNode->uDev == pFileNode->uDev)
|
|---|
| 501 | {
|
|---|
| 502 | pFileNode->pNextHardLink = pDupFileNode->pNextHardLink;
|
|---|
| 503 | pDupFileNode->pNextHardLink = pFileNode;
|
|---|
| 504 | if (g_cVerbosity >= 1)
|
|---|
| 505 | printf("Found hardlinked: '%" PATH_PRI "' -> '%" PATH_PRI "' (ino:%#" KX64_PRI " dev:%#" KX64_PRI ")\n",
|
|---|
| 506 | pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB, pFileNode->uInode, pFileNode->uDev);
|
|---|
| 507 | g_cHardlinked += 1;
|
|---|
| 508 | }
|
|---|
| 509 | else
|
|---|
| 510 | {
|
|---|
| 511 | KBOOL fDifferentDev;
|
|---|
| 512 |
|
|---|
| 513 | /* Genuinly duplicate (or inode numbers are busted). */
|
|---|
| 514 | if (!pDupFileNode->pNextDup)
|
|---|
| 515 | {
|
|---|
| 516 | *g_ppNextDuplicate = pDupFileNode;
|
|---|
| 517 | g_ppNextDuplicate = &pDupFileNode->pNextGlobalDup;
|
|---|
| 518 | }
|
|---|
| 519 |
|
|---|
| 520 | /* The list is sorted by device to better facility hardlinking later. */
|
|---|
| 521 | while ( (fDifferentDev = pDupFileNode->uDev != pFileNode->uDev)
|
|---|
| 522 | && pDupFileNode->pNextDup)
|
|---|
| 523 | pDupFileNode = pDupFileNode->pNextDup;
|
|---|
| 524 |
|
|---|
| 525 | pFileNode->pNextDup = pDupFileNode->pNextDup;
|
|---|
| 526 | pDupFileNode->pNextDup = pFileNode;
|
|---|
| 527 |
|
|---|
| 528 | g_cDuplicates += 1;
|
|---|
| 529 | if (!fDifferentDev)
|
|---|
| 530 | {
|
|---|
| 531 | g_cDuplicatesSaved += 1;
|
|---|
| 532 | #if K_OS == K_OS_WINDOWS
|
|---|
| 533 | g_cbDuplicatesSaved += pStat->st_blocks * BIRD_STAT_BLOCK_SIZE;
|
|---|
| 534 | #else
|
|---|
| 535 | g_cbDuplicatesSaved += pStat->st_size;
|
|---|
| 536 | #endif
|
|---|
| 537 | if (g_cVerbosity >= 1)
|
|---|
| 538 | printf("Found duplicate: '%" PATH_PRI "' <-> '%" PATH_PRI "'\n",
|
|---|
| 539 | pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB);
|
|---|
| 540 | }
|
|---|
| 541 | else if (g_cVerbosity >= 1)
|
|---|
| 542 | printf("Found duplicate: '%" PATH_PRI "' <-> '%" PATH_PRI "' (devices differ).\n",
|
|---|
| 543 | pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB);
|
|---|
| 544 | }
|
|---|
| 545 | }
|
|---|
| 546 | }
|
|---|
| 547 | }
|
|---|
| 548 | else if (g_cVerbosity >= 1)
|
|---|
| 549 | printf("Skipping '%" PATH_PRI "' because %" KU64_PRI " bytes is outside the size range.\n", pFtsEnt->FTS_ACCPATH, cbFile);
|
|---|
| 550 | return 0;
|
|---|
| 551 | }
|
|---|
| 552 |
|
|---|
| 553 |
|
|---|
| 554 | /**
|
|---|
| 555 | * Process the non-option arguments, creating the file tree.
|
|---|
| 556 | *
|
|---|
| 557 | * @returns 0 on success, non-zero on failure.
|
|---|
| 558 | * @param papwszFtsArgs The input in argv style.
|
|---|
| 559 | * @param fFtsOptions The FTS options.
|
|---|
| 560 | */
|
|---|
| 561 | #if K_OS == K_OS_WINDOWS
|
|---|
| 562 | static int kDupReadAll(wchar_t **papwszFtsArgs, unsigned fFtsOptions)
|
|---|
| 563 | #else
|
|---|
| 564 | static int kDupReadAll(char **papszFtsArgs, unsigned fFtsOptions)
|
|---|
| 565 | #endif
|
|---|
| 566 | {
|
|---|
| 567 | int rcExit = 0;
|
|---|
| 568 | #if K_OS == K_OS_WINDOWS
|
|---|
| 569 | FTS *pFts = nt_fts_openw(papwszFtsArgs, fFtsOptions, NULL /*pfnCompare*/);
|
|---|
| 570 | #else
|
|---|
| 571 | FTS *pFts = fts_open(papszFtsArgs, fFtsOptions, NULL /*pfnCompare*/);
|
|---|
| 572 | #endif
|
|---|
| 573 | if (pFts != NULL)
|
|---|
| 574 | {
|
|---|
| 575 | for (;;)
|
|---|
| 576 | {
|
|---|
| 577 | #if K_OS == K_OS_WINDOWS
|
|---|
| 578 | FTSENT *pFtsEnt = nt_fts_read(pFts);
|
|---|
| 579 | #else
|
|---|
| 580 | FTSENT *pFtsEnt = fts_read(pFts);
|
|---|
| 581 | #endif
|
|---|
| 582 | if (pFtsEnt)
|
|---|
| 583 | {
|
|---|
| 584 | switch (pFtsEnt->fts_info)
|
|---|
| 585 | {
|
|---|
| 586 | case FTS_F:
|
|---|
| 587 | rcExit = kDupDoFile(pFtsEnt);
|
|---|
| 588 | if (rcExit == 0)
|
|---|
| 589 | continue;
|
|---|
| 590 | break;
|
|---|
| 591 |
|
|---|
| 592 | case FTS_D:
|
|---|
| 593 | if ( g_fRecursive
|
|---|
| 594 | || pFtsEnt->fts_level == FTS_ROOTLEVEL) /* enumerate dirs on the command line */
|
|---|
| 595 | continue;
|
|---|
| 596 | #if K_OS == K_OS_WINDOWS
|
|---|
| 597 | rcExit = nt_fts_set(pFts, pFtsEnt, FTS_SKIP);
|
|---|
| 598 | #else
|
|---|
| 599 | rcExit = fts_set(pFts, pFtsEnt, FTS_SKIP);
|
|---|
| 600 | #endif
|
|---|
| 601 | if (rcExit == 0)
|
|---|
| 602 | continue;
|
|---|
| 603 | fprintf(stderr, "kDeDup: internal error: nt_fts_set failed!\n");
|
|---|
| 604 | rcExit = 1;
|
|---|
| 605 | break;
|
|---|
| 606 |
|
|---|
| 607 | case FTS_DP:
|
|---|
| 608 | /* nothing to do here. */
|
|---|
| 609 | break;
|
|---|
| 610 |
|
|---|
| 611 | case FTS_SL:
|
|---|
| 612 | {
|
|---|
| 613 | #if K_OS == K_OS_WINDOWS
|
|---|
| 614 | /* The nice thing on windows is that we already know whether it's a
|
|---|
| 615 | directory or file when encountering the symbolic link. */
|
|---|
| 616 | if ( (pFtsEnt->fts_stat.st_isdirsymlink ? g_fRecursiveViaSymlinks : g_fFollowSymlinkedFiles)
|
|---|
| 617 | && pFtsEnt->fts_number == 0)
|
|---|
| 618 | #else
|
|---|
| 619 | struct stat St;
|
|---|
| 620 | if ( pFtsEnt->fts_number == 0
|
|---|
| 621 | && ( (g_fRecursiveViaSymlinks && g_fFollowSymlinkedFiles)
|
|---|
| 622 | || ( stat(pFtsEnt->fts_accpath, &St) == 0
|
|---|
| 623 | && (S_ISDIR(St.st_mode) ? g_fRecursiveViaSymlinks : g_fFollowSymlinkedFiles))))
|
|---|
| 624 | #endif
|
|---|
| 625 | {
|
|---|
| 626 | pFtsEnt->fts_number++;
|
|---|
| 627 | #if K_OS == K_OS_WINDOWS
|
|---|
| 628 | rcExit = nt_fts_set(pFts, pFtsEnt, FTS_FOLLOW);
|
|---|
| 629 | #else
|
|---|
| 630 | rcExit = fts_set(pFts, pFtsEnt, FTS_FOLLOW);
|
|---|
| 631 | #endif
|
|---|
| 632 | if (rcExit == 0)
|
|---|
| 633 | continue;
|
|---|
| 634 | fprintf(stderr, "kDeDup: internal error: nt_fts_set failed!\n");
|
|---|
| 635 | rcExit = 1;
|
|---|
| 636 | }
|
|---|
| 637 | break;
|
|---|
| 638 | }
|
|---|
| 639 |
|
|---|
| 640 | case FTS_DC:
|
|---|
| 641 | fprintf(stderr, "kDeDup: warning: Ignoring cycle '%" PATH_PRI "'!\n", pFtsEnt->FTS_ACCPATH);
|
|---|
| 642 | continue;
|
|---|
| 643 |
|
|---|
| 644 | case FTS_NS:
|
|---|
| 645 | fprintf(stderr, "kDeDup: warning: Failed to stat '%" PATH_PRI "': %s (%d)\n",
|
|---|
| 646 | pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
|
|---|
| 647 | continue;
|
|---|
| 648 |
|
|---|
| 649 | case FTS_DNR:
|
|---|
| 650 | fprintf(stderr, "kDeDup: error: Error reading directory '%" PATH_PRI "': %s (%d)\n",
|
|---|
| 651 | pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
|
|---|
| 652 | rcExit = 1;
|
|---|
| 653 | break;
|
|---|
| 654 |
|
|---|
| 655 | case FTS_ERR:
|
|---|
| 656 | fprintf(stderr, "kDeDup: error: Error on '%" PATH_PRI "': %s (%d)\n",
|
|---|
| 657 | pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
|
|---|
| 658 | rcExit = 1;
|
|---|
| 659 | break;
|
|---|
| 660 |
|
|---|
| 661 |
|
|---|
| 662 | /* ignore */
|
|---|
| 663 | case FTS_SLNONE:
|
|---|
| 664 | case FTS_DEFAULT:
|
|---|
| 665 | break;
|
|---|
| 666 |
|
|---|
| 667 | /* Not supposed to get here. */
|
|---|
| 668 | default:
|
|---|
| 669 | fprintf(stderr, "kDeDup: internal error: fts_info=%d - '%" PATH_PRI "'\n",
|
|---|
| 670 | pFtsEnt->fts_info, pFtsEnt->FTS_ACCPATH);
|
|---|
| 671 | rcExit = 1;
|
|---|
| 672 | break;
|
|---|
| 673 | }
|
|---|
| 674 | }
|
|---|
| 675 | else if (errno == 0)
|
|---|
| 676 | break;
|
|---|
| 677 | else
|
|---|
| 678 | {
|
|---|
| 679 | fprintf(stderr, "kDeDup: error: nt_fts_read failed: %s (%d)\n", strerror(errno), errno);
|
|---|
| 680 | rcExit = 1;
|
|---|
| 681 | break;
|
|---|
| 682 | }
|
|---|
| 683 | }
|
|---|
| 684 |
|
|---|
| 685 | #if K_OS == K_OS_WINDOWS
|
|---|
| 686 | if (nt_fts_close(pFts) != 0)
|
|---|
| 687 | #else
|
|---|
| 688 | if (fts_close(pFts) != 0)
|
|---|
| 689 | #endif
|
|---|
| 690 | {
|
|---|
| 691 | fprintf(stderr, "kDeDup: error: nt_fts_close failed: %s (%d)\n", strerror(errno), errno);
|
|---|
| 692 | rcExit = 1;
|
|---|
| 693 | }
|
|---|
| 694 | }
|
|---|
| 695 | else
|
|---|
| 696 | {
|
|---|
| 697 | fprintf(stderr, "kDeDup: error: nt_fts_openw failed: %s (%d)\n", strerror(errno), errno);
|
|---|
| 698 | rcExit = 1;
|
|---|
| 699 | }
|
|---|
| 700 |
|
|---|
| 701 | return rcExit;
|
|---|
| 702 | }
|
|---|
| 703 |
|
|---|
| 704 |
|
|---|
| 705 | /**
|
|---|
| 706 | * Compares the content of the two files.
|
|---|
| 707 | *
|
|---|
| 708 | * @returns 0 if equal, 1 if not equal, -1 on open/read error.
|
|---|
| 709 | * @param pFile1 The first file.
|
|---|
| 710 | * @param pFile2 The second file.
|
|---|
| 711 | */
|
|---|
| 712 | static int kDupCompareFiles(PKDUPFILENODE pFile1, PKDUPFILENODE pFile2)
|
|---|
| 713 | {
|
|---|
| 714 | #if K_OS == K_OS_WINDOWS
|
|---|
| 715 | int rcRet = 0;
|
|---|
| 716 | K_NOREF(pFile1);
|
|---|
| 717 | K_NOREF(pFile2);
|
|---|
| 718 | /** @todo compare files. */
|
|---|
| 719 | #else
|
|---|
| 720 | int rcRet = -1;
|
|---|
| 721 | # ifdef O_BINARY
|
|---|
| 722 | int fOpen = O_RDONLY | O_BINARY;
|
|---|
| 723 | # else
|
|---|
| 724 | int fOpen = O_RDONLY;
|
|---|
| 725 | # endif
|
|---|
| 726 | /*
|
|---|
| 727 | * Open the two files.
|
|---|
| 728 | */
|
|---|
| 729 | int fd1 = open(pFile1->szPath, fOpen);
|
|---|
| 730 | if (fd1 >= 0)
|
|---|
| 731 | {
|
|---|
| 732 | int fd2 = open(pFile2->szPath, fOpen);
|
|---|
| 733 | if (fd1 >= 0)
|
|---|
| 734 | {
|
|---|
| 735 | /*
|
|---|
| 736 | * Read and compare all the data.
|
|---|
| 737 | */
|
|---|
| 738 | static KU8 s_abBuf1[2*1024*1024];
|
|---|
| 739 | static KU8 s_abBuf2[2*1024*1024];
|
|---|
| 740 | KU64 off = 0;
|
|---|
| 741 | for (;;)
|
|---|
| 742 | {
|
|---|
| 743 | ssize_t cb1 = kDupReadFile(fd1, s_abBuf1, sizeof(s_abBuf1));
|
|---|
| 744 | ssize_t cb2 = kDupReadFile(fd2, s_abBuf2, sizeof(s_abBuf2));
|
|---|
| 745 | if (cb1 < 0 || cb2 < 0)
|
|---|
| 746 | {
|
|---|
| 747 | if (cb1 < 0)
|
|---|
| 748 | fprintf(stderr, "kDeDup: error: reading from '%s': %s (%d)\n", pFile1->szPath, strerror(errno), errno);
|
|---|
| 749 | if (cb2 < 0)
|
|---|
| 750 | fprintf(stderr, "kDeDup: error: reading from '%s': %s (%d)\n", pFile2->szPath, strerror(errno), errno);
|
|---|
| 751 | break;
|
|---|
| 752 | }
|
|---|
| 753 | if (cb1 != cb2)
|
|---|
| 754 | {
|
|---|
| 755 | fprintf(stderr, "kDeDup: warning: '%s' now differs from '%s' in size...\n", pFile1->szPath, pFile2->szPath);
|
|---|
| 756 | rcRet = 1;
|
|---|
| 757 | break;
|
|---|
| 758 | }
|
|---|
| 759 | if (cb1 == 0)
|
|---|
| 760 | {
|
|---|
| 761 | rcRet = 0;
|
|---|
| 762 | break;
|
|---|
| 763 | }
|
|---|
| 764 | if (memcmp(s_abBuf1, s_abBuf2, cb1) != 0)
|
|---|
| 765 | {
|
|---|
| 766 | fprintf(stderr, "kDeDup: warning: hash collision: '%s' differs from '%s' (" KX64_PRI " LB %#x)\n",
|
|---|
| 767 | pFile1->szPath, pFile2->szPath, off, (unsigned)cb1);
|
|---|
| 768 | rcRet = 1;
|
|---|
| 769 | break;
|
|---|
| 770 | }
|
|---|
| 771 | off += cb1;
|
|---|
| 772 | }
|
|---|
| 773 |
|
|---|
| 774 | close(fd2);
|
|---|
| 775 | }
|
|---|
| 776 | close(fd1);
|
|---|
| 777 | }
|
|---|
| 778 | #endif
|
|---|
| 779 | return rcRet;
|
|---|
| 780 | }
|
|---|
| 781 |
|
|---|
| 782 |
|
|---|
| 783 | /**
|
|---|
| 784 | * Hardlink duplicates.
|
|---|
| 785 | */
|
|---|
| 786 | static int kDupHardlinkDuplicates(void)
|
|---|
| 787 | {
|
|---|
| 788 | int rcExit = 0;
|
|---|
| 789 | PKDUPFILENODE pFileNode;
|
|---|
| 790 | for (pFileNode = g_pDuplicateHead; pFileNode != NULL; pFileNode = pFileNode->pNextGlobalDup)
|
|---|
| 791 | {
|
|---|
| 792 | PKDUPFILENODE pTargetFile = pFileNode;
|
|---|
| 793 | PKDUPFILENODE pDupFile;
|
|---|
| 794 | for (pDupFile = pFileNode->pNextDup; pDupFile != NULL; pDupFile = pDupFile->pNextDup)
|
|---|
| 795 | {
|
|---|
| 796 | /*
|
|---|
| 797 | * Can only hard link if the files are on the same device.
|
|---|
| 798 | */
|
|---|
| 799 | if (pDupFile->uDev == pTargetFile->uDev)
|
|---|
| 800 | {
|
|---|
| 801 | if (kDupCompareFiles(pDupFile, pTargetFile) == 0)
|
|---|
| 802 | {
|
|---|
| 803 | /*
|
|---|
| 804 | * Start by renaming the orinal file before we try create the hard link.
|
|---|
| 805 | */
|
|---|
| 806 | #if K_OS == K_OS_WINDOWS
|
|---|
| 807 | static const wchar_t s_wszBackupSuffix[] = L".kDepBackup";
|
|---|
| 808 | wchar_t wszBackup[0x4000];
|
|---|
| 809 | size_t cwcPath = wcslen(pDupFile->wszPath);
|
|---|
| 810 | if (cwcPath + sizeof(s_wszBackupSuffix) / sizeof(wchar_t) < K_ELEMENTS(wszBackup))
|
|---|
| 811 | {
|
|---|
| 812 | memcpy(wszBackup, pDupFile->wszPath, cwcPath * sizeof(wchar_t));
|
|---|
| 813 | memcpy(&wszBackup[cwcPath], s_wszBackupSuffix, sizeof(s_wszBackupSuffix));
|
|---|
| 814 | if (MoveFileW(pDupFile->wszPath, wszBackup))
|
|---|
| 815 | {
|
|---|
| 816 | if (CreateHardLinkW(pDupFile->wszPath, pTargetFile->wszPath, NULL))
|
|---|
| 817 | {
|
|---|
| 818 | if (birdUnlinkForcedW(wszBackup) == 0)
|
|---|
| 819 | {
|
|---|
| 820 | if (g_cVerbosity >= 1)
|
|---|
| 821 | printf("Hardlinked '%ls' to '%ls'.\n", pDupFile->wszPath, pTargetFile->wszPath);
|
|---|
| 822 | }
|
|---|
| 823 | else
|
|---|
| 824 | {
|
|---|
| 825 | fprintf(stderr, "kDeDup: fatal: failed to delete '%ls' after hardlinking: %s (%d)\n",
|
|---|
| 826 | wszBackup, strerror(errno), errno);
|
|---|
| 827 | return 8;
|
|---|
| 828 | }
|
|---|
| 829 | }
|
|---|
| 830 | else
|
|---|
| 831 | {
|
|---|
| 832 | fprintf(stderr, "kDeDup: error: failed to hard link '%ls' to '%ls': %u\n",
|
|---|
| 833 | pDupFile->wszPath, wszBackup, GetLastError());
|
|---|
| 834 | if (!MoveFileW(wszBackup, pDupFile->wszPath))
|
|---|
| 835 | {
|
|---|
| 836 | fprintf(stderr, "kDeDup: fatal: Restore '%ls' to '%ls' after hardlinking failed: %u\n",
|
|---|
| 837 | wszBackup, pDupFile->wszPath, GetLastError());
|
|---|
| 838 | return 8;
|
|---|
| 839 | }
|
|---|
| 840 | rcExit = 1;
|
|---|
| 841 | }
|
|---|
| 842 | }
|
|---|
| 843 | else
|
|---|
| 844 | {
|
|---|
| 845 | fprintf(stderr, "kDeDup: error: failed to rename '%ls' to '%ls': %u\n",
|
|---|
| 846 | pDupFile->wszPath, wszBackup, GetLastError());
|
|---|
| 847 | rcExit = 1;
|
|---|
| 848 | }
|
|---|
| 849 | }
|
|---|
| 850 | else
|
|---|
| 851 | {
|
|---|
| 852 | fprintf(stderr, "kDeDup: error: too long backup path: '%ls'\n", pDupFile->wszPath);
|
|---|
| 853 | rcExit = 1;
|
|---|
| 854 | }
|
|---|
| 855 | #else /* K_OS != K_OS_WINDOWS */
|
|---|
| 856 | static const char s_szBackupSuffix[] = ".kDepBackup";
|
|---|
| 857 | char szBackup[0x4000];
|
|---|
| 858 | size_t cchPath = strlen(pDupFile->szPath);
|
|---|
| 859 | if (cchPath + sizeof(s_szBackupSuffix) < sizeof(szBackup))
|
|---|
| 860 | {
|
|---|
| 861 | struct stat StTmp;
|
|---|
| 862 | memcpy(szBackup, pDupFile->szPath, cchPath);
|
|---|
| 863 | memcpy(&szBackup[cchPath], s_szBackupSuffix, sizeof(s_szBackupSuffix));
|
|---|
| 864 | if (stat(szBackup, &StTmp) != 0)
|
|---|
| 865 | {
|
|---|
| 866 | if (rename(pDupFile->szPath, szBackup) == 0)
|
|---|
| 867 | {
|
|---|
| 868 | if (link(pTargetFile->szPath, pDupFile->szPath) == 0)
|
|---|
| 869 | {
|
|---|
| 870 | if (unlink(szBackup) == 0)
|
|---|
| 871 | {
|
|---|
| 872 | if (g_cVerbosity >= 1)
|
|---|
| 873 | printf("Hardlinked '%s' to '%s'.\n", pDupFile->szPath, pTargetFile->szPath);
|
|---|
| 874 | }
|
|---|
| 875 | else
|
|---|
| 876 | {
|
|---|
| 877 | fprintf(stderr, "kDeDup: fatal: failed to delete '%s' after hardlinking: %s (%d)\n",
|
|---|
| 878 | szBackup, strerror(errno), errno);
|
|---|
| 879 | return 8;
|
|---|
| 880 | }
|
|---|
| 881 | }
|
|---|
| 882 | else
|
|---|
| 883 | {
|
|---|
| 884 | fprintf(stderr, "kDeDup: error: failed to hard link '%s' to '%s': %s (%d)\n",
|
|---|
| 885 | pDupFile->szPath, szBackup, strerror(errno), errno);
|
|---|
| 886 | if (rename(szBackup, pDupFile->szPath) != 0)
|
|---|
| 887 | {
|
|---|
| 888 | fprintf(stderr, "kDeDup: fatal: Restore '%s' to '%s' after hardlinking failed: %s (%d)\n",
|
|---|
| 889 | szBackup, pDupFile->szPath, strerror(errno), errno);
|
|---|
| 890 | return 8;
|
|---|
| 891 | }
|
|---|
| 892 | rcExit = 1;
|
|---|
| 893 | }
|
|---|
| 894 | }
|
|---|
| 895 | else
|
|---|
| 896 | {
|
|---|
| 897 | fprintf(stderr, "kDeDup: error: failed to rename '%s' to '%s': %s (%d)\n",
|
|---|
| 898 | pDupFile->szPath, szBackup, strerror(errno), errno);
|
|---|
| 899 | rcExit = 1;
|
|---|
| 900 | }
|
|---|
| 901 | }
|
|---|
| 902 | else
|
|---|
| 903 | {
|
|---|
| 904 | fprintf(stderr, "kDeDup: error: failed to rename '%s' to '%s': file already exist (st_mode=%#x)\n",
|
|---|
| 905 | pDupFile->szPath, szBackup, StTmp.st_mode);
|
|---|
| 906 | rcExit = 1;
|
|---|
| 907 | }
|
|---|
| 908 | }
|
|---|
| 909 | else
|
|---|
| 910 | {
|
|---|
| 911 | fprintf(stderr, "kDeDup: error: too long backup path: '%s'\n", pDupFile->szPath);
|
|---|
| 912 | rcExit = 1;
|
|---|
| 913 | }
|
|---|
| 914 | #endif /* K_OS != K_OS_WINDOWS */
|
|---|
| 915 | }
|
|---|
| 916 | }
|
|---|
| 917 | /*
|
|---|
| 918 | * Since the list is sorted by uDev, we now change the target file.
|
|---|
| 919 | */
|
|---|
| 920 | else
|
|---|
| 921 | pTargetFile = pDupFile;
|
|---|
| 922 | }
|
|---|
| 923 | }
|
|---|
| 924 | return rcExit;
|
|---|
| 925 | }
|
|---|
| 926 |
|
|---|
| 927 |
|
|---|
| 928 | static int usage(const char *pszName, FILE *pOut)
|
|---|
| 929 | {
|
|---|
| 930 | fprintf(pOut,
|
|---|
| 931 | "usage: %s [options] <path1> [path2 [..]]\n"
|
|---|
| 932 | "usage: %s <-V|--version>\n"
|
|---|
| 933 | "usage: %s <-h|--help>\n"
|
|---|
| 934 | , pszName, pszName, pszName);
|
|---|
| 935 | fprintf(pOut,
|
|---|
| 936 | "\n"
|
|---|
| 937 | "Options:\n"
|
|---|
| 938 | " -H, --dereference-command-line, --no-dereference-command-line\n"
|
|---|
| 939 | " Follow symbolic links on the command line.\n"
|
|---|
| 940 | " -L, --dereference\n"
|
|---|
| 941 | " Follow symbolic links while scanning directories.\n"
|
|---|
| 942 | " -P, --no-dereference\n"
|
|---|
| 943 | " Do not follow symbolic links while scanning directories.\n"
|
|---|
| 944 | " -r, --recursive\n"
|
|---|
| 945 | " Recurse into subdirectories, but do not follow links to them.\n"
|
|---|
| 946 | " -R, --recursive-dereference\n"
|
|---|
| 947 | " Same as -r, but also follow into symlinked subdirectories.\n"
|
|---|
| 948 | " -x, --one-file-system\n"
|
|---|
| 949 | " Do not consider other file system (volumes), either down thru a\n"
|
|---|
| 950 | " mount point or via a symbolic link to a directory.\n"
|
|---|
| 951 | " --no-one-file-system, --cross-file-systems\n"
|
|---|
| 952 | " Reverses the effect of --one-file-system.\n"
|
|---|
| 953 | " -q, --quiet, -v,--verbose\n"
|
|---|
| 954 | " Controls the output level.\n"
|
|---|
| 955 | " --hardlink-duplicates\n"
|
|---|
| 956 | " Hardlink duplicate files to remove duplicates and save space. By default\n"
|
|---|
| 957 | " no action is taken and only analysis is done.\n"
|
|---|
| 958 | );
|
|---|
| 959 | return 0;
|
|---|
| 960 | }
|
|---|
| 961 |
|
|---|
| 962 |
|
|---|
| 963 | #if K_OS == K_OS_WINDOWS
|
|---|
| 964 | int wmain(int argc, wchar_t **argv)
|
|---|
| 965 | #else
|
|---|
| 966 | int main(int argc, char **argv)
|
|---|
| 967 | #endif
|
|---|
| 968 | {
|
|---|
| 969 | int rcExit;
|
|---|
| 970 |
|
|---|
| 971 | /*
|
|---|
| 972 | * Process parameters. Position.
|
|---|
| 973 | */
|
|---|
| 974 | unsigned cFtsArgs = 0;
|
|---|
| 975 | #if K_OS == K_OS_WINDOWS
|
|---|
| 976 | wchar_t **papwszFtsArgs = (wchar_t **)calloc(argc + 1, sizeof(wchar_t *));
|
|---|
| 977 | unsigned fFtsOptions = FTS_NOCHDIR | FTS_NO_ANSI;
|
|---|
| 978 | #else
|
|---|
| 979 | char **papszFtsArgs = (char **)calloc(argc + 1, sizeof(char *));
|
|---|
| 980 | unsigned fFtsOptions = FTS_NOCHDIR;
|
|---|
| 981 | #endif
|
|---|
| 982 | KBOOL fEndOfOptions = K_FALSE;
|
|---|
| 983 | KBOOL fHardlinkDups = K_FALSE;
|
|---|
| 984 | int i;
|
|---|
| 985 | for (i = 1; i < argc; i++)
|
|---|
| 986 | {
|
|---|
| 987 | #if K_OS == K_OS_WINDOWS
|
|---|
| 988 | wchar_t *pwszArg = argv[i];
|
|---|
| 989 | if ( *pwszArg == '-'
|
|---|
| 990 | && !fEndOfOptions)
|
|---|
| 991 | #else
|
|---|
| 992 | char *pszArg = argv[i];
|
|---|
| 993 | if ( *pszArg == '-'
|
|---|
| 994 | && !fEndOfOptions)
|
|---|
| 995 | #endif
|
|---|
| 996 | {
|
|---|
| 997 | #if K_OS != K_OS_WINDOWS
|
|---|
| 998 | wchar_t wszOpt[1024] = { 0 };
|
|---|
| 999 | wchar_t *pwszArg = wszOpt;
|
|---|
| 1000 | mbsrtowcs(wszOpt, (const char **)&pszArg, 1024 - 1, NULL);
|
|---|
| 1001 | #endif
|
|---|
| 1002 | wchar_t wcOpt = *++pwszArg;
|
|---|
| 1003 | pwszArg++;
|
|---|
| 1004 | if (wcOpt == '-')
|
|---|
| 1005 | {
|
|---|
| 1006 | /* Translate long options. */
|
|---|
| 1007 | if (wcscmp(pwszArg, L"help") == 0)
|
|---|
| 1008 | wcOpt = 'h';
|
|---|
| 1009 | else if (wcscmp(pwszArg, L"version") == 0)
|
|---|
| 1010 | wcOpt = 'V';
|
|---|
| 1011 | else if (wcscmp(pwszArg, L"recursive") == 0)
|
|---|
| 1012 | wcOpt = 'r';
|
|---|
| 1013 | else if (wcscmp(pwszArg, L"dereference-recursive") == 0)
|
|---|
| 1014 | wcOpt = 'R';
|
|---|
| 1015 | else if (wcscmp(pwszArg, L"dereference") == 0)
|
|---|
| 1016 | wcOpt = 'L';
|
|---|
| 1017 | else if (wcscmp(pwszArg, L"dereference-command-line") == 0)
|
|---|
| 1018 | wcOpt = 'H';
|
|---|
| 1019 | else if (wcscmp(pwszArg, L"one-file-system") == 0)
|
|---|
| 1020 | wcOpt = 'x';
|
|---|
| 1021 | /* Process long options. */
|
|---|
| 1022 | else if (*pwszArg == '\0')
|
|---|
| 1023 | {
|
|---|
| 1024 | fEndOfOptions = K_TRUE;
|
|---|
| 1025 | continue;
|
|---|
| 1026 | }
|
|---|
| 1027 | else if (wcscmp(pwszArg, L"no-recursive") == 0)
|
|---|
| 1028 | {
|
|---|
| 1029 | g_fRecursive = g_fRecursiveViaSymlinks = K_FALSE;
|
|---|
| 1030 | continue;
|
|---|
| 1031 | }
|
|---|
| 1032 | else if (wcscmp(pwszArg, L"no-dereference-command-line") == 0)
|
|---|
| 1033 | {
|
|---|
| 1034 | fFtsOptions &= ~FTS_COMFOLLOW;
|
|---|
| 1035 | continue;
|
|---|
| 1036 | }
|
|---|
| 1037 | else if ( wcscmp(pwszArg, L"no-one-file-system") == 0
|
|---|
| 1038 | || wcscmp(pwszArg, L"cross-file-systems") == 0)
|
|---|
| 1039 | {
|
|---|
| 1040 | fFtsOptions &= ~FTS_XDEV;
|
|---|
| 1041 | continue;
|
|---|
| 1042 | }
|
|---|
| 1043 | else if (wcscmp(pwszArg, L"hardlink-duplicates") == 0)
|
|---|
| 1044 | {
|
|---|
| 1045 | fHardlinkDups = K_TRUE;
|
|---|
| 1046 | continue;
|
|---|
| 1047 | }
|
|---|
| 1048 | else
|
|---|
| 1049 | {
|
|---|
| 1050 | fprintf(stderr, "kDeDup: syntax error: Unknown option '--%ls'\n", pwszArg);
|
|---|
| 1051 | return 2;
|
|---|
| 1052 | }
|
|---|
| 1053 | }
|
|---|
| 1054 |
|
|---|
| 1055 | /* Process one or more short options. */
|
|---|
| 1056 | do
|
|---|
| 1057 | {
|
|---|
| 1058 | switch (wcOpt)
|
|---|
| 1059 | {
|
|---|
| 1060 | case 'r': /* --recursive */
|
|---|
| 1061 | g_fRecursive = K_TRUE;
|
|---|
| 1062 | break;
|
|---|
| 1063 |
|
|---|
| 1064 | case 'R': /* --dereference-recursive */
|
|---|
| 1065 | g_fRecursive = g_fRecursiveViaSymlinks = K_TRUE;
|
|---|
| 1066 | break;
|
|---|
| 1067 |
|
|---|
| 1068 | case 'H': /* --dereference-command-line */
|
|---|
| 1069 | fFtsOptions |= FTS_COMFOLLOW;
|
|---|
| 1070 | break;
|
|---|
| 1071 |
|
|---|
| 1072 | case 'L': /* --dereference*/
|
|---|
| 1073 | g_fFollowSymlinkedFiles = K_TRUE;
|
|---|
| 1074 | break;
|
|---|
| 1075 |
|
|---|
| 1076 | case 'x': /* --one-file-system*/
|
|---|
| 1077 | fFtsOptions |= FTS_XDEV;
|
|---|
| 1078 | break;
|
|---|
| 1079 |
|
|---|
| 1080 | case 'q':
|
|---|
| 1081 | g_cVerbosity = 0;
|
|---|
| 1082 | break;
|
|---|
| 1083 |
|
|---|
| 1084 | case 'v':
|
|---|
| 1085 | g_cVerbosity++;
|
|---|
| 1086 | break;
|
|---|
| 1087 |
|
|---|
| 1088 |
|
|---|
| 1089 | case 'h':
|
|---|
| 1090 | case '?':
|
|---|
| 1091 | return usage("kDeDup", stdout);
|
|---|
| 1092 |
|
|---|
| 1093 | case 'V':
|
|---|
| 1094 | printf("0.0.1\n");
|
|---|
| 1095 | return 0;
|
|---|
| 1096 |
|
|---|
| 1097 | default:
|
|---|
| 1098 | #if K_OS == K_OS_WINDOWS
|
|---|
| 1099 | fprintf(stderr, "kDeDup: syntax error: Unknown option '-%lc'\n", wcOpt);
|
|---|
| 1100 | #else
|
|---|
| 1101 | fprintf(stderr, "kDeDup: syntax error: Unknown option '-%c'\n", (int)wcOpt);
|
|---|
| 1102 | #endif
|
|---|
| 1103 | return 2;
|
|---|
| 1104 | }
|
|---|
| 1105 |
|
|---|
| 1106 | wcOpt = *pwszArg++;
|
|---|
| 1107 | } while (wcOpt != '\0');
|
|---|
| 1108 | }
|
|---|
| 1109 | else
|
|---|
| 1110 | {
|
|---|
| 1111 | /*
|
|---|
| 1112 | * Append non-option arguments to the FTS argument vector.
|
|---|
| 1113 | */
|
|---|
| 1114 | #if K_OS == K_OS_WINDOWS
|
|---|
| 1115 | papwszFtsArgs[cFtsArgs] = pwszArg;
|
|---|
| 1116 | #else
|
|---|
| 1117 | papszFtsArgs[cFtsArgs] = pszArg;
|
|---|
| 1118 | #endif
|
|---|
| 1119 | cFtsArgs++;
|
|---|
| 1120 | }
|
|---|
| 1121 | }
|
|---|
| 1122 |
|
|---|
| 1123 | /*
|
|---|
| 1124 | * Do the FTS processing.
|
|---|
| 1125 | */
|
|---|
| 1126 | kDupSizeTree_Init(&g_SizeRoot);
|
|---|
| 1127 | #if K_OS == K_OS_WINDOWS
|
|---|
| 1128 | rcExit = kDupReadAll(papwszFtsArgs, fFtsOptions);
|
|---|
| 1129 | #else
|
|---|
| 1130 | rcExit = kDupReadAll(papszFtsArgs, fFtsOptions);
|
|---|
| 1131 | #endif
|
|---|
| 1132 | if (rcExit == 0)
|
|---|
| 1133 | {
|
|---|
| 1134 | /*
|
|---|
| 1135 | * Display the result.
|
|---|
| 1136 | */
|
|---|
| 1137 | printf("Found %" KU64_PRI " duplicate files, out which %" KU64_PRI " can be hardlinked saving %" KU64_PRI " bytes\n",
|
|---|
| 1138 | g_cDuplicates, g_cDuplicatesSaved, g_cbDuplicatesSaved);
|
|---|
| 1139 |
|
|---|
| 1140 | if (fHardlinkDups)
|
|---|
| 1141 | rcExit = kDupHardlinkDuplicates();
|
|---|
| 1142 | }
|
|---|
| 1143 |
|
|---|
| 1144 | K_NOREF(kDupFileTree_Remove);
|
|---|
| 1145 | K_NOREF(kDupSizeTree_Remove);
|
|---|
| 1146 | return rcExit;
|
|---|
| 1147 | }
|
|---|
| 1148 |
|
|---|