diff --git a/fdupes.changes b/fdupes.changes index 4012f13..7966cd1 100644 --- a/fdupes.changes +++ b/fdupes.changes @@ -1,3 +1,15 @@ +------------------------------------------------------------------- +Fri Apr 1 19:50:32 UTC 2022 - Stefan Brüns + +- Fixes for the new wrapper: + * Order duplicates by name, to get a reproducible file set + (boo#1197484). + * Remove redundant order parameter from fdupes invocation. + * Modernize code, significantly reduce allocations. + * Exit immediately when mandatory parameters are missing. + * Remove obsolete buildroot parameter + * Add some tests for the wrapper + ------------------------------------------------------------------- Tue Mar 15 07:41:35 UTC 2022 - Stephan Kulow diff --git a/fdupes.spec b/fdupes.spec index 07be3a5..a916486 100644 --- a/fdupes.spec +++ b/fdupes.spec @@ -53,6 +53,26 @@ install -D -m755 fdupes_wrapper %{buildroot}/usr/lib/rpm/fdupes_wrapper ./%{name} --recurse testdir ./%{name} --size testdir +# Check wrapper +PATH=`pwd`:$PATH +(cd testdir; md5sum ./* ./*/* > ../testdir.md5 || true) +for operation in '-n' '-s' ' '; do + cp -R testdir "testdir${operation}" + ./fdupes_wrapper ${operation} "testdir${operation}" + (cd "testdir${operation}"; md5sum --check ../testdir.md5) +done +# Check order does not depend on creation order - x should be target +mkdir testdir_order +for t in "a b x" "x a b" "a x b"; do + pushd testdir_order + for f in $t ; do cp ../testdir.md5 $f; done + ../fdupes_wrapper -s ./ + test -h ./a + test -h ./b + rm * + popd +done + %files %doc CHANGES %{_bindir}/%{name} diff --git a/fdupes_wrapper.cpp b/fdupes_wrapper.cpp index 1ef481a..7bd8216 100644 --- a/fdupes_wrapper.cpp +++ b/fdupes_wrapper.cpp @@ -4,19 +4,16 @@ * * Copyright 2022 Jiri Slaby * 2022 Stephan Kulow + * 2022 Stefan Brüns * * SPDX-License-Identifier: MIT */ #include -#include #include -#include -#include #include #include #include -#include #include #include #include @@ -24,8 +21,22 @@ using namespace std; -typedef std::map> dups_map; -typedef std::pair nlink_pair; +struct file_entry +{ + ino_t inode; + nlink_t link_count; + string path; + + file_entry(ino_t i, nlink_t n, string&& p) + : inode(i), link_count(n), path(move(p)) {} +}; +using dup_set = vector; + +enum class Operation { + Symlink, + Hardlink, + DryRun, +}; vector split_paths(const string& path) { @@ -42,7 +53,7 @@ vector split_paths(const string& path) return paths; } -string merge_paths(vector paths) +string merge_paths(const vector& paths) { string path; for (const auto& s : paths) { @@ -79,33 +90,18 @@ string relative(const string& p1, const string& p2) return merge_paths(paths); } -bool cmp_nlink(const nlink_pair& a, const nlink_pair& b) -{ - return a.second > b.second; -} - -void sort_by_count(const dups_map& in, std::vector& out) -{ - out.clear(); - std::list nlinks; - for (auto it = in.cbegin(); it != in.cend(); ++it) { - nlinks.push_back(std::make_pair(it->first, it->second.size())); - } - nlinks.sort(cmp_nlink); - for (auto it = nlinks.cbegin(); it != nlinks.cend(); ++it) { - out.push_back(it->first); - } -} - -void link_file(const std::string& file, const std::string& target, bool symlink) +void link_file(const std::string& file, const std::string& target, Operation op) { std::cout << "Linking " << file << " -> " << target << std::endl; + if (op == Operation::DryRun) + return; + if (unlink(file.c_str())) { std::cerr << "Removing '" << file << "' failed." << std::endl; exit(1); } int ret; - if (symlink) { + if (op == Operation::Symlink) { ret = ::symlink(target.c_str(), file.c_str()); } else { ret = link(target.c_str(), file.c_str()); @@ -116,44 +112,65 @@ void link_file(const std::string& file, const std::string& target, bool symlink) } } -std::string target_for_link(string target, const std::string &file, bool symlink) +std::string target_for_link(string target, const std::string &file, Operation op) { - if (!symlink) // hardlinks don't care + if (op == Operation::Hardlink) // hardlinks don't care return target; - + return relative(file, target); } -void handle_dups(const dups_map& dups, const std::string& buildroot, bool symlink) +void handle_dups(dup_set& dups, Operation op) { - // all are hardlinks to the same data - if (dups.size() < 2) - return; - std::vector sorted; - sort_by_count(dups, sorted); - auto inodes = sorted.begin(); - std::string target = dups.at(*inodes).front(); - - for (++inodes; inodes != sorted.end(); ++inodes) { - const std::vector files = dups.at(*inodes); - for (auto it = files.begin(); it != files.end(); ++it) { - link_file(*it, target_for_link(target, *it, symlink), symlink); + // calculate number of hardlinked duplicates found, for each file + // this may be different than the st_nlink value + std::sort(dups.begin(), dups.end(), [](const file_entry& a, const file_entry& b) { + return a.inode < b.inode; + }); + auto first = dups.begin(); + while (first != dups.end()) { + auto r = equal_range(first, dups.end(), *first, [](const file_entry& a, const file_entry& b) { + return a.inode < b.inode; + }); + for (auto i = r.first; i != r.second; ++i) { + i->link_count = std::distance(r.first, r.second); } + first = r.second; + } + + // use the file with most hardlinks as target + // in case of ties, sort by name to get a stable order for reproducible builds + std::sort(dups.begin(), dups.end(), [](const file_entry& a, const file_entry& b) { + if (a.link_count == b.link_count) + return a.path > b.path; + return a.link_count > b.link_count; + }); + + const string& target = dups[0].path; + + for (const file_entry& e : dups) { + // skip duplicates hardlinked to first entry + if (e.inode == dups[0].inode) + continue; + + link_file(e.path, target_for_link(target, e.path, op), op); } } int main(int argc, char** argv) { - bool symlink = false; + Operation op = Operation::Hardlink; std::vector roots; - std::string buildroot; while (1) { - int result = getopt(argc, argv, "sb:"); + int result = getopt(argc, argv, "sn"); if (result == -1) break; /* end of list */ switch (result) { case 's': - symlink = true; + op = Operation::Symlink; + break; + case 'n': + op = Operation::DryRun; break; default: /* unknown */ break; @@ -170,17 +187,17 @@ int main(int argc, char** argv) if (roots.empty()) { std::cerr << "Missing directory argument."; + return 1; } /* fdupes options used: -q: hide progress indicator -p: don't consider files with different owner/group or permission bits as duplicates -n: exclude zero-length files from consideration - -o name: output order of duplicates -r: follow subdirectories -H: also report hard links as duplicates */ - std::string command = "fdupes -q -p -r -n -o name"; - if (!symlink) { + std::string command = "fdupes -q -p -r -n"; + if (op != Operation::Symlink) { /* if we create symlinks, avoid looking at hard links being duplicated. This way fdupes is faster and won't break them up anyway */ command += " -H"; @@ -192,12 +209,14 @@ int main(int argc, char** argv) if (!pipe) { throw std::runtime_error("popen() failed!"); } - std::array buffer; - dups_map dups; + std::vector buffer; + buffer.resize(MAXPATHLEN); + + dup_set dups; while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { std::string line = buffer.data(); if (line.length() < 2) { - handle_dups(dups, buildroot, symlink); + handle_dups(dups, op); dups.clear(); continue; } @@ -212,7 +231,7 @@ int main(int argc, char** argv) std::cerr << "Stat on '" << buffer.data() << "' failed" << std::endl; return 1; } - dups[sb.st_ino].push_back(line); + dups.emplace_back(sb.st_ino, 0, std::move(line)); } pclose(pipe);