From da1a4eb97b1ef0e674d30151385faffb91f0803aa46c87896c6f2cc4163b6451 Mon Sep 17 00:00:00 2001 From: Peter Simons Date: Mon, 14 Mar 2022 09:15:34 +0000 Subject: [PATCH 1/3] Accepting request 961567 from home:coolo:branches:utilities - Simplify macros.fdupes with a call to a C++ program that does the same within a fraction of a second what the shell loop did in many seconds (bsc#1195709) OBS-URL: https://build.opensuse.org/request/show/961567 OBS-URL: https://build.opensuse.org/package/show/utilities/fdupes?expand=0&rev=23 --- fdupes.changes | 7 +++ fdupes.spec | 12 +++- fdupes_wrapper.cpp | 145 +++++++++++++++++++++++++++++++++++++++++++++ macros.fdupes | 22 +------ 4 files changed, 162 insertions(+), 24 deletions(-) create mode 100644 fdupes_wrapper.cpp diff --git a/fdupes.changes b/fdupes.changes index db1e37f..6c1f2b9 100644 --- a/fdupes.changes +++ b/fdupes.changes @@ -1,3 +1,10 @@ +------------------------------------------------------------------- +Sat Mar 12 08:17:37 UTC 2022 - Stephan Kulow + +- Simplify macros.fdupes with a call to a C++ program that does + the same within a fraction of a second what the shell loop did + in many seconds (bsc#1195709) + ------------------------------------------------------------------- Sun Aug 16 16:59:45 UTC 2020 - Dirk Mueller diff --git a/fdupes.spec b/fdupes.spec index 4fb6764..07be3a5 100644 --- a/fdupes.spec +++ b/fdupes.spec @@ -1,7 +1,7 @@ # # spec file for package fdupes # -# Copyright (c) 2016 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2022 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -12,9 +12,10 @@ # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. -# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# Please submit bugfixes or comments via https://bugs.opensuse.org/ # + %{?!_rpmmacrodir:%define _rpmmacrodir /usr/lib/rpm/macros.d} Name: fdupes @@ -23,9 +24,11 @@ Release: 0 Summary: Tool to identify or delete duplicate files License: MIT Group: Productivity/Archiving/Compression -Url: https://github.com/adrianlopezroche/fdupes +URL: https://github.com/adrianlopezroche/fdupes Source0: https://github.com/adrianlopezroche/fdupes/releases/download/v%{version}/fdupes-%{version}.tar.gz Source1: macros.fdupes +Source2: fdupes_wrapper.cpp +BuildRequires: gcc-c++ %description FDUPES is a program for identifying or deleting duplicate files @@ -37,10 +40,12 @@ residing within specified directories. %build %configure --without-ncurses %make_build +g++ $RPM_OPT_FLAGS %{S:2} -o fdupes_wrapper %install %make_install install -D -m644 %{SOURCE1} %{buildroot}%{_rpmmacrodir}/macros.%{name} +install -D -m755 fdupes_wrapper %{buildroot}/usr/lib/rpm/fdupes_wrapper %check ./%{name} testdir @@ -53,5 +58,6 @@ install -D -m644 %{SOURCE1} %{buildroot}%{_rpmmacrodir}/macros.%{name} %{_bindir}/%{name} %{_mandir}/man1/%{name}.1* %{_rpmmacrodir}/macros.%{name} +/usr/lib/rpm/fdupes_wrapper %changelog diff --git a/fdupes_wrapper.cpp b/fdupes_wrapper.cpp new file mode 100644 index 0000000..0e79721 --- /dev/null +++ b/fdupes_wrapper.cpp @@ -0,0 +1,145 @@ +/* + * A little helper to wrap around fdupes and create hard/soft links of the + * dups found. Used in openSUSE rpm. + * + * Copyright 2022 Jiri Slaby + * 2022 Stephan Kulow + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef std::map> dups_map; +typedef std::pair nlink_pair; + +bool cmp_nlink(const nlink_pair& a, const nlink_pair& b) +{ + return a.second > b.second; +} + +void sort_by_count(const dups_map& in, std::vector& out) +{ + out.clear(); + std::list nlinks; + for (auto it = in.cbegin(); it != in.cend(); ++it) { + nlinks.push_back(std::make_pair(it->first, it->second.size())); + } + nlinks.sort(cmp_nlink); + for (auto it = nlinks.cbegin(); it != nlinks.cend(); ++it) { + out.push_back(it->first); + } +} + +void link_file(const std::string& file, const std::string& target, bool symlink) +{ + std::cout << "Linking " << file << " -> " << target << std::endl; + if (unlink(file.c_str())) { + std::cerr << "Removing '" << file << "' failed." << std::endl; + exit(1); + } + int ret; + if (symlink) { + ret = ::symlink(target.c_str(), file.c_str()); + } else { + ret = link(target.c_str(), file.c_str()); + } + if (ret) { + std::cerr << "Linking '" << file << "' failed." << std::endl; + exit(1); + } +} + +void handle_dups(const dups_map& dups, bool symlink) +{ + // all are hardlinks to the same data + if (dups.size() < 2) + return; + std::vector sorted; + sort_by_count(dups, sorted); + auto inodes = sorted.begin(); + std::string target = dups.at(*inodes).front(); + + for (++inodes; inodes != sorted.end(); ++inodes) { + const std::vector files = dups.at(*inodes); + for (auto it = files.begin(); it != files.end(); ++it) { + link_file(*it, target, symlink); + } + } +} + +int main(int argc, char** argv) +{ + bool symlink = false; + std::string root; + while (1) { + int result = getopt(argc, argv, "s"); + if (result == -1) + break; /* end of list */ + switch (result) { + case 's': + symlink = true; + break; + default: /* unknown */ + break; + } + } + if (optind < argc) { + root = argv[optind++]; + } else { + std::cerr << "Missing directory argument."; + } + if (optind < argc) { + std::cerr << "Too many arguments."; + return 1; + } + /* fdupes options used: + -q: hide progress indicator + -p: don't consider files with different owner/group or permission bits as duplicates + -n: exclude zero-length files from consideration + -o name: output order of duplicates + -r: follow subdirectories + -H: also report hard links as duplicates + */ + std::string command = "fdupes -q -p -n -o name -r -H '" + root + "'"; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + throw std::runtime_error("popen() failed!"); + } + std::array buffer; + dups_map dups; + while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { + std::string line = buffer.data(); + if (line.length() < 2) { + handle_dups(dups, symlink); + dups.clear(); + continue; + } + if (line.back() != '\n') { + std::cerr << "Too long lines? '" << line << "'" << std::endl; + return 1; + } + line.pop_back(); + + struct stat sb; + if (stat(line.c_str(), &sb)) { + std::cerr << "Stat on '" << buffer.data() << "' failed" << std::endl; + return 1; + } + dups[sb.st_ino].push_back(line); + } + pclose(pipe); + + return 0; +} diff --git a/macros.fdupes b/macros.fdupes index fc9b3ad..be95009 100644 --- a/macros.fdupes +++ b/macros.fdupes @@ -1,21 +1 @@ -%fdupes(s) \ - _target=""; \ - _symlinks=0; \ - %{-s:_symlinks=1;} \ - fdupes -q -p -n -H -o name -r %1 | \ - while read _file; do \ - if test -z "$_target" ; then \ - _target="$_file"; \ - else \ - if test -z "$_file" ; then \ - _target=""; \ - continue ; \ - fi ; \ - if test "$_symlinks" = 1; then \ - ln -sf "${_target#%{buildroot}}" "$_file"; \ - else \ - ln -f "$_target" "$_file"; \ - fi ;\ - fi ; \ - done \ -%{nil} +%fdupes /usr/lib/rpm/fdupes_wrapper From 760afc07b9ced37d0cd154d63a535318558bf9a864d22449d3ba134eceb27a34 Mon Sep 17 00:00:00 2001 From: Peter Simons Date: Mon, 14 Mar 2022 14:38:27 +0000 Subject: [PATCH 2/3] Accepting request 961692 from home:coolo:branches:utilities - Handle symlinks (-s argument) correctly OBS-URL: https://build.opensuse.org/request/show/961692 OBS-URL: https://build.opensuse.org/package/show/utilities/fdupes?expand=0&rev=24 --- fdupes.changes | 5 +++++ fdupes_wrapper.cpp | 31 +++++++++++++++++++++++++++---- macros.fdupes | 2 +- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/fdupes.changes b/fdupes.changes index 6c1f2b9..89cd3df 100644 --- a/fdupes.changes +++ b/fdupes.changes @@ -1,3 +1,8 @@ +------------------------------------------------------------------- +Mon Mar 14 13:44:54 UTC 2022 - Stephan Kulow + +- Handle symlinks (-s argument) correctly + ------------------------------------------------------------------- Sat Mar 12 08:17:37 UTC 2022 - Stephan Kulow diff --git a/fdupes_wrapper.cpp b/fdupes_wrapper.cpp index 0e79721..9972152 100644 --- a/fdupes_wrapper.cpp +++ b/fdupes_wrapper.cpp @@ -61,7 +61,7 @@ void link_file(const std::string& file, const std::string& target, bool symlink) } } -void handle_dups(const dups_map& dups, bool symlink) +void handle_dups(const dups_map& dups, const std::string& buildroot, bool symlink) { // all are hardlinks to the same data if (dups.size() < 2) @@ -70,6 +70,9 @@ void handle_dups(const dups_map& dups, bool symlink) sort_by_count(dups, sorted); auto inodes = sorted.begin(); std::string target = dups.at(*inodes).front(); + if (symlink) { + target.replace(0, buildroot.length(), ""); + } for (++inodes; inodes != sorted.end(); ++inodes) { const std::vector files = dups.at(*inodes); @@ -83,18 +86,32 @@ int main(int argc, char** argv) { bool symlink = false; std::string root; + std::string buildroot; while (1) { - int result = getopt(argc, argv, "s"); + int result = getopt(argc, argv, "sb:"); if (result == -1) break; /* end of list */ switch (result) { case 's': symlink = true; break; + case 'b': + buildroot = optarg; + break; default: /* unknown */ break; } } + if (buildroot.empty()) { + if (symlink) { + std::cerr << "Missing -b argument to remove bootroot from symlink targets"; + return 1; + } + // eliminate final slash from directory argument + if (buildroot.back() == '/') { + buildroot.pop_back(); + } + } if (optind < argc) { root = argv[optind++]; } else { @@ -112,7 +129,13 @@ int main(int argc, char** argv) -r: follow subdirectories -H: also report hard links as duplicates */ - std::string command = "fdupes -q -p -n -o name -r -H '" + root + "'"; + std::string command = "fdupes -q -p -n -o name"; + if (!symlink) { + /* if we create symlinks, avoid looking at hard links being duplicated. This way + fdupes is faster and won't break them up anyway */ + command += " -H"; + } + command += " -r '" + root + "'"; FILE* pipe = popen(command.c_str(), "r"); if (!pipe) { throw std::runtime_error("popen() failed!"); @@ -122,7 +145,7 @@ int main(int argc, char** argv) while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { std::string line = buffer.data(); if (line.length() < 2) { - handle_dups(dups, symlink); + handle_dups(dups, buildroot, symlink); dups.clear(); continue; } diff --git a/macros.fdupes b/macros.fdupes index be95009..fd514d7 100644 --- a/macros.fdupes +++ b/macros.fdupes @@ -1 +1 @@ -%fdupes /usr/lib/rpm/fdupes_wrapper +%fdupes /usr/lib/rpm/fdupes_wrapper -b %{buildroot} From 34728dc6e5b868a7b6b64bcc4e1a3a94c1af03845a8698589212005c2d41c0cb Mon Sep 17 00:00:00 2001 From: Peter Simons Date: Tue, 15 Mar 2022 08:17:48 +0000 Subject: [PATCH 3/3] Accepting request 961811 from home:coolo:branches:utilities This time I branched all staging failures to make sure it's the last one - A more correct approach to creating symlinks (old bug actually): Do not link the files as given by fdupes, but turn them into relative links (it works by chance if given a buildroot, but fails if running on a subdirectory) - Support multiple directories given (as glob to the macro) OBS-URL: https://build.opensuse.org/request/show/961811 OBS-URL: https://build.opensuse.org/package/show/utilities/fdupes?expand=0&rev=25 --- fdupes.changes | 9 ++++ fdupes_wrapper.cpp | 102 ++++++++++++++++++++++++++++++++++----------- macros.fdupes | 2 +- 3 files changed, 87 insertions(+), 26 deletions(-) diff --git a/fdupes.changes b/fdupes.changes index 89cd3df..4012f13 100644 --- a/fdupes.changes +++ b/fdupes.changes @@ -1,3 +1,12 @@ +------------------------------------------------------------------- +Tue Mar 15 07:41:35 UTC 2022 - Stephan Kulow + +- A more correct approach to creating symlinks (old bug actually): + Do not link the files as given by fdupes, but turn them into + relative links (it works by chance if given a buildroot, but + fails if running on a subdirectory) +- Support multiple directories given (as glob to the macro) + ------------------------------------------------------------------- Mon Mar 14 13:44:54 UTC 2022 - Stephan Kulow diff --git a/fdupes_wrapper.cpp b/fdupes_wrapper.cpp index 9972152..1ef481a 100644 --- a/fdupes_wrapper.cpp +++ b/fdupes_wrapper.cpp @@ -20,10 +20,65 @@ #include #include #include +#include + +using namespace std; typedef std::map> dups_map; typedef std::pair nlink_pair; +vector split_paths(const string& path) +{ + string token; + vector paths; + stringstream ss(path); + while (getline(ss, token, '/')) { + if (token == "..") { + paths.pop_back(); + } else if (token != "." || ss.eof()) { + paths.push_back(token); + } + } + return paths; +} + +string merge_paths(vector paths) +{ + string path; + for (const auto& s : paths) { + if (s.empty()) + continue; + if (!path.empty()) + path += "/"; + path += s; + } + + return path; +} + +string relative(const string& p1, const string& p2) +{ + vector paths1 = split_paths(p1); + paths1.pop_back(); + vector paths2 = split_paths(p2); + vector paths; + vector::const_iterator it1 = paths1.begin(); + vector::const_iterator it2 = paths2.begin(); + // first remove the common parts + while (it1 != paths1.end() && *it1 == *it2) { + it1++; + it2++; + } + for (; it1 != paths1.end(); ++it1) { + paths.push_back(".."); + } + for (; it2 != paths2.end(); ++it2) { + paths.push_back(*it2); + } + + return merge_paths(paths); +} + bool cmp_nlink(const nlink_pair& a, const nlink_pair& b) { return a.second > b.second; @@ -61,6 +116,14 @@ void link_file(const std::string& file, const std::string& target, bool symlink) } } +std::string target_for_link(string target, const std::string &file, bool symlink) +{ + if (!symlink) // hardlinks don't care + return target; + + return relative(file, target); +} + void handle_dups(const dups_map& dups, const std::string& buildroot, bool symlink) { // all are hardlinks to the same data @@ -70,14 +133,11 @@ void handle_dups(const dups_map& dups, const std::string& buildroot, bool symlin sort_by_count(dups, sorted); auto inodes = sorted.begin(); std::string target = dups.at(*inodes).front(); - if (symlink) { - target.replace(0, buildroot.length(), ""); - } for (++inodes; inodes != sorted.end(); ++inodes) { const std::vector files = dups.at(*inodes); for (auto it = files.begin(); it != files.end(); ++it) { - link_file(*it, target, symlink); + link_file(*it, target_for_link(target, *it, symlink), symlink); } } } @@ -85,7 +145,7 @@ void handle_dups(const dups_map& dups, const std::string& buildroot, bool symlin int main(int argc, char** argv) { bool symlink = false; - std::string root; + std::vector roots; std::string buildroot; while (1) { int result = getopt(argc, argv, "sb:"); @@ -95,32 +155,22 @@ int main(int argc, char** argv) case 's': symlink = true; break; - case 'b': - buildroot = optarg; - break; default: /* unknown */ break; } } - if (buildroot.empty()) { - if (symlink) { - std::cerr << "Missing -b argument to remove bootroot from symlink targets"; - return 1; - } - // eliminate final slash from directory argument - if (buildroot.back() == '/') { - buildroot.pop_back(); + while (optind < argc) { + std::string root = argv[optind++]; + if (root.front() != '/') { + char buffer[PATH_MAX]; + root = std::string(getcwd(buffer, PATH_MAX)) + '/' + root; } + roots.push_back(root); } - if (optind < argc) { - root = argv[optind++]; - } else { + + if (roots.empty()) { std::cerr << "Missing directory argument."; } - if (optind < argc) { - std::cerr << "Too many arguments."; - return 1; - } /* fdupes options used: -q: hide progress indicator -p: don't consider files with different owner/group or permission bits as duplicates @@ -129,13 +179,15 @@ int main(int argc, char** argv) -r: follow subdirectories -H: also report hard links as duplicates */ - std::string command = "fdupes -q -p -n -o name"; + std::string command = "fdupes -q -p -r -n -o name"; if (!symlink) { /* if we create symlinks, avoid looking at hard links being duplicated. This way fdupes is faster and won't break them up anyway */ command += " -H"; } - command += " -r '" + root + "'"; + for (auto it = roots.begin(); it != roots.end(); ++it) { + command += " '" + *it + "'"; + } FILE* pipe = popen(command.c_str(), "r"); if (!pipe) { throw std::runtime_error("popen() failed!"); diff --git a/macros.fdupes b/macros.fdupes index fd514d7..be95009 100644 --- a/macros.fdupes +++ b/macros.fdupes @@ -1 +1 @@ -%fdupes /usr/lib/rpm/fdupes_wrapper -b %{buildroot} +%fdupes /usr/lib/rpm/fdupes_wrapper