From ea465a6a1597a1ff50f55c41909b9a6f30b653f5e46b6d1633b1d7b96bb8c5fc Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Mon, 14 Mar 2022 22:07:30 +0000 Subject: [PATCH] - add jdupes_wrapper to speedup symlinks - add new %suse_symlink_dupes and %suse_hardlink_dupes macros which are more descriptive than a %jdupes or %fdupes OBS-URL: https://build.opensuse.org/package/show/utilities/jdupes?expand=0&rev=4 --- jdupes.changes | 7 ++ jdupes.spec | 9 ++- jdupes_wrapper.cpp | 167 +++++++++++++++++++++++++++++++++++++++++++++ macros.jdupes | 24 ++----- 4 files changed, 185 insertions(+), 22 deletions(-) create mode 100644 jdupes_wrapper.cpp diff --git a/jdupes.changes b/jdupes.changes index bd7e644..f01fad3 100644 --- a/jdupes.changes +++ b/jdupes.changes @@ -1,3 +1,10 @@ +------------------------------------------------------------------- +Mon Mar 14 22:06:26 UTC 2022 - Dirk Müller + +- add jdupes_wrapper to speedup symlinks +- add new %suse_symlink_dupes and %suse_hardlink_dupes macros + which are more descriptive than a %jdupes or %fdupes + ------------------------------------------------------------------- Wed Mar 9 21:54:15 UTC 2022 - Dirk Müller diff --git a/jdupes.spec b/jdupes.spec index 5df0299..1824bb7 100644 --- a/jdupes.spec +++ b/jdupes.spec @@ -1,7 +1,7 @@ # # spec file for package jdupes # -# Copyright (c) 2021 SUSE LLC +# Copyright (c) 2022 SUSE LLC # Copyright (c) 2019-2020 Malcolm J Lewis # # All modifications and additions to the file contributed by third parties @@ -26,6 +26,8 @@ Group: Productivity/File utilities URL: https://github.com/jbruchon/jdupes Source0: https://github.com/jbruchon/jdupes/archive/refs/tags/v%{version}.tar.gz Source1: macros.jdupes +Source2: jdupes_wrapper.cpp +BuildRequires: gcc-c++ %description A program for identifying and taking actions upon duplicate files. @@ -41,11 +43,13 @@ programs. %build make %{?_smp_mflags} \ ENABLE_DEDUPE=1 \ - STATIC_DEDUPE_H=1 \ + STATIC_DEDUPE_H=1 +g++ %{optflags} -O2 -Wall %{SOURCE2} -o jdupes_wrapper %install make DESTDIR=%{buildroot} PREFIX=%{_prefix} install install -D -m644 %{SOURCE1} %{buildroot}%{_rpmmacrodir}/macros.%{name} +install -D -m755 jdupes_wrapper %{buildroot}/usr/lib/rpm/jdupes_wrapper %check ./jdupes -q -r testdir @@ -56,5 +60,6 @@ install -D -m644 %{SOURCE1} %{buildroot}%{_rpmmacrodir}/macros.%{name} %{_bindir}/%{name} %{_mandir}/man1/%{name}.1%{?ext_man} %{_rpmmacrodir}/macros.%{name} +/usr/lib/rpm/jdupes_wrapper %changelog diff --git a/jdupes_wrapper.cpp b/jdupes_wrapper.cpp new file mode 100644 index 0000000..a78ed49 --- /dev/null +++ b/jdupes_wrapper.cpp @@ -0,0 +1,167 @@ +/* + * A little helper to wrap around jdupes and create hard/soft links of the + * dups found. Used in openSUSE rpm. + * + * Copyright 2022 Jiri Slaby + * 2022 Stephan Kulow + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef std::map> dups_map; +typedef std::pair nlink_pair; + +bool cmp_nlink(const nlink_pair& a, const nlink_pair& b) +{ + return a.second > b.second; +} + +void sort_by_count(const dups_map& in, std::vector& out) +{ + out.clear(); + std::list nlinks; + for (auto it = in.cbegin(); it != in.cend(); ++it) { + nlinks.push_back(std::make_pair(it->first, it->second.size())); + } + nlinks.sort(cmp_nlink); + for (auto it = nlinks.cbegin(); it != nlinks.cend(); ++it) { + out.push_back(it->first); + } +} + +void link_file(const std::string& file, const std::string& target, bool symlink) +{ + std::cout << "Linking " << file << " -> " << target << std::endl; + if (unlink(file.c_str())) { + std::cerr << "Removing '" << file << "' failed." << std::endl; + exit(1); + } + int ret; + if (symlink) { + ret = ::symlink(target.c_str(), file.c_str()); + } else { + ret = link(target.c_str(), file.c_str()); + } + if (ret) { + std::cerr << "Linking '" << file << "' failed." << std::endl; + exit(1); + } +} + +void handle_dups(const dups_map& dups, const std::string& buildroot, bool symlink) +{ + // all are hardlinks to the same data + if (dups.size() < 2) + return; + std::vector sorted; + sort_by_count(dups, sorted); + auto inodes = sorted.begin(); + std::string target = dups.at(*inodes).front(); + if (symlink) { + target.replace(0, buildroot.length(), ""); + } + + for (++inodes; inodes != sorted.end(); ++inodes) { + const std::vector files = dups.at(*inodes); + for (auto it = files.begin(); it != files.end(); ++it) { + link_file(*it, target, symlink); + } + } +} + +int main(int argc, char** argv) +{ + bool symlink = false; + std::string root; + std::string buildroot; + while (1) { + int result = getopt(argc, argv, "sb:"); + if (result == -1) + break; /* end of list */ + switch (result) { + case 's': + symlink = true; + break; + case 'b': + buildroot = optarg; + break; + default: /* unknown */ + break; + } + } + if (buildroot.empty()) { + if (symlink) { + std::cerr << "Missing -b argument to remove bootroot from symlink targets"; + return 1; + } + // eliminate final slash from directory argument + if (buildroot.back() == '/') { + buildroot.pop_back(); + } + } + if (optind < argc) { + root = argv[optind++]; + } else { + std::cerr << "Missing directory argument."; + } + if (optind < argc) { + std::cerr << "Too many arguments."; + return 1; + } + /* jdupes options used: + -q: hide progress indicator + -p: don't consider files with different owner/group or permission bits as duplicates + -o name: output order of duplicates + -r: follow subdirectories + -H: also report hard links as duplicates + */ + std::string command = "jdupes -q -p -o name"; + if (!symlink) { + /* if we create symlinks, avoid looking at hard links being duplicated. This way + jdupes is faster and won't break them up anyway */ + command += " -H"; + } + command += " -r '" + root + "'"; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + throw std::runtime_error("popen() failed!"); + } + std::array buffer; + dups_map dups; + while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { + std::string line = buffer.data(); + if (line.length() < 2) { + handle_dups(dups, buildroot, symlink); + dups.clear(); + continue; + } + if (line.back() != '\n') { + std::cerr << "Too long lines? '" << line << "'" << std::endl; + return 1; + } + line.pop_back(); + + struct stat sb; + if (stat(line.c_str(), &sb)) { + std::cerr << "Stat on '" << buffer.data() << "' failed" << std::endl; + return 1; + } + dups[sb.st_ino].push_back(line); + } + pclose(pipe); + + return 0; +} diff --git a/macros.jdupes b/macros.jdupes index 60dc360..2dc7c5b 100644 --- a/macros.jdupes +++ b/macros.jdupes @@ -1,20 +1,4 @@ -%jdupes(s) \ - _target=""; _symlinks=0; \ - %{-s:_symlinks=1;} \ - if test "$_symlinks" = 1; then \ - jdupes -q -p -H -o name -r %1 | \ - while read _file; do \ - if test -z "$_target" ; then \ - _target="$_file"; \ - else \ - if test -z "$_file" ; then \ - _target=""; \ - continue ; \ - fi ; \ - ln -sf "${_target#%{buildroot}}" "$_file"; \ - fi ; \ - done ; \ - else \ - jdupes -q -p -H -o name -L -r %1 ; \ - fi ; \ -%{nil} +%suse_symlink_dupes /usr/lib/rpm/jdupes_wrapper -b %{buildroot} -s +%suse_hardlink_dupes jdupes -q -p -H -o name -L -r %1 + +%fdupes /usr/lib/rpm/jdupes_wrapper -b %{buildroot}