SHA256
1
0
forked from pool/fdupes
Dominique Leuenberger 2022-05-04 13:10:10 +00:00 committed by Git OBS Bridge
commit cdce730b1f
3 changed files with 105 additions and 54 deletions

View File

@ -1,3 +1,15 @@
-------------------------------------------------------------------
Fri Apr 1 19:50:32 UTC 2022 - Stefan Brüns <stefan.bruens@rwth-aachen.de>
- Fixes for the new wrapper:
* Order duplicates by name, to get a reproducible file set
(boo#1197484).
* Remove redundant order parameter from fdupes invocation.
* Modernize code, significantly reduce allocations.
* Exit immediately when mandatory parameters are missing.
* Remove obsolete buildroot parameter
* Add some tests for the wrapper
------------------------------------------------------------------- -------------------------------------------------------------------
Tue Mar 15 07:41:35 UTC 2022 - Stephan Kulow <coolo@suse.com> Tue Mar 15 07:41:35 UTC 2022 - Stephan Kulow <coolo@suse.com>

View File

@ -53,6 +53,26 @@ install -D -m755 fdupes_wrapper %{buildroot}/usr/lib/rpm/fdupes_wrapper
./%{name} --recurse testdir ./%{name} --recurse testdir
./%{name} --size testdir ./%{name} --size testdir
# Check wrapper
PATH=`pwd`:$PATH
(cd testdir; md5sum ./* ./*/* > ../testdir.md5 || true)
for operation in '-n' '-s' ' '; do
cp -R testdir "testdir${operation}"
./fdupes_wrapper ${operation} "testdir${operation}"
(cd "testdir${operation}"; md5sum --check ../testdir.md5)
done
# Check order does not depend on creation order - x should be target
mkdir testdir_order
for t in "a b x" "x a b" "a x b"; do
pushd testdir_order
for f in $t ; do cp ../testdir.md5 $f; done
../fdupes_wrapper -s ./
test -h ./a
test -h ./b
rm *
popd
done
%files %files
%doc CHANGES %doc CHANGES
%{_bindir}/%{name} %{_bindir}/%{name}

View File

@ -4,19 +4,16 @@
* *
* Copyright 2022 Jiri Slaby <jslaby@suse.cz> * Copyright 2022 Jiri Slaby <jslaby@suse.cz>
* 2022 Stephan Kulow <coolo@suse.de> * 2022 Stephan Kulow <coolo@suse.de>
* 2022 Stefan Brüns <stefan.bruens@rwth-aachen.de>
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
*/ */
#include <algorithm> #include <algorithm>
#include <array>
#include <iostream> #include <iostream>
#include <list>
#include <map>
#include <string> #include <string>
#include <sys/param.h> #include <sys/param.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <tuple>
#include <unistd.h> #include <unistd.h>
#include <utility> #include <utility>
#include <vector> #include <vector>
@ -24,8 +21,22 @@
using namespace std; using namespace std;
typedef std::map<ino_t, std::vector<std::string>> dups_map; struct file_entry
typedef std::pair<ino_t, size_t> nlink_pair; {
ino_t inode;
nlink_t link_count;
string path;
file_entry(ino_t i, nlink_t n, string&& p)
: inode(i), link_count(n), path(move(p)) {}
};
using dup_set = vector<file_entry>;
enum class Operation {
Symlink,
Hardlink,
DryRun,
};
vector<string> split_paths(const string& path) vector<string> split_paths(const string& path)
{ {
@ -42,7 +53,7 @@ vector<string> split_paths(const string& path)
return paths; return paths;
} }
string merge_paths(vector<string> paths) string merge_paths(const vector<string>& paths)
{ {
string path; string path;
for (const auto& s : paths) { for (const auto& s : paths) {
@ -79,33 +90,18 @@ string relative(const string& p1, const string& p2)
return merge_paths(paths); return merge_paths(paths);
} }
bool cmp_nlink(const nlink_pair& a, const nlink_pair& b) void link_file(const std::string& file, const std::string& target, Operation op)
{
return a.second > b.second;
}
void sort_by_count(const dups_map& in, std::vector<ino_t>& out)
{
out.clear();
std::list<nlink_pair> nlinks;
for (auto it = in.cbegin(); it != in.cend(); ++it) {
nlinks.push_back(std::make_pair(it->first, it->second.size()));
}
nlinks.sort(cmp_nlink);
for (auto it = nlinks.cbegin(); it != nlinks.cend(); ++it) {
out.push_back(it->first);
}
}
void link_file(const std::string& file, const std::string& target, bool symlink)
{ {
std::cout << "Linking " << file << " -> " << target << std::endl; std::cout << "Linking " << file << " -> " << target << std::endl;
if (op == Operation::DryRun)
return;
if (unlink(file.c_str())) { if (unlink(file.c_str())) {
std::cerr << "Removing '" << file << "' failed." << std::endl; std::cerr << "Removing '" << file << "' failed." << std::endl;
exit(1); exit(1);
} }
int ret; int ret;
if (symlink) { if (op == Operation::Symlink) {
ret = ::symlink(target.c_str(), file.c_str()); ret = ::symlink(target.c_str(), file.c_str());
} else { } else {
ret = link(target.c_str(), file.c_str()); ret = link(target.c_str(), file.c_str());
@ -116,44 +112,65 @@ void link_file(const std::string& file, const std::string& target, bool symlink)
} }
} }
std::string target_for_link(string target, const std::string &file, bool symlink) std::string target_for_link(string target, const std::string &file, Operation op)
{ {
if (!symlink) // hardlinks don't care if (op == Operation::Hardlink) // hardlinks don't care
return target; return target;
return relative(file, target); return relative(file, target);
} }
void handle_dups(const dups_map& dups, const std::string& buildroot, bool symlink) void handle_dups(dup_set& dups, Operation op)
{ {
// all are hardlinks to the same data // calculate number of hardlinked duplicates found, for each file
if (dups.size() < 2) // this may be different than the st_nlink value
return; std::sort(dups.begin(), dups.end(), [](const file_entry& a, const file_entry& b) {
std::vector<ino_t> sorted; return a.inode < b.inode;
sort_by_count(dups, sorted); });
auto inodes = sorted.begin(); auto first = dups.begin();
std::string target = dups.at(*inodes).front(); while (first != dups.end()) {
auto r = equal_range(first, dups.end(), *first, [](const file_entry& a, const file_entry& b) {
for (++inodes; inodes != sorted.end(); ++inodes) { return a.inode < b.inode;
const std::vector<std::string> files = dups.at(*inodes); });
for (auto it = files.begin(); it != files.end(); ++it) { for (auto i = r.first; i != r.second; ++i) {
link_file(*it, target_for_link(target, *it, symlink), symlink); i->link_count = std::distance(r.first, r.second);
} }
first = r.second;
}
// use the file with most hardlinks as target
// in case of ties, sort by name to get a stable order for reproducible builds
std::sort(dups.begin(), dups.end(), [](const file_entry& a, const file_entry& b) {
if (a.link_count == b.link_count)
return a.path > b.path;
return a.link_count > b.link_count;
});
const string& target = dups[0].path;
for (const file_entry& e : dups) {
// skip duplicates hardlinked to first entry
if (e.inode == dups[0].inode)
continue;
link_file(e.path, target_for_link(target, e.path, op), op);
} }
} }
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
bool symlink = false; Operation op = Operation::Hardlink;
std::vector<std::string> roots; std::vector<std::string> roots;
std::string buildroot;
while (1) { while (1) {
int result = getopt(argc, argv, "sb:"); int result = getopt(argc, argv, "sn");
if (result == -1) if (result == -1)
break; /* end of list */ break; /* end of list */
switch (result) { switch (result) {
case 's': case 's':
symlink = true; op = Operation::Symlink;
break;
case 'n':
op = Operation::DryRun;
break; break;
default: /* unknown */ default: /* unknown */
break; break;
@ -170,17 +187,17 @@ int main(int argc, char** argv)
if (roots.empty()) { if (roots.empty()) {
std::cerr << "Missing directory argument."; std::cerr << "Missing directory argument.";
return 1;
} }
/* fdupes options used: /* fdupes options used:
-q: hide progress indicator -q: hide progress indicator
-p: don't consider files with different owner/group or permission bits as duplicates -p: don't consider files with different owner/group or permission bits as duplicates
-n: exclude zero-length files from consideration -n: exclude zero-length files from consideration
-o name: output order of duplicates
-r: follow subdirectories -r: follow subdirectories
-H: also report hard links as duplicates -H: also report hard links as duplicates
*/ */
std::string command = "fdupes -q -p -r -n -o name"; std::string command = "fdupes -q -p -r -n";
if (!symlink) { if (op != Operation::Symlink) {
/* if we create symlinks, avoid looking at hard links being duplicated. This way /* if we create symlinks, avoid looking at hard links being duplicated. This way
fdupes is faster and won't break them up anyway */ fdupes is faster and won't break them up anyway */
command += " -H"; command += " -H";
@ -192,12 +209,14 @@ int main(int argc, char** argv)
if (!pipe) { if (!pipe) {
throw std::runtime_error("popen() failed!"); throw std::runtime_error("popen() failed!");
} }
std::array<char, MAXPATHLEN> buffer; std::vector<char> buffer;
dups_map dups; buffer.resize(MAXPATHLEN);
dup_set dups;
while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
std::string line = buffer.data(); std::string line = buffer.data();
if (line.length() < 2) { if (line.length() < 2) {
handle_dups(dups, buildroot, symlink); handle_dups(dups, op);
dups.clear(); dups.clear();
continue; continue;
} }
@ -212,7 +231,7 @@ int main(int argc, char** argv)
std::cerr << "Stat on '" << buffer.data() << "' failed" << std::endl; std::cerr << "Stat on '" << buffer.data() << "' failed" << std::endl;
return 1; return 1;
} }
dups[sb.st_ino].push_back(line); dups.emplace_back(sb.st_ino, 0, std::move(line));
} }
pclose(pipe); pclose(pipe);