diff --git a/0001-Cache-compiled-regexps-between-magic-matches.patch b/0001-Cache-compiled-regexps-between-magic-matches.patch deleted file mode 100644 index 473a0f4..0000000 diff --git a/file-5.41-cache-regexps.patch b/file-5.41-cache-regexps.patch new file mode 100644 index 0000000..45b95ef --- /dev/null +++ b/file-5.41-cache-regexps.patch @@ -0,0 +1,346 @@ +From 1957db8212e9c74e5d626de3023e49d0bb502052 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Dirk=20M=C3=BCller?= +Date: Fri, 11 Mar 2022 23:51:55 +0100 +Subject: [PATCH] Cache compiled regexps between magic matches + +regcomp() is relatively expensive compared to regexec() for matching, +so it helps to only compile once and then reuse the compiled version +for future matches of the same magic. + +when doing equivalent of `find | xargs file` this provides a massive +speedup, between factor 2 and 4 depending on how heavy the magic +is on regexp usage. + +The memory overhead is mediocre (~ 200kb ) and it compiles regexps +lazy, so it doesn't add significant overhead to single match usecases. +--- + src/apprentice.c | 26 +++++++++++++++++++---- + src/file.h | 40 ++++++++++++++++++----------------- + src/softmagic.c | 54 +++++++++++++++++++++++++++--------------------- + 3 files changed, 73 insertions(+), 47 deletions(-) + +Index: file-5.41/src/apprentice.c +=================================================================== +--- file-5.41.orig/src/apprentice.c ++++ file-5.41/src/apprentice.c +@@ -427,7 +427,14 @@ add_mlist(struct mlist *mlp, struct magi + ml->map = idx == 0 ? map : NULL; + ml->magic = map->magic[idx]; + ml->nmagic = map->nmagic[idx]; +- ++ ml->magic_rxcomp = NULL; ++ if (ml->nmagic) { ++ ml->magic_rxcomp = CAST(file_regex_t**, calloc(ml->nmagic, sizeof(file_regex_t*))); ++ if (ml->magic_rxcomp == NULL) { ++ free(ml); ++ return -1; ++ } ++ } + mlp->prev->next = ml; + ml->prev = mlp->prev; + ml->next = mlp; +@@ -612,8 +619,19 @@ mlist_free_all(struct magic_set *ms) + private void + mlist_free_one(struct mlist *ml) + { ++ size_t i; ++ + if (ml->map) + apprentice_unmap(CAST(struct magic_map *, ml->map)); ++ ++ for (i = 0; i < ml->nmagic; ++i) { ++ if (ml->magic_rxcomp[i]) { ++ file_regfree(ml->magic_rxcomp[i]); ++ free(ml->magic_rxcomp[i]); ++ } ++ } ++ free(ml->magic_rxcomp); ++ ml->magic_rxcomp = NULL; + free(ml); + } + +@@ -3489,16 +3507,16 @@ file_magicfind(struct magic_set *ms, con + + for (ml = mlist->next; ml != mlist; ml = ml->next) { + struct magic *ma = ml->magic; +- uint32_t nma = ml->nmagic; +- for (i = 0; i < nma; i++) { ++ for (i = 0; i < ml->nmagic; i++) { + if (ma[i].type != FILE_NAME) + continue; + if (strcmp(ma[i].value.s, name) == 0) { + v->magic = &ma[i]; +- for (j = i + 1; j < nma; j++) ++ for (j = i + 1; j < ml->nmagic; j++) + if (ma[j].cont_level == 0) + break; + v->nmagic = j - i; ++ v->magic_rxcomp = ml->magic_rxcomp; + return 0; + } + } +Index: file-5.41/src/file.h +=================================================================== +--- file-5.41.orig/src/file.h ++++ file-5.41/src/file.h +@@ -88,6 +88,10 @@ + /* Do this here and now, because struct stat gets re-defined on solaris */ + #include + #include ++#include ++#if defined(HAVE_XLOCALE_H) ++#include ++#endif + + #define ENABLE_CONDITIONALS + +@@ -167,6 +171,19 @@ + #define FILE_COMPILE 2 + #define FILE_LIST 3 + ++typedef struct { ++ const char *pat; ++#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) ++#define USE_C_LOCALE ++ locale_t old_lc_ctype; ++ locale_t c_lc_ctype; ++#else ++ char *old_lc_ctype; ++#endif ++ int rc; ++ regex_t rx; ++} file_regex_t; ++ + struct buffer { + int fd; + struct stat st; +@@ -397,9 +414,10 @@ struct magic { + + /* list of magic entries */ + struct mlist { +- struct magic *magic; /* array of magic entries */ +- uint32_t nmagic; /* number of entries in array */ +- void *map; /* internal resources used by entry */ ++ struct magic *magic; /* array of magic entries */ ++ file_regex_t **magic_rxcomp; /* array of compiled regexps */ ++ size_t nmagic; /* number of entries in array */ ++ void *map; /* internal resources used by entry */ + struct mlist *next, *prev; + }; + +@@ -568,23 +586,7 @@ protected void buffer_init(struct buffer + protected void buffer_fini(struct buffer *); + protected int buffer_fill(const struct buffer *); + +-#include +-#if defined(HAVE_XLOCALE_H) +-#include +-#endif + +-typedef struct { +- const char *pat; +-#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) +-#define USE_C_LOCALE +- locale_t old_lc_ctype; +- locale_t c_lc_ctype; +-#else +- char *old_lc_ctype; +-#endif +- int rc; +- regex_t rx; +-} file_regex_t; + + protected int file_regcomp(file_regex_t *, const char *, int); + protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *, +Index: file-5.41/src/softmagic.c +=================================================================== +--- file-5.41.orig/src/softmagic.c ++++ file-5.41/src/softmagic.c +@@ -43,7 +43,7 @@ FILE_RCSID("@(#)$File: softmagic.c,v 1.3 + #include + #include "der.h" + +-private int match(struct magic_set *, struct magic *, uint32_t, ++private int match(struct magic_set *, struct magic *, file_regex_t **, uint32_t, + const struct buffer *, size_t, int, int, int, uint16_t *, + uint16_t *, int *, int *, int *, int *); + private int mget(struct magic_set *, struct magic *, const struct buffer *, +@@ -52,7 +52,7 @@ private int mget(struct magic_set *, str + uint16_t *, int *, int *, int *, int *); + private int msetoffset(struct magic_set *, struct magic *, struct buffer *, + const struct buffer *, size_t, unsigned int); +-private int magiccheck(struct magic_set *, struct magic *); ++private int magiccheck(struct magic_set *, struct magic *, file_regex_t **); + private int32_t mprint(struct magic_set *, struct magic *); + private int moffset(struct magic_set *, struct magic *, const struct buffer *, + int32_t *); +@@ -131,7 +131,7 @@ file_softmagic(struct magic_set *ms, con + } + + for (ml = ms->mlist[0]->next; ml != ms->mlist[0]; ml = ml->next) +- if ((rv = match(ms, ml->magic, ml->nmagic, b, 0, mode, ++ if ((rv = match(ms, ml->magic, ml->magic_rxcomp, ml->nmagic, b, 0, mode, + text, 0, indir_count, name_count, + &printed_something, &need_separator, NULL, NULL)) != 0) + return rv; +@@ -191,7 +191,7 @@ file_fmtcheck(struct magic_set *ms, cons + * so that higher-level continuations are processed. + */ + private int +-match(struct magic_set *ms, struct magic *magic, uint32_t nmagic, ++match(struct magic_set *ms, struct magic *magic, file_regex_t **magic_rxcomp, uint32_t nmagic, + const struct buffer *b, size_t offset, int mode, int text, + int flip, uint16_t *indir_count, uint16_t *name_count, + int *printed_something, int *need_separator, int *returnval, +@@ -220,6 +220,7 @@ match(struct magic_set *ms, struct magic + for (magindex = 0; magindex < nmagic; magindex++) { + int flush = 0; + struct magic *m = &magic[magindex]; ++ file_regex_t** m_rxcomp = &magic_rxcomp[magindex]; + + if (m->type != FILE_NAME) + if ((IS_STRING(m->type) && +@@ -257,7 +258,7 @@ flush: + *returnval = 1; + } + +- switch (magiccheck(ms, m)) { ++ switch (magiccheck(ms, m, m_rxcomp)) { + case -1: + return -1; + case 0: +@@ -318,6 +319,7 @@ flush: + while (magindex + 1 < nmagic && + magic[magindex + 1].cont_level != 0) { + m = &magic[++magindex]; ++ m_rxcomp = &magic_rxcomp[magindex]; + ms->line = m->lineno; /* for messages */ + + if (cont_level < m->cont_level) +@@ -371,7 +373,7 @@ flush: + break; + } + +- switch (flush ? 1 : magiccheck(ms, m)) { ++ switch (flush ? 1 : magiccheck(ms, m, m_rxcomp)) { + case -1: + return -1; + case 0: +@@ -655,7 +657,7 @@ mprint(struct magic_set *ms, struct magi + + if (m->str_flags & STRING_TRIM) + str = file_strtrim(str); +- ++ + if (file_printf(ms, F(ms, desc, "%s"), + file_printable(ms, sbuf, sizeof(sbuf), str, + sizeof(p->s) - (str - p->s))) == -1) +@@ -770,7 +772,7 @@ mprint(struct magic_set *ms, struct magi + return -1; + } + scp = (m->str_flags & STRING_TRIM) ? file_strtrim(cp) : cp; +- ++ + rval = file_printf(ms, F(ms, desc, "%s"), file_printable(ms, + sbuf, sizeof(sbuf), scp, ms->search.rm_len)); + free(cp); +@@ -1822,7 +1824,7 @@ mget(struct magic_set *ms, struct magic + for (mlp = ms->mlist[0]->next; mlp != ms->mlist[0]; + mlp = mlp->next) + { +- if ((rv = match(ms, mlp->magic, mlp->nmagic, &bb, 0, ++ if ((rv = match(ms, mlp->magic, mlp->magic_rxcomp, mlp->nmagic, &bb, 0, + BINTEST, text, 0, indir_count, name_count, + printed_something, need_separator, NULL, + NULL)) != 0) +@@ -1875,7 +1877,7 @@ mget(struct magic_set *ms, struct magic + nfound_match = 0; + (*name_count)++; + eoffset = ms->eoffset; +- rv = match(ms, ml.magic, ml.nmagic, b, offset + o, ++ rv = match(ms, ml.magic, ml.magic_rxcomp, ml.nmagic, b, offset + o, + mode, text, flip, indir_count, name_count, + printed_something, need_separator, returnval, + &nfound_match); +@@ -1999,7 +2001,7 @@ file_strncmp16(const char *a, const char + } + + private int +-magiccheck(struct magic_set *ms, struct magic *m) ++magiccheck(struct magic_set *ms, struct magic *m, file_regex_t** m_cache) + { + uint64_t l = m->value.q; + uint64_t v; +@@ -2182,27 +2184,32 @@ magiccheck(struct magic_set *ms, struct + } + case FILE_REGEX: { + int rc; +- file_regex_t rx; ++ file_regex_t *rx = *m_cache; + const char *search; + + if (ms->search.s == NULL) + return 0; + ++ if (rx == NULL) { ++ rx = *m_cache = CAST(file_regex_t*, malloc(sizeof(file_regex_t))); ++ rc = file_regcomp(rx, m->value.s, ++ REG_EXTENDED|REG_NEWLINE| ++ ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); ++ if (rc) { ++ file_regerror(rx, rc, ms); ++ file_regfree(rx); ++ v = CAST(uint64_t, -1); ++ break; ++ } ++ } + l = 0; +- rc = file_regcomp(&rx, m->value.s, +- REG_EXTENDED|REG_NEWLINE| +- ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); +- if (rc) { +- file_regerror(&rx, rc, ms); +- v = CAST(uint64_t, -1); +- } else { ++ { + regmatch_t pmatch; + size_t slen = ms->search.s_len; + char *copy; + if (slen != 0) { + copy = CAST(char *, malloc(slen)); + if (copy == NULL) { +- file_regfree(&rx); + file_error(ms, errno, + "can't allocate %" SIZE_T_FORMAT "u bytes", + slen); +@@ -2215,14 +2222,14 @@ magiccheck(struct magic_set *ms, struct + search = CCAST(char *, ""); + copy = NULL; + } +- rc = file_regexec(&rx, RCAST(const char *, search), ++ rc = file_regexec(rx, RCAST(const char *, search), + 1, &pmatch, 0); + free(copy); + switch (rc) { + case 0: + ms->search.s += CAST(int, pmatch.rm_so); + ms->search.offset += CAST(size_t, pmatch.rm_so); +- ms->search.rm_len = CAST(size_t, ++ ms->search.rm_len = CAST(size_t, + pmatch.rm_eo - pmatch.rm_so); + v = 0; + break; +@@ -2232,12 +2239,11 @@ magiccheck(struct magic_set *ms, struct + break; + + default: +- file_regerror(&rx, rc, ms); ++ file_regerror(rx, rc, ms); + v = CAST(uint64_t, -1); + break; + } + } +- file_regfree(&rx); + if (v == CAST(uint64_t, -1)) + return -1; + break; diff --git a/file.changes b/file.changes index cd2448e..2634daf 100644 --- a/file.changes +++ b/file.changes @@ -1,3 +1,8 @@ +------------------------------------------------------------------- +Sat Mar 19 18:00:32 UTC 2022 - Dirk Müller + +- add file-5.41-cache-regexps.patch to cache regexp lookups + ------------------------------------------------------------------- Thu Feb 24 10:05:17 UTC 2022 - Dr. Werner Fink diff --git a/file.spec b/file.spec index d63dcfc..929af31 100644 --- a/file.spec +++ b/file.spec @@ -62,6 +62,7 @@ Patch31: file-5.19-biorad.dif Patch32: file-5.19-clicfs.dif Patch34: file-5.23-endian.patch Patch37: file-secure_getenv.patch +Patch38: file-5.41-cache-regexps.patch Patch39: file-5.28-btrfs-image.dif # Upstream commits as patches BuildRoot: %{_tmppath}/%{name}-%{version}-build @@ -125,6 +126,7 @@ to develop applications that require the magic "file" interface. %patch32 -p0 -b .clicfs %patch34 -p0 -b .endian %patch37 -p1 -b .getenv +%patch38 -p1 -b .regexp %patch39 -p1 -b .btrfs %patch -b .0 test -s src/magic.h.in || cp -p src/magic.h src/magic.h.in diff --git a/python-magic.changes b/python-magic.changes index cda6113..5b4b11d 100644 --- a/python-magic.changes +++ b/python-magic.changes @@ -1,3 +1,8 @@ +------------------------------------------------------------------- +Sat Mar 19 18:01:52 UTC 2022 - Dirk Müller + +- spec-cleaner run + ------------------------------------------------------------------- Tue Oct 19 09:55:47 UTC 2021 - Dr. Werner Fink diff --git a/python-magic.spec b/python-magic.spec index 50d5b29..0d8c005 100644 --- a/python-magic.spec +++ b/python-magic.spec @@ -19,25 +19,23 @@ # PyPI package name is file-magic. Version is taken from setup.py %define file_magic_version 0.3.0 %{?!python_module:%define python_module() python-%{**} python3-%{**}} - +%global _miscdir %{_datadir}/misc Name: python-magic -BuildRequires: %{python_module setuptools} -BuildRequires: findutils -BuildRequires: libtool -BuildRequires: python-rpm-macros -BuildRequires: zlib-devel -URL: http://www.darwinsys.com/file/ Version: 5.41 Release: 0 Summary: Python module to use libmagic License: BSD-3-Clause AND BSD-4-Clause Group: Development/Languages/Python -%{expand:%(sed -n -e '/^Source0\?:/,/^BuildRoot:/p' <%{_sourcedir}/file.spec)} +URL: https://www.darwinsys.com/file/ Source99: file.spec +BuildRequires: %{python_module setuptools} +BuildRequires: findutils +BuildRequires: libtool +BuildRequires: python-rpm-macros +BuildRequires: zlib-devel Requires: libmagic1 Provides: python-file-magic = %{file_magic_version} -%global _miscdir %{_datadir}/misc - +%{expand:%(sed -n -e '/^Source0\?:/,/^BuildRoot:/p' <%{_sourcedir}/file.spec)} %python_subpackages %description @@ -58,7 +56,6 @@ pushd python popd %files %{python_files} -%defattr(-,root,root) %doc python/README python/example.py %{python_sitelib}/magic.py* %pycache_only %{python_sitelib}/__pycache__