From 47aed56704bc41e35b729b172e7142971a4c85a8e0f8c5462862ff3db68b509e Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Mon, 14 Mar 2022 10:44:02 +0000 Subject: [PATCH 1/5] OBS-URL: https://build.opensuse.org/package/show/Base:System/file?expand=0&rev=235 --- ...mpiled-regexps-between-magic-matches.patch | 346 ++++++++++++++++++ file.spec | 4 +- 2 files changed, 349 insertions(+), 1 deletion(-) create mode 100644 0001-Cache-compiled-regexps-between-magic-matches.patch diff --git a/0001-Cache-compiled-regexps-between-magic-matches.patch b/0001-Cache-compiled-regexps-between-magic-matches.patch new file mode 100644 index 0000000..098119d --- /dev/null +++ b/0001-Cache-compiled-regexps-between-magic-matches.patch @@ -0,0 +1,346 @@ +From 1957db8212e9c74e5d626de3023e49d0bb502052 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Dirk=20M=C3=BCller?= +Date: Fri, 11 Mar 2022 23:51:55 +0100 +Subject: [PATCH] Cache compiled regexps between magic matches + +regcomp() is relatively expensive compared to regexec() for matching, +so it helps to only compile once and then reuse the compiled version +for future matches of the same magic. + +when doing equivalent of `find | xargs file` this provides a massive +speedup, between factor 2 and 4 depending on how heavy the magic +is on regexp usage. + +The memory overhead is mediocre (~ 200kb ) and it compiles regexps +lazy, so it doesn't add significant overhead to single match usecases. +--- + src/apprentice.c | 26 +++++++++++++++++++---- + src/file.h | 40 ++++++++++++++++++----------------- + src/softmagic.c | 54 +++++++++++++++++++++++++++--------------------- + 3 files changed, 73 insertions(+), 47 deletions(-) + +Index: file-5.41/src/apprentice.c +=================================================================== +--- file-5.41.orig/src/apprentice.c ++++ file-5.41/src/apprentice.c +@@ -425,7 +425,14 @@ add_mlist(struct mlist *mlp, struct magi + ml->map = idx == 0 ? map : NULL; + ml->magic = map->magic[idx]; + ml->nmagic = map->nmagic[idx]; +- ++ ml->magic_rxcomp = NULL; ++ if (ml->nmagic) { ++ ml->magic_rxcomp = CAST(file_regex_t**, calloc(ml->nmagic, sizeof(file_regex_t*))); ++ if (ml->magic_rxcomp == NULL) { ++ free(ml); ++ return -1; ++ } ++ } + mlp->prev->next = ml; + ml->prev = mlp->prev; + ml->next = mlp; +@@ -610,8 +617,19 @@ mlist_free_all(struct magic_set *ms) + private void + mlist_free_one(struct mlist *ml) + { ++ size_t i; ++ + if (ml->map) + apprentice_unmap(CAST(struct magic_map *, ml->map)); ++ ++ for (i = 0; i < ml->nmagic; ++i) { ++ if (ml->magic_rxcomp[i]) { ++ file_regfree(ml->magic_rxcomp[i]); ++ free(ml->magic_rxcomp[i]); ++ } ++ } ++ free(ml->magic_rxcomp); ++ ml->magic_rxcomp = NULL; + free(ml); + } + +@@ -3548,16 +3566,16 @@ file_magicfind(struct magic_set *ms, con + + for (ml = mlist->next; ml != mlist; ml = ml->next) { + struct magic *ma = ml->magic; +- uint32_t nma = ml->nmagic; +- for (i = 0; i < nma; i++) { ++ for (i = 0; i < ml->nmagic; i++) { + if (ma[i].type != FILE_NAME) + continue; + if (strcmp(ma[i].value.s, name) == 0) { + v->magic = &ma[i]; +- for (j = i + 1; j < nma; j++) ++ for (j = i + 1; j < ml->nmagic; j++) + if (ma[j].cont_level == 0) + break; + v->nmagic = j - i; ++ v->magic_rxcomp = ml->magic_rxcomp; + return 0; + } + } +Index: file-5.41/src/file.h +=================================================================== +--- file-5.41.orig/src/file.h ++++ file-5.41/src/file.h +@@ -88,6 +88,10 @@ + /* Do this here and now, because struct stat gets re-defined on solaris */ + #include + #include ++#include ++#if defined(HAVE_XLOCALE_H) ++#include ++#endif + + #define ENABLE_CONDITIONALS + +@@ -167,6 +171,19 @@ + #define FILE_COMPILE 2 + #define FILE_LIST 3 + ++typedef struct { ++ const char *pat; ++#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) ++#define USE_C_LOCALE ++ locale_t old_lc_ctype; ++ locale_t c_lc_ctype; ++#else ++ char *old_lc_ctype; ++#endif ++ int rc; ++ regex_t rx; ++} file_regex_t; ++ + struct buffer { + int fd; + struct stat st; +@@ -397,9 +414,10 @@ struct magic { + + /* list of magic entries */ + struct mlist { +- struct magic *magic; /* array of magic entries */ +- uint32_t nmagic; /* number of entries in array */ +- void *map; /* internal resources used by entry */ ++ struct magic *magic; /* array of magic entries */ ++ file_regex_t **magic_rxcomp; /* array of compiled regexps */ ++ size_t nmagic; /* number of entries in array */ ++ void *map; /* internal resources used by entry */ + struct mlist *next, *prev; + }; + +@@ -568,23 +586,7 @@ protected void buffer_init(struct buffer + protected void buffer_fini(struct buffer *); + protected int buffer_fill(const struct buffer *); + +-#include +-#if defined(HAVE_XLOCALE_H) +-#include +-#endif + +-typedef struct { +- const char *pat; +-#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) +-#define USE_C_LOCALE +- locale_t old_lc_ctype; +- locale_t c_lc_ctype; +-#else +- char *old_lc_ctype; +-#endif +- int rc; +- regex_t rx; +-} file_regex_t; + + protected int file_regcomp(file_regex_t *, const char *, int); + protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *, +Index: file-5.41/src/softmagic.c +=================================================================== +--- file-5.41.orig/src/softmagic.c ++++ file-5.41/src/softmagic.c +@@ -43,7 +43,7 @@ FILE_RCSID("@(#)$File: softmagic.c,v 1.3 + #include + #include "der.h" + +-private int match(struct magic_set *, struct magic *, uint32_t, ++private int match(struct magic_set *, struct magic *, file_regex_t **, uint32_t, + const struct buffer *, size_t, int, int, int, uint16_t *, + uint16_t *, int *, int *, int *, int *); + private int mget(struct magic_set *, struct magic *, const struct buffer *, +@@ -52,7 +52,7 @@ private int mget(struct magic_set *, str + uint16_t *, int *, int *, int *, int *); + private int msetoffset(struct magic_set *, struct magic *, struct buffer *, + const struct buffer *, size_t, unsigned int); +-private int magiccheck(struct magic_set *, struct magic *); ++private int magiccheck(struct magic_set *, struct magic *, file_regex_t **); + private int32_t mprint(struct magic_set *, struct magic *); + private int moffset(struct magic_set *, struct magic *, const struct buffer *, + int32_t *); +@@ -131,7 +131,7 @@ file_softmagic(struct magic_set *ms, con + } + + for (ml = ms->mlist[0]->next; ml != ms->mlist[0]; ml = ml->next) +- if ((rv = match(ms, ml->magic, ml->nmagic, b, 0, mode, ++ if ((rv = match(ms, ml->magic, ml->magic_rxcomp, ml->nmagic, b, 0, mode, + text, 0, indir_count, name_count, + &printed_something, &need_separator, NULL, NULL)) != 0) + return rv; +@@ -191,7 +191,7 @@ file_fmtcheck(struct magic_set *ms, cons + * so that higher-level continuations are processed. + */ + private int +-match(struct magic_set *ms, struct magic *magic, uint32_t nmagic, ++match(struct magic_set *ms, struct magic *magic, file_regex_t **magic_rxcomp, uint32_t nmagic, + const struct buffer *b, size_t offset, int mode, int text, + int flip, uint16_t *indir_count, uint16_t *name_count, + int *printed_something, int *need_separator, int *returnval, +@@ -220,6 +220,7 @@ match(struct magic_set *ms, struct magic + for (magindex = 0; magindex < nmagic; magindex++) { + int flush = 0; + struct magic *m = &magic[magindex]; ++ file_regex_t** m_rxcomp = &magic_rxcomp[magindex]; + + if (m->type != FILE_NAME) + if ((IS_STRING(m->type) && +@@ -257,7 +258,7 @@ flush: + *returnval = 1; + } + +- switch (magiccheck(ms, m)) { ++ switch (magiccheck(ms, m, m_rxcomp)) { + case -1: + return -1; + case 0: +@@ -318,6 +319,7 @@ flush: + while (magindex + 1 < nmagic && + magic[magindex + 1].cont_level != 0) { + m = &magic[++magindex]; ++ m_rxcomp = &magic_rxcomp[magindex]; + ms->line = m->lineno; /* for messages */ + + if (cont_level < m->cont_level) +@@ -371,7 +373,7 @@ flush: + break; + } + +- switch (flush ? 1 : magiccheck(ms, m)) { ++ switch (flush ? 1 : magiccheck(ms, m, m_rxcomp)) { + case -1: + return -1; + case 0: +@@ -655,7 +657,7 @@ mprint(struct magic_set *ms, struct magi + + if (m->str_flags & STRING_TRIM) + str = file_strtrim(str); +- ++ + if (file_printf(ms, F(ms, desc, "%s"), + file_printable(ms, sbuf, sizeof(sbuf), str, + sizeof(p->s) - (str - p->s))) == -1) +@@ -770,7 +772,7 @@ mprint(struct magic_set *ms, struct magi + return -1; + } + scp = (m->str_flags & STRING_TRIM) ? file_strtrim(cp) : cp; +- ++ + rval = file_printf(ms, F(ms, desc, "%s"), file_printable(ms, + sbuf, sizeof(sbuf), scp, ms->search.rm_len)); + free(cp); +@@ -1822,7 +1824,7 @@ mget(struct magic_set *ms, struct magic + for (mlp = ms->mlist[0]->next; mlp != ms->mlist[0]; + mlp = mlp->next) + { +- if ((rv = match(ms, mlp->magic, mlp->nmagic, &bb, 0, ++ if ((rv = match(ms, mlp->magic, mlp->magic_rxcomp, mlp->nmagic, &bb, 0, + BINTEST, text, 0, indir_count, name_count, + printed_something, need_separator, NULL, + NULL)) != 0) +@@ -1875,7 +1877,7 @@ mget(struct magic_set *ms, struct magic + nfound_match = 0; + (*name_count)++; + eoffset = ms->eoffset; +- rv = match(ms, ml.magic, ml.nmagic, b, offset + o, ++ rv = match(ms, ml.magic, ml.magic_rxcomp, ml.nmagic, b, offset + o, + mode, text, flip, indir_count, name_count, + printed_something, need_separator, returnval, + &nfound_match); +@@ -1999,7 +2001,7 @@ file_strncmp16(const char *a, const char + } + + private int +-magiccheck(struct magic_set *ms, struct magic *m) ++magiccheck(struct magic_set *ms, struct magic *m, file_regex_t** m_cache) + { + uint64_t l = m->value.q; + uint64_t v; +@@ -2182,27 +2184,32 @@ magiccheck(struct magic_set *ms, struct + } + case FILE_REGEX: { + int rc; +- file_regex_t rx; ++ file_regex_t *rx = *m_cache; + const char *search; + + if (ms->search.s == NULL) + return 0; + ++ if (rx == NULL) { ++ rx = *m_cache = CAST(file_regex_t*, malloc(sizeof(file_regex_t))); ++ rc = file_regcomp(rx, m->value.s, ++ REG_EXTENDED|REG_NEWLINE| ++ ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); ++ if (rc) { ++ file_regerror(rx, rc, ms); ++ file_regfree(rx); ++ v = CAST(uint64_t, -1); ++ break; ++ } ++ } + l = 0; +- rc = file_regcomp(&rx, m->value.s, +- REG_EXTENDED|REG_NEWLINE| +- ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); +- if (rc) { +- file_regerror(&rx, rc, ms); +- v = CAST(uint64_t, -1); +- } else { ++ { + regmatch_t pmatch; + size_t slen = ms->search.s_len; + char *copy; + if (slen != 0) { + copy = CAST(char *, malloc(slen)); + if (copy == NULL) { +- file_regfree(&rx); + file_error(ms, errno, + "can't allocate %" SIZE_T_FORMAT "u bytes", + slen); +@@ -2215,14 +2222,14 @@ magiccheck(struct magic_set *ms, struct + search = CCAST(char *, ""); + copy = NULL; + } +- rc = file_regexec(&rx, RCAST(const char *, search), ++ rc = file_regexec(rx, RCAST(const char *, search), + 1, &pmatch, 0); + free(copy); + switch (rc) { + case 0: + ms->search.s += CAST(int, pmatch.rm_so); + ms->search.offset += CAST(size_t, pmatch.rm_so); +- ms->search.rm_len = CAST(size_t, ++ ms->search.rm_len = CAST(size_t, + pmatch.rm_eo - pmatch.rm_so); + v = 0; + break; +@@ -2232,12 +2239,11 @@ magiccheck(struct magic_set *ms, struct + break; + + default: +- file_regerror(&rx, rc, ms); ++ file_regerror(rx, rc, ms); + v = CAST(uint64_t, -1); + break; + } + } +- file_regfree(&rx); + if (v == CAST(uint64_t, -1)) + return -1; + break; diff --git a/file.spec b/file.spec index d63dcfc..b6549e3 100644 --- a/file.spec +++ b/file.spec @@ -45,6 +45,7 @@ Source4: ftp://ftp.astron.com/pub/file/file-%{version}.tar.gz.asc Source5: file.keyring Patch: file-5.41.dif Patch1: file-5.19-misc.dif +Patch2: 0001-Cache-compiled-regexps-between-magic-matches.patch Patch4: file-4.24-autoconf.dif Patch5: file-5.14-tex.dif Patch7: file-4.20-ssd.dif @@ -108,6 +109,7 @@ to develop applications that require the magic "file" interface. %prep %setup -q -n file-%{version} %patch1 -p0 -b .misc +%patch2 -p1 -b .cache %patch4 -p0 -b .conf %patch5 -p0 -b .tex %patch7 -p0 -b .ssd @@ -138,7 +140,7 @@ rm -f ltcf-c.sh ltconfig ltmain.sh autoreconf -fiv export CFLAGS="%{optflags} -DHOWMANY=69632 -fPIE $(pkg-config libseccomp --cflags)" %configure --disable-silent-rules --datadir=%{_miscdir} \ - --disable-static \ + --disable-static --disable-libseccomp \ --enable-fsect-man5 make %{?_smp_mflags} pkgdatadir='$(datadir)' LDFLAGS="-pie" From 9797584eba3ec1c1558a1b69491fdc4e06d6bb9ca05985ccffb25598d2a3a5fe Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Sat, 19 Mar 2022 17:59:14 +0000 Subject: [PATCH 2/5] revert accidental commit OBS-URL: https://build.opensuse.org/package/show/Base:System/file?expand=0&rev=236 --- ...mpiled-regexps-between-magic-matches.patch | 346 ------------------ file.spec | 4 +- 2 files changed, 1 insertion(+), 349 deletions(-) diff --git a/0001-Cache-compiled-regexps-between-magic-matches.patch b/0001-Cache-compiled-regexps-between-magic-matches.patch index 098119d..473a0f4 100644 --- a/0001-Cache-compiled-regexps-between-magic-matches.patch +++ b/0001-Cache-compiled-regexps-between-magic-matches.patch @@ -1,346 +0,0 @@ -From 1957db8212e9c74e5d626de3023e49d0bb502052 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Dirk=20M=C3=BCller?= -Date: Fri, 11 Mar 2022 23:51:55 +0100 -Subject: [PATCH] Cache compiled regexps between magic matches - -regcomp() is relatively expensive compared to regexec() for matching, -so it helps to only compile once and then reuse the compiled version -for future matches of the same magic. - -when doing equivalent of `find | xargs file` this provides a massive -speedup, between factor 2 and 4 depending on how heavy the magic -is on regexp usage. - -The memory overhead is mediocre (~ 200kb ) and it compiles regexps -lazy, so it doesn't add significant overhead to single match usecases. ---- - src/apprentice.c | 26 +++++++++++++++++++---- - src/file.h | 40 ++++++++++++++++++----------------- - src/softmagic.c | 54 +++++++++++++++++++++++++++--------------------- - 3 files changed, 73 insertions(+), 47 deletions(-) - -Index: file-5.41/src/apprentice.c -=================================================================== ---- file-5.41.orig/src/apprentice.c -+++ file-5.41/src/apprentice.c -@@ -425,7 +425,14 @@ add_mlist(struct mlist *mlp, struct magi - ml->map = idx == 0 ? map : NULL; - ml->magic = map->magic[idx]; - ml->nmagic = map->nmagic[idx]; -- -+ ml->magic_rxcomp = NULL; -+ if (ml->nmagic) { -+ ml->magic_rxcomp = CAST(file_regex_t**, calloc(ml->nmagic, sizeof(file_regex_t*))); -+ if (ml->magic_rxcomp == NULL) { -+ free(ml); -+ return -1; -+ } -+ } - mlp->prev->next = ml; - ml->prev = mlp->prev; - ml->next = mlp; -@@ -610,8 +617,19 @@ mlist_free_all(struct magic_set *ms) - private void - mlist_free_one(struct mlist *ml) - { -+ size_t i; -+ - if (ml->map) - apprentice_unmap(CAST(struct magic_map *, ml->map)); -+ -+ for (i = 0; i < ml->nmagic; ++i) { -+ if (ml->magic_rxcomp[i]) { -+ file_regfree(ml->magic_rxcomp[i]); -+ free(ml->magic_rxcomp[i]); -+ } -+ } -+ free(ml->magic_rxcomp); -+ ml->magic_rxcomp = NULL; - free(ml); - } - -@@ -3548,16 +3566,16 @@ file_magicfind(struct magic_set *ms, con - - for (ml = mlist->next; ml != mlist; ml = ml->next) { - struct magic *ma = ml->magic; -- uint32_t nma = ml->nmagic; -- for (i = 0; i < nma; i++) { -+ for (i = 0; i < ml->nmagic; i++) { - if (ma[i].type != FILE_NAME) - continue; - if (strcmp(ma[i].value.s, name) == 0) { - v->magic = &ma[i]; -- for (j = i + 1; j < nma; j++) -+ for (j = i + 1; j < ml->nmagic; j++) - if (ma[j].cont_level == 0) - break; - v->nmagic = j - i; -+ v->magic_rxcomp = ml->magic_rxcomp; - return 0; - } - } -Index: file-5.41/src/file.h -=================================================================== ---- file-5.41.orig/src/file.h -+++ file-5.41/src/file.h -@@ -88,6 +88,10 @@ - /* Do this here and now, because struct stat gets re-defined on solaris */ - #include - #include -+#include -+#if defined(HAVE_XLOCALE_H) -+#include -+#endif - - #define ENABLE_CONDITIONALS - -@@ -167,6 +171,19 @@ - #define FILE_COMPILE 2 - #define FILE_LIST 3 - -+typedef struct { -+ const char *pat; -+#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) -+#define USE_C_LOCALE -+ locale_t old_lc_ctype; -+ locale_t c_lc_ctype; -+#else -+ char *old_lc_ctype; -+#endif -+ int rc; -+ regex_t rx; -+} file_regex_t; -+ - struct buffer { - int fd; - struct stat st; -@@ -397,9 +414,10 @@ struct magic { - - /* list of magic entries */ - struct mlist { -- struct magic *magic; /* array of magic entries */ -- uint32_t nmagic; /* number of entries in array */ -- void *map; /* internal resources used by entry */ -+ struct magic *magic; /* array of magic entries */ -+ file_regex_t **magic_rxcomp; /* array of compiled regexps */ -+ size_t nmagic; /* number of entries in array */ -+ void *map; /* internal resources used by entry */ - struct mlist *next, *prev; - }; - -@@ -568,23 +586,7 @@ protected void buffer_init(struct buffer - protected void buffer_fini(struct buffer *); - protected int buffer_fill(const struct buffer *); - --#include --#if defined(HAVE_XLOCALE_H) --#include --#endif - --typedef struct { -- const char *pat; --#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) --#define USE_C_LOCALE -- locale_t old_lc_ctype; -- locale_t c_lc_ctype; --#else -- char *old_lc_ctype; --#endif -- int rc; -- regex_t rx; --} file_regex_t; - - protected int file_regcomp(file_regex_t *, const char *, int); - protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *, -Index: file-5.41/src/softmagic.c -=================================================================== ---- file-5.41.orig/src/softmagic.c -+++ file-5.41/src/softmagic.c -@@ -43,7 +43,7 @@ FILE_RCSID("@(#)$File: softmagic.c,v 1.3 - #include - #include "der.h" - --private int match(struct magic_set *, struct magic *, uint32_t, -+private int match(struct magic_set *, struct magic *, file_regex_t **, uint32_t, - const struct buffer *, size_t, int, int, int, uint16_t *, - uint16_t *, int *, int *, int *, int *); - private int mget(struct magic_set *, struct magic *, const struct buffer *, -@@ -52,7 +52,7 @@ private int mget(struct magic_set *, str - uint16_t *, int *, int *, int *, int *); - private int msetoffset(struct magic_set *, struct magic *, struct buffer *, - const struct buffer *, size_t, unsigned int); --private int magiccheck(struct magic_set *, struct magic *); -+private int magiccheck(struct magic_set *, struct magic *, file_regex_t **); - private int32_t mprint(struct magic_set *, struct magic *); - private int moffset(struct magic_set *, struct magic *, const struct buffer *, - int32_t *); -@@ -131,7 +131,7 @@ file_softmagic(struct magic_set *ms, con - } - - for (ml = ms->mlist[0]->next; ml != ms->mlist[0]; ml = ml->next) -- if ((rv = match(ms, ml->magic, ml->nmagic, b, 0, mode, -+ if ((rv = match(ms, ml->magic, ml->magic_rxcomp, ml->nmagic, b, 0, mode, - text, 0, indir_count, name_count, - &printed_something, &need_separator, NULL, NULL)) != 0) - return rv; -@@ -191,7 +191,7 @@ file_fmtcheck(struct magic_set *ms, cons - * so that higher-level continuations are processed. - */ - private int --match(struct magic_set *ms, struct magic *magic, uint32_t nmagic, -+match(struct magic_set *ms, struct magic *magic, file_regex_t **magic_rxcomp, uint32_t nmagic, - const struct buffer *b, size_t offset, int mode, int text, - int flip, uint16_t *indir_count, uint16_t *name_count, - int *printed_something, int *need_separator, int *returnval, -@@ -220,6 +220,7 @@ match(struct magic_set *ms, struct magic - for (magindex = 0; magindex < nmagic; magindex++) { - int flush = 0; - struct magic *m = &magic[magindex]; -+ file_regex_t** m_rxcomp = &magic_rxcomp[magindex]; - - if (m->type != FILE_NAME) - if ((IS_STRING(m->type) && -@@ -257,7 +258,7 @@ flush: - *returnval = 1; - } - -- switch (magiccheck(ms, m)) { -+ switch (magiccheck(ms, m, m_rxcomp)) { - case -1: - return -1; - case 0: -@@ -318,6 +319,7 @@ flush: - while (magindex + 1 < nmagic && - magic[magindex + 1].cont_level != 0) { - m = &magic[++magindex]; -+ m_rxcomp = &magic_rxcomp[magindex]; - ms->line = m->lineno; /* for messages */ - - if (cont_level < m->cont_level) -@@ -371,7 +373,7 @@ flush: - break; - } - -- switch (flush ? 1 : magiccheck(ms, m)) { -+ switch (flush ? 1 : magiccheck(ms, m, m_rxcomp)) { - case -1: - return -1; - case 0: -@@ -655,7 +657,7 @@ mprint(struct magic_set *ms, struct magi - - if (m->str_flags & STRING_TRIM) - str = file_strtrim(str); -- -+ - if (file_printf(ms, F(ms, desc, "%s"), - file_printable(ms, sbuf, sizeof(sbuf), str, - sizeof(p->s) - (str - p->s))) == -1) -@@ -770,7 +772,7 @@ mprint(struct magic_set *ms, struct magi - return -1; - } - scp = (m->str_flags & STRING_TRIM) ? file_strtrim(cp) : cp; -- -+ - rval = file_printf(ms, F(ms, desc, "%s"), file_printable(ms, - sbuf, sizeof(sbuf), scp, ms->search.rm_len)); - free(cp); -@@ -1822,7 +1824,7 @@ mget(struct magic_set *ms, struct magic - for (mlp = ms->mlist[0]->next; mlp != ms->mlist[0]; - mlp = mlp->next) - { -- if ((rv = match(ms, mlp->magic, mlp->nmagic, &bb, 0, -+ if ((rv = match(ms, mlp->magic, mlp->magic_rxcomp, mlp->nmagic, &bb, 0, - BINTEST, text, 0, indir_count, name_count, - printed_something, need_separator, NULL, - NULL)) != 0) -@@ -1875,7 +1877,7 @@ mget(struct magic_set *ms, struct magic - nfound_match = 0; - (*name_count)++; - eoffset = ms->eoffset; -- rv = match(ms, ml.magic, ml.nmagic, b, offset + o, -+ rv = match(ms, ml.magic, ml.magic_rxcomp, ml.nmagic, b, offset + o, - mode, text, flip, indir_count, name_count, - printed_something, need_separator, returnval, - &nfound_match); -@@ -1999,7 +2001,7 @@ file_strncmp16(const char *a, const char - } - - private int --magiccheck(struct magic_set *ms, struct magic *m) -+magiccheck(struct magic_set *ms, struct magic *m, file_regex_t** m_cache) - { - uint64_t l = m->value.q; - uint64_t v; -@@ -2182,27 +2184,32 @@ magiccheck(struct magic_set *ms, struct - } - case FILE_REGEX: { - int rc; -- file_regex_t rx; -+ file_regex_t *rx = *m_cache; - const char *search; - - if (ms->search.s == NULL) - return 0; - -+ if (rx == NULL) { -+ rx = *m_cache = CAST(file_regex_t*, malloc(sizeof(file_regex_t))); -+ rc = file_regcomp(rx, m->value.s, -+ REG_EXTENDED|REG_NEWLINE| -+ ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); -+ if (rc) { -+ file_regerror(rx, rc, ms); -+ file_regfree(rx); -+ v = CAST(uint64_t, -1); -+ break; -+ } -+ } - l = 0; -- rc = file_regcomp(&rx, m->value.s, -- REG_EXTENDED|REG_NEWLINE| -- ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); -- if (rc) { -- file_regerror(&rx, rc, ms); -- v = CAST(uint64_t, -1); -- } else { -+ { - regmatch_t pmatch; - size_t slen = ms->search.s_len; - char *copy; - if (slen != 0) { - copy = CAST(char *, malloc(slen)); - if (copy == NULL) { -- file_regfree(&rx); - file_error(ms, errno, - "can't allocate %" SIZE_T_FORMAT "u bytes", - slen); -@@ -2215,14 +2222,14 @@ magiccheck(struct magic_set *ms, struct - search = CCAST(char *, ""); - copy = NULL; - } -- rc = file_regexec(&rx, RCAST(const char *, search), -+ rc = file_regexec(rx, RCAST(const char *, search), - 1, &pmatch, 0); - free(copy); - switch (rc) { - case 0: - ms->search.s += CAST(int, pmatch.rm_so); - ms->search.offset += CAST(size_t, pmatch.rm_so); -- ms->search.rm_len = CAST(size_t, -+ ms->search.rm_len = CAST(size_t, - pmatch.rm_eo - pmatch.rm_so); - v = 0; - break; -@@ -2232,12 +2239,11 @@ magiccheck(struct magic_set *ms, struct - break; - - default: -- file_regerror(&rx, rc, ms); -+ file_regerror(rx, rc, ms); - v = CAST(uint64_t, -1); - break; - } - } -- file_regfree(&rx); - if (v == CAST(uint64_t, -1)) - return -1; - break; diff --git a/file.spec b/file.spec index b6549e3..d63dcfc 100644 --- a/file.spec +++ b/file.spec @@ -45,7 +45,6 @@ Source4: ftp://ftp.astron.com/pub/file/file-%{version}.tar.gz.asc Source5: file.keyring Patch: file-5.41.dif Patch1: file-5.19-misc.dif -Patch2: 0001-Cache-compiled-regexps-between-magic-matches.patch Patch4: file-4.24-autoconf.dif Patch5: file-5.14-tex.dif Patch7: file-4.20-ssd.dif @@ -109,7 +108,6 @@ to develop applications that require the magic "file" interface. %prep %setup -q -n file-%{version} %patch1 -p0 -b .misc -%patch2 -p1 -b .cache %patch4 -p0 -b .conf %patch5 -p0 -b .tex %patch7 -p0 -b .ssd @@ -140,7 +138,7 @@ rm -f ltcf-c.sh ltconfig ltmain.sh autoreconf -fiv export CFLAGS="%{optflags} -DHOWMANY=69632 -fPIE $(pkg-config libseccomp --cflags)" %configure --disable-silent-rules --datadir=%{_miscdir} \ - --disable-static --disable-libseccomp \ + --disable-static \ --enable-fsect-man5 make %{?_smp_mflags} pkgdatadir='$(datadir)' LDFLAGS="-pie" From 207614c41ab37ec0424d2157449e2a374ad49846c5caae6f12c41f61c56365fe Mon Sep 17 00:00:00 2001 From: "Dr. Werner Fink" Date: Mon, 21 Mar 2022 09:18:32 +0000 Subject: [PATCH 3/5] Accepting request 963483 from home:dirkmueller:Factory - add file-5.41-cache-regexps.patch to cache regexp lookups - spec-cleaner run OBS-URL: https://build.opensuse.org/request/show/963483 OBS-URL: https://build.opensuse.org/package/show/Base:System/file?expand=0&rev=237 --- ...mpiled-regexps-between-magic-matches.patch | 0 file-5.41-cache-regexps.patch | 346 ++++++++++++++++++ file.changes | 5 + file.spec | 2 + python-magic.changes | 5 + python-magic.spec | 19 +- 6 files changed, 366 insertions(+), 11 deletions(-) delete mode 100644 0001-Cache-compiled-regexps-between-magic-matches.patch create mode 100644 file-5.41-cache-regexps.patch diff --git a/0001-Cache-compiled-regexps-between-magic-matches.patch b/0001-Cache-compiled-regexps-between-magic-matches.patch deleted file mode 100644 index 473a0f4..0000000 diff --git a/file-5.41-cache-regexps.patch b/file-5.41-cache-regexps.patch new file mode 100644 index 0000000..45b95ef --- /dev/null +++ b/file-5.41-cache-regexps.patch @@ -0,0 +1,346 @@ +From 1957db8212e9c74e5d626de3023e49d0bb502052 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Dirk=20M=C3=BCller?= +Date: Fri, 11 Mar 2022 23:51:55 +0100 +Subject: [PATCH] Cache compiled regexps between magic matches + +regcomp() is relatively expensive compared to regexec() for matching, +so it helps to only compile once and then reuse the compiled version +for future matches of the same magic. + +when doing equivalent of `find | xargs file` this provides a massive +speedup, between factor 2 and 4 depending on how heavy the magic +is on regexp usage. + +The memory overhead is mediocre (~ 200kb ) and it compiles regexps +lazy, so it doesn't add significant overhead to single match usecases. +--- + src/apprentice.c | 26 +++++++++++++++++++---- + src/file.h | 40 ++++++++++++++++++----------------- + src/softmagic.c | 54 +++++++++++++++++++++++++++--------------------- + 3 files changed, 73 insertions(+), 47 deletions(-) + +Index: file-5.41/src/apprentice.c +=================================================================== +--- file-5.41.orig/src/apprentice.c ++++ file-5.41/src/apprentice.c +@@ -427,7 +427,14 @@ add_mlist(struct mlist *mlp, struct magi + ml->map = idx == 0 ? map : NULL; + ml->magic = map->magic[idx]; + ml->nmagic = map->nmagic[idx]; +- ++ ml->magic_rxcomp = NULL; ++ if (ml->nmagic) { ++ ml->magic_rxcomp = CAST(file_regex_t**, calloc(ml->nmagic, sizeof(file_regex_t*))); ++ if (ml->magic_rxcomp == NULL) { ++ free(ml); ++ return -1; ++ } ++ } + mlp->prev->next = ml; + ml->prev = mlp->prev; + ml->next = mlp; +@@ -612,8 +619,19 @@ mlist_free_all(struct magic_set *ms) + private void + mlist_free_one(struct mlist *ml) + { ++ size_t i; ++ + if (ml->map) + apprentice_unmap(CAST(struct magic_map *, ml->map)); ++ ++ for (i = 0; i < ml->nmagic; ++i) { ++ if (ml->magic_rxcomp[i]) { ++ file_regfree(ml->magic_rxcomp[i]); ++ free(ml->magic_rxcomp[i]); ++ } ++ } ++ free(ml->magic_rxcomp); ++ ml->magic_rxcomp = NULL; + free(ml); + } + +@@ -3489,16 +3507,16 @@ file_magicfind(struct magic_set *ms, con + + for (ml = mlist->next; ml != mlist; ml = ml->next) { + struct magic *ma = ml->magic; +- uint32_t nma = ml->nmagic; +- for (i = 0; i < nma; i++) { ++ for (i = 0; i < ml->nmagic; i++) { + if (ma[i].type != FILE_NAME) + continue; + if (strcmp(ma[i].value.s, name) == 0) { + v->magic = &ma[i]; +- for (j = i + 1; j < nma; j++) ++ for (j = i + 1; j < ml->nmagic; j++) + if (ma[j].cont_level == 0) + break; + v->nmagic = j - i; ++ v->magic_rxcomp = ml->magic_rxcomp; + return 0; + } + } +Index: file-5.41/src/file.h +=================================================================== +--- file-5.41.orig/src/file.h ++++ file-5.41/src/file.h +@@ -88,6 +88,10 @@ + /* Do this here and now, because struct stat gets re-defined on solaris */ + #include + #include ++#include ++#if defined(HAVE_XLOCALE_H) ++#include ++#endif + + #define ENABLE_CONDITIONALS + +@@ -167,6 +171,19 @@ + #define FILE_COMPILE 2 + #define FILE_LIST 3 + ++typedef struct { ++ const char *pat; ++#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) ++#define USE_C_LOCALE ++ locale_t old_lc_ctype; ++ locale_t c_lc_ctype; ++#else ++ char *old_lc_ctype; ++#endif ++ int rc; ++ regex_t rx; ++} file_regex_t; ++ + struct buffer { + int fd; + struct stat st; +@@ -397,9 +414,10 @@ struct magic { + + /* list of magic entries */ + struct mlist { +- struct magic *magic; /* array of magic entries */ +- uint32_t nmagic; /* number of entries in array */ +- void *map; /* internal resources used by entry */ ++ struct magic *magic; /* array of magic entries */ ++ file_regex_t **magic_rxcomp; /* array of compiled regexps */ ++ size_t nmagic; /* number of entries in array */ ++ void *map; /* internal resources used by entry */ + struct mlist *next, *prev; + }; + +@@ -568,23 +586,7 @@ protected void buffer_init(struct buffer + protected void buffer_fini(struct buffer *); + protected int buffer_fill(const struct buffer *); + +-#include +-#if defined(HAVE_XLOCALE_H) +-#include +-#endif + +-typedef struct { +- const char *pat; +-#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) +-#define USE_C_LOCALE +- locale_t old_lc_ctype; +- locale_t c_lc_ctype; +-#else +- char *old_lc_ctype; +-#endif +- int rc; +- regex_t rx; +-} file_regex_t; + + protected int file_regcomp(file_regex_t *, const char *, int); + protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *, +Index: file-5.41/src/softmagic.c +=================================================================== +--- file-5.41.orig/src/softmagic.c ++++ file-5.41/src/softmagic.c +@@ -43,7 +43,7 @@ FILE_RCSID("@(#)$File: softmagic.c,v 1.3 + #include + #include "der.h" + +-private int match(struct magic_set *, struct magic *, uint32_t, ++private int match(struct magic_set *, struct magic *, file_regex_t **, uint32_t, + const struct buffer *, size_t, int, int, int, uint16_t *, + uint16_t *, int *, int *, int *, int *); + private int mget(struct magic_set *, struct magic *, const struct buffer *, +@@ -52,7 +52,7 @@ private int mget(struct magic_set *, str + uint16_t *, int *, int *, int *, int *); + private int msetoffset(struct magic_set *, struct magic *, struct buffer *, + const struct buffer *, size_t, unsigned int); +-private int magiccheck(struct magic_set *, struct magic *); ++private int magiccheck(struct magic_set *, struct magic *, file_regex_t **); + private int32_t mprint(struct magic_set *, struct magic *); + private int moffset(struct magic_set *, struct magic *, const struct buffer *, + int32_t *); +@@ -131,7 +131,7 @@ file_softmagic(struct magic_set *ms, con + } + + for (ml = ms->mlist[0]->next; ml != ms->mlist[0]; ml = ml->next) +- if ((rv = match(ms, ml->magic, ml->nmagic, b, 0, mode, ++ if ((rv = match(ms, ml->magic, ml->magic_rxcomp, ml->nmagic, b, 0, mode, + text, 0, indir_count, name_count, + &printed_something, &need_separator, NULL, NULL)) != 0) + return rv; +@@ -191,7 +191,7 @@ file_fmtcheck(struct magic_set *ms, cons + * so that higher-level continuations are processed. + */ + private int +-match(struct magic_set *ms, struct magic *magic, uint32_t nmagic, ++match(struct magic_set *ms, struct magic *magic, file_regex_t **magic_rxcomp, uint32_t nmagic, + const struct buffer *b, size_t offset, int mode, int text, + int flip, uint16_t *indir_count, uint16_t *name_count, + int *printed_something, int *need_separator, int *returnval, +@@ -220,6 +220,7 @@ match(struct magic_set *ms, struct magic + for (magindex = 0; magindex < nmagic; magindex++) { + int flush = 0; + struct magic *m = &magic[magindex]; ++ file_regex_t** m_rxcomp = &magic_rxcomp[magindex]; + + if (m->type != FILE_NAME) + if ((IS_STRING(m->type) && +@@ -257,7 +258,7 @@ flush: + *returnval = 1; + } + +- switch (magiccheck(ms, m)) { ++ switch (magiccheck(ms, m, m_rxcomp)) { + case -1: + return -1; + case 0: +@@ -318,6 +319,7 @@ flush: + while (magindex + 1 < nmagic && + magic[magindex + 1].cont_level != 0) { + m = &magic[++magindex]; ++ m_rxcomp = &magic_rxcomp[magindex]; + ms->line = m->lineno; /* for messages */ + + if (cont_level < m->cont_level) +@@ -371,7 +373,7 @@ flush: + break; + } + +- switch (flush ? 1 : magiccheck(ms, m)) { ++ switch (flush ? 1 : magiccheck(ms, m, m_rxcomp)) { + case -1: + return -1; + case 0: +@@ -655,7 +657,7 @@ mprint(struct magic_set *ms, struct magi + + if (m->str_flags & STRING_TRIM) + str = file_strtrim(str); +- ++ + if (file_printf(ms, F(ms, desc, "%s"), + file_printable(ms, sbuf, sizeof(sbuf), str, + sizeof(p->s) - (str - p->s))) == -1) +@@ -770,7 +772,7 @@ mprint(struct magic_set *ms, struct magi + return -1; + } + scp = (m->str_flags & STRING_TRIM) ? file_strtrim(cp) : cp; +- ++ + rval = file_printf(ms, F(ms, desc, "%s"), file_printable(ms, + sbuf, sizeof(sbuf), scp, ms->search.rm_len)); + free(cp); +@@ -1822,7 +1824,7 @@ mget(struct magic_set *ms, struct magic + for (mlp = ms->mlist[0]->next; mlp != ms->mlist[0]; + mlp = mlp->next) + { +- if ((rv = match(ms, mlp->magic, mlp->nmagic, &bb, 0, ++ if ((rv = match(ms, mlp->magic, mlp->magic_rxcomp, mlp->nmagic, &bb, 0, + BINTEST, text, 0, indir_count, name_count, + printed_something, need_separator, NULL, + NULL)) != 0) +@@ -1875,7 +1877,7 @@ mget(struct magic_set *ms, struct magic + nfound_match = 0; + (*name_count)++; + eoffset = ms->eoffset; +- rv = match(ms, ml.magic, ml.nmagic, b, offset + o, ++ rv = match(ms, ml.magic, ml.magic_rxcomp, ml.nmagic, b, offset + o, + mode, text, flip, indir_count, name_count, + printed_something, need_separator, returnval, + &nfound_match); +@@ -1999,7 +2001,7 @@ file_strncmp16(const char *a, const char + } + + private int +-magiccheck(struct magic_set *ms, struct magic *m) ++magiccheck(struct magic_set *ms, struct magic *m, file_regex_t** m_cache) + { + uint64_t l = m->value.q; + uint64_t v; +@@ -2182,27 +2184,32 @@ magiccheck(struct magic_set *ms, struct + } + case FILE_REGEX: { + int rc; +- file_regex_t rx; ++ file_regex_t *rx = *m_cache; + const char *search; + + if (ms->search.s == NULL) + return 0; + ++ if (rx == NULL) { ++ rx = *m_cache = CAST(file_regex_t*, malloc(sizeof(file_regex_t))); ++ rc = file_regcomp(rx, m->value.s, ++ REG_EXTENDED|REG_NEWLINE| ++ ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); ++ if (rc) { ++ file_regerror(rx, rc, ms); ++ file_regfree(rx); ++ v = CAST(uint64_t, -1); ++ break; ++ } ++ } + l = 0; +- rc = file_regcomp(&rx, m->value.s, +- REG_EXTENDED|REG_NEWLINE| +- ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); +- if (rc) { +- file_regerror(&rx, rc, ms); +- v = CAST(uint64_t, -1); +- } else { ++ { + regmatch_t pmatch; + size_t slen = ms->search.s_len; + char *copy; + if (slen != 0) { + copy = CAST(char *, malloc(slen)); + if (copy == NULL) { +- file_regfree(&rx); + file_error(ms, errno, + "can't allocate %" SIZE_T_FORMAT "u bytes", + slen); +@@ -2215,14 +2222,14 @@ magiccheck(struct magic_set *ms, struct + search = CCAST(char *, ""); + copy = NULL; + } +- rc = file_regexec(&rx, RCAST(const char *, search), ++ rc = file_regexec(rx, RCAST(const char *, search), + 1, &pmatch, 0); + free(copy); + switch (rc) { + case 0: + ms->search.s += CAST(int, pmatch.rm_so); + ms->search.offset += CAST(size_t, pmatch.rm_so); +- ms->search.rm_len = CAST(size_t, ++ ms->search.rm_len = CAST(size_t, + pmatch.rm_eo - pmatch.rm_so); + v = 0; + break; +@@ -2232,12 +2239,11 @@ magiccheck(struct magic_set *ms, struct + break; + + default: +- file_regerror(&rx, rc, ms); ++ file_regerror(rx, rc, ms); + v = CAST(uint64_t, -1); + break; + } + } +- file_regfree(&rx); + if (v == CAST(uint64_t, -1)) + return -1; + break; diff --git a/file.changes b/file.changes index cd2448e..2634daf 100644 --- a/file.changes +++ b/file.changes @@ -1,3 +1,8 @@ +------------------------------------------------------------------- +Sat Mar 19 18:00:32 UTC 2022 - Dirk Müller + +- add file-5.41-cache-regexps.patch to cache regexp lookups + ------------------------------------------------------------------- Thu Feb 24 10:05:17 UTC 2022 - Dr. Werner Fink diff --git a/file.spec b/file.spec index d63dcfc..929af31 100644 --- a/file.spec +++ b/file.spec @@ -62,6 +62,7 @@ Patch31: file-5.19-biorad.dif Patch32: file-5.19-clicfs.dif Patch34: file-5.23-endian.patch Patch37: file-secure_getenv.patch +Patch38: file-5.41-cache-regexps.patch Patch39: file-5.28-btrfs-image.dif # Upstream commits as patches BuildRoot: %{_tmppath}/%{name}-%{version}-build @@ -125,6 +126,7 @@ to develop applications that require the magic "file" interface. %patch32 -p0 -b .clicfs %patch34 -p0 -b .endian %patch37 -p1 -b .getenv +%patch38 -p1 -b .regexp %patch39 -p1 -b .btrfs %patch -b .0 test -s src/magic.h.in || cp -p src/magic.h src/magic.h.in diff --git a/python-magic.changes b/python-magic.changes index cda6113..5b4b11d 100644 --- a/python-magic.changes +++ b/python-magic.changes @@ -1,3 +1,8 @@ +------------------------------------------------------------------- +Sat Mar 19 18:01:52 UTC 2022 - Dirk Müller + +- spec-cleaner run + ------------------------------------------------------------------- Tue Oct 19 09:55:47 UTC 2021 - Dr. Werner Fink diff --git a/python-magic.spec b/python-magic.spec index 50d5b29..0d8c005 100644 --- a/python-magic.spec +++ b/python-magic.spec @@ -19,25 +19,23 @@ # PyPI package name is file-magic. Version is taken from setup.py %define file_magic_version 0.3.0 %{?!python_module:%define python_module() python-%{**} python3-%{**}} - +%global _miscdir %{_datadir}/misc Name: python-magic -BuildRequires: %{python_module setuptools} -BuildRequires: findutils -BuildRequires: libtool -BuildRequires: python-rpm-macros -BuildRequires: zlib-devel -URL: http://www.darwinsys.com/file/ Version: 5.41 Release: 0 Summary: Python module to use libmagic License: BSD-3-Clause AND BSD-4-Clause Group: Development/Languages/Python -%{expand:%(sed -n -e '/^Source0\?:/,/^BuildRoot:/p' <%{_sourcedir}/file.spec)} +URL: https://www.darwinsys.com/file/ Source99: file.spec +BuildRequires: %{python_module setuptools} +BuildRequires: findutils +BuildRequires: libtool +BuildRequires: python-rpm-macros +BuildRequires: zlib-devel Requires: libmagic1 Provides: python-file-magic = %{file_magic_version} -%global _miscdir %{_datadir}/misc - +%{expand:%(sed -n -e '/^Source0\?:/,/^BuildRoot:/p' <%{_sourcedir}/file.spec)} %python_subpackages %description @@ -58,7 +56,6 @@ pushd python popd %files %{python_files} -%defattr(-,root,root) %doc python/README python/example.py %{python_sitelib}/magic.py* %pycache_only %{python_sitelib}/__pycache__ From 47e53e9dcf052b4038f13fa4734db0a8a692115875486062a5457d91a955776d Mon Sep 17 00:00:00 2001 From: "Dr. Werner Fink" Date: Wed, 23 Mar 2022 09:44:25 +0000 Subject: [PATCH 4/5] Accepting request 964197 from home:dirkmueller:Factory - add file-5.41-cache-regexps-locale-restore.patch to restore previous locale handling behavior OBS-URL: https://build.opensuse.org/request/show/964197 OBS-URL: https://build.opensuse.org/package/show/Base:System/file?expand=0&rev=238 --- file-5.41-cache-regexps-locale-restore.patch | 101 +++++++++++++++++++ file.changes | 6 ++ file.spec | 2 + 3 files changed, 109 insertions(+) create mode 100644 file-5.41-cache-regexps-locale-restore.patch diff --git a/file-5.41-cache-regexps-locale-restore.patch b/file-5.41-cache-regexps-locale-restore.patch new file mode 100644 index 0000000..3012443 --- /dev/null +++ b/file-5.41-cache-regexps-locale-restore.patch @@ -0,0 +1,101 @@ +From c25329eabeaba048cb6ef1448d1ee040c62c415f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Dirk=20M=C3=BCller?= +Date: Tue, 22 Mar 2022 22:28:14 +0100 +Subject: [PATCH] Restore locale handling after regex caching + +file_regcomp/file_regfree had the side effect of setting and restoring +locale C_TYPE to C to have predictable regex matching. With regcomp +caching file_regfree has been changed to be only called at destruction +time, which means the library changed the locale setting for anything +else as well. Restore old behavior by splitting save/restore into +separate functions which are surrounding regcomp() and regexec() only. +--- + src/funcs.c | 39 ++++++++++++++++++++++++++++++++------- + 1 file changed, 32 insertions(+), 7 deletions(-) + +diff --git a/src/funcs.c b/src/funcs.c +index dcfd352d..7ecaff33 100644 +--- a/src/funcs.c ++++ b/src/funcs.c +@@ -658,35 +658,62 @@ out: + return rv; + } + ++static void ++file_reg_set_ctype(file_regex_t *rx) ++{ ++#ifdef USE_C_LOCALE ++ rx->old_lc_ctype = uselocale(rx->c_lc_ctype); ++ assert(rx->old_lc_ctype != NULL); ++#else ++ (void)setlocale(LC_CTYPE, "C"); ++#endif ++} ++ ++static void ++file_reg_restore_ctype(file_regex_t *rx) ++{ ++#ifdef USE_C_LOCALE ++ (void)uselocale(rx->old_lc_ctype); ++#else ++ (void)setlocale(LC_CTYPE, rx->old_lc_ctype); ++#endif ++} ++ + protected int + file_regcomp(file_regex_t *rx, const char *pat, int flags) + { + #ifdef USE_C_LOCALE + rx->c_lc_ctype = newlocale(LC_CTYPE_MASK, "C", 0); + assert(rx->c_lc_ctype != NULL); +- rx->old_lc_ctype = uselocale(rx->c_lc_ctype); +- assert(rx->old_lc_ctype != NULL); + #else + rx->old_lc_ctype = setlocale(LC_CTYPE, NULL); + assert(rx->old_lc_ctype != NULL); + rx->old_lc_ctype = strdup(rx->old_lc_ctype); + assert(rx->old_lc_ctype != NULL); +- (void)setlocale(LC_CTYPE, "C"); + #endif + rx->pat = pat; + +- return rx->rc = regcomp(&rx->rx, pat, flags); ++ file_reg_set_ctype(rx); ++ rx->rc = regcomp(&rx->rx, pat, flags); ++ file_reg_restore_ctype(rx); ++ ++ return rx->rc; + } + + protected int + file_regexec(file_regex_t *rx, const char *str, size_t nmatch, + regmatch_t* pmatch, int eflags) + { ++ int rc; + assert(rx->rc == 0); + /* XXX: force initialization because glibc does not always do this */ + if (nmatch != 0) + memset(pmatch, 0, nmatch * sizeof(*pmatch)); +- return regexec(&rx->rx, str, nmatch, pmatch, eflags); ++ file_reg_set_ctype(rx); ++ rc = regexec(&rx->rx, str, nmatch, pmatch, eflags); ++ file_reg_restore_ctype(rx); ++ ++ return rc; + } + + protected void +@@ -695,10 +722,8 @@ file_regfree(file_regex_t *rx) + if (rx->rc == 0) + regfree(&rx->rx); + #ifdef USE_C_LOCALE +- (void)uselocale(rx->old_lc_ctype); + freelocale(rx->c_lc_ctype); + #else +- (void)setlocale(LC_CTYPE, rx->old_lc_ctype); + free(rx->old_lc_ctype); + #endif + } +-- +2.35.1 + diff --git a/file.changes b/file.changes index 2634daf..8603a43 100644 --- a/file.changes +++ b/file.changes @@ -1,3 +1,9 @@ +------------------------------------------------------------------- +Wed Mar 23 09:02:37 UTC 2022 - Dirk Müller + +- add file-5.41-cache-regexps-locale-restore.patch to restore + previous locale handling behavior + ------------------------------------------------------------------- Sat Mar 19 18:00:32 UTC 2022 - Dirk Müller diff --git a/file.spec b/file.spec index 929af31..03e484d 100644 --- a/file.spec +++ b/file.spec @@ -64,6 +64,7 @@ Patch34: file-5.23-endian.patch Patch37: file-secure_getenv.patch Patch38: file-5.41-cache-regexps.patch Patch39: file-5.28-btrfs-image.dif +Patch40: file-5.41-cache-regexps-locale-restore.patch # Upstream commits as patches BuildRoot: %{_tmppath}/%{name}-%{version}-build %global _sysconfdir /etc @@ -128,6 +129,7 @@ to develop applications that require the magic "file" interface. %patch37 -p1 -b .getenv %patch38 -p1 -b .regexp %patch39 -p1 -b .btrfs +%patch40 -p1 -b .locale %patch -b .0 test -s src/magic.h.in || cp -p src/magic.h src/magic.h.in rm -fv src/magic.h From 03ac71daa888d285013672281c492099aa0e773727461dcb21ee3c49ca09a2b1 Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Thu, 24 Mar 2022 19:18:12 +0000 Subject: [PATCH 5/5] OBS-URL: https://build.opensuse.org/package/show/Base:System/file?expand=0&rev=239 --- file-5.41-cache-regexps-locale-restore.patch | 255 ++++++++++++++----- 1 file changed, 195 insertions(+), 60 deletions(-) diff --git a/file-5.41-cache-regexps-locale-restore.patch b/file-5.41-cache-regexps-locale-restore.patch index 3012443..65120b6 100644 --- a/file-5.41-cache-regexps-locale-restore.patch +++ b/file-5.41-cache-regexps-locale-restore.patch @@ -1,101 +1,236 @@ -From c25329eabeaba048cb6ef1448d1ee040c62c415f Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Dirk=20M=C3=BCller?= -Date: Tue, 22 Mar 2022 22:28:14 +0100 -Subject: [PATCH] Restore locale handling after regex caching +From 7d438e28c16773e28a3707935c8e5d9927a515a7 Mon Sep 17 00:00:00 2001 +From: Christos Zoulas +Date: Sat, 19 Mar 2022 19:52:09 +0000 +Subject: [PATCH] Now that we are cacheing regex's we cannot assume that we + always do regcomp->regexec->regfree, so this causes memory corruption (and + increased memory use with all the locale copies) in xlocale systems. Instead + save and restore locales in regcomp and regexec as needed. -file_regcomp/file_regfree had the side effect of setting and restoring -locale C_TYPE to C to have predictable regex matching. With regcomp -caching file_regfree has been changed to be only called at destruction -time, which means the library changed the locale setting for anything -else as well. Restore old behavior by splitting save/restore into -separate functions which are surrounding regcomp() and regexec() only. --- - src/funcs.c | 39 ++++++++++++++++++++++++++++++++------- - 1 file changed, 32 insertions(+), 7 deletions(-) + src/apprentice.c | 13 +++++++++-- + src/file.h | 18 +++++++-------- + src/funcs.c | 57 +++++++++++++++++++++++++++++------------------- + src/softmagic.c | 11 +++++----- + 4 files changed, 59 insertions(+), 40 deletions(-) -diff --git a/src/funcs.c b/src/funcs.c -index dcfd352d..7ecaff33 100644 ---- a/src/funcs.c -+++ b/src/funcs.c -@@ -658,35 +658,62 @@ out: - return rv; +Index: file-5.41/src/apprentice.c +=================================================================== +--- file-5.41.orig/src/apprentice.c ++++ file-5.41/src/apprentice.c +@@ -516,6 +516,9 @@ file_ms_free(struct magic_set *ms) + free(ms->o.pbuf); + free(ms->o.buf); + free(ms->c.li); ++#ifdef USE_C_LOCALE ++ freelocale(ms->c_lc_ctype); ++#endif + free(ms); } -+static void -+file_reg_set_ctype(file_regex_t *rx) -+{ +@@ -555,6 +558,10 @@ file_ms_alloc(int flags) + ms->regex_max = FILE_REGEX_MAX; + ms->bytes_max = FILE_BYTES_MAX; + ms->encoding_max = FILE_ENCODING_MAX; +#ifdef USE_C_LOCALE -+ rx->old_lc_ctype = uselocale(rx->c_lc_ctype); -+ assert(rx->old_lc_ctype != NULL); -+#else -+ (void)setlocale(LC_CTYPE, "C"); ++ ms->c_lc_ctype = newlocale(LC_CTYPE_MASK, "C", 0); ++ assert(ms->c_lc_ctype != NULL); +#endif -+} -+ -+static void -+file_reg_restore_ctype(file_regex_t *rx) -+{ -+#ifdef USE_C_LOCALE -+ (void)uselocale(rx->old_lc_ctype); -+#else -+ (void)setlocale(LC_CTYPE, rx->old_lc_ctype); + return ms; + free: + free(ms); +@@ -628,6 +635,7 @@ mlist_free_one(struct mlist *ml) + if (ml->magic_rxcomp[i]) { + file_regfree(ml->magic_rxcomp[i]); + free(ml->magic_rxcomp[i]); ++ ml->magic_rxcomp[i] = NULL; + } + } + free(ml->magic_rxcomp); +@@ -2741,7 +2749,8 @@ getvalue(struct magic_set *ms, struct ma + } + if (m->type == FILE_REGEX) { + file_regex_t rx; +- int rc = file_regcomp(&rx, m->value.s, REG_EXTENDED); ++ int rc = file_regcomp(ms, &rx, m->value.s, ++ REG_EXTENDED); + if (rc) { + if (ms->flags & MAGIC_CHECK) + file_regerror(&rx, rc, ms); +Index: file-5.41/src/file.h +=================================================================== +--- file-5.41.orig/src/file.h ++++ file-5.41/src/file.h +@@ -173,13 +173,6 @@ + + typedef struct { + const char *pat; +-#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) +-#define USE_C_LOCALE +- locale_t old_lc_ctype; +- locale_t c_lc_ctype; +-#else +- char *old_lc_ctype; +-#endif + int rc; + regex_t rx; + } file_regex_t; +@@ -495,6 +488,10 @@ struct magic_set { + #define FILE_NAME_MAX 50 + #define FILE_REGEX_MAX 8192 + #define FILE_ENCODING_MAX (64 * 1024) ++#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) ++#define USE_C_LOCALE ++ locale_t c_lc_ctype; +#endif -+} -+ + }; + + /* Type for Unicode characters */ +@@ -588,9 +585,10 @@ protected int buffer_fill(const struct b + + + +-protected int file_regcomp(file_regex_t *, const char *, int); +-protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *, ++protected int file_regcomp(struct magic_set *, file_regex_t *, const char *, + int); ++protected int file_regexec(struct magic_set *, file_regex_t *, const char *, ++ size_t, regmatch_t *, int); + protected void file_regfree(file_regex_t *); + protected void file_regerror(file_regex_t *, int, struct magic_set *); + +Index: file-5.41/src/funcs.c +=================================================================== +--- file-5.41.orig/src/funcs.c ++++ file-5.41/src/funcs.c +@@ -634,13 +634,13 @@ file_replace(struct magic_set *ms, const + file_regex_t rx; + int rc, rv = -1; + +- rc = file_regcomp(&rx, pat, REG_EXTENDED); ++ rc = file_regcomp(ms, &rx, pat, REG_EXTENDED); + if (rc) { + file_regerror(&rx, rc, ms); + } else { + regmatch_t rm; + int nm = 0; +- while (file_regexec(&rx, ms->o.buf, 1, &rm, 0) == 0) { ++ while (file_regexec(ms, &rx, ms->o.buf, 1, &rm, 0) == 0) { + ms->o.buf[rm.rm_so] = '\0'; + if (file_printf(ms, "%s%s", rep, + rm.rm_eo != 0 ? ms->o.buf + rm.rm_eo : "") == -1) +@@ -655,34 +655,52 @@ out: + } + protected int - file_regcomp(file_regex_t *rx, const char *pat, int flags) +-file_regcomp(file_regex_t *rx, const char *pat, int flags) ++file_regcomp(struct magic_set *ms, file_regex_t *rx, const char *pat, int flags) { #ifdef USE_C_LOCALE - rx->c_lc_ctype = newlocale(LC_CTYPE_MASK, "C", 0); - assert(rx->c_lc_ctype != NULL); +- rx->c_lc_ctype = newlocale(LC_CTYPE_MASK, "C", 0); +- assert(rx->c_lc_ctype != NULL); - rx->old_lc_ctype = uselocale(rx->c_lc_ctype); - assert(rx->old_lc_ctype != NULL); ++ locale_t old = uselocale(ms->c_lc_ctype); ++ assert(old != NULL); #else - rx->old_lc_ctype = setlocale(LC_CTYPE, NULL); - assert(rx->old_lc_ctype != NULL); - rx->old_lc_ctype = strdup(rx->old_lc_ctype); - assert(rx->old_lc_ctype != NULL); -- (void)setlocale(LC_CTYPE, "C"); +- rx->old_lc_ctype = setlocale(LC_CTYPE, NULL); +- assert(rx->old_lc_ctype != NULL); +- rx->old_lc_ctype = strdup(rx->old_lc_ctype); +- assert(rx->old_lc_ctype != NULL); ++ char old[1024]; ++ strlcpy(old, setlocale(LC_CTYPE, NULL), sizeof(old)); + (void)setlocale(LC_CTYPE, "C"); #endif rx->pat = pat; - return rx->rc = regcomp(&rx->rx, pat, flags); -+ file_reg_set_ctype(rx); + rx->rc = regcomp(&rx->rx, pat, flags); -+ file_reg_restore_ctype(rx); + ++#ifdef USE_C_LOCALE ++ uselocale(old); ++#else ++ (void)setlocale(LC_CTYPE, old); ++#endif + return rx->rc; } protected int - file_regexec(file_regex_t *rx, const char *str, size_t nmatch, - regmatch_t* pmatch, int eflags) +-file_regexec(file_regex_t *rx, const char *str, size_t nmatch, +- regmatch_t* pmatch, int eflags) ++file_regexec(struct magic_set *ms, file_regex_t *rx, const char *str, ++ size_t nmatch, regmatch_t* pmatch, int eflags) { ++#ifdef USE_C_LOCALE ++ locale_t old = uselocale(ms->c_lc_ctype); ++ assert(old != NULL); ++#else ++ char old[1024]; ++ strlcpy(old, setlocale(LC_CTYPE, NULL), sizeof(old)); ++ (void)setlocale(LC_CTYPE, "C"); ++#endif + int rc; assert(rx->rc == 0); /* XXX: force initialization because glibc does not always do this */ if (nmatch != 0) memset(pmatch, 0, nmatch * sizeof(*pmatch)); - return regexec(&rx->rx, str, nmatch, pmatch, eflags); -+ file_reg_set_ctype(rx); + rc = regexec(&rx->rx, str, nmatch, pmatch, eflags); -+ file_reg_restore_ctype(rx); -+ ++#ifdef USE_C_LOCALE ++ uselocale(old); ++#else ++ (void)setlocale(LC_CTYPE, old); ++#endif + return rc; } protected void -@@ -695,10 +722,8 @@ file_regfree(file_regex_t *rx) +@@ -690,13 +708,6 @@ file_regfree(file_regex_t *rx) + { if (rx->rc == 0) regfree(&rx->rx); - #ifdef USE_C_LOCALE +-#ifdef USE_C_LOCALE - (void)uselocale(rx->old_lc_ctype); - freelocale(rx->c_lc_ctype); - #else +- freelocale(rx->c_lc_ctype); +-#else - (void)setlocale(LC_CTYPE, rx->old_lc_ctype); - free(rx->old_lc_ctype); - #endif +- free(rx->old_lc_ctype); +-#endif } --- -2.35.1 - + + protected void +Index: file-5.41/src/softmagic.c +=================================================================== +--- file-5.41.orig/src/softmagic.c ++++ file-5.41/src/softmagic.c +@@ -479,11 +479,11 @@ check_fmt(struct magic_set *ms, const ch + if (strchr(fmt, '%') == NULL) + return 0; + +- rc = file_regcomp(&rx, "%[-0-9\\.]*s", REG_EXTENDED|REG_NOSUB); ++ rc = file_regcomp(ms, &rx, "%[-0-9\\.]*s", REG_EXTENDED|REG_NOSUB); + if (rc) { + file_regerror(&rx, rc, ms); + } else { +- rc = file_regexec(&rx, fmt, 0, 0, 0); ++ rc = file_regexec(ms, &rx, fmt, 0, 0, 0); + rv = !rc; + } + file_regfree(&rx); +@@ -2192,7 +2192,7 @@ magiccheck(struct magic_set *ms, struct + + if (rx == NULL) { + rx = *m_cache = CAST(file_regex_t*, malloc(sizeof(file_regex_t))); +- rc = file_regcomp(rx, m->value.s, ++ rc = file_regcomp(ms, rx, m->value.s, + REG_EXTENDED|REG_NEWLINE| + ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); + if (rc) { +@@ -2222,7 +2222,7 @@ magiccheck(struct magic_set *ms, struct + search = CCAST(char *, ""); + copy = NULL; + } +- rc = file_regexec(rx, RCAST(const char *, search), ++ rc = file_regexec(ms, rx, RCAST(const char *, search), + 1, &pmatch, 0); + free(copy); + switch (rc) {