From 1957db8212e9c74e5d626de3023e49d0bb502052 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dirk=20M=C3=BCller?= Date: Fri, 11 Mar 2022 23:51:55 +0100 Subject: [PATCH] Cache compiled regexps between magic matches regcomp() is relatively expensive compared to regexec() for matching, so it helps to only compile once and then reuse the compiled version for future matches of the same magic. when doing equivalent of `find | xargs file` this provides a massive speedup, between factor 2 and 4 depending on how heavy the magic is on regexp usage. The memory overhead is mediocre (~ 200kb ) and it compiles regexps lazy, so it doesn't add significant overhead to single match usecases. --- src/apprentice.c | 26 +++++++++++++++++++---- src/file.h | 40 ++++++++++++++++++----------------- src/softmagic.c | 54 +++++++++++++++++++++++++++--------------------- 3 files changed, 73 insertions(+), 47 deletions(-) Index: file-5.41/src/apprentice.c =================================================================== --- file-5.41.orig/src/apprentice.c +++ file-5.41/src/apprentice.c @@ -427,7 +427,14 @@ add_mlist(struct mlist *mlp, struct magi ml->map = idx == 0 ? map : NULL; ml->magic = map->magic[idx]; ml->nmagic = map->nmagic[idx]; - + ml->magic_rxcomp = NULL; + if (ml->nmagic) { + ml->magic_rxcomp = CAST(file_regex_t**, calloc(ml->nmagic, sizeof(file_regex_t*))); + if (ml->magic_rxcomp == NULL) { + free(ml); + return -1; + } + } mlp->prev->next = ml; ml->prev = mlp->prev; ml->next = mlp; @@ -612,8 +619,19 @@ mlist_free_all(struct magic_set *ms) private void mlist_free_one(struct mlist *ml) { + size_t i; + if (ml->map) apprentice_unmap(CAST(struct magic_map *, ml->map)); + + for (i = 0; i < ml->nmagic; ++i) { + if (ml->magic_rxcomp[i]) { + file_regfree(ml->magic_rxcomp[i]); + free(ml->magic_rxcomp[i]); + } + } + free(ml->magic_rxcomp); + ml->magic_rxcomp = NULL; free(ml); } @@ -3489,16 +3507,16 @@ file_magicfind(struct magic_set *ms, con for (ml = mlist->next; ml != mlist; ml = ml->next) { struct magic *ma = ml->magic; - uint32_t nma = ml->nmagic; - for (i = 0; i < nma; i++) { + for (i = 0; i < ml->nmagic; i++) { if (ma[i].type != FILE_NAME) continue; if (strcmp(ma[i].value.s, name) == 0) { v->magic = &ma[i]; - for (j = i + 1; j < nma; j++) + for (j = i + 1; j < ml->nmagic; j++) if (ma[j].cont_level == 0) break; v->nmagic = j - i; + v->magic_rxcomp = &(ml->magic_rxcomp[i]); return 0; } } Index: file-5.41/src/file.h =================================================================== --- file-5.41.orig/src/file.h +++ file-5.41/src/file.h @@ -88,6 +88,10 @@ /* Do this here and now, because struct stat gets re-defined on solaris */ #include #include +#include +#if defined(HAVE_XLOCALE_H) +#include +#endif #define ENABLE_CONDITIONALS @@ -167,6 +171,19 @@ #define FILE_COMPILE 2 #define FILE_LIST 3 +typedef struct { + const char *pat; +#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) +#define USE_C_LOCALE + locale_t old_lc_ctype; + locale_t c_lc_ctype; +#else + char *old_lc_ctype; +#endif + int rc; + regex_t rx; +} file_regex_t; + struct buffer { int fd; struct stat st; @@ -397,9 +414,10 @@ struct magic { /* list of magic entries */ struct mlist { - struct magic *magic; /* array of magic entries */ - uint32_t nmagic; /* number of entries in array */ - void *map; /* internal resources used by entry */ + struct magic *magic; /* array of magic entries */ + file_regex_t **magic_rxcomp; /* array of compiled regexps */ + size_t nmagic; /* number of entries in array */ + void *map; /* internal resources used by entry */ struct mlist *next, *prev; }; @@ -568,23 +586,7 @@ protected void buffer_init(struct buffer protected void buffer_fini(struct buffer *); protected int buffer_fill(const struct buffer *); -#include -#if defined(HAVE_XLOCALE_H) -#include -#endif -typedef struct { - const char *pat; -#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE) -#define USE_C_LOCALE - locale_t old_lc_ctype; - locale_t c_lc_ctype; -#else - char *old_lc_ctype; -#endif - int rc; - regex_t rx; -} file_regex_t; protected int file_regcomp(file_regex_t *, const char *, int); protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *, Index: file-5.41/src/softmagic.c =================================================================== --- file-5.41.orig/src/softmagic.c +++ file-5.41/src/softmagic.c @@ -43,7 +43,7 @@ FILE_RCSID("@(#)$File: softmagic.c,v 1.3 #include #include "der.h" -private int match(struct magic_set *, struct magic *, uint32_t, +private int match(struct magic_set *, struct magic *, file_regex_t **, uint32_t, const struct buffer *, size_t, int, int, int, uint16_t *, uint16_t *, int *, int *, int *, int *); private int mget(struct magic_set *, struct magic *, const struct buffer *, @@ -52,7 +52,7 @@ private int mget(struct magic_set *, str uint16_t *, int *, int *, int *, int *); private int msetoffset(struct magic_set *, struct magic *, struct buffer *, const struct buffer *, size_t, unsigned int); -private int magiccheck(struct magic_set *, struct magic *); +private int magiccheck(struct magic_set *, struct magic *, file_regex_t **); private int32_t mprint(struct magic_set *, struct magic *); private int moffset(struct magic_set *, struct magic *, const struct buffer *, int32_t *); @@ -131,7 +131,7 @@ file_softmagic(struct magic_set *ms, con } for (ml = ms->mlist[0]->next; ml != ms->mlist[0]; ml = ml->next) - if ((rv = match(ms, ml->magic, ml->nmagic, b, 0, mode, + if ((rv = match(ms, ml->magic, ml->magic_rxcomp, ml->nmagic, b, 0, mode, text, 0, indir_count, name_count, &printed_something, &need_separator, NULL, NULL)) != 0) return rv; @@ -191,7 +191,7 @@ file_fmtcheck(struct magic_set *ms, cons * so that higher-level continuations are processed. */ private int -match(struct magic_set *ms, struct magic *magic, uint32_t nmagic, +match(struct magic_set *ms, struct magic *magic, file_regex_t **magic_rxcomp, uint32_t nmagic, const struct buffer *b, size_t offset, int mode, int text, int flip, uint16_t *indir_count, uint16_t *name_count, int *printed_something, int *need_separator, int *returnval, @@ -220,6 +220,7 @@ match(struct magic_set *ms, struct magic for (magindex = 0; magindex < nmagic; magindex++) { int flush = 0; struct magic *m = &magic[magindex]; + file_regex_t** m_rxcomp = &magic_rxcomp[magindex]; if (m->type != FILE_NAME) if ((IS_STRING(m->type) && @@ -257,7 +258,7 @@ flush: *returnval = 1; } - switch (magiccheck(ms, m)) { + switch (magiccheck(ms, m, m_rxcomp)) { case -1: return -1; case 0: @@ -318,6 +319,7 @@ flush: while (magindex + 1 < nmagic && magic[magindex + 1].cont_level != 0) { m = &magic[++magindex]; + m_rxcomp = &magic_rxcomp[magindex]; ms->line = m->lineno; /* for messages */ if (cont_level < m->cont_level) @@ -371,7 +373,7 @@ flush: break; } - switch (flush ? 1 : magiccheck(ms, m)) { + switch (flush ? 1 : magiccheck(ms, m, m_rxcomp)) { case -1: return -1; case 0: @@ -655,7 +657,7 @@ mprint(struct magic_set *ms, struct magi if (m->str_flags & STRING_TRIM) str = file_strtrim(str); - + if (file_printf(ms, F(ms, desc, "%s"), file_printable(ms, sbuf, sizeof(sbuf), str, sizeof(p->s) - (str - p->s))) == -1) @@ -770,7 +772,7 @@ mprint(struct magic_set *ms, struct magi return -1; } scp = (m->str_flags & STRING_TRIM) ? file_strtrim(cp) : cp; - + rval = file_printf(ms, F(ms, desc, "%s"), file_printable(ms, sbuf, sizeof(sbuf), scp, ms->search.rm_len)); free(cp); @@ -1822,7 +1824,7 @@ mget(struct magic_set *ms, struct magic for (mlp = ms->mlist[0]->next; mlp != ms->mlist[0]; mlp = mlp->next) { - if ((rv = match(ms, mlp->magic, mlp->nmagic, &bb, 0, + if ((rv = match(ms, mlp->magic, mlp->magic_rxcomp, mlp->nmagic, &bb, 0, BINTEST, text, 0, indir_count, name_count, printed_something, need_separator, NULL, NULL)) != 0) @@ -1875,7 +1877,7 @@ mget(struct magic_set *ms, struct magic nfound_match = 0; (*name_count)++; eoffset = ms->eoffset; - rv = match(ms, ml.magic, ml.nmagic, b, offset + o, + rv = match(ms, ml.magic, ml.magic_rxcomp, ml.nmagic, b, offset + o, mode, text, flip, indir_count, name_count, printed_something, need_separator, returnval, &nfound_match); @@ -1999,7 +2001,7 @@ file_strncmp16(const char *a, const char } private int -magiccheck(struct magic_set *ms, struct magic *m) +magiccheck(struct magic_set *ms, struct magic *m, file_regex_t** m_cache) { uint64_t l = m->value.q; uint64_t v; @@ -2182,27 +2184,32 @@ magiccheck(struct magic_set *ms, struct } case FILE_REGEX: { int rc; - file_regex_t rx; + file_regex_t *rx = *m_cache; const char *search; if (ms->search.s == NULL) return 0; + if (rx == NULL) { + rx = *m_cache = CAST(file_regex_t*, malloc(sizeof(file_regex_t))); + rc = file_regcomp(rx, m->value.s, + REG_EXTENDED|REG_NEWLINE| + ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); + if (rc) { + file_regerror(rx, rc, ms); + file_regfree(rx); + v = CAST(uint64_t, -1); + break; + } + } l = 0; - rc = file_regcomp(&rx, m->value.s, - REG_EXTENDED|REG_NEWLINE| - ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0)); - if (rc) { - file_regerror(&rx, rc, ms); - v = CAST(uint64_t, -1); - } else { + { regmatch_t pmatch; size_t slen = ms->search.s_len; char *copy; if (slen != 0) { copy = CAST(char *, malloc(slen)); if (copy == NULL) { - file_regfree(&rx); file_error(ms, errno, "can't allocate %" SIZE_T_FORMAT "u bytes", slen); @@ -2215,14 +2222,14 @@ magiccheck(struct magic_set *ms, struct search = CCAST(char *, ""); copy = NULL; } - rc = file_regexec(&rx, RCAST(const char *, search), + rc = file_regexec(rx, RCAST(const char *, search), 1, &pmatch, 0); free(copy); switch (rc) { case 0: ms->search.s += CAST(int, pmatch.rm_so); ms->search.offset += CAST(size_t, pmatch.rm_so); - ms->search.rm_len = CAST(size_t, + ms->search.rm_len = CAST(size_t, pmatch.rm_eo - pmatch.rm_so); v = 0; break; @@ -2232,12 +2239,11 @@ magiccheck(struct magic_set *ms, struct break; default: - file_regerror(&rx, rc, ms); + file_regerror(rx, rc, ms); v = CAST(uint64_t, -1); break; } } - file_regfree(&rx); if (v == CAST(uint64_t, -1)) return -1; break;