file/file-5.41-cache-regexps.patch

347 lines
11 KiB
Diff
Raw Normal View History

From 1957db8212e9c74e5d626de3023e49d0bb502052 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dirk=20M=C3=BCller?= <dirk@dmllr.de>
Date: Fri, 11 Mar 2022 23:51:55 +0100
Subject: [PATCH] Cache compiled regexps between magic matches
regcomp() is relatively expensive compared to regexec() for matching,
so it helps to only compile once and then reuse the compiled version
for future matches of the same magic.
when doing equivalent of `find | xargs file` this provides a massive
speedup, between factor 2 and 4 depending on how heavy the magic
is on regexp usage.
The memory overhead is mediocre (~ 200kb ) and it compiles regexps
lazy, so it doesn't add significant overhead to single match usecases.
---
src/apprentice.c | 26 +++++++++++++++++++----
src/file.h | 40 ++++++++++++++++++-----------------
src/softmagic.c | 54 +++++++++++++++++++++++++++---------------------
3 files changed, 73 insertions(+), 47 deletions(-)
Index: file-5.41/src/apprentice.c
===================================================================
--- file-5.41.orig/src/apprentice.c
+++ file-5.41/src/apprentice.c
@@ -427,7 +427,14 @@ add_mlist(struct mlist *mlp, struct magi
ml->map = idx == 0 ? map : NULL;
ml->magic = map->magic[idx];
ml->nmagic = map->nmagic[idx];
-
+ ml->magic_rxcomp = NULL;
+ if (ml->nmagic) {
+ ml->magic_rxcomp = CAST(file_regex_t**, calloc(ml->nmagic, sizeof(file_regex_t*)));
+ if (ml->magic_rxcomp == NULL) {
+ free(ml);
+ return -1;
+ }
+ }
mlp->prev->next = ml;
ml->prev = mlp->prev;
ml->next = mlp;
@@ -612,8 +619,19 @@ mlist_free_all(struct magic_set *ms)
private void
mlist_free_one(struct mlist *ml)
{
+ size_t i;
+
if (ml->map)
apprentice_unmap(CAST(struct magic_map *, ml->map));
+
+ for (i = 0; i < ml->nmagic; ++i) {
+ if (ml->magic_rxcomp[i]) {
+ file_regfree(ml->magic_rxcomp[i]);
+ free(ml->magic_rxcomp[i]);
+ }
+ }
+ free(ml->magic_rxcomp);
+ ml->magic_rxcomp = NULL;
free(ml);
}
@@ -3489,16 +3507,16 @@ file_magicfind(struct magic_set *ms, con
for (ml = mlist->next; ml != mlist; ml = ml->next) {
struct magic *ma = ml->magic;
- uint32_t nma = ml->nmagic;
- for (i = 0; i < nma; i++) {
+ for (i = 0; i < ml->nmagic; i++) {
if (ma[i].type != FILE_NAME)
continue;
if (strcmp(ma[i].value.s, name) == 0) {
v->magic = &ma[i];
- for (j = i + 1; j < nma; j++)
+ for (j = i + 1; j < ml->nmagic; j++)
if (ma[j].cont_level == 0)
break;
v->nmagic = j - i;
+ v->magic_rxcomp = ml->magic_rxcomp;
return 0;
}
}
Index: file-5.41/src/file.h
===================================================================
--- file-5.41.orig/src/file.h
+++ file-5.41/src/file.h
@@ -88,6 +88,10 @@
/* Do this here and now, because struct stat gets re-defined on solaris */
#include <sys/stat.h>
#include <stdarg.h>
+#include <locale.h>
+#if defined(HAVE_XLOCALE_H)
+#include <xlocale.h>
+#endif
#define ENABLE_CONDITIONALS
@@ -167,6 +171,19 @@
#define FILE_COMPILE 2
#define FILE_LIST 3
+typedef struct {
+ const char *pat;
+#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE)
+#define USE_C_LOCALE
+ locale_t old_lc_ctype;
+ locale_t c_lc_ctype;
+#else
+ char *old_lc_ctype;
+#endif
+ int rc;
+ regex_t rx;
+} file_regex_t;
+
struct buffer {
int fd;
struct stat st;
@@ -397,9 +414,10 @@ struct magic {
/* list of magic entries */
struct mlist {
- struct magic *magic; /* array of magic entries */
- uint32_t nmagic; /* number of entries in array */
- void *map; /* internal resources used by entry */
+ struct magic *magic; /* array of magic entries */
+ file_regex_t **magic_rxcomp; /* array of compiled regexps */
+ size_t nmagic; /* number of entries in array */
+ void *map; /* internal resources used by entry */
struct mlist *next, *prev;
};
@@ -568,23 +586,7 @@ protected void buffer_init(struct buffer
protected void buffer_fini(struct buffer *);
protected int buffer_fill(const struct buffer *);
-#include <locale.h>
-#if defined(HAVE_XLOCALE_H)
-#include <xlocale.h>
-#endif
-typedef struct {
- const char *pat;
-#if defined(HAVE_NEWLOCALE) && defined(HAVE_USELOCALE) && defined(HAVE_FREELOCALE)
-#define USE_C_LOCALE
- locale_t old_lc_ctype;
- locale_t c_lc_ctype;
-#else
- char *old_lc_ctype;
-#endif
- int rc;
- regex_t rx;
-} file_regex_t;
protected int file_regcomp(file_regex_t *, const char *, int);
protected int file_regexec(file_regex_t *, const char *, size_t, regmatch_t *,
Index: file-5.41/src/softmagic.c
===================================================================
--- file-5.41.orig/src/softmagic.c
+++ file-5.41/src/softmagic.c
@@ -43,7 +43,7 @@ FILE_RCSID("@(#)$File: softmagic.c,v 1.3
#include <time.h>
#include "der.h"
-private int match(struct magic_set *, struct magic *, uint32_t,
+private int match(struct magic_set *, struct magic *, file_regex_t **, uint32_t,
const struct buffer *, size_t, int, int, int, uint16_t *,
uint16_t *, int *, int *, int *, int *);
private int mget(struct magic_set *, struct magic *, const struct buffer *,
@@ -52,7 +52,7 @@ private int mget(struct magic_set *, str
uint16_t *, int *, int *, int *, int *);
private int msetoffset(struct magic_set *, struct magic *, struct buffer *,
const struct buffer *, size_t, unsigned int);
-private int magiccheck(struct magic_set *, struct magic *);
+private int magiccheck(struct magic_set *, struct magic *, file_regex_t **);
private int32_t mprint(struct magic_set *, struct magic *);
private int moffset(struct magic_set *, struct magic *, const struct buffer *,
int32_t *);
@@ -131,7 +131,7 @@ file_softmagic(struct magic_set *ms, con
}
for (ml = ms->mlist[0]->next; ml != ms->mlist[0]; ml = ml->next)
- if ((rv = match(ms, ml->magic, ml->nmagic, b, 0, mode,
+ if ((rv = match(ms, ml->magic, ml->magic_rxcomp, ml->nmagic, b, 0, mode,
text, 0, indir_count, name_count,
&printed_something, &need_separator, NULL, NULL)) != 0)
return rv;
@@ -191,7 +191,7 @@ file_fmtcheck(struct magic_set *ms, cons
* so that higher-level continuations are processed.
*/
private int
-match(struct magic_set *ms, struct magic *magic, uint32_t nmagic,
+match(struct magic_set *ms, struct magic *magic, file_regex_t **magic_rxcomp, uint32_t nmagic,
const struct buffer *b, size_t offset, int mode, int text,
int flip, uint16_t *indir_count, uint16_t *name_count,
int *printed_something, int *need_separator, int *returnval,
@@ -220,6 +220,7 @@ match(struct magic_set *ms, struct magic
for (magindex = 0; magindex < nmagic; magindex++) {
int flush = 0;
struct magic *m = &magic[magindex];
+ file_regex_t** m_rxcomp = &magic_rxcomp[magindex];
if (m->type != FILE_NAME)
if ((IS_STRING(m->type) &&
@@ -257,7 +258,7 @@ flush:
*returnval = 1;
}
- switch (magiccheck(ms, m)) {
+ switch (magiccheck(ms, m, m_rxcomp)) {
case -1:
return -1;
case 0:
@@ -318,6 +319,7 @@ flush:
while (magindex + 1 < nmagic &&
magic[magindex + 1].cont_level != 0) {
m = &magic[++magindex];
+ m_rxcomp = &magic_rxcomp[magindex];
ms->line = m->lineno; /* for messages */
if (cont_level < m->cont_level)
@@ -371,7 +373,7 @@ flush:
break;
}
- switch (flush ? 1 : magiccheck(ms, m)) {
+ switch (flush ? 1 : magiccheck(ms, m, m_rxcomp)) {
case -1:
return -1;
case 0:
@@ -655,7 +657,7 @@ mprint(struct magic_set *ms, struct magi
if (m->str_flags & STRING_TRIM)
str = file_strtrim(str);
-
+
if (file_printf(ms, F(ms, desc, "%s"),
file_printable(ms, sbuf, sizeof(sbuf), str,
sizeof(p->s) - (str - p->s))) == -1)
@@ -770,7 +772,7 @@ mprint(struct magic_set *ms, struct magi
return -1;
}
scp = (m->str_flags & STRING_TRIM) ? file_strtrim(cp) : cp;
-
+
rval = file_printf(ms, F(ms, desc, "%s"), file_printable(ms,
sbuf, sizeof(sbuf), scp, ms->search.rm_len));
free(cp);
@@ -1822,7 +1824,7 @@ mget(struct magic_set *ms, struct magic
for (mlp = ms->mlist[0]->next; mlp != ms->mlist[0];
mlp = mlp->next)
{
- if ((rv = match(ms, mlp->magic, mlp->nmagic, &bb, 0,
+ if ((rv = match(ms, mlp->magic, mlp->magic_rxcomp, mlp->nmagic, &bb, 0,
BINTEST, text, 0, indir_count, name_count,
printed_something, need_separator, NULL,
NULL)) != 0)
@@ -1875,7 +1877,7 @@ mget(struct magic_set *ms, struct magic
nfound_match = 0;
(*name_count)++;
eoffset = ms->eoffset;
- rv = match(ms, ml.magic, ml.nmagic, b, offset + o,
+ rv = match(ms, ml.magic, ml.magic_rxcomp, ml.nmagic, b, offset + o,
mode, text, flip, indir_count, name_count,
printed_something, need_separator, returnval,
&nfound_match);
@@ -1999,7 +2001,7 @@ file_strncmp16(const char *a, const char
}
private int
-magiccheck(struct magic_set *ms, struct magic *m)
+magiccheck(struct magic_set *ms, struct magic *m, file_regex_t** m_cache)
{
uint64_t l = m->value.q;
uint64_t v;
@@ -2182,27 +2184,32 @@ magiccheck(struct magic_set *ms, struct
}
case FILE_REGEX: {
int rc;
- file_regex_t rx;
+ file_regex_t *rx = *m_cache;
const char *search;
if (ms->search.s == NULL)
return 0;
+ if (rx == NULL) {
+ rx = *m_cache = CAST(file_regex_t*, malloc(sizeof(file_regex_t)));
+ rc = file_regcomp(rx, m->value.s,
+ REG_EXTENDED|REG_NEWLINE|
+ ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0));
+ if (rc) {
+ file_regerror(rx, rc, ms);
+ file_regfree(rx);
+ v = CAST(uint64_t, -1);
+ break;
+ }
+ }
l = 0;
- rc = file_regcomp(&rx, m->value.s,
- REG_EXTENDED|REG_NEWLINE|
- ((m->str_flags & STRING_IGNORE_CASE) ? REG_ICASE : 0));
- if (rc) {
- file_regerror(&rx, rc, ms);
- v = CAST(uint64_t, -1);
- } else {
+ {
regmatch_t pmatch;
size_t slen = ms->search.s_len;
char *copy;
if (slen != 0) {
copy = CAST(char *, malloc(slen));
if (copy == NULL) {
- file_regfree(&rx);
file_error(ms, errno,
"can't allocate %" SIZE_T_FORMAT "u bytes",
slen);
@@ -2215,14 +2222,14 @@ magiccheck(struct magic_set *ms, struct
search = CCAST(char *, "");
copy = NULL;
}
- rc = file_regexec(&rx, RCAST(const char *, search),
+ rc = file_regexec(rx, RCAST(const char *, search),
1, &pmatch, 0);
free(copy);
switch (rc) {
case 0:
ms->search.s += CAST(int, pmatch.rm_so);
ms->search.offset += CAST(size_t, pmatch.rm_so);
- ms->search.rm_len = CAST(size_t,
+ ms->search.rm_len = CAST(size_t,
pmatch.rm_eo - pmatch.rm_so);
v = 0;
break;
@@ -2232,12 +2239,11 @@ magiccheck(struct magic_set *ms, struct
break;
default:
- file_regerror(&rx, rc, ms);
+ file_regerror(rx, rc, ms);
v = CAST(uint64_t, -1);
break;
}
}
- file_regfree(&rx);
if (v == CAST(uint64_t, -1))
return -1;
break;