From 54b41e87156bc823d5938749d71c4c57adc75b1b Mon Sep 17 00:00:00 2001 From: Kevin Boone Date: Thu, 30 Jun 2022 15:41:59 +0100 Subject: [PATCH] Fixed handling of URL-encoded spine hrefs --- README.md | 1 + src/epub2txt.c | 3 ++- src/util.c | 44 +++++++++++++++++++++++++++++++++++++++++++- src/util.h | 6 +++++- src/xhtml.c | 5 +++++ 5 files changed, 56 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4fbcafc..546242f 100644 --- a/README.md +++ b/README.md @@ -244,6 +244,7 @@ covered. Date | Change -----|------- +?, Jun 2022 | Fixed handling of URL-encoded spine href's 2.06, Jun 2022 | Fixed bug in invoking unzip 2.05, Apr 2022 | Fixed bug with empty metadata tags 2.04, Apr 2022 | Improved handling of UTF-8 BOMs diff --git a/src/epub2txt.c b/src/epub2txt.c index 7e9f4a1..72e0504 100644 --- a/src/epub2txt.c +++ b/src/epub2txt.c @@ -312,7 +312,8 @@ List *epub2txt_get_items (const char *opf, char **error) char *val2 = r3->attributes[p].value; if (strcmp (name2, "href") == 0) { - list_append (ret, strdup (val2)); + char *decoded_val2 = decode_url (val2); + list_append (ret, decoded_val2); } } } diff --git a/src/util.c b/src/util.c index 853343c..16e7431 100644 --- a/src/util.c +++ b/src/util.c @@ -1,12 +1,14 @@ /*============================================================================ epub2txt v2 util.c - Copyright (c)2022 Marco Bonelli, GPL v3.0 + Copyright (c)2022 Marco Bonelli, Kevin Boone, GPL v3.0 ============================================================================*/ #include #include #include +#include +#include #include #include #include "util.h" @@ -16,6 +18,7 @@ run_command Run an helper command through fork + execvp, wait for it to finish and return its status. Log execvp errors, and abort execution if abort_on_error is TRUE. +(Marco Bonelli) *==========================================================================*/ int run_command (const char *const argv[], BOOL abort_on_error) { @@ -39,3 +42,42 @@ int run_command (const char *const argv[], BOOL abort_on_error) waitpid (pid, &status, 0); return status; } + +/*========================================================================== + Decode %xx in URL-type strings. The caller must free the resulting + string, which will be no longer than the input. + (Kevin Boone) +*==========================================================================*/ +char *decode_url (const char *url) + { + char *ret = malloc (strlen (url) + 2); + + int len = 0; + for (; *url; len++) + { + if (*url == '%' && url[1] && url[2] && + isxdigit(url[1]) && isxdigit(url[2])) + { + char url1 = url[1]; + char url2 = url[2]; + url1 -= url1 <= '9' ? '0' : (url1 <= 'F' ? 'A' : 'a')-10; + url2 -= url2 <= '9' ? '0' : (url2 <= 'F' ? 'A' : 'a')-10; + ret[len] = 16 * url1 + url2; + url += 3; + continue; + } + else if (*url == '+') + { + /* I have not tested this piece of the function, because I have not + seen any instances of '+' (meaning space) in a spine href */ + url += 1; + ret[len] = ' '; + } + ret[len] = *url++; + } + ret[len] = '\0'; + + return ret; + } + + diff --git a/src/util.h b/src/util.h index 2685a02..6b0c197 100644 --- a/src/util.h +++ b/src/util.h @@ -1,7 +1,7 @@ /*============================================================================ epub2txt v2 util.h - Copyright (c)2022 Marco Bonelli, GPL v3.0 + Copyright (c)2022 Marco Bonelli, Kevin Boone GPL v3.0 ============================================================================*/ #pragma once @@ -9,3 +9,7 @@ #include "defs.h" int run_command (const char *const argv[], BOOL abort_on_error); + +/** Decode %xx in URL-type strings. The caller must free the resulting + string, which will be no longer than the input. */ +char *decode_url (const char *url); diff --git a/src/xhtml.c b/src/xhtml.c index 1338882..fbfceae 100644 --- a/src/xhtml.c +++ b/src/xhtml.c @@ -530,6 +530,8 @@ WString *xhtml_transform_char (uint32_t c, BOOL to_ascii) ============================================================================*/ WString *xhtml_translate_entity (const WString *entity) { + /* Program flow in this function is very ugly, and prone to memory + leaks when modified. The whole thing needs to be rewritten */ char out[20]; IN char *in = wstring_to_utf8 (entity); @@ -569,8 +571,11 @@ WString *xhtml_translate_entity (const WString *entity) WString *ret = wstring_create_empty(); wstring_append_c (ret, (uint32_t)v); OUT + free (s); + free (in); return ret; } + free (s); } else {