From c3c931f5608da23a196b216edfcc2af5b626fc9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sa=C3=BAl=20Ibarra=20Corretg=C3=A9?= Date: Mon, 4 Aug 2025 10:27:07 +0200 Subject: [PATCH] Refactor channel destruction logic - Use ares_queue_wait_empty to wait for queries to be complete before destruction - Make sure NO queries are cancelled as side effects on __del__ - Start the destruction thread early, as soon as a channel is created Cancelling pending queries while in a query callback seemingly causes heap corruption or double-free bugs, so delay the operation until no Python code if using the channel anymore, that is, the destructor thread. Fixes: https://github.com/aio-libs/aiodns/issues/175 Fixes: https://github.com/saghul/pycares/issues/248 --- docs/channel.rst | 13 ------ src/_cffi_src/build_cares.py | 78 +++++++++++++++++++++++------------- src/pycares/__init__.py | 61 ++++++++++++---------------- 3 files changed, 76 insertions(+), 76 deletions(-) Index: pycares-4.9.0/docs/channel.rst =================================================================== --- pycares-4.9.0.orig/docs/channel.rst +++ pycares-4.9.0/docs/channel.rst @@ -77,25 +77,6 @@ While channels will attempt automatic cleanup during garbage collection, explicit closing is safer as it gives you control over when resources are released. - .. warning:: - The channel destruction mechanism has a limited throughput of 60 channels per minute - (one channel per second) to ensure thread safety and prevent use-after-free errors - in c-ares. This means: - - - Avoid creating transient channels for individual queries - - Reuse channel instances whenever possible - - For applications with high query volume, use a single long-lived channel - - If you must create multiple channels, consider pooling them - - Creating and destroying channels rapidly will result in a backlog as the destruction - queue processes channels sequentially with a 1-second delay between each. - - The Channel class supports the context manager protocol for automatic cleanup:: - - with pycares.Channel() as channel: - channel.query('example.com', pycares.QUERY_TYPE_A, callback) - # Channel is automatically closed when exiting the context - .. py:method:: getaddrinfo(host, port, callback, family=0, type=0, proto=0, flags=0) :param string host: Hostname to resolve. Index: pycares-4.9.0/src/_cffi_src/build_cares.py =================================================================== --- pycares-4.9.0.orig/src/_cffi_src/build_cares.py +++ pycares-4.9.0/src/_cffi_src/build_cares.py @@ -90,34 +90,6 @@ struct sockaddr_in6 { typedef int... ares_socket_t; typedef int... ares_socklen_t; -#define ARES_SUCCESS ... - -#define ARES_ENODATA ... -#define ARES_EFORMERR ... -#define ARES_ESERVFAIL ... -#define ARES_ENOTFOUND ... -#define ARES_ENOTIMP ... -#define ARES_EREFUSED ... -#define ARES_EBADQUERY ... -#define ARES_EBADNAME ... -#define ARES_EBADFAMILY ... -#define ARES_EBADRESP ... -#define ARES_ECONNREFUSED ... -#define ARES_ETIMEOUT ... -#define ARES_EOF ... -#define ARES_EFILE ... -#define ARES_ENOMEM ... -#define ARES_EDESTRUCTION ... -#define ARES_EBADSTR ... -#define ARES_EBADFLAGS ... -#define ARES_ENONAME ... -#define ARES_EBADHINTS ... -#define ARES_ENOTINITIALIZED ... -#define ARES_ELOADIPHLPAPI ... -#define ARES_EADDRGETNETWORKPARAMS ... -#define ARES_ECANCELLED ... -#define ARES_ESERVICE ... - #define ARES_FLAG_USEVC ... #define ARES_FLAG_PRIMARY ... #define ARES_FLAG_IGNTC ... @@ -229,6 +201,54 @@ struct ares_server_failover_options { size_t retry_delay; }; +typedef enum { + ARES_SUCCESS = 0, + + /* Server error codes (ARES_ENODATA indicates no relevant answer) */ + ARES_ENODATA = 1, + ARES_EFORMERR = 2, + ARES_ESERVFAIL = 3, + ARES_ENOTFOUND = 4, + ARES_ENOTIMP = 5, + ARES_EREFUSED = 6, + + /* Locally generated error codes */ + ARES_EBADQUERY = 7, + ARES_EBADNAME = 8, + ARES_EBADFAMILY = 9, + ARES_EBADRESP = 10, + ARES_ECONNREFUSED = 11, + ARES_ETIMEOUT = 12, + ARES_EOF = 13, + ARES_EFILE = 14, + ARES_ENOMEM = 15, + ARES_EDESTRUCTION = 16, + ARES_EBADSTR = 17, + + /* ares_getnameinfo error codes */ + ARES_EBADFLAGS = 18, + + /* ares_getaddrinfo error codes */ + ARES_ENONAME = 19, + ARES_EBADHINTS = 20, + + /* Uninitialized library error code */ + ARES_ENOTINITIALIZED = 21, /* introduced in 1.7.0 */ + + /* ares_library_init error codes */ + ARES_ELOADIPHLPAPI = 22, /* introduced in 1.7.0 */ + ARES_EADDRGETNETWORKPARAMS = 23, /* introduced in 1.7.0 */ + + /* More error codes */ + ARES_ECANCELLED = 24, /* introduced in 1.7.0 */ + + /* More ares_getaddrinfo error codes */ + ARES_ESERVICE = 25, /* ares_getaddrinfo() was passed a text service name that + * is not recognized. introduced in 1.16.0 */ + + ARES_ENOSERVER = 26 /* No DNS servers were configured */ +} ares_status_t; + /*! Values for ARES_OPT_EVENT_THREAD */ typedef enum { /*! Default (best choice) event system */ @@ -597,6 +617,8 @@ const char *ares_inet_ntop(int af, const int ares_inet_pton(int af, const char *src, void *dst); ares_bool_t ares_threadsafety(void); + +ares_status_t ares_queue_wait_empty(ares_channel channel, int timeout_ms); """ CALLBACKS = """ Index: pycares-4.9.0/src/pycares/__init__.py =================================================================== --- pycares-4.9.0.orig/src/pycares/__init__.py +++ pycares-4.9.0/src/pycares/__init__.py @@ -12,10 +12,8 @@ from ._version import __version__ import socket import math import threading -import time import weakref from collections.abc import Callable, Iterable -from contextlib import suppress from typing import Any, Callable, Optional, Dict, Union from queue import SimpleQueue @@ -344,7 +342,7 @@ class _ChannelShutdownManager: def __init__(self) -> None: self._queue: SimpleQueue = SimpleQueue() self._thread: Optional[threading.Thread] = None - self._thread_started = False + self._start_lock = threading.Lock() def _run_safe_shutdown_loop(self) -> None: """Process channel destruction requests from the queue.""" @@ -352,16 +350,27 @@ class _ChannelShutdownManager: # Block forever until we get a channel to destroy channel = self._queue.get() - # Sleep for 1 second to ensure c-ares has finished processing - # Its important that c-ares is past this critcial section - # so we use a delay to ensure it has time to finish processing - # https://github.com/c-ares/c-ares/blob/4f42928848e8b73d322b15ecbe3e8d753bf8734e/src/lib/ares_process.c#L1422 - time.sleep(1.0) + # Cancel all pending queries - this will trigger callbacks with ARES_ECANCELLED + _lib.ares_cancel(channel[0]) + + # Wait for all queries to finish + _lib.ares_queue_wait_empty(channel[0], -1) # Destroy the channel if _lib is not None and channel is not None: _lib.ares_destroy(channel[0]) + def start(self) -> None: + """Start the background thread if not already started.""" + if self._thread is not None: + return + with self._start_lock: + if self._thread is not None: + # Started by another thread while waiting for the lock + return + self._thread = threading.Thread(target=self._run_safe_shutdown_loop, daemon=True) + self._thread.start() + def destroy_channel(self, channel) -> None: """ Schedule channel destruction on the background thread with a safety delay. @@ -369,17 +378,10 @@ class _ChannelShutdownManager: Thread Safety and Synchronization: This method uses SimpleQueue which is thread-safe for putting items from multiple threads. The background thread processes channels - sequentially with a 1-second delay before each destruction. + sequentially waiting for queries to end before each destruction. """ - # Put the channel in the queue self._queue.put(channel) - # Start the background thread if not already started - if not self._thread_started: - self._thread_started = True - self._thread = threading.Thread(target=self._run_safe_shutdown_loop, daemon=True) - self._thread.start() - # Global shutdown manager instance _shutdown_manager = _ChannelShutdownManager() @@ -516,11 +518,12 @@ class Channel: self.close() return False + # Ensure the shutdown thread is started + _shutdown_manager.start() + def __del__(self) -> None: """Ensure the channel is destroyed when the object is deleted.""" - if self._channel is not None: - # Schedule channel destruction using the global shutdown manager - self._schedule_destruction() + self.close() def _create_callback_handle(self, callback_data): """ @@ -764,24 +767,12 @@ class Channel: # Already destroyed return - # Cancel all pending queries - this will trigger callbacks with ARES_ECANCELLED - self.cancel() + # NB: don't cancel queries here, it may lead to problem if done from a + # query callback. # Schedule channel destruction - self._schedule_destruction() - - def _schedule_destruction(self) -> None: - """Schedule channel destruction using the global shutdown manager.""" - if self._channel is None: - return - channel = self._channel - self._channel = None - # Can't start threads during interpreter shutdown - # The channel will be cleaned up by the OS - # TODO: Change to PythonFinalizationError when Python 3.12 support is dropped - with suppress(RuntimeError): - _shutdown_manager.destroy_channel(channel) - + channel, self._channel = self._channel, None + _shutdown_manager.destroy_channel(channel) class AresResult: