python-hdf5storage/hdf5storage-pr134-numpy2.patch

From 9814bc28874a56757e16479186523b2b77d5c553 Mon Sep 17 00:00:00 2001
From: Jesse R Codling <codling@umich.edu>
Date: Wed, 14 Aug 2024 12:34:47 -0400
Subject: [PATCH 2/3] Numpy 2.0: Remove all np.unicode_ for np.str_

---
 doc/source/storage_format.rst           |  6 ++--
 pyproject.toml                          |  2 +-
 hdf5storage/Marshallers.py          | 20 +++++++------
 hdf5storage/__init__.py             |  6 ++--
 hdf5storage/utilities.py            | 40 ++++++++++++-------------
 tests/asserts.py                        | 14 ++++-----
 tests/make_randoms.py                   |  4 +--
 tests/test_dict_like_storage_methods.py |  6 ++--
 tests/test_str_conv_utils.py            |  8 ++---
 tests/test_string_utf16_conversion.py   |  4 +--
 tests/test_write_readback.py            |  6 ++--
 11 files changed, 59 insertions(+), 57 deletions(-)

Index: hdf5storage-0.1.19/tests/make_randoms.py
===================================================================
--- hdf5storage-0.1.19.orig/tests/make_randoms.py
+++ hdf5storage-0.1.19/tests/make_randoms.py
@@ -156,7 +156,7 @@ def random_numpy(shape, dtype, allow_nan
                 chars = random_str_some_unicode(length)
             else:
                 chars = random_str_ascii(length)
-            data[index] = np.unicode_(chars)
+            data[index] = np.str_(chars)
         return data
     elif dtype == 'object':
         data = np.zeros(shape=shape, dtype='object')
Index: hdf5storage-0.1.19/tests/test_string_utf16_conversion.py
===================================================================
--- hdf5storage-0.1.19.orig/tests/test_string_utf16_conversion.py
+++ hdf5storage-0.1.19/tests/test_string_utf16_conversion.py
@@ -44,12 +44,12 @@ import pytest
 # convert_numpy_str_to_utf16 option is set.
 #
 # * str
-# * numpy.unicode_ scalars
+# * numpy.str_ scalars

 if sys.hexversion < 0x3000000:
-    tps_tuple = (unicode, np.unicode_)
+    tps_tuple = (unicode, np.str_)
 else:
-    tps_tuple = (str, np.unicode_)
+    tps_tuple = (str, np.str_)


 @pytest.mark.parametrize("tp", tps_tuple)
Index: hdf5storage-0.1.19/hdf5storage/Marshallers.py
===================================================================
--- hdf5storage-0.1.19.orig/hdf5storage/Marshallers.py
+++ hdf5storage-0.1.19/hdf5storage/Marshallers.py
@@ -480,7 +480,7 @@ class NumpyScalarArrayMarshaller(TypeMar
                                       'MATLAB_int_decode',
                                       'MATLAB_fields'])
         # As np.str_ is the unicode type string in Python 3 and the bare
-        # bytes string in Python 2, we have to use np.unicode_ which is
+        # bytes string in Python 2, we have to use np.str_ which is
         # or points to the unicode one in both versions.
         self.types = [np.ndarray, np.matrix,
                       np.chararray, np.core.records.recarray,
@@ -489,7 +489,7 @@ class NumpyScalarArrayMarshaller(TypeMar
                       np.int8, np.int16, np.int32, np.int64,
                       np.float32, np.float64,
                       np.complex64, np.complex128,
-                      np.bytes_, np.unicode_, np.object_]
+                      np.bytes_, np.str_, np.object_]
         self._numpy_types = list(self.types)
         # Using Python 3 type strings.
         self.python_type_strings = ['numpy.ndarray', 'numpy.matrix',
@@ -525,7 +525,7 @@ class NumpyScalarArrayMarshaller(TypeMar
                                  np.complex64: 'single',
                                  np.complex128: 'double',
                                  np.bytes_: 'char',
-                                 np.unicode_: 'char',
+                                 np.str_: 'char',
                                  np.object_: 'cell'}

         # Make a dict to look up the opposite direction (given a matlab
@@ -542,7 +542,7 @@ class NumpyScalarArrayMarshaller(TypeMar
                                          'int64': np.int64,
                                          'single': np.float32,
                                          'double': np.float64,
-                                         'char': np.unicode_,
+                                         'char': np.str_,
                                          'cell': np.object_,
                                          'canonical empty': np.float64,
                                          'struct': np.object_}
@@ -601,18 +601,7 @@ class NumpyScalarArrayMarshaller(TypeMar
                     raise NotImplementedError( \
                         'Can''t write non-ASCII numpy.bytes_.')

-        # As of 2013-12-13, h5py cannot write numpy.str_ (UTF-32
-        # encoding) types (its numpy.unicode_ in Python 2, which is an
-        # alias for it in Python 3). If the option is set to try to
-        # convert them to UTF-16, then an attempt at the conversion is
-        # made. If no conversion is to be done, the conversion throws an
-        # exception (a UTF-32 character had no UTF-16 equivalent), or a
-        # UTF-32 character gets turned into a UTF-16 doublet (the
-        # increase in the number of columns will be by a factor more
-        # than the length of the strings); then it will be simply
-        # converted to uint32's byte for byte instead.
-
-        if data.dtype.type == np.unicode_:
+        if data.dtype.type == np.str_:
             new_data = None
             if options.convert_numpy_str_to_utf16:
                 try:
@@ -620,7 +609,7 @@ class NumpyScalarArrayMarshaller(TypeMar
                         data_to_store)
                 except:
                     pass
-            if new_data is None or (type(data_to_store) == np.unicode_ \
+            if new_data is None or (type(data_to_store) == np.str_ \
                     and len(data_to_store) != len(new_data)) \
                     or (isinstance(data_to_store, np.ndarray) \
                     and new_data.shape[-1] != data_to_store.shape[-1] \
@@ -1049,7 +1038,7 @@ class NumpyScalarArrayMarshaller(TypeMar
                 str_attrs[attr_name] = value
             elif isinstance(value, bytes):
                 str_attrs[attr_name] = value.decode()
-            elif isinstance(value, np.unicode_):
+            elif isinstance(value, np.str_):
                 str_attrs[attr_name] = str(value)
             elif isinstance(value, np.bytes_):
                 str_attrs[attr_name] = value.decode()
@@ -1313,7 +1302,7 @@ class NumpyScalarArrayMarshaller(TypeMar
             elif underlying_type.startswith('str') \
                     or matlab_class == 'char':
                 if underlying_type == 'str':
-                    data = np.unicode_('')
+                    data = np.str_('')
                 elif underlying_type.startswith('str'):
                     data = convert_to_numpy_str(data, \
                         length=int(underlying_type[3:])//32)
@@ -1344,7 +1333,7 @@ class NumpyScalarArrayMarshaller(TypeMar
                         data = data.flatten()[0]
                 elif underlying_type.startswith('str'):
                     if python_empty == 1:
-                        data = np.unicode_('')
+                        data = np.str_('')
                     elif isinstance(data, np.ndarray):
                         data = data.flatten()[0]
                 else:
@@ -1511,7 +1500,7 @@ class PythonStringMarshaller(NumpyScalar
         if (sys.hexversion >= 0x03000000 and isinstance(data, str)) \
                 or (sys.hexversion < 0x03000000 \
                 and isinstance(data, unicode)):
-            cdata = np.unicode_(data)
+            cdata = np.str_(data)
         else:
             cdata = np.bytes_(data)

Index: hdf5storage-0.1.19/hdf5storage/utilities.py
===================================================================
--- hdf5storage-0.1.19.orig/hdf5storage/utilities.py
+++ hdf5storage-0.1.19/hdf5storage/utilities.py
@@ -408,7 +408,7 @@ def convert_to_str(data):
     # assuming it is in UTF-8. Otherwise, data has to be returned as is.

     if isinstance(data, (np.ndarray, np.uint8, np.uint16, np.uint32,
-                  np.bytes_, np.unicode_)):
+                  np.bytes_, np.str_)):
         if data.dtype.name == 'uint8':
             return numpy_to_bytes(data.flatten()).decode('UTF-8')
         elif data.dtype.name == 'uint16':
@@ -477,7 +477,7 @@ def convert_to_numpy_str(data, length=No

     """
     # The method of conversion depends on its type.
-    if isinstance(data, np.unicode_) or (isinstance(data, np.ndarray) \
+    if isinstance(data, np.str_) or (isinstance(data, np.ndarray) \
             and data.dtype.char == 'U'):
         # It is already an np.str_ or array of them, so nothing needs to
         # be done.
@@ -486,16 +486,16 @@ def convert_to_numpy_str(data, length=No
            or (sys.hexversion < 0x03000000 \
            and isinstance(data, unicode)):
         # Easily converted through constructor.
-        return np.unicode_(data)
+        return np.str_(data)
     elif isinstance(data, (bytes, bytearray, np.bytes_)):
         # All of them can be decoded and then passed through the
         # constructor.
-        return np.unicode_(data.decode('UTF-8'))
+        return np.str_(data.decode('UTF-8'))
     elif isinstance(data, (np.uint8, np.uint16)):
         # They are single UTF-8 or UTF-16 scalars, and are easily
         # converted to a UTF-8 string and then passed through the
         # constructor.
-        return np.unicode_(convert_to_str(data))
+        return np.str_(convert_to_str(data))
     elif isinstance(data, np.uint32):
         # It is just the uint32 version of the character, so it just
         # needs to be have the dtype essentially changed by having its
@@ -507,7 +507,7 @@ def convert_to_numpy_str(data, length=No
         new_data = np.zeros(shape=data.shape,
                             dtype='U' + str(data.dtype.itemsize))
         for index, x in np.ndenumerate(data):
-            new_data[index] = np.unicode_(x.decode('UTF-8'))
+            new_data[index] = np.str_(x.decode('UTF-8'))
         return new_data
     elif isinstance(data, np.ndarray) \
             and data.dtype.name in ('uint8', 'uint16', 'uint32'):
@@ -559,7 +559,7 @@ def convert_to_numpy_str(data, length=No
                     dtype=new_data.dtype,
                     buffer=numpy_to_bytes(chunk))[()]
             else:
-                new_data[i] = np.unicode_(convert_to_str(chunk))
+                new_data[i] = np.str_(convert_to_str(chunk))

         # Only thing is left is to reshape it.
         return new_data.reshape(tuple(new_shape))
@@ -896,7 +896,7 @@ def get_attribute_string(target, name):
         return value
     elif isinstance(value, bytes):
         return value.decode()
-    elif isinstance(value, np.unicode_):
+    elif isinstance(value, np.str_):
         return str(value)
     elif isinstance(value, np.bytes_):
         return value.decode()