From 0ce5e32ce07d8c90645e12a3d15f2ff8af787140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Schr=C3=B6ter?= Date: Fri, 5 Jan 2024 09:46:47 +0100 Subject: [PATCH] Sync from SUSE:ALP:Source:Standard:1.0 python-fastparquet revision 4b20351494976404f011c20f7d423c7b --- .gitattributes | 23 ++ fastparquet-2023.10.1.tar.gz | 3 + python-fastparquet.changes | 509 +++++++++++++++++++++++++++++++++++ python-fastparquet.spec | 87 ++++++ 4 files changed, 622 insertions(+) create mode 100644 .gitattributes create mode 100644 fastparquet-2023.10.1.tar.gz create mode 100644 python-fastparquet.changes create mode 100644 python-fastparquet.spec diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..fecc750 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,23 @@ +## Default LFS +*.7z filter=lfs diff=lfs merge=lfs -text +*.bsp filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.gem filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.jar filter=lfs diff=lfs merge=lfs -text +*.lz filter=lfs diff=lfs merge=lfs -text +*.lzma filter=lfs diff=lfs merge=lfs -text +*.obscpio filter=lfs diff=lfs merge=lfs -text +*.oxt filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.rpm filter=lfs diff=lfs merge=lfs -text +*.tbz filter=lfs diff=lfs merge=lfs -text +*.tbz2 filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.ttf filter=lfs diff=lfs merge=lfs -text +*.txz filter=lfs diff=lfs merge=lfs -text +*.whl filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text diff --git a/fastparquet-2023.10.1.tar.gz b/fastparquet-2023.10.1.tar.gz new file mode 100644 index 0000000..ab02b6f --- /dev/null +++ b/fastparquet-2023.10.1.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ba93992c772422c9abd2993bed1ad3ea4f50bacd6c05d9015f934b0db4aa11 +size 28904462 diff --git a/python-fastparquet.changes b/python-fastparquet.changes new file mode 100644 index 0000000..2872b2e --- /dev/null +++ b/python-fastparquet.changes @@ -0,0 +1,509 @@ +------------------------------------------------------------------- +Tue Dec 5 12:23:32 UTC 2023 - Dirk Müller + +- update to 2023.10.0: + * Datetime units in empty() with tz (#893) + * Fewer inplace decompressions for V2 pages (#890 + * Allow writing categorical column with no categories (#888) + * Fixes for new numpy (#886) + * RLE bools and DELTA for v1 pages (#885, 883) + +------------------------------------------------------------------- +Mon Sep 11 21:29:16 UTC 2023 - Dirk Müller + +- update to 2023.8.0: + * More general timestamp units (#874) + * ReadTheDocs V2 (#871) + * Better roundtrip dtypes (#861, 859) + * No convert when computing bytes-per-item for str (#858) + +------------------------------------------------------------------- +Sat Jul 1 20:05:36 UTC 2023 - Arun Persaud + +- update to version 2023.7.0: + * Add test case for reading non-pandas parquet file (#870) + * Extra field when cloning ParquetFile (#866) + +------------------------------------------------------------------- +Fri Apr 28 08:10:46 UTC 2023 - Dirk Müller + +- update to 2023.4.0: + * allow loading categoricals even if not so in the pandas metadata, + when a column is dict-encodedand we only have one row-group (#863) + * apply dtype to the columns names series, even when selecting no + columns (#861, 859) + * don't make strings while estimating bye column size (#858) + * handle upstream depr (#857, 856) + +------------------------------------------------------------------- +Thu Feb 9 15:55:08 UTC 2023 - Arun Persaud + +- update to version 2023.2.0: + * revert one-level set of filters (#852) + * full size dict for decoding V2 pages (#850) + * infer_object_encoding fix (#847) + * row filtering with V2 pages (#845) + +------------------------------------------------------------------- +Wed Feb 8 18:25:03 UTC 2023 - Arun Persaud + +- specfile: + * remove fastparquet-pr835.patch, implemented upstream + +- update to version 2023.1.0: + * big improvement to write speed + * paging support for bigger row-groups + * pandas 2.0 support + * delta for big-endian architecture + +------------------------------------------------------------------- +Mon Jan 2 20:38:49 UTC 2023 - Ben Greiner + +- Update to 2022.12.0 + * check all int32 values before passing to thrift writer + * fix type of num_rows to i64 for big single file +- Release 2022.11.0 + * Switch to calver + * Speed up loading of nullable types + * Allow schema evolution by addition of columns + * Allow specifying dtypes of output + * update to scm versioning + * fixes to row filter, statistics and tests + * support pathlib.Paths + * JSON encoder options +- Drop fastparquet-pr813-updatefixes.patch + +------------------------------------------------------------------- +Fri Dec 23 09:18:39 UTC 2022 - Guillaume GARDET + +- Add patch to fox the test test_delta_from_def_2 on + aarch64, armv7 and ppc64le: + * fastparquet-pr835.patch + +------------------------------------------------------------------- +Fri Oct 28 15:47:41 UTC 2022 - Ben Greiner + +- Update to 0.8.3 + * improved key/value handling and rejection of bad types + * fix regression in consolidate_cats (caught in dask tests) +- Release 0.8.2 + * datetime indexes initialised to 0 to prevent overflow from + randommemory + * case from csv_to_parquet where stats exists but has not nulls + entry + * define len and bool for ParquetFile + * maintain int types of optional data tha came from pandas + * fix for delta encoding +- Add fastparquet-pr813-updatefixes.patch gh#dask/fastparquet#813 + +------------------------------------------------------------------- +Tue Apr 26 11:02:27 UTC 2022 - Ben Greiner + +- Update to 0.8.1 + * fix critical buffer overflow crash for large number of columns + and long column names + * metadata handling + * thrift int32 for list + * avoid error storing NaNs in column stats + +------------------------------------------------------------------- +Sat Jan 29 21:36:38 UTC 2022 - Ben Greiner + +- Update to 0.8.0 + * our own cythonic thrift implementation (drop thrift dependency) + * more in-place dataset editing ad reordering + * python 3.10 support + * fixes for multi-index and pandas types +- Clean test skips + +------------------------------------------------------------------- +Sun Jan 16 13:34:53 UTC 2022 - Ben Greiner + +- Clean specfile from unused python36 conditionals +- Require thrift 0.15.0 (+patch) for Python 3.10 compatibility + * gh#dask/fastparquet#514 + +------------------------------------------------------------------- +Sat Nov 27 20:34:53 UTC 2021 - Arun Persaud + +- update to version 0.7.2: + * Ability to remove row-groups in-place for multifile datasets + * Accept pandas nullable Float type + * allow empty strings and fix min/max when there is no data + * make writing statistics optional + * row selection in to_pandas() + +------------------------------------------------------------------- +Sun Aug 8 15:13:55 UTC 2021 - Ben Greiner + +- Update to version 0.7.1 + * Back compile for older versions of numpy + * Make pandas nullable types opt-out. The old behaviour (casting + to float) is still available with ParquetFile(..., + pandas_nulls=False). + * Fix time field regression: IsAdjustedToUTC will be False when + there is no timezone + * Micro improvements to the speed of ParquetFile creation by + using simple simple string ops instead of regex and + regularising filenames once at the start. Effects datasets with + many files. +- Release 0.7.0 + * This version institutes major, breaking changes, listed here, + and incremental fixes and additions. + * Reading a directory without a _metadata summary file now works + by providing only the directory, instead of a list of + constituent files. This change also makes direct of use of + fsspec filesystems, if given, to be able to load the footer + metadata areas of the files concurrently, if the storage + backend supports it, and not directly instantiating + intermediate ParquetFile instances + * row-level filtering of the data. Whereas previously, only full + row-groups could be excluded on the basis of their parquet + metadata statistics (if present), filtering can now be done + within row-groups too. The syntax is the same as before, + allowing for multiple column expressions to be combined with + AND|OR, depending on the list structure. This mechanism + requires two passes: one to load the columns needed to create + the boolean mask, and another to load the columns actually + needed in the output. This will not be faster, and may be + slower, but in some cases can save significant memory + footprint, if a small fraction of rows are considered good and + the columns for the filter expression are not in the output. + Not currently supported for reading with DataPageV2. + * DELTA integer encoding (read-only): experimentally working, + but we only have one test file to verify against, since it is + not trivial to persuade Spark to produce files encoded this + way. DELTA can be extremely compact a representation for + slowly varying and/or monotonically increasing integers. + * nanosecond resolution times: the new extended "logical" types + system supports nanoseconds alongside the previous millis and + micros. We now emit these for the default pandas time type, + and produce full parquet schema including both "converted" and + "logical" type information. Note that all output has + isAdjustedToUTC=True, i.e., these are timestamps rather than + local time. The time-zone is stored in the metadata, as + before, and will be successfully recreated only in fastparquet + and (py)arrow. Otherwise, the times will appear to be UTC. For + compatibility with Spark, you may still want to use + times="int96" when writing. + * DataPageV2 writing: now we support both reading and writing. + For writing, can be enabled with the environment variable + FASTPARQUET_DATAPAGE_V2, or module global fastparquet.writer. + DATAPAGE_VERSION and is off by default. It will become on by + default in the future. In many cases, V2 will result in better + read performance, because the data and page headers are + encoded separately, so data can be directly read into the + output without addition allocation/copies. This feature is + considered experimental, but we believe it working well for + most use cases (i.e., our test suite) and should be readable + by all modern parquet frameworks including arrow and spark. + * pandas nullable types: pandas supports "masked" extension + arrays for types that previously could not support NULL at + all: ints and bools. Fastparquet used to cast such columns to + float, so that we could represent NULLs as NaN; now we use the + new(er) masked types by default. This means faster reading of + such columns, as there is no conversion. If the metadata + guarantees that there are no nulls, we still use the + non-nullable variant unless the data was written with + fastparquet/pyarrow, and the metadata indicates that the + original datatype was nullable. We already handled writing of + nullable columns. + +------------------------------------------------------------------- +Tue May 18 14:41:46 UTC 2021 - Ben Greiner + +- Update to version 0.6.3 + * no release notes + * new requirement: cramjam instead of separate compression libs + and their bindings + * switch from numba to Cython + +------------------------------------------------------------------- +Fri Feb 12 14:50:18 UTC 2021 - Dirk Müller + +- skip python 36 build + +------------------------------------------------------------------- +Thu Feb 4 17:50:32 UTC 2021 - Jan Engelhardt + +- Use of "+=" in %check warrants bash as buildshell. + +------------------------------------------------------------------- +Wed Feb 3 21:43:10 UTC 2021 - Ben Greiner + +- Skip the import without warning test gh#dask/fastparquet#558 +- Apply the Cepl-Strangelove-Parameter to pytest + (--import-mode append) + +------------------------------------------------------------------- +Sat Jan 2 21:04:30 UTC 2021 - Benjamin Greiner + +- update to version 0.5 + * no changelog +- update test suite setup -- install the .test module + +------------------------------------------------------------------- +Sat Jul 18 18:13:53 UTC 2020 - Arun Persaud + +- specfile: + * update requirements: version numbers and added packaging + +- update to version 0.4.1: + * nulls, fixes #504 + * deps: Add missing dependency on packaging. (#502) + +------------------------------------------------------------------- +Thu Jul 9 14:04:10 UTC 2020 - Marketa Calabkova + +- Update to 0.4.0 + * Changed RangeIndex private methods to public ones + * Use the python executable used to run the code + * Add support for Python 3.8 + * support for numba > 0.48 +- drop upstreamed patch use-python-exec.patch + +------------------------------------------------------------------- +Mon Apr 6 06:54:36 UTC 2020 - Tomáš Chvátal + +- Add patch to use sys.executable and not call py2 binary directly: + * use-python-exec.patch + +------------------------------------------------------------------- +Mon Apr 6 06:50:26 UTC 2020 - Tomáš Chvátal + +- Update to 0.3.3: + * no upstream changelog + +------------------------------------------------------------------- +Fri Oct 25 17:50:50 UTC 2019 - Todd R + +- Drop broken python 2 support. +- Testing fixes + +------------------------------------------------------------------- +Sat Aug 3 15:10:41 UTC 2019 - Arun Persaud + +- update to version 0.3.2: + * Only calculate dataset stats once (#453) + * Fixes #436 (#452) + * Fix a crash if trying to read a file whose created_by value is not + set + * COMPAT: Fix for pandas DeprecationWarning (#446) + * Apply timezone to index (#439) + * Handle NaN partition values (#438) + * Pandas meta (#431) + * Only strip _metadata from end of file path (#430) + * Simple nesting fix (#428) + * Disallow bad tz on save, warn on load (#427) + +------------------------------------------------------------------- +Tue Jul 30 14:23:21 UTC 2019 - Todd R + +- Fix spurious test failure + +------------------------------------------------------------------- +Mon May 20 15:12:11 CEST 2019 - Matej Cepl + +- Clean up SPEC file. + +------------------------------------------------------------------- +Tue Apr 30 14:28:46 UTC 2019 - Todd R + +- update to 0.3.1 + * Add schema == (__eq__) and != (__ne__) methods and tests. + * Fix item iteration for decimals + * List missing columns in error message + * Fix tz being None case +- Update to 0.3.0 + * Squash some warnings and import failures + * Improvements to in and not in operators + * Fixes because pandas released + +------------------------------------------------------------------- +Sat Jan 26 17:05:09 UTC 2019 - Arun Persaud + +- specfile: + * update copyright year + +- update to version 0.2.1: + * Compat for pandas 0.24.0 refactor (#390) + * Change OverflowError message when failing on large pages (#387) + * Allow for changes in dictionary while reading a row-group column + (#367) + * Correct pypi project names for compression libraries (#385) + +------------------------------------------------------------------- +Thu Nov 22 22:47:24 UTC 2018 - Arun Persaud + +- update to version 0.2.0: + * Don't mutate column list input (#383) (#384) + * Add optional requirements to extras_require (#380) + * Fix "broken link to parquet-format page" (#377) + * Add .c file to repo + * Handle rows split across 2 pages in the case of a map (#369) + * Fixes 370 (#371) + * Handle multi-page maps (#368) + * Handle zero-column files. Closes #361. (#363) + +------------------------------------------------------------------- +Sun Sep 30 16:22:56 UTC 2018 - Arun Persaud + +- specfile: + * update url + * make %files section more specific + +- update to version 0.1.6: + * Restrict what categories get passed through (#358) + * Deep digging for multi-indexes (#356) + * allow_empty is the default in >=zstandard-0.9 (#355) + * Remove setup_requires from setup.py (#345) + * Fixed error if a certain partition is empty, when writing a + partioned (#347) + * Allow UTF8 column names to be read (#342) + * readd test file + * Allow for NULL converted type (#340) + * Robust partition names (#336) + * Fix accidental multiindex + * Read multi indexes (#331) + * Allow reading from any file-like (#330) + * change `parquet-format` link to apache repo (#328) + * Remove extra space from api.py (#325) + * numba bool fun (#324) + +- changes from version 0.1.5: + * Fix _dtypes to be more efficient, to work with files with lots of + columns (#318) + * Buildfix (#313) + * Use LZ4 block compression for compatibility with parquet-cpp + (#314) (#315) + * Fix typo in ParquetFile docstring (#312) + * Remove annoying print() when reading file with CategoricalDtype + index (#311) + * Allow lists of multi-file data-sets (#309) + * Acceleate dataframe.empty for small/medium sizes (#307) + * Include dictionary page in column size (#306) + * Fix for selecting columns which were used for partitioning (#304) + * Remove occurances of np.fromstring (#303) + * Add support for zstandard compression (#296) + * Int96time order (#298) + +- changes from version 0.1.4: + * Add handling of keyword arguments for compressor (#294) + * Fix setup.py duplication (#295) + * Integrate pytest with setup.py (#293) + * Get setup.py pytest to work. (#287) + * Add LZ4 support (#292) + * Update for forthcoming thrift release (#281) + * If timezones are in pandas metadata, assign columns as required + (#285) + * Pandas import (#284) + * Copy FMDs instead of mutate (#279) + * small fixes (#278) + * fixes to get benchmark to work (#276) + * backwards compat with Dask + * Fix test_time_millis on Windows (#275) + * join paths os-independently (#271) + * Adds int32 support for object encoding (#268) + * Fix a couple small typos in documentation (#267) + * Partition order should be sorted (#265) + * COMPAT: Update thrift (#264) + * Speedups result (#253) + * Remove thrift_copy + * Define `__copy__` on thrift structures + * Update rtd deps + +- changes from version 0.1.3: + * More care over append when partitioning multiple columns + * Sep for windows cats filtering + * Move pytest imports to tests/ remove requirememnt + * Special-case only zeros + * Cope with partition values like "07" + * fix for s3 + * Fix for list of paths rooted in the current directory + * add test + * Explicit file opens + * update docstring + * Refactor partition interpretation + * py2 fix + * Error in test changed + * Better error messages when failed to cnovert on write + +- changes from version 0.1.2: + * Revert accidental removal of s3 import + * Move thrift things together, and make thrift serializer for pickle + * COMPAT: for new pandas CategoricalDtype + * Fixup for backwards seeking. + * Fix some test failures + * Protptype version using thrift instead of thriftpy + * Not all mergers have cats + * Revert accidental deletion + * remove warnings + * Sort keys in json for metadata + * Check column chunks for categories sizes + * Account for partition dir names with numbers + * Fix map/list doc + * Catch more stats errors + * Prevent pandas auto-names being given to index + +- changes from version 0.1.1: + * Add workaround for single-value-partition + * update test + * Simplify and fix for py2 + * Use thrift encoding on statistics strings + * remove redundant SNAPPY from supported compressions list + * Fix statistics + * lists again + * Always convert int96 to times + * Update docs + * attribute typo + * Fix definition level + * Add test, clean columns + * Allow optional->optional lists and maps + * Flatten schema to enable loading of non-repeated columns + * Remove extra file + * Fix py2 + * Fix "in" filter to cope with strings that could be numbers + * Allow pip install without NumPy or Cython + +- changes from version 0.1.0: + * Add ParquetFile attribute documentation + * Fix tests + * Enable append to an empty dataset + * More warning words and check on partition_on + * Do not fail stats if there are no row-groups + * Fix "numpy_dtype"->"numpy_type + * "in" was checking range not exact membership of set + * If metadata gives index, put in columns + * Fix pytest warning + * Fail on ordering dict statistics + * Fix stats filter + * clean test + * Fix ImportWarning on Python 3.6+ + * TEST: added updated test file for special strings used in filters + * fix links + * [README]: indicate dependency on LLVM 4.0.x. + * Filter stats had unfortunate converted_type check + * Ignore exceptions in val_to_num + * Also for TODAY + * Very special case for partition: NOW should be kept as string + * Allow partition_on; fix category nuls + * Remove old category key/values on writing + * Implement writing pandas metadata and auto-setting cats/index + * Pandas compatability + * Test and fix for filter on single file + * Do not attempt to recurse into schema elements with zero childrean + +------------------------------------------------------------------- +Thu Jun 7 20:41:31 UTC 2018 - jengelh@inai.de + +- Fixup grammar./Replace future aims with what it does now. + +------------------------------------------------------------------- +Thu May 3 14:07:08 UTC 2018 - toddrme2178@gmail.com + +- Use %license tag + +------------------------------------------------------------------- +Thu May 25 12:19:26 UTC 2017 - toddrme2178@gmail.com + +- Initial version diff --git a/python-fastparquet.spec b/python-fastparquet.spec new file mode 100644 index 0000000..1582257 --- /dev/null +++ b/python-fastparquet.spec @@ -0,0 +1,87 @@ +# +# spec file for package python-fastparquet +# +# Copyright (c) 2023 SUSE LLC +# +# All modifications and additions to the file contributed by third parties +# remain the property of their copyright owners, unless otherwise agreed +# upon. The license for this file, and modifications and additions to the +# file, is the same license as for the pristine package itself (unless the +# license for the pristine package is not an Open Source License, in which +# case the license is the MIT License). An "Open Source License" is a +# license that conforms to the Open Source Definition (Version 1.9) +# published by the Open Source Initiative. + +# Please submit bugfixes or comments via https://bugs.opensuse.org/ +# + + +%{?sle15_python_module_pythons} +Name: python-fastparquet +Version: 2023.10.1 +Release: 0 +Summary: Python support for Parquet file format +License: Apache-2.0 +URL: https://github.com/dask/fastparquet/ +# Use GitHub archive, because it containts the test modules and data, requires setting version manuall for setuptools_scm +Source: https://github.com/dask/fastparquet/archive/%{version}.tar.gz#/fastparquet-%{version}.tar.gz +BuildRequires: %{python_module Cython >= 0.29.23} +BuildRequires: %{python_module base >= 3.8} +BuildRequires: %{python_module cramjam >= 2.3.0} +# version requirement not declared for runtime, but necessary for tests. +BuildRequires: %{python_module fsspec >= 2021.6.0} +BuildRequires: %{python_module numpy-devel >= 1.20.3} +BuildRequires: %{python_module packaging} +BuildRequires: %{python_module pandas >= 1.5.0} +BuildRequires: %{python_module pip} +BuildRequires: %{python_module pytest-asyncio} +BuildRequires: %{python_module pytest-xdist} +BuildRequires: %{python_module pytest} +BuildRequires: %{python_module python-lzo} +BuildRequires: %{python_module setuptools_scm > 1.5.4} +BuildRequires: %{python_module setuptools} +BuildRequires: %{python_module wheel} +BuildRequires: fdupes +BuildRequires: git-core +BuildRequires: python-rpm-macros +Requires: python-cramjam >= 2.3.0 +Requires: python-fsspec +Requires: python-numpy >= 1.20.3 +Requires: python-packaging +Requires: python-pandas >= 1.5.0 +Recommends: python-python-lzo +%python_subpackages + +%description +This is a Python implementation of the parquet format +for integrating it into python-based Big Data workflows. + +%prep +%autosetup -p1 -n fastparquet-%{version} +# remove pytest-runner from setup_requires +sed -i "s/'pytest-runner',//" setup.py +# this is not meant for setup.py +sed -i "s/oldest-supported-numpy/numpy/" setup.py +# the tests import the fastparquet.test module and we need to import from sitearch, so install it. +sed -i -e "s/^\s*packages=\[/&'fastparquet.test', /" -e "/exclude_package_data/ d" setup.py + +%build +export CFLAGS="%{optflags}" +export SETUPTOOLS_SCM_PRETEND_VERSION=%{version} +%pyproject_wheel + +%install +%pyproject_install +%python_expand rm -v %{buildroot}%{$python_sitearch}/fastparquet/{speedups,cencoding}.c +%python_expand %fdupes %{buildroot}%{$python_sitearch} + +%check +%pytest_arch --pyargs fastparquet --import-mode append -n auto + +%files %{python_files} +%doc README.rst +%license LICENSE +%{python_sitearch}/fastparquet +%{python_sitearch}/fastparquet-%{version}*-info + +%changelog