diff --git a/_service b/_service new file mode 100644 index 0000000..dd5811d --- /dev/null +++ b/_service @@ -0,0 +1,13 @@ + + + + + tokenizers*.tar.gz + zst + bindings/python/Cargo.toml + tokenizers/Cargo.toml + false + true + + + diff --git a/python-tokenizers.changes b/python-tokenizers.changes index a614c3f..2013696 100644 --- a/python-tokenizers.changes +++ b/python-tokenizers.changes @@ -1,3 +1,50 @@ +------------------------------------------------------------------- +Tue Aug 20 07:27:42 UTC 2024 - Simon Lees + +- Fix testsuite on 15.6 + +------------------------------------------------------------------- +Sun Aug 18 16:49:56 UTC 2024 - Soc Virnyl Estela + +- Replace vendor tarball to zstd compressed vendor tarball +- Force gcc version on leap. Thanks @marv7000 for your zed.spec +- Use `CARGO_*` environmental variables to force generate + full debuginfo and avoid stripping. +- Enable cargo test in %check. +- Update to version 0.20.0: + * remove enforcement of non special when adding tokens + * [BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder + * Make USED_PARALLELISM atomic + * Fixing for clippy 1.78 + * feat(ci): add trufflehog secrets detection + * Switch from cached_download to hf_hub_download in tests + * Fix "dictionnary" typo + * make sure we don't warn on empty tokens + * Enable dropout = 0.0 as an equivalent to none in BPE + * Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) … + * Add bytelevel normalizer to fix decode when adding tokens to BPE + * Fix clippy + feature test management. + * Bump spm_precompiled to 0.1.3 + * Add benchmark vs tiktoken + * Fixing the benchmark. + * Tiny improvement + * Enable fancy regex + * Fixing release CI strict (taken from safetensors). + * Adding some serialization testing around the wrapper. + * Add-legacy-tests + * Adding a few tests for decoder deserialization. + * Better serialization error + * Add test normalizers + * Improve decoder deserialization + * Using serde (serde_pyo3) to get str and repr easily. + * Merges cannot handle tokens containing spaces. + * Fix doc about split + * Support None to reset pre_tokenizers and normalizers, and index sequences + * Fix strip python type + * Tests + Deserialization improvement for normalizers. + * add deserialize for pre tokenizers + * Perf improvement 16% by removing offsets. + ------------------------------------------------------------------- Wed Jul 3 14:55:36 UTC 2024 - Christian Goll diff --git a/python-tokenizers.spec b/python-tokenizers.spec index 7aa2c4f..05ec0a1 100644 --- a/python-tokenizers.spec +++ b/python-tokenizers.spec @@ -15,26 +15,29 @@ # Please submit bugfixes or comments via https://bugs.opensuse.org/ # +%if 0%{?suse_version} && 0%{?suse_version} < 1550 +%global force_gcc_version 13 +%endif -%{?!python_module:%define python_module() python-%{**} python3-%{**}} - +%{?sle15_python_module_pythons} Name: python-tokenizers -Version: 0.19.1 +Version: 0.20.0 Release: 0 Summary: Provides an implementation of today's most used tokenizers License: Apache-2.0 URL: https://github.com/huggingface/tokenizers Source0: https://github.com/huggingface/tokenizers/archive/refs/tags/v%{version}.tar.gz#/tokenizers-%{version}.tar.gz -Source1: vendor.tar.gz +Source1: vendor.tar.zst BuildRequires: %{python_module devel} BuildRequires: %{python_module maturin} BuildRequires: %{python_module pip} BuildRequires: %{python_module setuptools} BuildRequires: cargo-packaging -BuildRequires: gcc-c++ +BuildRequires: gcc%{?force_gcc_version}-c++ BuildRequires: fdupes BuildRequires: python-rpm-macros -BuildRequires: python-rpm-macros +BuildRequires: zstd +Requires: %{python_module huggingface-hub} %python_subpackages %description @@ -52,20 +55,37 @@ performance and versatility. needs. %prep -%autosetup -p1 -n tokenizers-%{version} -cd bindings/python -tar xzf %{S:1} +%autosetup -p1 -n tokenizers-%{version} -a1 %build -cd bindings/python +export CARGO_NET_OFFLINE=true +export CARGO_PROFILE_RELEASE_DEBUG=full +export CARGO_PROFILE_RELEASE_SPLIT_DEBUGINFO=off +export CARGO_PROFILE_RELEASE_STRIP=false +%if 0%{?force_gcc_version} +export CC="gcc-%{?force_gcc_version}" +export CXX="g++-%{?force_gcc_version}" +%endif +pushd bindings/python %pyproject_wheel +%python_expand %fdupes %{buildroot}%{$python_sitearch} %install -cd bindings/python +export CARGO_NET_OFFLINE=true +export CARGO_PROFILE_RELEASE_DEBUG=full +export CARGO_PROFILE_RELEASE_SPLIT_DEBUGINFO=off +export CARGO_PROFILE_RELEASE_STRIP=false +pushd bindings/python %pyproject_install %python_expand %fdupes %{buildroot}/%{$python_sitearch}/* %check +%if 0%{?force_gcc_version} +export CC="gcc-%{?force_gcc_version}" +export CXX="g++-%{?force_gcc_version}" +%endif +# See https://doc.rust-lang.org/cargo/reference/config.html#hierarchical-structure +%{cargo_test} --manifest-path ./tokenizers/Cargo.toml --lib %files %{python_files} %license LICENSE diff --git a/tokenizers-0.19.1.tar.gz b/tokenizers-0.19.1.tar.gz deleted file mode 100644 index 62d966d..0000000 --- a/tokenizers-0.19.1.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:53f5e644148c14cf2c429f8eb321cc7f75e3092973ca6b0ced5b516214a081bf -size 1521372 diff --git a/tokenizers-0.20.0.tar.gz b/tokenizers-0.20.0.tar.gz new file mode 100644 index 0000000..3f95dbb --- /dev/null +++ b/tokenizers-0.20.0.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea027dbebbca61b28e1a4512eb447e513af3004bd268bcf139b51a384c073cb5 +size 1537041 diff --git a/vendor.tar.gz b/vendor.tar.gz deleted file mode 100644 index 4a53db9..0000000 --- a/vendor.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75f3dc1f210fe1b404dc08ac2793b4610ad98e15c033ce573996b89f686647dc -size 24387077 diff --git a/vendor.tar.zst b/vendor.tar.zst new file mode 100644 index 0000000..d484629 --- /dev/null +++ b/vendor.tar.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e5695527d437259fb7ac111196a24a3137f982c4ace3dc7260e235814dd094c +size 28934904