diff --git a/_service b/_service
new file mode 100644
index 0000000..dd5811d
--- /dev/null
+++ b/_service
@@ -0,0 +1,13 @@
+
+
+
+
+ tokenizers*.tar.gz
+ zst
+ bindings/python/Cargo.toml
+ tokenizers/Cargo.toml
+ false
+ true
+
+
+
diff --git a/python-tokenizers.changes b/python-tokenizers.changes
index a614c3f..2013696 100644
--- a/python-tokenizers.changes
+++ b/python-tokenizers.changes
@@ -1,3 +1,50 @@
+-------------------------------------------------------------------
+Tue Aug 20 07:27:42 UTC 2024 - Simon Lees
+
+- Fix testsuite on 15.6
+
+-------------------------------------------------------------------
+Sun Aug 18 16:49:56 UTC 2024 - Soc Virnyl Estela
+
+- Replace vendor tarball to zstd compressed vendor tarball
+- Force gcc version on leap. Thanks @marv7000 for your zed.spec
+- Use `CARGO_*` environmental variables to force generate
+ full debuginfo and avoid stripping.
+- Enable cargo test in %check.
+- Update to version 0.20.0:
+ * remove enforcement of non special when adding tokens
+ * [BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder
+ * Make USED_PARALLELISM atomic
+ * Fixing for clippy 1.78
+ * feat(ci): add trufflehog secrets detection
+ * Switch from cached_download to hf_hub_download in tests
+ * Fix "dictionnary" typo
+ * make sure we don't warn on empty tokens
+ * Enable dropout = 0.0 as an equivalent to none in BPE
+ * Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) …
+ * Add bytelevel normalizer to fix decode when adding tokens to BPE
+ * Fix clippy + feature test management.
+ * Bump spm_precompiled to 0.1.3
+ * Add benchmark vs tiktoken
+ * Fixing the benchmark.
+ * Tiny improvement
+ * Enable fancy regex
+ * Fixing release CI strict (taken from safetensors).
+ * Adding some serialization testing around the wrapper.
+ * Add-legacy-tests
+ * Adding a few tests for decoder deserialization.
+ * Better serialization error
+ * Add test normalizers
+ * Improve decoder deserialization
+ * Using serde (serde_pyo3) to get str and repr easily.
+ * Merges cannot handle tokens containing spaces.
+ * Fix doc about split
+ * Support None to reset pre_tokenizers and normalizers, and index sequences
+ * Fix strip python type
+ * Tests + Deserialization improvement for normalizers.
+ * add deserialize for pre tokenizers
+ * Perf improvement 16% by removing offsets.
+
-------------------------------------------------------------------
Wed Jul 3 14:55:36 UTC 2024 - Christian Goll
diff --git a/python-tokenizers.spec b/python-tokenizers.spec
index 7aa2c4f..05ec0a1 100644
--- a/python-tokenizers.spec
+++ b/python-tokenizers.spec
@@ -15,26 +15,29 @@
# Please submit bugfixes or comments via https://bugs.opensuse.org/
#
+%if 0%{?suse_version} && 0%{?suse_version} < 1550
+%global force_gcc_version 13
+%endif
-%{?!python_module:%define python_module() python-%{**} python3-%{**}}
-
+%{?sle15_python_module_pythons}
Name: python-tokenizers
-Version: 0.19.1
+Version: 0.20.0
Release: 0
Summary: Provides an implementation of today's most used tokenizers
License: Apache-2.0
URL: https://github.com/huggingface/tokenizers
Source0: https://github.com/huggingface/tokenizers/archive/refs/tags/v%{version}.tar.gz#/tokenizers-%{version}.tar.gz
-Source1: vendor.tar.gz
+Source1: vendor.tar.zst
BuildRequires: %{python_module devel}
BuildRequires: %{python_module maturin}
BuildRequires: %{python_module pip}
BuildRequires: %{python_module setuptools}
BuildRequires: cargo-packaging
-BuildRequires: gcc-c++
+BuildRequires: gcc%{?force_gcc_version}-c++
BuildRequires: fdupes
BuildRequires: python-rpm-macros
-BuildRequires: python-rpm-macros
+BuildRequires: zstd
+Requires: %{python_module huggingface-hub}
%python_subpackages
%description
@@ -52,20 +55,37 @@ performance and versatility.
needs.
%prep
-%autosetup -p1 -n tokenizers-%{version}
-cd bindings/python
-tar xzf %{S:1}
+%autosetup -p1 -n tokenizers-%{version} -a1
%build
-cd bindings/python
+export CARGO_NET_OFFLINE=true
+export CARGO_PROFILE_RELEASE_DEBUG=full
+export CARGO_PROFILE_RELEASE_SPLIT_DEBUGINFO=off
+export CARGO_PROFILE_RELEASE_STRIP=false
+%if 0%{?force_gcc_version}
+export CC="gcc-%{?force_gcc_version}"
+export CXX="g++-%{?force_gcc_version}"
+%endif
+pushd bindings/python
%pyproject_wheel
+%python_expand %fdupes %{buildroot}%{$python_sitearch}
%install
-cd bindings/python
+export CARGO_NET_OFFLINE=true
+export CARGO_PROFILE_RELEASE_DEBUG=full
+export CARGO_PROFILE_RELEASE_SPLIT_DEBUGINFO=off
+export CARGO_PROFILE_RELEASE_STRIP=false
+pushd bindings/python
%pyproject_install
%python_expand %fdupes %{buildroot}/%{$python_sitearch}/*
%check
+%if 0%{?force_gcc_version}
+export CC="gcc-%{?force_gcc_version}"
+export CXX="g++-%{?force_gcc_version}"
+%endif
+# See https://doc.rust-lang.org/cargo/reference/config.html#hierarchical-structure
+%{cargo_test} --manifest-path ./tokenizers/Cargo.toml --lib
%files %{python_files}
%license LICENSE
diff --git a/tokenizers-0.19.1.tar.gz b/tokenizers-0.19.1.tar.gz
deleted file mode 100644
index 62d966d..0000000
--- a/tokenizers-0.19.1.tar.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:53f5e644148c14cf2c429f8eb321cc7f75e3092973ca6b0ced5b516214a081bf
-size 1521372
diff --git a/tokenizers-0.20.0.tar.gz b/tokenizers-0.20.0.tar.gz
new file mode 100644
index 0000000..3f95dbb
--- /dev/null
+++ b/tokenizers-0.20.0.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea027dbebbca61b28e1a4512eb447e513af3004bd268bcf139b51a384c073cb5
+size 1537041
diff --git a/vendor.tar.gz b/vendor.tar.gz
deleted file mode 100644
index 4a53db9..0000000
--- a/vendor.tar.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:75f3dc1f210fe1b404dc08ac2793b4610ad98e15c033ce573996b89f686647dc
-size 24387077
diff --git a/vendor.tar.zst b/vendor.tar.zst
new file mode 100644
index 0000000..d484629
--- /dev/null
+++ b/vendor.tar.zst
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e5695527d437259fb7ac111196a24a3137f982c4ace3dc7260e235814dd094c
+size 28934904