From a9df2175f5b48af8e43f45cfb9b3db0d367261fc710df0174ac69d3e1ed3d0ff Mon Sep 17 00:00:00 2001
From: Christian Goll <cgoll@suse.com>
Date: Tue, 23 Jul 2024 09:21:39 +0000
Subject: [PATCH] initial commit ofrust base python-tokenizers

OBS-URL: https://build.opensuse.org/package/show/science:machinelearning/python-tokenizers?expand=0&rev=1
---
 .gitattributes            | 23 ++++++++++++
 .gitignore                |  1 +
 python-tokenizers.changes |  4 +++
 python-tokenizers.spec    | 75 +++++++++++++++++++++++++++++++++++++++
 tokenizers-0.19.1.tar.gz  |  3 ++
 vendor.tar.gz             |  3 ++
 6 files changed, 109 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 .gitignore
 create mode 100644 python-tokenizers.changes
 create mode 100644 python-tokenizers.spec
 create mode 100644 tokenizers-0.19.1.tar.gz
 create mode 100644 vendor.tar.gz

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..9b03811
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,23 @@
+## Default LFS
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.bsp filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.gem filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.jar filter=lfs diff=lfs merge=lfs -text
+*.lz filter=lfs diff=lfs merge=lfs -text
+*.lzma filter=lfs diff=lfs merge=lfs -text
+*.obscpio filter=lfs diff=lfs merge=lfs -text
+*.oxt filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.rpm filter=lfs diff=lfs merge=lfs -text
+*.tbz filter=lfs diff=lfs merge=lfs -text
+*.tbz2 filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.ttf filter=lfs diff=lfs merge=lfs -text
+*.txz filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..57affb6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.osc
diff --git a/python-tokenizers.changes b/python-tokenizers.changes
new file mode 100644
index 0000000..a614c3f
--- /dev/null
+++ b/python-tokenizers.changes
@@ -0,0 +1,4 @@
+-------------------------------------------------------------------
+Wed Jul  3 14:55:36 UTC 2024 - Christian Goll <cgoll@suse.com>
+
+- initial commit on rust based python-tokenizers 
diff --git a/python-tokenizers.spec b/python-tokenizers.spec
new file mode 100644
index 0000000..7aa2c4f
--- /dev/null
+++ b/python-tokenizers.spec
@@ -0,0 +1,75 @@
+#
+# spec file for package python-tokenizers
+#
+# Copyright (c) 2024 SUSE LLC
+#
+# All modifications and additions to the file contributed by third parties
+# remain the property of their copyright owners, unless otherwise agreed
+# upon. The license for this file, and modifications and additions to the
+# file, is the same license as for the pristine package itself (unless the
+# license for the pristine package is not an Open Source License, in which
+# case the license is the MIT License). An "Open Source License" is a
+# license that conforms to the Open Source Definition (Version 1.9)
+# published by the Open Source Initiative.
+
+# Please submit bugfixes or comments via https://bugs.opensuse.org/
+#
+
+
+%{?!python_module:%define python_module() python-%{**} python3-%{**}}
+
+Name:           python-tokenizers
+Version:        0.19.1
+Release:        0
+Summary:        Provides an implementation of today's most used tokenizers
+License:        Apache-2.0
+URL:            https://github.com/huggingface/tokenizers
+Source0:        https://github.com/huggingface/tokenizers/archive/refs/tags/v%{version}.tar.gz#/tokenizers-%{version}.tar.gz
+Source1:        vendor.tar.gz
+BuildRequires:  %{python_module devel}
+BuildRequires:  %{python_module maturin}
+BuildRequires:  %{python_module pip}
+BuildRequires:  %{python_module setuptools}
+BuildRequires:  cargo-packaging
+BuildRequires:  gcc-c++
+BuildRequires:  fdupes
+BuildRequires:  python-rpm-macros
+BuildRequires:  python-rpm-macros
+%python_subpackages
+
+%description
+Provides an implementation of today's most used tokenizers, with a focus on
+performance and versatility.
+* Train new vocabularies and tokenize, using today's most used tokenizers.
+* Extremely fast (both training and tokenization), thanks to the Rust
+  implementation. Takes less than 20 seconds to tokenize a GB of text on a
+  server's CPU.
+* Easy to use, but also extremely versatile.
+* Designed for research and production.
+* Normalization comes with alignments tracking. It's always possible to get the
+  part of the original sentence that corresponds to a given token.
+* Does all the pre-processing: Truncate, Pad, add the special tokens your model
+  needs.
+
+%prep
+%autosetup -p1 -n tokenizers-%{version}
+cd bindings/python
+tar xzf %{S:1}
+
+%build
+cd bindings/python
+%pyproject_wheel
+
+%install
+cd bindings/python
+%pyproject_install
+%python_expand %fdupes %{buildroot}/%{$python_sitearch}/*
+
+%check
+
+%files %{python_files}
+%license LICENSE
+%doc README.md
+%{python_sitearch}/tokenizers*
+
+%changelog
diff --git a/tokenizers-0.19.1.tar.gz b/tokenizers-0.19.1.tar.gz
new file mode 100644
index 0000000..62d966d
--- /dev/null
+++ b/tokenizers-0.19.1.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53f5e644148c14cf2c429f8eb321cc7f75e3092973ca6b0ced5b516214a081bf
+size 1521372
diff --git a/vendor.tar.gz b/vendor.tar.gz
new file mode 100644
index 0000000..4a53db9
--- /dev/null
+++ b/vendor.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75f3dc1f210fe1b404dc08ac2793b4610ad98e15c033ce573996b89f686647dc
+size 24387077