From a9df2175f5b48af8e43f45cfb9b3db0d367261fc710df0174ac69d3e1ed3d0ff Mon Sep 17 00:00:00 2001 From: Christian Goll Date: Tue, 23 Jul 2024 09:21:39 +0000 Subject: [PATCH] initial commit ofrust base python-tokenizers OBS-URL: https://build.opensuse.org/package/show/science:machinelearning/python-tokenizers?expand=0&rev=1 --- .gitattributes | 23 ++++++++++++ .gitignore | 1 + python-tokenizers.changes | 4 +++ python-tokenizers.spec | 75 +++++++++++++++++++++++++++++++++++++++ tokenizers-0.19.1.tar.gz | 3 ++ vendor.tar.gz | 3 ++ 6 files changed, 109 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 python-tokenizers.changes create mode 100644 python-tokenizers.spec create mode 100644 tokenizers-0.19.1.tar.gz create mode 100644 vendor.tar.gz diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9b03811 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,23 @@ +## Default LFS +*.7z filter=lfs diff=lfs merge=lfs -text +*.bsp filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.gem filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.jar filter=lfs diff=lfs merge=lfs -text +*.lz filter=lfs diff=lfs merge=lfs -text +*.lzma filter=lfs diff=lfs merge=lfs -text +*.obscpio filter=lfs diff=lfs merge=lfs -text +*.oxt filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.rpm filter=lfs diff=lfs merge=lfs -text +*.tbz filter=lfs diff=lfs merge=lfs -text +*.tbz2 filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.ttf filter=lfs diff=lfs merge=lfs -text +*.txz filter=lfs diff=lfs merge=lfs -text +*.whl filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..57affb6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.osc diff --git a/python-tokenizers.changes b/python-tokenizers.changes new file mode 100644 index 0000000..a614c3f --- /dev/null +++ b/python-tokenizers.changes @@ -0,0 +1,4 @@ +------------------------------------------------------------------- +Wed Jul 3 14:55:36 UTC 2024 - Christian Goll + +- initial commit on rust based python-tokenizers diff --git a/python-tokenizers.spec b/python-tokenizers.spec new file mode 100644 index 0000000..7aa2c4f --- /dev/null +++ b/python-tokenizers.spec @@ -0,0 +1,75 @@ +# +# spec file for package python-tokenizers +# +# Copyright (c) 2024 SUSE LLC +# +# All modifications and additions to the file contributed by third parties +# remain the property of their copyright owners, unless otherwise agreed +# upon. The license for this file, and modifications and additions to the +# file, is the same license as for the pristine package itself (unless the +# license for the pristine package is not an Open Source License, in which +# case the license is the MIT License). An "Open Source License" is a +# license that conforms to the Open Source Definition (Version 1.9) +# published by the Open Source Initiative. + +# Please submit bugfixes or comments via https://bugs.opensuse.org/ +# + + +%{?!python_module:%define python_module() python-%{**} python3-%{**}} + +Name: python-tokenizers +Version: 0.19.1 +Release: 0 +Summary: Provides an implementation of today's most used tokenizers +License: Apache-2.0 +URL: https://github.com/huggingface/tokenizers +Source0: https://github.com/huggingface/tokenizers/archive/refs/tags/v%{version}.tar.gz#/tokenizers-%{version}.tar.gz +Source1: vendor.tar.gz +BuildRequires: %{python_module devel} +BuildRequires: %{python_module maturin} +BuildRequires: %{python_module pip} +BuildRequires: %{python_module setuptools} +BuildRequires: cargo-packaging +BuildRequires: gcc-c++ +BuildRequires: fdupes +BuildRequires: python-rpm-macros +BuildRequires: python-rpm-macros +%python_subpackages + +%description +Provides an implementation of today's most used tokenizers, with a focus on +performance and versatility. +* Train new vocabularies and tokenize, using today's most used tokenizers. +* Extremely fast (both training and tokenization), thanks to the Rust + implementation. Takes less than 20 seconds to tokenize a GB of text on a + server's CPU. +* Easy to use, but also extremely versatile. +* Designed for research and production. +* Normalization comes with alignments tracking. It's always possible to get the + part of the original sentence that corresponds to a given token. +* Does all the pre-processing: Truncate, Pad, add the special tokens your model + needs. + +%prep +%autosetup -p1 -n tokenizers-%{version} +cd bindings/python +tar xzf %{S:1} + +%build +cd bindings/python +%pyproject_wheel + +%install +cd bindings/python +%pyproject_install +%python_expand %fdupes %{buildroot}/%{$python_sitearch}/* + +%check + +%files %{python_files} +%license LICENSE +%doc README.md +%{python_sitearch}/tokenizers* + +%changelog diff --git a/tokenizers-0.19.1.tar.gz b/tokenizers-0.19.1.tar.gz new file mode 100644 index 0000000..62d966d --- /dev/null +++ b/tokenizers-0.19.1.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53f5e644148c14cf2c429f8eb321cc7f75e3092973ca6b0ced5b516214a081bf +size 1521372 diff --git a/vendor.tar.gz b/vendor.tar.gz new file mode 100644 index 0000000..4a53db9 --- /dev/null +++ b/vendor.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75f3dc1f210fe1b404dc08ac2793b4610ad98e15c033ce573996b89f686647dc +size 24387077