From db41c60eaaecbbae039d051e336b6c479a5ffb8e Mon Sep 17 00:00:00 2001 From: Raphael Gaschignard Date: Thu, 21 Apr 2022 23:21:09 +0900 Subject: [PATCH] Preserve unicode when slugifying by default This change is motivated by the many many people using non-latin languages who experience the (rather baffling) behavior of outright stripping characters when generating slugs from stuff that doesn't fit into ASCII. We went through loads of pain as a programming community to get to nicely supporting unicode everywhere, it's time to take advantage of that fact and just let people have stuff appear in their native language as much as possible. --- CHANGELOG.rst | 29 ++++++++++++++++---- docs/faq.rst | 3 +- docs/getting_started.rst | 20 ++++++++++++++ taggit/migrations/0005_auto_20220424_2025.py | 20 ++++++++++++++ taggit/models.py | 11 ++++++-- tests/test_models.py | 27 ++++++++++++++++++ 6 files changed, 102 insertions(+), 8 deletions(-) create mode 100644 taggit/migrations/0005_auto_20220424_2025.py create mode 100644 tests/test_models.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3b681637..3aa4155a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,14 +4,33 @@ Changelog (Unreleased) ~~~~~~~~~~~~ -* Drop Django 2.2 support. +* **Backwards incompatible:** Tag slugification used to silently strip non-ASCII characters + from the tag name to make the slug. This leads to a lot of confusion for anyone using + languages with non-latin alphabets, as well as weird performance issues. + + Tag slugification will now, by default, maintain unicode characters as-is during + slugification. This will lead to less surprises, but might cause issues for you if you are + expecting all of your tag slugs to fit within a regex like ``[a-zA-Z0-9]`` (for example in + URL routing configurations). + + Generally speaking, this should not require action on your part as a library user, as + existing tag slugs are persisted in the database, and only new tags will receive the + enhanced unicode-compatible slug. + + If you wish to maintain the old stripping behavior, set the setting + ``TAGGIT_STRIP_UNICODE_WHEN_SLUGIFYING`` to ``True``. + + As a reminder, custom tag models can easily customize slugification behavior by overriding + the ``slugify`` method to your business needs. + +`` Drop Django 2.2 support. 2.1.0 (2022-01-24) ~~~~~~~~~~~~~~~~~~ -* Add Python 3.10 support. -* Add Django 4.0 support. -* Drop Django 3.1 support. +`` Add Python 3.10 support. +`` Add Django 4.0 support. +`` Drop Django 3.1 support. 2.0.0 (2021-11-14) @@ -23,7 +42,7 @@ Changelog - previously: ``item.tags.set("red", "blue")`` - now: ``item.tags.set(["red", "blue"])`` -* Fix issue where ``TagField`` would incorrectly report that a field has changed on empty values. +** Fix issue where ``TagField`` would incorrectly report that a field has changed on empty values. * Update Russian translation. * Add Persian translation * Fix issue for many languages where content types were not being properly translated. diff --git a/docs/faq.rst b/docs/faq.rst index d4cd3f10..f8f9b19a 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -13,7 +13,8 @@ Frequently Asked Questions One way to handle this is with post-generation hooks:: - class ProductFactory(DjangoModelFactory): + + class ProductFactory(DjangoModelFactory): # Rest of the stuff @post_generation diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 0589b2af..a641bb05 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -27,3 +27,23 @@ And then to any model you want tagging on do the following:: If you want ``django-taggit`` to be **CASE-INSENSITIVE** when looking up existing tags, you'll have to set ``TAGGIT_CASE_INSENSITIVE`` (in ``settings.py`` or wherever you have your Django settings) to ``True`` (``False`` by default):: TAGGIT_CASE_INSENSITIVE = True + + +Settings +-------- + +The following Django-level settings affect the behavior of the library + +* ``TAGGIT_CASE_INSENSITIVE`` + + When set to ``True``, tag lookups will be case insensitive. This defaults to ``False``. + +`` ``TAGGIT_STRIP_UNICODE_WHEN_SLUGIFYING`` + When this is set to ``True``, tag slugs will be limited to ASCII characters. In this case, if you also have ```unidecode`` installed, + then tag sluggification will transform a tag like ``あい うえお`` to ``ai-ueo``. + If you do not have ``unidecode`` installed, then you will usually be outright stripping unicode, meaning that something like ``helloあい`` will be slugified as ``hello``. + + This value defaults to ``False``, meaning that unicode is preserved in slugification. + + Because the behavior when ``True`` is set leads to situations where + slugs can be entirely stripped to an empty string, we recommend not activating this. diff --git a/taggit/migrations/0005_auto_20220424_2025.py b/taggit/migrations/0005_auto_20220424_2025.py new file mode 100644 index 00000000..c64659da --- /dev/null +++ b/taggit/migrations/0005_auto_20220424_2025.py @@ -0,0 +1,20 @@ +# Generated by Django 2.2.26 on 2022-04-24 20:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("taggit", "0004_alter_taggeditem_content_type_alter_taggeditem_tag"), + ] + + operations = [ + migrations.AlterField( + model_name="tag", + name="slug", + field=models.SlugField( + allow_unicode=True, max_length=100, unique=True, verbose_name="slug" + ), + ), + ] diff --git a/taggit/models.py b/taggit/models.py index 1d457607..05bca49f 100644 --- a/taggit/models.py +++ b/taggit/models.py @@ -1,3 +1,4 @@ +from django.conf import settings from django.contrib.contenttypes.fields import GenericForeignKey from django.contrib.contenttypes.models import ContentType from django.db import IntegrityError, models, router, transaction @@ -19,7 +20,10 @@ class TagBase(models.Model): verbose_name=pgettext_lazy("A tag name", "name"), unique=True, max_length=100 ) slug = models.SlugField( - verbose_name=pgettext_lazy("A tag slug", "slug"), unique=True, max_length=100 + verbose_name=pgettext_lazy("A tag slug", "slug"), + unique=True, + max_length=100, + allow_unicode=True, ) def __str__(self): @@ -71,7 +75,10 @@ def save(self, *args, **kwargs): return super().save(*args, **kwargs) def slugify(self, tag, i=None): - slug = slugify(unidecode(tag)) + if getattr(settings, "TAGGIT_STRIP_UNICODE_WHEN_SLUGIFYING", False): + slug = slugify(unidecode(tag)) + else: + slug = slugify(tag, allow_unicode=True) if i is not None: slug += "_%d" % i return slug diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 00000000..b469c33d --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,27 @@ +from django.test import TestCase, override_settings + +from tests.models import TestModel + + +class TestSlugification(TestCase): + def test_unicode_slugs(self): + """ + Confirm the preservation of unicode in slugification by default + """ + sample_obj = TestModel.objects.create() + # a unicode tag will be slugified for space reasons but + # unicode-ness will be kept by default + sample_obj.tags.add("あい うえお") + self.assertEqual([tag.slug for tag in sample_obj.tags.all()], ["あい-うえお"]) + + def test_old_slugs(self): + """ + Test that the setting that gives us the old slugification behavior + is in place + """ + with override_settings(TAGGIT_STRIP_UNICODE_WHEN_SLUGIFYING=True): + sample_obj = TestModel.objects.create() + # a unicode tag will be slugified for space reasons but + # unicode-ness will be kept by default + sample_obj.tags.add("あい うえお") + self.assertEqual([tag.slug for tag in sample_obj.tags.all()], [""])