From ec78492ef1ae8db70d6b85c03310fad27af6170d Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Thu, 3 Apr 2025 01:35:47 +0100 Subject: [PATCH 1/6] Optimise import time for ``string`` --- Lib/string.py | 64 +++++++++++-------- ...-04-03-01-35-02.gh-issue-118761.VQcj70.rst | 2 + 2 files changed, 41 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-04-03-01-35-02.gh-issue-118761.VQcj70.rst diff --git a/Lib/string.py b/Lib/string.py index c4f05c7223ce8a..e3416135990179 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -49,29 +49,20 @@ def capwords(s, sep=None): #################################################################### -import re as _re -from collections import ChainMap as _ChainMap - +_sentinel_flags = object() _sentinel_dict = {} -class Template: - """A string class for supporting $-substitutions.""" - delimiter = '$' - # r'[a-z]' matches to non-ASCII letters when used with IGNORECASE, but - # without the ASCII flag. We can't add re.ASCII to flags because of - # backward compatibility. So we use the ?a local flag and [a-z] pattern. - # See https://bugs.python.org/issue31672 - idpattern = r'(?a:[_a-z][_a-z0-9]*)' - braceidpattern = None - flags = _re.IGNORECASE - - def __init_subclass__(cls): - super().__init_subclass__() - if 'pattern' in cls.__dict__: +class _TemplatePattern: + def __get__(self, instance, cls=None): + if cls is None: + return self + import re + if ('pattern' in cls.__dict__ + and not isinstance(cls.__dict__['pattern'], _TemplatePattern)): pattern = cls.pattern else: - delim = _re.escape(cls.delimiter) + delim = re.escape(cls.delimiter) id = cls.idpattern bid = cls.braceidpattern or cls.idpattern pattern = fr""" @@ -82,7 +73,32 @@ def __init_subclass__(cls): (?P<invalid>) # Other ill-formed delimiter exprs ) """ - cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE) + if cls.flags is _sentinel_flags: + cls.flags = re.IGNORECASE + pattern = re.compile(pattern, cls.flags | re.VERBOSE) + # replace this descriptor with the compiled pattern + setattr(cls, 'pattern', pattern) + return pattern + + +class Template: + """A string class for supporting $-substitutions.""" + + delimiter = '$' + # r'[a-z]' matches to non-ASCII letters when used with IGNORECASE, but + # without the ASCII flag. We can't add re.ASCII to flags because of + # backward compatibility. So we use the ?a local flag and [a-z] pattern. + # See https://bugs.python.org/issue31672 + idpattern = r'(?a:[_a-z][_a-z0-9]*)' + braceidpattern = None + flags = _sentinel_flags # default: re.IGNORECASE + + # use a descriptor to be able to defer the import of `re`, for performance + pattern = _TemplatePattern() + + def __init_subclass__(cls): + super().__init_subclass__() + cls.pattern = _TemplatePattern() def __init__(self, template): self.template = template @@ -105,7 +121,8 @@ def substitute(self, mapping=_sentinel_dict, /, **kws): if mapping is _sentinel_dict: mapping = kws elif kws: - mapping = _ChainMap(kws, mapping) + from collections import ChainMap + mapping = ChainMap(kws, mapping) # Helper function for .sub() def convert(mo): # Check the most common path first. @@ -124,7 +141,8 @@ def safe_substitute(self, mapping=_sentinel_dict, /, **kws): if mapping is _sentinel_dict: mapping = kws elif kws: - mapping = _ChainMap(kws, mapping) + from collections import ChainMap + mapping = ChainMap(kws, mapping) # Helper function for .sub() def convert(mo): named = mo.group('named') or mo.group('braced') @@ -170,10 +188,6 @@ def get_identifiers(self): self.pattern) return ids -# Initialize Template.pattern. __init_subclass__() is automatically called -# only for subclasses, not for the Template class itself. -Template.__init_subclass__() - ######################################################################## # the Formatter class diff --git a/Misc/NEWS.d/next/Library/2025-04-03-01-35-02.gh-issue-118761.VQcj70.rst b/Misc/NEWS.d/next/Library/2025-04-03-01-35-02.gh-issue-118761.VQcj70.rst new file mode 100644 index 00000000000000..257ad7ece7d18a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-04-03-01-35-02.gh-issue-118761.VQcj70.rst @@ -0,0 +1,2 @@ +Improve import times by up to 27x for the :mod:`string` module. +Patch by Adam Turner. From 2b542c76445ec12ffaa4b6b869d6be8192d3e044 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Thu, 3 Apr 2025 02:52:55 +0100 Subject: [PATCH 2/6] Refactor to a common classmethod --- Lib/string.py | 55 +++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/Lib/string.py b/Lib/string.py index e3416135990179..1a62b699dc2ac6 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -49,36 +49,15 @@ def capwords(s, sep=None): #################################################################### -_sentinel_flags = object() _sentinel_dict = {} +_sentinel_flags = object() class _TemplatePattern: def __get__(self, instance, cls=None): if cls is None: return self - import re - if ('pattern' in cls.__dict__ - and not isinstance(cls.__dict__['pattern'], _TemplatePattern)): - pattern = cls.pattern - else: - delim = re.escape(cls.delimiter) - id = cls.idpattern - bid = cls.braceidpattern or cls.idpattern - pattern = fr""" - {delim}(?: - (?P<escaped>{delim}) | # Escape sequence of two delimiters - (?P<named>{id}) | # delimiter and a Python identifier - {{(?P<braced>{bid})}} | # delimiter and a braced identifier - (?P<invalid>) # Other ill-formed delimiter exprs - ) - """ - if cls.flags is _sentinel_flags: - cls.flags = re.IGNORECASE - pattern = re.compile(pattern, cls.flags | re.VERBOSE) - # replace this descriptor with the compiled pattern - setattr(cls, 'pattern', pattern) - return pattern + return cls._compile_pattern() class Template: @@ -93,12 +72,36 @@ class Template: braceidpattern = None flags = _sentinel_flags # default: re.IGNORECASE - # use a descriptor to be able to defer the import of `re`, for performance - pattern = _TemplatePattern() + pattern = _TemplatePattern() # use a descriptor to compile the pattern def __init_subclass__(cls): super().__init_subclass__() - cls.pattern = _TemplatePattern() + cls._compile_pattern() + + @classmethod + def _compile_pattern(cls): + import re # deferred import, for performance + + cls_pattern = cls.__dict__.get('pattern') + if cls_pattern and not isinstance(cls_pattern, _TemplatePattern): + # Prefer a pattern defined on the class. + pattern = cls_pattern + else: + delim = re.escape(cls.delimiter) + id = cls.idpattern + bid = cls.braceidpattern or cls.idpattern + pattern = fr""" + {delim}(?: + (?P<escaped>{delim}) | # Escape sequence of two delimiters + (?P<named>{id}) | # delimiter and a Python identifier + {{(?P<braced>{bid})}} | # delimiter and a braced identifier + (?P<invalid>) # Other ill-formed delimiter exprs + ) + """ + if cls.flags is _sentinel_flags: + cls.flags = re.IGNORECASE + pat = cls.pattern = re.compile(pattern, cls.flags | re.VERBOSE) + return pat def __init__(self, template): self.template = template From e28e43ff093256eade1fe965e6e72eb352c75477 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Thu, 3 Apr 2025 02:55:36 +0100 Subject: [PATCH 3/6] Add comment --- Lib/string.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/string.py b/Lib/string.py index 1a62b699dc2ac6..af762894667e18 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -57,6 +57,7 @@ class _TemplatePattern: def __get__(self, instance, cls=None): if cls is None: return self + # This descriptor is overwritten in ``_compile_pattern()``. return cls._compile_pattern() From 056dd07bb50834ee07601661ff00de6f6bcd72fd Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 6 Apr 2025 18:11:27 +0100 Subject: [PATCH 4/6] Use None as the sentinel --- Lib/string.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/string.py b/Lib/string.py index af762894667e18..c4fb69c4ec7a04 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -50,7 +50,6 @@ def capwords(s, sep=None): #################################################################### _sentinel_dict = {} -_sentinel_flags = object() class _TemplatePattern: @@ -71,7 +70,7 @@ class Template: # See https://bugs.python.org/issue31672 idpattern = r'(?a:[_a-z][_a-z0-9]*)' braceidpattern = None - flags = _sentinel_flags # default: re.IGNORECASE + flags = None # default: re.IGNORECASE pattern = _TemplatePattern() # use a descriptor to compile the pattern @@ -99,7 +98,7 @@ def _compile_pattern(cls): (?P<invalid>) # Other ill-formed delimiter exprs ) """ - if cls.flags is _sentinel_flags: + if cls.flags is None: cls.flags = re.IGNORECASE pat = cls.pattern = re.compile(pattern, cls.flags | re.VERBOSE) return pat From bb3605a7d89acb288cabdf5e29dffc11b2567251 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Mon, 7 Apr 2025 21:52:14 +0100 Subject: [PATCH 5/6] Make _TemplatePattern a singleton --- Lib/string.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Lib/string.py b/Lib/string.py index c4fb69c4ec7a04..9ba0769e93eee3 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -53,11 +53,12 @@ def capwords(s, sep=None): class _TemplatePattern: + # This descriptor is overwritten in ``Template._compile_pattern()``. def __get__(self, instance, cls=None): if cls is None: return self - # This descriptor is overwritten in ``_compile_pattern()``. return cls._compile_pattern() +_TemplatePattern = _TemplatePattern() class Template: @@ -72,7 +73,7 @@ class Template: braceidpattern = None flags = None # default: re.IGNORECASE - pattern = _TemplatePattern() # use a descriptor to compile the pattern + pattern = _TemplatePattern # use a descriptor to compile the pattern def __init_subclass__(cls): super().__init_subclass__() @@ -83,7 +84,7 @@ def _compile_pattern(cls): import re # deferred import, for performance cls_pattern = cls.__dict__.get('pattern') - if cls_pattern and not isinstance(cls_pattern, _TemplatePattern): + if cls_pattern is not None and cls_pattern is not _TemplatePattern: # Prefer a pattern defined on the class. pattern = cls_pattern else: From 6fb9b5d65c2822012efe47bf7158227bcfde4ce0 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Date: Tue, 8 Apr 2025 10:39:27 +0100 Subject: [PATCH 6/6] Serhiy's suggestion Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> --- Lib/string.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/string.py b/Lib/string.py index 9ba0769e93eee3..eab5067c9b133e 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -83,11 +83,8 @@ def __init_subclass__(cls): def _compile_pattern(cls): import re # deferred import, for performance - cls_pattern = cls.__dict__.get('pattern') - if cls_pattern is not None and cls_pattern is not _TemplatePattern: - # Prefer a pattern defined on the class. - pattern = cls_pattern - else: + pattern = cls.__dict__.get('pattern', _TemplatePattern) + if pattern is _TemplatePattern: delim = re.escape(cls.delimiter) id = cls.idpattern bid = cls.braceidpattern or cls.idpattern