Skip to content

Commit

Permalink
GH-45295: [Python][CI] Make download_tzdata_on_windows more robust an…
Browse files Browse the repository at this point in the history
…d use tzdata package for tzinfo database on Windows for ORC (#45425)

### Rationale for this change

We have two Windows issues and this PR is addressing both:

1. PyArrow's `download_tzdata_on_windows` can fail due to TLS issues in certain CI environments.
2. The Python wheel test infrastructure needs a tzinfo database for ORC and the automation fetching that started failing because the URL was made invalid upstream.

These two issues are being solved in one PR simply because they appeared together during the 19.0.1 release process but they're separate.

### What changes are included in this PR?

1. Makes `download_tzdata_on_windows` more robust to TLS errors by attempting to use `requests` if it's available and falling back to urllib otherwise.
2. Switches our Windows wheel test infrastructure to grab a tzinfo database from the tzdata package on PyPi instead of from a mirror URL. This should be much more stable for us over time.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

No.
* GitHub Issue: #45295

Lead-authored-by: Bryce Mecum <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Bryce Mecum <[email protected]>
  • Loading branch information
amoeba and jorisvandenbossche authored Feb 11, 2025
1 parent f1961ec commit f6e2cbe
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 18 deletions.
7 changes: 0 additions & 7 deletions ci/scripts/python_wheel_windows_test.bat
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,5 @@ py -0p
@REM Validate wheel contents
%PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\repaired_wheels || exit /B 1

@rem Download IANA Timezone Database for ORC C++
curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B
mkdir %USERPROFILE%\Downloads\test\tzdata
arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata || exit /B
set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo
dir %TZDIR%

@REM Execute unittest
%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1
16 changes: 16 additions & 0 deletions python/pyarrow/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,22 @@
set_timezone_db_path(tzdata_set_path)


# GH-45295: For ORC, try to populate TZDIR env var from tzdata package resource
# path.
#
# Note this is a different kind of database than what we allow to be set by
# `PYARROW_TZDATA_PATH` and passed to set_timezone_db_path.
if sys.platform == 'win32':
if os.environ.get('TZDIR', None) is None:
from importlib import resources
try:
os.environ['TZDIR'] = os.path.join(resources.files('tzdata'), 'zoneinfo')
except ModuleNotFoundError:
print(
'Package "tzdata" not found. Not setting TZDIR environment variable.'
)


def pytest_addoption(parser):
# Create options to selectively enable test groups
def bool_env(name, default=None):
Expand Down
40 changes: 29 additions & 11 deletions python/pyarrow/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,20 @@ def _break_traceback_cycle_from_frame(frame):
refs = frame = this_frame = None


def _download_urllib(url, out_path):
from urllib.request import urlopen
with urlopen(url) as response:
with open(out_path, 'wb') as f:
f.write(response.read())


def _download_requests(url, out_path):
import requests
with requests.get(url) as response:
with open(out_path, 'wb') as f:
f.write(response.content)


def download_tzdata_on_windows():
r"""
Download and extract latest IANA timezone database into the
Expand All @@ -240,19 +254,23 @@ def download_tzdata_on_windows():

import tarfile

tzdata_url = "https://data.iana.org/time-zones/tzdata-latest.tar.gz"
tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata")
tzdata_compressed = os.path.join(tzdata_path, "tzdata.tar.gz")
tzdata_compressed_path = os.path.join(tzdata_path, "tzdata.tar.gz")
windows_zones_url = "https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml" # noqa
windows_zones_path = os.path.join(tzdata_path, "windowsZones.xml")
os.makedirs(tzdata_path, exist_ok=True)

from urllib.request import urlopen
with urlopen('https://data.iana.org/time-zones/tzdata-latest.tar.gz') as response:
with open(tzdata_compressed, 'wb') as f:
f.write(response.read())

assert os.path.exists(tzdata_compressed)
# Try to download the files with requests and then fall back to urllib. This
# works around possible issues in certain older environment (GH-45295)
try:
_download_requests(tzdata_url, tzdata_compressed_path)
_download_requests(windows_zones_url, windows_zones_path)
except ImportError:
_download_urllib(tzdata_url, tzdata_compressed_path)
_download_urllib(windows_zones_url, windows_zones_path)

tarfile.open(tzdata_compressed).extractall(tzdata_path)
assert os.path.exists(tzdata_compressed_path)
assert os.path.exists(windows_zones_path)

with urlopen('https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml') as response_zones: # noqa
with open(os.path.join(tzdata_path, "windowsZones.xml"), 'wb') as f:
f.write(response_zones.read())
tarfile.open(tzdata_compressed_path).extractall(tzdata_path)
1 change: 1 addition & 0 deletions python/requirements-wheel-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ hypothesis
pytest
pytz
pyuwsgi; sys.platform != 'win32' and python_version < '3.13'
requests; sys_platform == 'win32'
tzdata; sys_platform == 'win32'

# We generally test with the oldest numpy version that supports a given Python
Expand Down

0 comments on commit f6e2cbe

Please sign in to comment.