Skip to content

Commit

Permalink
Handle xml.gz sitemaps that are not actually zipped
Browse files Browse the repository at this point in the history
  • Loading branch information
eliasdabbas committed Dec 8, 2024
1 parent bdf9121 commit c1ffabb
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 3 deletions.
34 changes: 34 additions & 0 deletions tests/data/sitemap_testing/regular_sitemap.xml.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>

<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">

<url>
<loc>http://www.example.com/</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>

<url>
<loc>http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc>
<changefreq>weekly</changefreq>
</url>

<url>
<loc>http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand</loc>
<lastmod>2004-12-23</lastmod>
<changefreq>weekly</changefreq>
</url>

<url>
<loc>http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland</loc>
<lastmod>2004-12-23T18:00:15+00:00</lastmod>
<priority>0.3</priority>
</url>

<url>
<loc>http://www.example.com/catalog?item=83&amp;desc=vacation_usa</loc>
<lastmod>2004-11-23</lastmod>
</url>

</urlset>
14 changes: 11 additions & 3 deletions tests/test_sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import pandas as pd
import pytest

from advertools.sitemaps import sitemap_to_df, _build_request_headers, headers as DEFAULT_HEADERS
from advertools.sitemaps import _build_request_headers, sitemap_to_df
from advertools.sitemaps import headers as DEFAULT_HEADERS

gh_test_data_folder = "https://raw.githubusercontent.com/eliasdabbas/advertools/master/tests/data/sitemap_testing/"
offline_test_data_folder = "tests/data/sitemap_testing/"
Expand All @@ -20,6 +21,7 @@ def offline_path(filename):

regular_sitemap_url = full_path("regular_sitemap.xml")
zipped_sitemap_url = full_path("zipped_sitemap.xml.gz")
zipped_butnot_sitemap_url = offline_path("regular_sitemap.xml.gz")
sitemap_index_url = full_path("sitemap_index.xml")
error_sitemap_url = full_path("error_sitemap.xml")
image_sitemap_url = full_path("image_sitemap.xml")
Expand All @@ -34,7 +36,7 @@ def test_build_request_headers():
assert isinstance(final_headers, dict)
assert final_headers == {
"user-agent": DEFAULT_HEADERS["User-Agent"],
"if-none-match": "ETAG_STRING"
"if-none-match": "ETAG_STRING",
}


Expand All @@ -49,7 +51,7 @@ def test_build_request_headers_override_default():
final_headers = _build_request_headers(user_headers)
assert final_headers == {
"user-agent": "example/agent",
"if-none-match": "ETAG_STRING"
"if-none-match": "ETAG_STRING",
}


Expand All @@ -65,6 +67,12 @@ def test_gz_sitemap():
assert len(result) == 5


def test_gz_declared_but_regular_sitemap():
result = sitemap_to_df(zipped_butnot_sitemap_url)
assert isinstance(result, pd.core.frame.DataFrame)
assert len(result) == 5


def test_sitemap_index():
result = sitemap_to_df(sitemap_index_url)
assert isinstance(result, pd.core.frame.DataFrame)
Expand Down

0 comments on commit c1ffabb

Please sign in to comment.