annotate.py: Fix bug for 'Binary files ... differ' patches

When there is change to a binary file, Git will show this using the "Binary files a/foo and b/foo differ" message in place of diff; therefore for unidiff there are 0 hunks. Make the compute_sizes_and_spreads() method of AnnotatedPatchedFile class handle this case correctly. Additionally, you can now find number of changed "binary" files with 'n_binary_files' field in the compute_sizes_and_spreads() method output. . Witout this fix, you would get the following error (caught and printed): File "python-diff-annotator/src/diffannotator/annotate.py", line 786, in compute_sizes_and_spreads (self.patched_file[-1].source_start + self.patched_file[-1].source_length - 1 ~~~~~~~~~~~~~~~~~^^^^ IndexError: list index out of range Add test for this specific issue.
ncusi · Oct 11, 2024 · a7fa3d0 · a7fa3d0
1 parent c53cbe6
commit a7fa3d0
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 0 deletions.
diff --git a/src/diffannotator/annotate.py b/src/diffannotator/annotate.py
@@ -770,6 +770,9 @@ def compute_sizes_and_spreads(self) -> Counter:
             not interrupted by context line (also called "chunks"),
             as 'n_groups'
           - number of modified files, as 'n_files' (always 1)
+          - number of modified binary files, as 'n_binary_files' (either 0 or 1);
+            for those files there cannot beno information about "lines",
+            like the number of hunks, groups (chunks), etc.
           - sum of distances in context lines between groups (chunks)
             inside hunk, for all hunks in patched file, as 'spread_inner'
           - sum of distances in lines between groups (chunks) for
@@ -779,6 +782,15 @@ def compute_sizes_and_spreads(self) -> Counter:
         :return: Counter with different sizes and different spreads
             of the given changed file
         """
+        # Handle the case where there are no hunks of changed lines,
+        # for the case of change to the binary file:
+        #   Binary files /dev/null and b/foo.gz differ
+        if len(self.patched_file) == 0:
+            return Counter({
+                'n_files': 1,
+                'n_binary_files': 1,
+            })
+
         result = Counter({
             'n_files': 1,
             'hunk_span_src':

diff --git a/tests/test_annotate.py b/tests/test_annotate.py
@@ -484,6 +484,30 @@ def test_AnnotatedPatchedFile(line_type):
         "AnnotatedHunk.process() with source and AnnotatedPatchedFile.hunk_tokens_for_type() give the same tokens"
 
 
+def test_AnnotatedPatchSet_binary_files_differ():
+    # .......................................................................
+    # patch with binary files
+    file_path = 'tests/test_dataset_structured/scrapy-11/patches/9de6f1ca757b7f200d15e94840c9d431cf202276.diff'
+
+    patch_set = AnnotatedPatchSet.from_filename(file_path,
+                                                missing_ok=False, ignore_diff_parse_errors=False)
+
+    sizes_and_spreads = patch_set.compute_sizes_and_spreads()
+    #print(f"{sizes_and_spreads=}")
+    assert sizes_and_spreads['n_binary_files'] == 2, 'changes to 2 binary files'
+    assert sizes_and_spreads['n_files'] == 4, "changes to 2 files"
+
+    result = patch_set.process()
+    #print(f"{result=}")
+    for pm in ['+', '-']:
+        assert pm not in result['/dev/null'], \
+            f"no '{pm}' lines for /dev/null"
+        assert pm not in result['tests/sample_data/compressed/unexpected-eof-output.txt'], \
+            f"no '{pm}' lines for binary file with *.txt extension"
+        assert pm not in result['tests/sample_data/compressed/unexpected-eof.gz'], \
+            f"no '{pm}' lines for binary file with *.gz extension"
+
+
 def test_Bug_from_dataset():
     # code patch
     file_path = Path('tests/test_dataset/tqdm-1/c0dcf39b046d1b4ff6de14ac99ad9a1b10487512.diff')

diff --git a/...s/test_dataset_structured/scrapy-11/patches/9de6f1ca757b7f200d15e94840c9d431cf202276.diff b/...s/test_dataset_structured/scrapy-11/patches/9de6f1ca757b7f200d15e94840c9d431cf202276.diff
@@ -0,0 +1,46 @@
+diff --git a/scrapy/utils/gz.py b/scrapy/utils/gz.py
+index afc7ed128..73c2eb73b 100644
+--- a/scrapy/utils/gz.py
++++ b/scrapy/utils/gz.py
+@@ -43,7 +43,7 @@ def gunzip(data):
+             # contains the whole page content
+             if output or getattr(f, 'extrabuf', None):
+                 try:
+-                    output += f.extrabuf
++                    output += f.extrabuf[-f.extrasize:]
+                 finally:
+                     break
+             else:
+diff --git a/tests/sample_data/compressed/unexpected-eof-output.txt b/tests/sample_data/compressed/unexpected-eof-output.txt
+new file mode 100644
+index 000000000..3b201255f
+Binary files /dev/null and b/tests/sample_data/compressed/unexpected-eof-output.txt differ
+diff --git a/tests/sample_data/compressed/unexpected-eof.gz b/tests/sample_data/compressed/unexpected-eof.gz
+new file mode 100644
+index 000000000..96211e432
+Binary files /dev/null and b/tests/sample_data/compressed/unexpected-eof.gz differ
+diff --git a/tests/test_utils_gz.py b/tests/test_utils_gz.py
+index 2b47bf8da..7148185f4 100644
+--- a/tests/test_utils_gz.py
++++ b/tests/test_utils_gz.py
+@@ -1,6 +1,8 @@
+ import unittest
+ from os.path import join
+
++from w3lib.encoding import html_to_unicode
++
+ from scrapy.utils.gz import gunzip, is_gzipped
+ from scrapy.http import Response, Headers
+ from tests import tests_datadir
+@@ -66,3 +68,11 @@ class GunzipTest(unittest.TestCase):
+         hdrs = Headers({"Content-Type": "application/x-gzip;charset=utf-8"})
+         r1 = Response("http://www.example.com", headers=hdrs)
+         self.assertTrue(is_gzipped(r1))
++
++    def test_gunzip_illegal_eof(self):
++        with open(join(SAMPLEDIR, 'unexpected-eof.gz'), 'rb') as f:
++            text = html_to_unicode('charset=cp1252', gunzip(f.read()))[1]
++            with open(join(SAMPLEDIR, 'unexpected-eof-output.txt'), 'rb') as o:
++                expected_text = o.read().decode("utf-8")
++                self.assertEqual(len(text), len(expected_text))
++                self.assertEqual(text, expected_text)