Skip to content

Commit

Permalink
annotate.py: Fix bug for 'Binary files ... differ' patches
Browse files Browse the repository at this point in the history
When there is change to a binary file, Git will show this using the
"Binary files a/foo and b/foo differ" message in place of diff;
therefore for unidiff there are 0 hunks.  Make the
compute_sizes_and_spreads() method of AnnotatedPatchedFile class
handle this case correctly.

Additionally, you can now find number of changed "binary" files with
'n_binary_files' field in the compute_sizes_and_spreads() method output.

.
Witout this fix, you would get the following error (caught and
printed):

  File "python-diff-annotator/src/diffannotator/annotate.py", line 786, in compute_sizes_and_spreads
    (self.patched_file[-1].source_start + self.patched_file[-1].source_length - 1
     ~~~~~~~~~~~~~~~~~^^^^
  IndexError: list index out of range

Add test for this specific issue.
  • Loading branch information
jnareb committed Oct 11, 2024
1 parent c53cbe6 commit a7fa3d0
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/diffannotator/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,9 @@ def compute_sizes_and_spreads(self) -> Counter:
not interrupted by context line (also called "chunks"),
as 'n_groups'
- number of modified files, as 'n_files' (always 1)
- number of modified binary files, as 'n_binary_files' (either 0 or 1);
for those files there cannot beno information about "lines",
like the number of hunks, groups (chunks), etc.
- sum of distances in context lines between groups (chunks)
inside hunk, for all hunks in patched file, as 'spread_inner'
- sum of distances in lines between groups (chunks) for
Expand All @@ -779,6 +782,15 @@ def compute_sizes_and_spreads(self) -> Counter:
:return: Counter with different sizes and different spreads
of the given changed file
"""
# Handle the case where there are no hunks of changed lines,
# for the case of change to the binary file:
# Binary files /dev/null and b/foo.gz differ
if len(self.patched_file) == 0:
return Counter({
'n_files': 1,
'n_binary_files': 1,
})

result = Counter({
'n_files': 1,
'hunk_span_src':
Expand Down
24 changes: 24 additions & 0 deletions tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,30 @@ def test_AnnotatedPatchedFile(line_type):
"AnnotatedHunk.process() with source and AnnotatedPatchedFile.hunk_tokens_for_type() give the same tokens"


def test_AnnotatedPatchSet_binary_files_differ():
# .......................................................................
# patch with binary files
file_path = 'tests/test_dataset_structured/scrapy-11/patches/9de6f1ca757b7f200d15e94840c9d431cf202276.diff'

patch_set = AnnotatedPatchSet.from_filename(file_path,
missing_ok=False, ignore_diff_parse_errors=False)

sizes_and_spreads = patch_set.compute_sizes_and_spreads()
#print(f"{sizes_and_spreads=}")
assert sizes_and_spreads['n_binary_files'] == 2, 'changes to 2 binary files'
assert sizes_and_spreads['n_files'] == 4, "changes to 2 files"

result = patch_set.process()
#print(f"{result=}")
for pm in ['+', '-']:
assert pm not in result['/dev/null'], \
f"no '{pm}' lines for /dev/null"
assert pm not in result['tests/sample_data/compressed/unexpected-eof-output.txt'], \
f"no '{pm}' lines for binary file with *.txt extension"
assert pm not in result['tests/sample_data/compressed/unexpected-eof.gz'], \
f"no '{pm}' lines for binary file with *.gz extension"


def test_Bug_from_dataset():
# code patch
file_path = Path('tests/test_dataset/tqdm-1/c0dcf39b046d1b4ff6de14ac99ad9a1b10487512.diff')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
diff --git a/scrapy/utils/gz.py b/scrapy/utils/gz.py
index afc7ed128..73c2eb73b 100644
--- a/scrapy/utils/gz.py
+++ b/scrapy/utils/gz.py
@@ -43,7 +43,7 @@ def gunzip(data):
# contains the whole page content
if output or getattr(f, 'extrabuf', None):
try:
- output += f.extrabuf
+ output += f.extrabuf[-f.extrasize:]
finally:
break
else:
diff --git a/tests/sample_data/compressed/unexpected-eof-output.txt b/tests/sample_data/compressed/unexpected-eof-output.txt
new file mode 100644
index 000000000..3b201255f
Binary files /dev/null and b/tests/sample_data/compressed/unexpected-eof-output.txt differ
diff --git a/tests/sample_data/compressed/unexpected-eof.gz b/tests/sample_data/compressed/unexpected-eof.gz
new file mode 100644
index 000000000..96211e432
Binary files /dev/null and b/tests/sample_data/compressed/unexpected-eof.gz differ
diff --git a/tests/test_utils_gz.py b/tests/test_utils_gz.py
index 2b47bf8da..7148185f4 100644
--- a/tests/test_utils_gz.py
+++ b/tests/test_utils_gz.py
@@ -1,6 +1,8 @@
import unittest
from os.path import join

+from w3lib.encoding import html_to_unicode
+
from scrapy.utils.gz import gunzip, is_gzipped
from scrapy.http import Response, Headers
from tests import tests_datadir
@@ -66,3 +68,11 @@ class GunzipTest(unittest.TestCase):
hdrs = Headers({"Content-Type": "application/x-gzip;charset=utf-8"})
r1 = Response("http://www.example.com", headers=hdrs)
self.assertTrue(is_gzipped(r1))
+
+ def test_gunzip_illegal_eof(self):
+ with open(join(SAMPLEDIR, 'unexpected-eof.gz'), 'rb') as f:
+ text = html_to_unicode('charset=cp1252', gunzip(f.read()))[1]
+ with open(join(SAMPLEDIR, 'unexpected-eof-output.txt'), 'rb') as o:
+ expected_text = o.read().decode("utf-8")
+ self.assertEqual(len(text), len(expected_text))
+ self.assertEqual(text, expected_text)

0 comments on commit a7fa3d0

Please sign in to comment.