test_annotate.py: Test 'changes' vs 'diff_metadata' for multi-file diff

The exact patch file (*.diff) to be tested was selected based on examination of mismatch between -/+:count and n_rem, n_mod, n_add values found in notebooks/panel/01-timeline.ipynb, section "Patch size and spread, resampled". The fact that no discrepancies were found in the test (added to the tests/test_annotate.py::test_misc_patchsets_sizes_and_spreads test) most likely measn that the bug was not in annotate.py - thus removal of the task from TODO.md. (It might be the case that unification of the code computing of patch size and spread metrics in .process() fixed the issue instead, or that if fixed the issue for the gather_data.py too.)
ncusi · Oct 28, 2024 · 898b6cf · 898b6cf
1 parent 28cb96c
commit 898b6cf
Show file tree

Hide file tree

Showing 4 changed files with 180 additions and 3 deletions.
diff --git a/TODO.md b/TODO.md
@@ -128,9 +128,6 @@ The result of annotation is saved in JSON files, one per patch / commit.
     - [x] computing patch/diff size and spread, following
           _"[Dissection of a bug dataset: Anatomy of 395 patches from Defects4J][dissection-defects4j-paper]"_
           (and extending it) - independent implementation
-        - [ ] check if the **bug** of added, removed, and modified lines
-              not matching with the number of - and + lines in unified diff
-              is here, or with the `diff-gather-stats` step
         - [x] _patch size_ counting added ('+'), removed ('-'), and **modified** ('!') lines,
               with simplified changed lines detection:<br>
               "Lines are considered modified when sequences of removed lines are straight followed by added lines

diff --git a/tests/test_annotate.py b/tests/test_annotate.py
@@ -383,6 +383,43 @@ def test_misc_patchsets_sizes_and_spreads():
     #pprint(result)
     assert result['n_files'] == 12, "there were 12 changed files in patch"
 
+    file_path = 'tests/test_dataset/tensorflow/87de301db14745ab920d7e32b53d926236a4f2af.diff'
+    patch_set = AnnotatedPatchSet.from_filename(file_path, encoding='utf-8')
+    diff_metadata = patch_set.compute_sizes_and_spreads()
+    changes_data  = patch_set.process(sizes_and_spreads=False)['changes']
+
+    assert len(changes_data) == diff_metadata['n_files'] + diff_metadata['n_file_renames'], \
+        f"number of files matches between 'changes' and 'diff_metadata' for {file_path}"
+
+    # TODO: extract this common-ish code
+    total_m = total_p = 0
+    for file_name, file_data in changes_data.items():
+        for data_key, data_value in file_data.items():
+            if data_key == '-':
+                total_m += len(data_value)
+            elif data_key == '+':
+                total_p += len(data_value)
+
+        ## DEBUG
+        #print(f"{file_name!r}: {total_m=}, {total_p=}")
+
+    ## DEBUG
+    #print(f"TOTAL: {total_m=}, {total_p=}, {total_p+total_m=}")
+    #print(f"META:  "
+    #      f"'n_rem'={diff_metadata['n_rem']}, 2*'n_mod'={2*diff_metadata['n_mod']}, 'n_add'={diff_metadata['n_add']}")
+    #print(f"META:  "
+    #      f"'n_rem'+'n_mod'={diff_metadata['n_rem'] + diff_metadata['n_mod']}, ",
+    #      f"'n_mod'+'n_add'={diff_metadata['n_mod'] + diff_metadata['n_add']}")
+    #print(f"META:  "
+    #      f"'n_rem'+2*'n_mod'+'n_add'={diff_metadata['n_rem'] + 2*diff_metadata['n_mod'] + diff_metadata['n_add']}")
+
+    assert total_m == diff_metadata['n_rem'] + diff_metadata['n_mod'], \
+        f"number of '-' lines matches between 'changes' and 'diff_metadata' in {file_path}"
+    assert total_p == diff_metadata['n_add'] + diff_metadata['n_mod'], \
+        f"number of '+' lines matches between 'changes' and 'diff_metadata' in {file_path}"
+    assert total_m + total_p == diff_metadata['n_rem'] + 2*diff_metadata['n_mod'] + diff_metadata['n_add'], \
+        f"number of -/+ lines matches between 'changes' and 'diff_metadata' in {file_path}"
+
 
 @pytest.mark.parametrize("line_type", [unidiff.LINE_TYPE_REMOVED, unidiff.LINE_TYPE_ADDED])
 def test_AnnotatedPatchedFile(line_type):

diff --git a/tests/test_dataset/tensorflow/.gitattributes b/tests/test_dataset/tensorflow/.gitattributes
@@ -0,0 +1,3 @@
+# do not perform any end-of-line conversions;
+# *.diff files here need to keep LF as end of line, but have embedded CR
+*.diff -text
diff --git a/tests/test_dataset/tensorflow/87de301db14745ab920d7e32b53d926236a4f2af.diff b/tests/test_dataset/tensorflow/87de301db14745ab920d7e32b53d926236a4f2af.diff
@@ -0,0 +1,140 @@
+diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
+index d583e4e6bba27d..70dbd15c46cb34 100644
+--- a/tensorflow/core/kernels/string_to_number_op.cc
++++ b/tensorflow/core/kernels/string_to_number_op.cc
+@@ -49,43 +49,15 @@ class StringToNumberOp : public OpKernel {
+     auto output_flat = output_tensor->flat<OutputType>();
+
+     for (int i = 0; i < input_flat.size(); ++i) {
+-      Convert(input_flat(i), &output_flat(i), context);
++      OP_REQUIRES(
++          context,
++          strings::SafeStringToNumeric<OutputType>(input_flat(i).c_str(),
++                                                   &output_flat(i)),
++          errors::InvalidArgument(kErrorMessage, input_flat(i).c_str()));
+     }
+   }
+-
+- private:
+-  void Convert(const string& s, OutputType* output_data,
+-               OpKernelContext* context);
+ };
+
+-template <>
+-void StringToNumberOp<float>::Convert(const string& s, float* output_data,
+-                                      OpKernelContext* context) {
+-  OP_REQUIRES(context, strings::safe_strtof(s.c_str(), output_data),
+-              errors::InvalidArgument(kErrorMessage, s));
+-}
+-
+-template <>
+-void StringToNumberOp<double>::Convert(const string& s, double* output_data,
+-                                       OpKernelContext* context) {
+-  OP_REQUIRES(context, strings::safe_strtod(s.c_str(), output_data),
+-              errors::InvalidArgument(kErrorMessage, s));
+-}
+-
+-template <>
+-void StringToNumberOp<int32>::Convert(const string& s, int32* output_data,
+-                                      OpKernelContext* context) {
+-  OP_REQUIRES(context, strings::safe_strto32(s, output_data),
+-              errors::InvalidArgument(kErrorMessage, s));
+-}
+-
+-template <>
+-void StringToNumberOp<int64>::Convert(const string& s, int64* output_data,
+-                                      OpKernelContext* context) {
+-  OP_REQUIRES(context, strings::safe_strto64(s, output_data),
+-              errors::InvalidArgument(kErrorMessage, s));
+-}
+-
+ // Registers the currently supported output types.
+ #define REGISTER(type)                                           \
+   REGISTER_KERNEL_BUILDER(Name("StringToNumber")                 \
+diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
+index 31b6abbac682bf..3c45b902740199 100644
+--- a/tensorflow/core/lib/strings/numbers.h
++++ b/tensorflow/core/lib/strings/numbers.h
+@@ -122,6 +122,38 @@ bool safe_strtof(const char* str, float* value);
+ // Values may be rounded on over- and underflow.
+ bool safe_strtod(const char* str, double* value);
+
++inline bool ProtoParseNumeric(StringPiece s, int32* value) {
++  return safe_strto32(s, value);
++}
++
++inline bool ProtoParseNumeric(StringPiece s, uint32* value) {
++  return safe_strtou32(s, value);
++}
++
++inline bool ProtoParseNumeric(StringPiece s, int64* value) {
++  return safe_strto64(s, value);
++}
++
++inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
++  return safe_strtou64(s, value);
++}
++
++inline bool ProtoParseNumeric(StringPiece s, float* value) {
++  return safe_strtof(s.ToString().c_str(), value);
++}
++
++inline bool ProtoParseNumeric(StringPiece s, double* value) {
++  return safe_strtod(s.ToString().c_str(), value);
++}
++
++// Convert strings to number of type T.
++// Leading and trailing spaces are allowed.
++// Values may be rounded on over- and underflow.
++template <typename T>
++bool SafeStringToNumeric(StringPiece s, T* value) {
++  return ProtoParseNumeric(s, value);
++}
++
+ // Converts from an int64 to a human readable string representing the
+ // same number, using decimal powers.  e.g. 1200000 -> "1.20M".
+ string HumanReadableNum(int64 value);
+diff --git a/tensorflow/core/lib/strings/proto_text_util.h b/tensorflow/core/lib/strings/proto_text_util.h
+index 3d0c6e4a376268..ed6d0af0105c37 100644
+--- a/tensorflow/core/lib/strings/proto_text_util.h
++++ b/tensorflow/core/lib/strings/proto_text_util.h
+@@ -118,30 +118,6 @@ class ProtoTextOutput {
+   TF_DISALLOW_COPY_AND_ASSIGN(ProtoTextOutput);
+ };
+
+-inline bool ProtoParseNumeric(StringPiece s, int32* value) {
+-  return ::tensorflow::strings::safe_strto32(s, value);
+-}
+-
+-inline bool ProtoParseNumeric(StringPiece s, uint32* value) {
+-  return ::tensorflow::strings::safe_strtou32(s, value);
+-}
+-
+-inline bool ProtoParseNumeric(StringPiece s, int64* value) {
+-  return ::tensorflow::strings::safe_strto64(s, value);
+-}
+-
+-inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
+-  return ::tensorflow::strings::safe_strtou64(s, value);
+-}
+-
+-inline bool ProtoParseNumeric(StringPiece s, float* value) {
+-  return ::tensorflow::strings::safe_strtof(s.ToString().c_str(), value);
+-}
+-
+-inline bool ProtoParseNumeric(StringPiece s, double* value) {
+-  return ::tensorflow::strings::safe_strtod(s.ToString().c_str(), value);
+-}
+-
+ inline void ProtoSpaceAndComments(Scanner* scanner) {
+   for (;;) {
+     scanner->AnySpace();
+@@ -174,7 +150,7 @@ bool ProtoParseNumericFromScanner(Scanner* scanner, T* value) {
+   }
+
+   ProtoSpaceAndComments(scanner);
+-  return ProtoParseNumeric(numeric_str, value);
++  return SafeStringToNumeric<T>(numeric_str, value);
+ }
+
+ // Parse the next boolean value from <scanner>, returning false if parsing