Skip to content

Commit

Permalink
test_annotate.py: Test 'changes' vs 'diff_metadata' for multi-file diff
Browse files Browse the repository at this point in the history
The exact patch file (*.diff) to be tested was selected based on
examination of mismatch between -/+:count and n_rem, n_mod, n_add
values found in notebooks/panel/01-timeline.ipynb, section "Patch size
and spread, resampled".

The fact that no discrepancies were found in the test (added to the
tests/test_annotate.py::test_misc_patchsets_sizes_and_spreads test)
most likely measn that the bug was not in annotate.py - thus removal
of the task from TODO.md.

(It might be the case that unification of the code computing of patch
size and spread metrics in .process() fixed the issue instead, or that
if fixed the issue for the gather_data.py too.)
  • Loading branch information
jnareb committed Oct 28, 2024
1 parent 28cb96c commit 898b6cf
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 3 deletions.
3 changes: 0 additions & 3 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,6 @@ The result of annotation is saved in JSON files, one per patch / commit.
- [x] computing patch/diff size and spread, following
_"[Dissection of a bug dataset: Anatomy of 395 patches from Defects4J][dissection-defects4j-paper]"_
(and extending it) - independent implementation
- [ ] check if the **bug** of added, removed, and modified lines
not matching with the number of - and + lines in unified diff
is here, or with the `diff-gather-stats` step
- [x] _patch size_ counting added ('+'), removed ('-'), and **modified** ('!') lines,
with simplified changed lines detection:<br>
"Lines are considered modified when sequences of removed lines are straight followed by added lines
Expand Down
37 changes: 37 additions & 0 deletions tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,43 @@ def test_misc_patchsets_sizes_and_spreads():
#pprint(result)
assert result['n_files'] == 12, "there were 12 changed files in patch"

file_path = 'tests/test_dataset/tensorflow/87de301db14745ab920d7e32b53d926236a4f2af.diff'
patch_set = AnnotatedPatchSet.from_filename(file_path, encoding='utf-8')
diff_metadata = patch_set.compute_sizes_and_spreads()
changes_data = patch_set.process(sizes_and_spreads=False)['changes']

assert len(changes_data) == diff_metadata['n_files'] + diff_metadata['n_file_renames'], \
f"number of files matches between 'changes' and 'diff_metadata' for {file_path}"

# TODO: extract this common-ish code
total_m = total_p = 0
for file_name, file_data in changes_data.items():
for data_key, data_value in file_data.items():
if data_key == '-':
total_m += len(data_value)
elif data_key == '+':
total_p += len(data_value)

## DEBUG
#print(f"{file_name!r}: {total_m=}, {total_p=}")

## DEBUG
#print(f"TOTAL: {total_m=}, {total_p=}, {total_p+total_m=}")
#print(f"META: "
# f"'n_rem'={diff_metadata['n_rem']}, 2*'n_mod'={2*diff_metadata['n_mod']}, 'n_add'={diff_metadata['n_add']}")
#print(f"META: "
# f"'n_rem'+'n_mod'={diff_metadata['n_rem'] + diff_metadata['n_mod']}, ",
# f"'n_mod'+'n_add'={diff_metadata['n_mod'] + diff_metadata['n_add']}")
#print(f"META: "
# f"'n_rem'+2*'n_mod'+'n_add'={diff_metadata['n_rem'] + 2*diff_metadata['n_mod'] + diff_metadata['n_add']}")

assert total_m == diff_metadata['n_rem'] + diff_metadata['n_mod'], \
f"number of '-' lines matches between 'changes' and 'diff_metadata' in {file_path}"
assert total_p == diff_metadata['n_add'] + diff_metadata['n_mod'], \
f"number of '+' lines matches between 'changes' and 'diff_metadata' in {file_path}"
assert total_m + total_p == diff_metadata['n_rem'] + 2*diff_metadata['n_mod'] + diff_metadata['n_add'], \
f"number of -/+ lines matches between 'changes' and 'diff_metadata' in {file_path}"


@pytest.mark.parametrize("line_type", [unidiff.LINE_TYPE_REMOVED, unidiff.LINE_TYPE_ADDED])
def test_AnnotatedPatchedFile(line_type):
Expand Down
3 changes: 3 additions & 0 deletions tests/test_dataset/tensorflow/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# do not perform any end-of-line conversions;
# *.diff files here need to keep LF as end of line, but have embedded CR
*.diff -text
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index d583e4e6bba27d..70dbd15c46cb34 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -49,43 +49,15 @@ class StringToNumberOp : public OpKernel {
auto output_flat = output_tensor->flat<OutputType>();

for (int i = 0; i < input_flat.size(); ++i) {
- Convert(input_flat(i), &output_flat(i), context);
+ OP_REQUIRES(
+ context,
+ strings::SafeStringToNumeric<OutputType>(input_flat(i).c_str(),
+ &output_flat(i)),
+ errors::InvalidArgument(kErrorMessage, input_flat(i).c_str()));
}
}
-
- private:
- void Convert(const string& s, OutputType* output_data,
- OpKernelContext* context);
};

-template <>
-void StringToNumberOp<float>::Convert(const string& s, float* output_data,
- OpKernelContext* context) {
- OP_REQUIRES(context, strings::safe_strtof(s.c_str(), output_data),
- errors::InvalidArgument(kErrorMessage, s));
-}
-
-template <>
-void StringToNumberOp<double>::Convert(const string& s, double* output_data,
- OpKernelContext* context) {
- OP_REQUIRES(context, strings::safe_strtod(s.c_str(), output_data),
- errors::InvalidArgument(kErrorMessage, s));
-}
-
-template <>
-void StringToNumberOp<int32>::Convert(const string& s, int32* output_data,
- OpKernelContext* context) {
- OP_REQUIRES(context, strings::safe_strto32(s, output_data),
- errors::InvalidArgument(kErrorMessage, s));
-}
-
-template <>
-void StringToNumberOp<int64>::Convert(const string& s, int64* output_data,
- OpKernelContext* context) {
- OP_REQUIRES(context, strings::safe_strto64(s, output_data),
- errors::InvalidArgument(kErrorMessage, s));
-}
-
// Registers the currently supported output types.
#define REGISTER(type) \
REGISTER_KERNEL_BUILDER(Name("StringToNumber") \
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 31b6abbac682bf..3c45b902740199 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -122,6 +122,38 @@ bool safe_strtof(const char* str, float* value);
// Values may be rounded on over- and underflow.
bool safe_strtod(const char* str, double* value);

+inline bool ProtoParseNumeric(StringPiece s, int32* value) {
+ return safe_strto32(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, uint32* value) {
+ return safe_strtou32(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, int64* value) {
+ return safe_strto64(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
+ return safe_strtou64(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, float* value) {
+ return safe_strtof(s.ToString().c_str(), value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, double* value) {
+ return safe_strtod(s.ToString().c_str(), value);
+}
+
+// Convert strings to number of type T.
+// Leading and trailing spaces are allowed.
+// Values may be rounded on over- and underflow.
+template <typename T>
+bool SafeStringToNumeric(StringPiece s, T* value) {
+ return ProtoParseNumeric(s, value);
+}
+
// Converts from an int64 to a human readable string representing the
// same number, using decimal powers. e.g. 1200000 -> "1.20M".
string HumanReadableNum(int64 value);
diff --git a/tensorflow/core/lib/strings/proto_text_util.h b/tensorflow/core/lib/strings/proto_text_util.h
index 3d0c6e4a376268..ed6d0af0105c37 100644
--- a/tensorflow/core/lib/strings/proto_text_util.h
+++ b/tensorflow/core/lib/strings/proto_text_util.h
@@ -118,30 +118,6 @@ class ProtoTextOutput {
TF_DISALLOW_COPY_AND_ASSIGN(ProtoTextOutput);
};

-inline bool ProtoParseNumeric(StringPiece s, int32* value) {
- return ::tensorflow::strings::safe_strto32(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, uint32* value) {
- return ::tensorflow::strings::safe_strtou32(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, int64* value) {
- return ::tensorflow::strings::safe_strto64(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
- return ::tensorflow::strings::safe_strtou64(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, float* value) {
- return ::tensorflow::strings::safe_strtof(s.ToString().c_str(), value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, double* value) {
- return ::tensorflow::strings::safe_strtod(s.ToString().c_str(), value);
-}
-
inline void ProtoSpaceAndComments(Scanner* scanner) {
for (;;) {
scanner->AnySpace();
@@ -174,7 +150,7 @@ bool ProtoParseNumericFromScanner(Scanner* scanner, T* value) {
}

ProtoSpaceAndComments(scanner);
- return ProtoParseNumeric(numeric_str, value);
+ return SafeStringToNumeric<T>(numeric_str, value);
}

// Parse the next boolean value from <scanner>, returning false if parsing

0 comments on commit 898b6cf

Please sign in to comment.