gather_data.py: Add 'timeline' subcommand

The structure of the data returned by the 'timeline' subcommand is different: instead of returning a dict, it returns (and saves) a list. This required adding (and testing) AnnotateBugDataset.gather_data_list() method and map_diff_to_timeline() function. The (only) *.json file in tests/test_dataset_annotated/ was enhanced with appropriate 'commit_metadata' information: from the e54746bdf7d5c8... commit in https://github.com/matrix-org/synapse repository. Some additional tests were performed 'by hand' by modifying test code.
ncusi · Sep 28, 2024 · 03160a0 · 03160a0
1 parent e1b3101
commit 03160a0
Show file tree

Hide file tree

Showing 4 changed files with 283 additions and 3 deletions.
diff --git a/src/diffannotator/gather_data.py b/src/diffannotator/gather_data.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 import json
 import os
-from collections import Counter
+from collections import Counter, defaultdict
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, List, Optional, TypeVar
@@ -235,6 +235,32 @@ def gather_data_dict(self, bug_dict_mapper: Callable[..., dict],
             combined_results |= {bug_id: bug_results}
         return combined_results
 
+    def gather_data_list(self, bug_to_dict_mapper: Callable[..., dict],
+                         annotations_dir: str = Bug.DEFAULT_ANNOTATIONS_DIR,
+                         **mapper_kwargs) -> list:
+        """
+        Gathers dataset data via processing each bug using AnnotatedBug class and provided function
+
+        :param bug_to_dict_mapper: function to map diff annotations to dictionary
+        :param annotations_dir: subdirectory where annotations are; path
+            to annotation in a dataset is <bug_id>/<annotations_dir>/<patch_data>.json
+        :return: list of bug dictionaries
+        """
+        combined_results = []
+        for bug_id in tqdm.tqdm(self.bugs, desc="patchset", position=2, leave=False):
+            bug_path = self._path / bug_id
+            bug = AnnotatedBug(bug_path, annotations_dir=annotations_dir)
+            bug_results = bug.gather_data_dict(bug_to_dict_mapper, **mapper_kwargs)
+            # NOTE: could have used `+=` instead of `.append()`
+            for patch_id, patch_data in bug_results.items():
+                combined_results.append({
+                    'bug_id': bug_id,
+                    'patch_id': patch_id,
+                    **patch_data
+                })
+
+        return combined_results
+
 
 def map_diff_to_purpose_dict(_diff_file_path: str, data: dict) -> dict:
     """Extracts file purposes of changed file in a diff annotation
@@ -346,6 +372,123 @@ def map_diff_to_lines_stats(annotation_file_basename: str,
     return result
 
 
+def map_diff_to_timeline(annotation_file_basename: str,
+                         annotation_data: dict) -> dict:
+    """Mapper passed by timeline() to *.gather_data_dict() method
+
+    It gathers information about file, and counts information about
+    changed lines (in pre-image i.e. "-", in post-image i.e. "+",...).
+
+    :param annotation_file_basename: name of JSON file with annotation data
+    :param annotation_data: parsed annotations data, retrieved from
+        `annotation_file_basename` file.
+    """
+    # Example fragment of annotation file:
+    #
+    # {
+    #   "commit_metadata": {
+    #     "id": "e54746bdf7d5c831eabe4dcea76a7626f1de73df",
+    #     "parents": ["93b61589b0bdb3845ee839e9c2a4e1adb06bd483"],
+    #     "tree": "262d65e6c945adfa2d64bfe51e70c09d2e1d7d06",
+    #     "author": {
+    #       "author": "Patrick Cloke <[email protected]>",
+    #       "name": "Patrick Cloke",
+    #       "email": "[email protected]",
+    #       "timestamp": 1611763190,
+    #       "tz_info": "-0500"
+    #     },
+    #     "committer": {
+    #       "committer": "GitHub <[email protected]>",
+    #       "name": "GitHub",
+    #       "email": "[email protected]",
+    #       "timestamp": 1611763190,
+    #       "tz_info": "-0500"
+    #     },
+    #   },
+    #   "third_party/xla/xla/service/gpu/ir_emitter_unnested.cc": {
+    #     "language": "C++",
+    #     "type": "programming",
+    #     "purpose": "programming",
+    #     "+": [
+    #       {
+    #         "id": 4,
+    #         "type": "code",
+    #         "purpose": "programming",
+    #         "tokens": […],
+    #       },
+    #       {"id":…},
+    #     ],
+    #     "-": […],
+    #   },…
+    # }
+
+    # TODO: add logging (info or debug)
+    result = Counter()
+    per_commit_info = {}
+
+    # gather summary data from all changed files
+    for filename, file_data in annotation_data.items():
+        # NOTE: each file should be present only once for given patch/commit
+
+        if filename == 'commit_metadata':
+            # this might be changed file information, but commit metadata
+            for metadata_key in ('author', 'committer'):
+                if metadata_key not in file_data:
+                    continue
+                authorship_data = file_data[metadata_key]
+                for authorship_key in ('timestamp', 'tz_info', 'name', 'email'):
+                    if authorship_key in authorship_data:
+                        per_commit_info[f"{metadata_key}.{authorship_key}"] = file_data[metadata_key][authorship_key]
+
+            if 'parents' in file_data:
+                per_commit_info['n_parents'] = len(file_data['parents'])
+
+            if 'purpose' not in file_data:
+                # commit metadata, skip processing it as a file
+                continue
+            else:
+                print(f"  warning: found file named 'commit_metadata' in {annotation_file_basename}")
+
+        result['file_names'] += 1
+
+        # gather per-file information, and aggregate it
+        per_file_data = {
+            key: value for key, value in file_data.items()
+            if key in ("language", "type", "purpose")
+        }
+        per_file_data.update({
+            "+": Counter(),
+            "-": Counter(),
+        })
+
+        for line_type in "+-":  # str used as iterable
+            # diff might have removed lines, or any added lines
+            if line_type not in file_data:
+                continue
+
+            for line in file_data[line_type]:
+                per_file_data[line_type]["count"] += 1  # count of added/removed lines
+
+                for data_type in ["type", "purpose"]:  # ignore "id" and "tokens" fields
+                    line_data = line[data_type]
+                    per_file_data[line_type][f"{data_type}.{line_data}"] += 1
+
+        for key, value in per_file_data.items():
+            if isinstance(value, (dict, defaultdict, Counter)):
+                for sub_key, sub_value in value.items():
+                    # don't expect anything deeper
+                    result[f"{key}:{sub_key}"] += sub_value
+            elif isinstance(value, int):
+                result[key] += value
+            else:
+                result[f"{key}:{value}"] += 1
+
+    result = dict(result, **per_commit_info)
+
+    return result
+
+
+
 # TODO: make it common (move it to 'utils' module or '__init__.py' file)
 def save_result(result: Any, result_json: Path) -> None:
     """Serialize `result` and save it in `result_json` JSON file
@@ -536,5 +679,71 @@ def lines_stats(
     save_result(result, output_file)
 
 
+@app.command(help="Gather timeline with per-bug count of lines per type")
+def timeline(
+    ctx: typer.Context,  # common arguments like --annotations-dir
+    output_file: Annotated[
+        Path,
+        typer.Argument(
+            dir_okay=False,
+            help="file to write gathered results to"
+        )
+    ],
+    datasets: Annotated[
+        List[Path],
+        typer.Argument(
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            readable=True,
+            writable=False,
+            help="list of dirs with datasets to process"
+        )
+    ],
+
+) -> None:
+    # TODO: extract common part of the command description
+    """Calculate timeline of bugs with per-bug count of different types of lines
+
+    For each bug (bugfix commit), compute the count of lines removed and added
+    by the patch (commit) in all changed files, keeping separate counts for
+    lines with different types, and (separately) with different purposes.
+
+    The gathered data is then saved in a format easy to load into dataframe.
+
+    Each DATASET is expected to be generated by annotating dataset or creating
+    annotations from a repository, and should be an existing directory with
+    the following structure:
+
+        <dataset_directory>/<bug_directory>/annotation/<patch_file>.json
+
+    Each dataset can consist of many BUGs, each BUG should include JSON
+    file with its diff/patch annotations as *.json file in 'annotation/'
+    subdirectory (by default).
+
+    Saves gathered timeline results to the OUTPUT_FILE.
+    """
+    result = {}
+
+    # often there is only one dataset, therefore joblib support is not needed
+    for dataset in tqdm.tqdm(datasets, desc='dataset'):
+        tqdm.tqdm.write(f"Dataset {dataset}")
+        annotated_bugs = AnnotatedBugDataset(dataset)
+        data = annotated_bugs.gather_data_list(map_diff_to_timeline,
+                                               annotations_dir=ctx.obj.annotations_dir)
+
+        # sanity check
+        if not data:
+            tqdm.tqdm.write("  warning: no data extracted from this dataset")
+        else:
+            if 'author.timestamp' not in data[0]:
+                tqdm.tqdm.write("  warning: dataset does not include time information")
+
+        result[dataset.name] = data
+
+    # TODO: support other formats than JSON
+    save_result(result, output_file)
+
+
 if __name__ == "__main__":
     app()
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -479,3 +479,26 @@ def test_gather_data(tmp_path: Path):
 
     # DEBUG
     #print(json_path.read_text())
+
+    ### for 'timeline'
+
+    #dataset_dir_annotations = 'tests/test_dataset_annotated'
+    json_path = Path(f"{dataset_dir_annotations}.timeline.json")
+    result = runner.invoke(gather_app, [
+        # select subcommand
+        "timeline",
+        # pass options and arguments to subcommand
+        f"{json_path}",
+        f"{dataset_dir_annotations}",
+    ])
+
+    #print(result.stdout)
+    if result.exit_code != 0:
+        print(result.stdout)
+
+    assert result.exit_code == 0, \
+        "gather app runs 'timeline' subcommand on generated annotations without errors"
+    assert json_path.is_file(), \
+        "output 'timeline' file app was requested to use exists (it was created)"
+    assert json_path.stat().st_size > 0, \
+        "generated 'timeline' JSON file with results is not empty"
diff --git a/...dataset_annotated/CVE-2021-21332/annotation/e54746bdf7d5c831eabe4dcea76a7626f1de73df.json b/...dataset_annotated/CVE-2021-21332/annotation/e54746bdf7d5c831eabe4dcea76a7626f1de73df.json
@@ -1,4 +1,23 @@
 {
+  "commit_metadata": {
+    "id": "e54746bdf7d5c831eabe4dcea76a7626f1de73df",
+    "parents": ["93b61589b0bdb3845ee839e9c2a4e1adb06bd483"],
+    "tree": "262d65e6c945adfa2d64bfe51e70c09d2e1d7d06",
+    "author": {
+      "author": "Patrick Cloke <[email protected]>",
+      "name": "Patrick Cloke",
+      "email": "[email protected]",
+      "timestamp": 1611763190,
+      "tz_info":"-0500"
+    },
+    "committer": {
+        "committer": "GitHub <[email protected]>",
+        "name": "GitHub",
+        "email": "[email protected]",
+        "timestamp": 1611763190,
+        "tz_info": "-0500"
+    }
+  },
   "UPGRADE.rst": {
     "language": "reStructuredText",
     "type": "prose",
@@ -7547,4 +7566,4 @@
       }
     ]
   }
-}
+}
diff --git a/tests/test_gather_data.py b/tests/test_gather_data.py
@@ -1,7 +1,8 @@
 from collections import Counter
 from pathlib import Path
 
-from diffannotator.gather_data import PurposeCounterResults, AnnotatedBugDataset, map_diff_to_purpose_dict
+from diffannotator.gather_data import (PurposeCounterResults, AnnotatedBugDataset,
+                                       map_diff_to_purpose_dict, map_diff_to_timeline)
 
 
 def test_AnnotatedBugDataset_with_PurposeCounterResults():
@@ -30,6 +31,34 @@ def test_AnnotatedBugDataset_with_dict_mapping():
         'documentation']
 
 
+def test_AnnotatedBugDataset_gather_data_list():
+    dataset_path = 'tests/test_dataset_annotated'
+    annotated_bug_dataset = AnnotatedBugDataset(dataset_path)
+    # TODO?: inject commit metadata, if missing
+    #print(f"{annotated_bug_dataset.bugs=}")
+
+    data_list = annotated_bug_dataset.gather_data_list(map_diff_to_timeline)
+
+    # DEBUG
+    #from pprint import pprint
+    #pprint(data_list)
+
+    # NOTE: change if the test data changes!
+    assert len(data_list) == 1, \
+        "only one annotation file was present"
+    assert data_list[0]['bug_id'] == 'CVE-2021-21332', \
+        "found expected bug id"
+
+    # NOTE: change if the test data changes!
+    annotation_file = 'e54746bdf7d5c831eabe4dcea76a7626f1de73df.json'
+    assert data_list[0]['patch_id'] == annotation_file, \
+        "found expected annotation file"
+
+    diff_data = data_list[0]
+    assert {'file_names', '+:count', '-:count'} <= set(diff_data.keys()), \
+        "expected keys present in extracted stats"
+
+
 def test_PurposeCounterResults_create():
     data = {
         "synapse/push/mailer.py": {