Skip to content

Commit

Permalink
gather_data.py: Add 'timeline' subcommand
Browse files Browse the repository at this point in the history
The structure of the data returned by the 'timeline' subcommand is
different: instead of returning a dict, it returns (and saves) a list.

This required adding (and testing) AnnotateBugDataset.gather_data_list()
method and map_diff_to_timeline() function.

The (only) *.json file in tests/test_dataset_annotated/ was enhanced
with appropriate 'commit_metadata' information: from the e54746bdf7d5c8...
commit in https://github.com/matrix-org/synapse repository.

Some additional tests were performed 'by hand' by modifying test code.
  • Loading branch information
jnareb committed Sep 28, 2024
1 parent e1b3101 commit 03160a0
Show file tree
Hide file tree
Showing 4 changed files with 283 additions and 3 deletions.
211 changes: 210 additions & 1 deletion src/diffannotator/gather_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
import json
import os
from collections import Counter
from collections import Counter, defaultdict
from pathlib import Path
from types import SimpleNamespace
from typing import Any, List, Optional, TypeVar
Expand Down Expand Up @@ -235,6 +235,32 @@ def gather_data_dict(self, bug_dict_mapper: Callable[..., dict],
combined_results |= {bug_id: bug_results}
return combined_results

def gather_data_list(self, bug_to_dict_mapper: Callable[..., dict],
annotations_dir: str = Bug.DEFAULT_ANNOTATIONS_DIR,
**mapper_kwargs) -> list:
"""
Gathers dataset data via processing each bug using AnnotatedBug class and provided function
:param bug_to_dict_mapper: function to map diff annotations to dictionary
:param annotations_dir: subdirectory where annotations are; path
to annotation in a dataset is <bug_id>/<annotations_dir>/<patch_data>.json
:return: list of bug dictionaries
"""
combined_results = []
for bug_id in tqdm.tqdm(self.bugs, desc="patchset", position=2, leave=False):
bug_path = self._path / bug_id
bug = AnnotatedBug(bug_path, annotations_dir=annotations_dir)
bug_results = bug.gather_data_dict(bug_to_dict_mapper, **mapper_kwargs)
# NOTE: could have used `+=` instead of `.append()`
for patch_id, patch_data in bug_results.items():
combined_results.append({
'bug_id': bug_id,
'patch_id': patch_id,
**patch_data
})

return combined_results


def map_diff_to_purpose_dict(_diff_file_path: str, data: dict) -> dict:
"""Extracts file purposes of changed file in a diff annotation
Expand Down Expand Up @@ -346,6 +372,123 @@ def map_diff_to_lines_stats(annotation_file_basename: str,
return result


def map_diff_to_timeline(annotation_file_basename: str,
annotation_data: dict) -> dict:
"""Mapper passed by timeline() to *.gather_data_dict() method
It gathers information about file, and counts information about
changed lines (in pre-image i.e. "-", in post-image i.e. "+",...).
:param annotation_file_basename: name of JSON file with annotation data
:param annotation_data: parsed annotations data, retrieved from
`annotation_file_basename` file.
"""
# Example fragment of annotation file:
#
# {
# "commit_metadata": {
# "id": "e54746bdf7d5c831eabe4dcea76a7626f1de73df",
# "parents": ["93b61589b0bdb3845ee839e9c2a4e1adb06bd483"],
# "tree": "262d65e6c945adfa2d64bfe51e70c09d2e1d7d06",
# "author": {
# "author": "Patrick Cloke <[email protected]>",
# "name": "Patrick Cloke",
# "email": "[email protected]",
# "timestamp": 1611763190,
# "tz_info": "-0500"
# },
# "committer": {
# "committer": "GitHub <[email protected]>",
# "name": "GitHub",
# "email": "[email protected]",
# "timestamp": 1611763190,
# "tz_info": "-0500"
# },
# },
# "third_party/xla/xla/service/gpu/ir_emitter_unnested.cc": {
# "language": "C++",
# "type": "programming",
# "purpose": "programming",
# "+": [
# {
# "id": 4,
# "type": "code",
# "purpose": "programming",
# "tokens": […],
# },
# {"id":…},
# ],
# "-": […],
# },…
# }

# TODO: add logging (info or debug)
result = Counter()
per_commit_info = {}

# gather summary data from all changed files
for filename, file_data in annotation_data.items():
# NOTE: each file should be present only once for given patch/commit

if filename == 'commit_metadata':
# this might be changed file information, but commit metadata
for metadata_key in ('author', 'committer'):
if metadata_key not in file_data:
continue
authorship_data = file_data[metadata_key]
for authorship_key in ('timestamp', 'tz_info', 'name', 'email'):
if authorship_key in authorship_data:
per_commit_info[f"{metadata_key}.{authorship_key}"] = file_data[metadata_key][authorship_key]

if 'parents' in file_data:
per_commit_info['n_parents'] = len(file_data['parents'])

if 'purpose' not in file_data:
# commit metadata, skip processing it as a file
continue
else:
print(f" warning: found file named 'commit_metadata' in {annotation_file_basename}")

result['file_names'] += 1

# gather per-file information, and aggregate it
per_file_data = {
key: value for key, value in file_data.items()
if key in ("language", "type", "purpose")
}
per_file_data.update({
"+": Counter(),
"-": Counter(),
})

for line_type in "+-": # str used as iterable
# diff might have removed lines, or any added lines
if line_type not in file_data:
continue

for line in file_data[line_type]:
per_file_data[line_type]["count"] += 1 # count of added/removed lines

for data_type in ["type", "purpose"]: # ignore "id" and "tokens" fields
line_data = line[data_type]
per_file_data[line_type][f"{data_type}.{line_data}"] += 1

for key, value in per_file_data.items():
if isinstance(value, (dict, defaultdict, Counter)):
for sub_key, sub_value in value.items():
# don't expect anything deeper
result[f"{key}:{sub_key}"] += sub_value
elif isinstance(value, int):
result[key] += value
else:
result[f"{key}:{value}"] += 1

result = dict(result, **per_commit_info)

return result



# TODO: make it common (move it to 'utils' module or '__init__.py' file)
def save_result(result: Any, result_json: Path) -> None:
"""Serialize `result` and save it in `result_json` JSON file
Expand Down Expand Up @@ -536,5 +679,71 @@ def lines_stats(
save_result(result, output_file)


@app.command(help="Gather timeline with per-bug count of lines per type")
def timeline(
ctx: typer.Context, # common arguments like --annotations-dir
output_file: Annotated[
Path,
typer.Argument(
dir_okay=False,
help="file to write gathered results to"
)
],
datasets: Annotated[
List[Path],
typer.Argument(
exists=True,
file_okay=False,
dir_okay=True,
readable=True,
writable=False,
help="list of dirs with datasets to process"
)
],

) -> None:
# TODO: extract common part of the command description
"""Calculate timeline of bugs with per-bug count of different types of lines
For each bug (bugfix commit), compute the count of lines removed and added
by the patch (commit) in all changed files, keeping separate counts for
lines with different types, and (separately) with different purposes.
The gathered data is then saved in a format easy to load into dataframe.
Each DATASET is expected to be generated by annotating dataset or creating
annotations from a repository, and should be an existing directory with
the following structure:
<dataset_directory>/<bug_directory>/annotation/<patch_file>.json
Each dataset can consist of many BUGs, each BUG should include JSON
file with its diff/patch annotations as *.json file in 'annotation/'
subdirectory (by default).
Saves gathered timeline results to the OUTPUT_FILE.
"""
result = {}

# often there is only one dataset, therefore joblib support is not needed
for dataset in tqdm.tqdm(datasets, desc='dataset'):
tqdm.tqdm.write(f"Dataset {dataset}")
annotated_bugs = AnnotatedBugDataset(dataset)
data = annotated_bugs.gather_data_list(map_diff_to_timeline,
annotations_dir=ctx.obj.annotations_dir)

# sanity check
if not data:
tqdm.tqdm.write(" warning: no data extracted from this dataset")
else:
if 'author.timestamp' not in data[0]:
tqdm.tqdm.write(" warning: dataset does not include time information")

result[dataset.name] = data

# TODO: support other formats than JSON
save_result(result, output_file)


if __name__ == "__main__":
app()
23 changes: 23 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,3 +479,26 @@ def test_gather_data(tmp_path: Path):

# DEBUG
#print(json_path.read_text())

### for 'timeline'

#dataset_dir_annotations = 'tests/test_dataset_annotated'
json_path = Path(f"{dataset_dir_annotations}.timeline.json")
result = runner.invoke(gather_app, [
# select subcommand
"timeline",
# pass options and arguments to subcommand
f"{json_path}",
f"{dataset_dir_annotations}",
])

#print(result.stdout)
if result.exit_code != 0:
print(result.stdout)

assert result.exit_code == 0, \
"gather app runs 'timeline' subcommand on generated annotations without errors"
assert json_path.is_file(), \
"output 'timeline' file app was requested to use exists (it was created)"
assert json_path.stat().st_size > 0, \
"generated 'timeline' JSON file with results is not empty"
Original file line number Diff line number Diff line change
@@ -1,4 +1,23 @@
{
"commit_metadata": {
"id": "e54746bdf7d5c831eabe4dcea76a7626f1de73df",
"parents": ["93b61589b0bdb3845ee839e9c2a4e1adb06bd483"],
"tree": "262d65e6c945adfa2d64bfe51e70c09d2e1d7d06",
"author": {
"author": "Patrick Cloke <[email protected]>",
"name": "Patrick Cloke",
"email": "[email protected]",
"timestamp": 1611763190,
"tz_info":"-0500"
},
"committer": {
"committer": "GitHub <[email protected]>",
"name": "GitHub",
"email": "[email protected]",
"timestamp": 1611763190,
"tz_info": "-0500"
}
},
"UPGRADE.rst": {
"language": "reStructuredText",
"type": "prose",
Expand Down Expand Up @@ -7547,4 +7566,4 @@
}
]
}
}
}
31 changes: 30 additions & 1 deletion tests/test_gather_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from collections import Counter
from pathlib import Path

from diffannotator.gather_data import PurposeCounterResults, AnnotatedBugDataset, map_diff_to_purpose_dict
from diffannotator.gather_data import (PurposeCounterResults, AnnotatedBugDataset,
map_diff_to_purpose_dict, map_diff_to_timeline)


def test_AnnotatedBugDataset_with_PurposeCounterResults():
Expand Down Expand Up @@ -30,6 +31,34 @@ def test_AnnotatedBugDataset_with_dict_mapping():
'documentation']


def test_AnnotatedBugDataset_gather_data_list():
dataset_path = 'tests/test_dataset_annotated'
annotated_bug_dataset = AnnotatedBugDataset(dataset_path)
# TODO?: inject commit metadata, if missing
#print(f"{annotated_bug_dataset.bugs=}")

data_list = annotated_bug_dataset.gather_data_list(map_diff_to_timeline)

# DEBUG
#from pprint import pprint
#pprint(data_list)

# NOTE: change if the test data changes!
assert len(data_list) == 1, \
"only one annotation file was present"
assert data_list[0]['bug_id'] == 'CVE-2021-21332', \
"found expected bug id"

# NOTE: change if the test data changes!
annotation_file = 'e54746bdf7d5c831eabe4dcea76a7626f1de73df.json'
assert data_list[0]['patch_id'] == annotation_file, \
"found expected annotation file"

diff_data = data_list[0]
assert {'file_names', '+:count', '-:count'} <= set(diff_data.keys()), \
"expected keys present in extracted stats"


def test_PurposeCounterResults_create():
data = {
"synapse/push/mailer.py": {
Expand Down

0 comments on commit 03160a0

Please sign in to comment.