-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
gather_data.py: Add 'timeline' subcommand
The structure of the data returned by the 'timeline' subcommand is different: instead of returning a dict, it returns (and saves) a list. This required adding (and testing) AnnotateBugDataset.gather_data_list() method and map_diff_to_timeline() function. The (only) *.json file in tests/test_dataset_annotated/ was enhanced with appropriate 'commit_metadata' information: from the e54746bdf7d5c8... commit in https://github.com/matrix-org/synapse repository. Some additional tests were performed 'by hand' by modifying test code.
- Loading branch information
Showing
4 changed files
with
283 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
#!/usr/bin/env python | ||
import json | ||
import os | ||
from collections import Counter | ||
from collections import Counter, defaultdict | ||
from pathlib import Path | ||
from types import SimpleNamespace | ||
from typing import Any, List, Optional, TypeVar | ||
|
@@ -235,6 +235,32 @@ def gather_data_dict(self, bug_dict_mapper: Callable[..., dict], | |
combined_results |= {bug_id: bug_results} | ||
return combined_results | ||
|
||
def gather_data_list(self, bug_to_dict_mapper: Callable[..., dict], | ||
annotations_dir: str = Bug.DEFAULT_ANNOTATIONS_DIR, | ||
**mapper_kwargs) -> list: | ||
""" | ||
Gathers dataset data via processing each bug using AnnotatedBug class and provided function | ||
:param bug_to_dict_mapper: function to map diff annotations to dictionary | ||
:param annotations_dir: subdirectory where annotations are; path | ||
to annotation in a dataset is <bug_id>/<annotations_dir>/<patch_data>.json | ||
:return: list of bug dictionaries | ||
""" | ||
combined_results = [] | ||
for bug_id in tqdm.tqdm(self.bugs, desc="patchset", position=2, leave=False): | ||
bug_path = self._path / bug_id | ||
bug = AnnotatedBug(bug_path, annotations_dir=annotations_dir) | ||
bug_results = bug.gather_data_dict(bug_to_dict_mapper, **mapper_kwargs) | ||
# NOTE: could have used `+=` instead of `.append()` | ||
for patch_id, patch_data in bug_results.items(): | ||
combined_results.append({ | ||
'bug_id': bug_id, | ||
'patch_id': patch_id, | ||
**patch_data | ||
}) | ||
|
||
return combined_results | ||
|
||
|
||
def map_diff_to_purpose_dict(_diff_file_path: str, data: dict) -> dict: | ||
"""Extracts file purposes of changed file in a diff annotation | ||
|
@@ -346,6 +372,123 @@ def map_diff_to_lines_stats(annotation_file_basename: str, | |
return result | ||
|
||
|
||
def map_diff_to_timeline(annotation_file_basename: str, | ||
annotation_data: dict) -> dict: | ||
"""Mapper passed by timeline() to *.gather_data_dict() method | ||
It gathers information about file, and counts information about | ||
changed lines (in pre-image i.e. "-", in post-image i.e. "+",...). | ||
:param annotation_file_basename: name of JSON file with annotation data | ||
:param annotation_data: parsed annotations data, retrieved from | ||
`annotation_file_basename` file. | ||
""" | ||
# Example fragment of annotation file: | ||
# | ||
# { | ||
# "commit_metadata": { | ||
# "id": "e54746bdf7d5c831eabe4dcea76a7626f1de73df", | ||
# "parents": ["93b61589b0bdb3845ee839e9c2a4e1adb06bd483"], | ||
# "tree": "262d65e6c945adfa2d64bfe51e70c09d2e1d7d06", | ||
# "author": { | ||
# "author": "Patrick Cloke <[email protected]>", | ||
# "name": "Patrick Cloke", | ||
# "email": "[email protected]", | ||
# "timestamp": 1611763190, | ||
# "tz_info": "-0500" | ||
# }, | ||
# "committer": { | ||
# "committer": "GitHub <[email protected]>", | ||
# "name": "GitHub", | ||
# "email": "[email protected]", | ||
# "timestamp": 1611763190, | ||
# "tz_info": "-0500" | ||
# }, | ||
# }, | ||
# "third_party/xla/xla/service/gpu/ir_emitter_unnested.cc": { | ||
# "language": "C++", | ||
# "type": "programming", | ||
# "purpose": "programming", | ||
# "+": [ | ||
# { | ||
# "id": 4, | ||
# "type": "code", | ||
# "purpose": "programming", | ||
# "tokens": […], | ||
# }, | ||
# {"id":…}, | ||
# ], | ||
# "-": […], | ||
# },… | ||
# } | ||
|
||
# TODO: add logging (info or debug) | ||
result = Counter() | ||
per_commit_info = {} | ||
|
||
# gather summary data from all changed files | ||
for filename, file_data in annotation_data.items(): | ||
# NOTE: each file should be present only once for given patch/commit | ||
|
||
if filename == 'commit_metadata': | ||
# this might be changed file information, but commit metadata | ||
for metadata_key in ('author', 'committer'): | ||
if metadata_key not in file_data: | ||
continue | ||
authorship_data = file_data[metadata_key] | ||
for authorship_key in ('timestamp', 'tz_info', 'name', 'email'): | ||
if authorship_key in authorship_data: | ||
per_commit_info[f"{metadata_key}.{authorship_key}"] = file_data[metadata_key][authorship_key] | ||
|
||
if 'parents' in file_data: | ||
per_commit_info['n_parents'] = len(file_data['parents']) | ||
|
||
if 'purpose' not in file_data: | ||
# commit metadata, skip processing it as a file | ||
continue | ||
else: | ||
print(f" warning: found file named 'commit_metadata' in {annotation_file_basename}") | ||
|
||
result['file_names'] += 1 | ||
|
||
# gather per-file information, and aggregate it | ||
per_file_data = { | ||
key: value for key, value in file_data.items() | ||
if key in ("language", "type", "purpose") | ||
} | ||
per_file_data.update({ | ||
"+": Counter(), | ||
"-": Counter(), | ||
}) | ||
|
||
for line_type in "+-": # str used as iterable | ||
# diff might have removed lines, or any added lines | ||
if line_type not in file_data: | ||
continue | ||
|
||
for line in file_data[line_type]: | ||
per_file_data[line_type]["count"] += 1 # count of added/removed lines | ||
|
||
for data_type in ["type", "purpose"]: # ignore "id" and "tokens" fields | ||
line_data = line[data_type] | ||
per_file_data[line_type][f"{data_type}.{line_data}"] += 1 | ||
|
||
for key, value in per_file_data.items(): | ||
if isinstance(value, (dict, defaultdict, Counter)): | ||
for sub_key, sub_value in value.items(): | ||
# don't expect anything deeper | ||
result[f"{key}:{sub_key}"] += sub_value | ||
elif isinstance(value, int): | ||
result[key] += value | ||
else: | ||
result[f"{key}:{value}"] += 1 | ||
|
||
result = dict(result, **per_commit_info) | ||
|
||
return result | ||
|
||
|
||
|
||
# TODO: make it common (move it to 'utils' module or '__init__.py' file) | ||
def save_result(result: Any, result_json: Path) -> None: | ||
"""Serialize `result` and save it in `result_json` JSON file | ||
|
@@ -536,5 +679,71 @@ def lines_stats( | |
save_result(result, output_file) | ||
|
||
|
||
@app.command(help="Gather timeline with per-bug count of lines per type") | ||
def timeline( | ||
ctx: typer.Context, # common arguments like --annotations-dir | ||
output_file: Annotated[ | ||
Path, | ||
typer.Argument( | ||
dir_okay=False, | ||
help="file to write gathered results to" | ||
) | ||
], | ||
datasets: Annotated[ | ||
List[Path], | ||
typer.Argument( | ||
exists=True, | ||
file_okay=False, | ||
dir_okay=True, | ||
readable=True, | ||
writable=False, | ||
help="list of dirs with datasets to process" | ||
) | ||
], | ||
|
||
) -> None: | ||
# TODO: extract common part of the command description | ||
"""Calculate timeline of bugs with per-bug count of different types of lines | ||
For each bug (bugfix commit), compute the count of lines removed and added | ||
by the patch (commit) in all changed files, keeping separate counts for | ||
lines with different types, and (separately) with different purposes. | ||
The gathered data is then saved in a format easy to load into dataframe. | ||
Each DATASET is expected to be generated by annotating dataset or creating | ||
annotations from a repository, and should be an existing directory with | ||
the following structure: | ||
<dataset_directory>/<bug_directory>/annotation/<patch_file>.json | ||
Each dataset can consist of many BUGs, each BUG should include JSON | ||
file with its diff/patch annotations as *.json file in 'annotation/' | ||
subdirectory (by default). | ||
Saves gathered timeline results to the OUTPUT_FILE. | ||
""" | ||
result = {} | ||
|
||
# often there is only one dataset, therefore joblib support is not needed | ||
for dataset in tqdm.tqdm(datasets, desc='dataset'): | ||
tqdm.tqdm.write(f"Dataset {dataset}") | ||
annotated_bugs = AnnotatedBugDataset(dataset) | ||
data = annotated_bugs.gather_data_list(map_diff_to_timeline, | ||
annotations_dir=ctx.obj.annotations_dir) | ||
|
||
# sanity check | ||
if not data: | ||
tqdm.tqdm.write(" warning: no data extracted from this dataset") | ||
else: | ||
if 'author.timestamp' not in data[0]: | ||
tqdm.tqdm.write(" warning: dataset does not include time information") | ||
|
||
result[dataset.name] = data | ||
|
||
# TODO: support other formats than JSON | ||
save_result(result, output_file) | ||
|
||
|
||
if __name__ == "__main__": | ||
app() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,23 @@ | ||
{ | ||
"commit_metadata": { | ||
"id": "e54746bdf7d5c831eabe4dcea76a7626f1de73df", | ||
"parents": ["93b61589b0bdb3845ee839e9c2a4e1adb06bd483"], | ||
"tree": "262d65e6c945adfa2d64bfe51e70c09d2e1d7d06", | ||
"author": { | ||
"author": "Patrick Cloke <[email protected]>", | ||
"name": "Patrick Cloke", | ||
"email": "[email protected]", | ||
"timestamp": 1611763190, | ||
"tz_info":"-0500" | ||
}, | ||
"committer": { | ||
"committer": "GitHub <[email protected]>", | ||
"name": "GitHub", | ||
"email": "[email protected]", | ||
"timestamp": 1611763190, | ||
"tz_info": "-0500" | ||
} | ||
}, | ||
"UPGRADE.rst": { | ||
"language": "reStructuredText", | ||
"type": "prose", | ||
|
@@ -7547,4 +7566,4 @@ | |
} | ||
] | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters