-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
127 lines (113 loc) · 7.81 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import logging
from collections import defaultdict
import helper as hlp
from metrics import * # noqa
import pandas as pd
from plot import plot
from data_collection.collect_commit_data import fetch_data
from mapping import get_contributor_names_from_file
def aggregate(ledger, repo, commits_per_sample_window, contributor_type, contribution_type):
output_dir = hlp.get_output_dir(output_type='data', contribution_type=contribution_type, contributor_type=contributor_type,
commits_per_sample_window=commits_per_sample_window, data_type='contributions_per_entity', mkdir=True)
logging.info(f'Processing {repo}...')
contributor_names_by_email = get_contributor_names_from_file(repo)
commits = hlp.read_commit_data(ledger, repo)
# aggregate commits by the appropriate number of commits per sample window
contributions_per_entity = defaultdict(dict)
sample_window_timestamps = defaultdict(list)
for i, commit in enumerate(reversed(commits)):
sample_window_idx = i // commits_per_sample_window if commits_per_sample_window else 0
sample_window_timestamps[sample_window_idx].append(commit[f'{contributor_type}_timestamp'])
contributor_email = commit[f'{contributor_type}_email']
contributor_name = contributor_names_by_email[contributor_email]
contributions_per_entity[contributor_name][sample_window_idx] = contributions_per_entity[contributor_name].get(
sample_window_idx, 0) + get_contribution_from_commit(commit, contribution_type)
# remove last sample window if it has fewer observations than the rest
if commits_per_sample_window and len(sample_window_timestamps[sample_window_idx]) < commits_per_sample_window:
sample_window_timestamps.pop(sample_window_idx)
for entity, contributions in contributions_per_entity.items():
contributions.pop(sample_window_idx, None)
mean_timestamps = {idx: pd.to_datetime(timestamps).mean().date() for idx, timestamps in
sample_window_timestamps.items()}
filename = f'{repo}_contributions_per_entity.csv'
hlp.write_contributions_per_entity_to_file(contributions_per_entity, mean_timestamps, output_dir / filename)
def get_contribution_from_commit(commit, contribution_type):
if contribution_type == 'commits':
return 1
elif contribution_type == 'lines_added':
return commit['lines_added']
elif contribution_type == 'lines_deleted':
return commit['lines_deleted']
elif contribution_type == 'lines_changed':
return commit['lines_added'] + commit['lines_deleted']
elif contribution_type == 'merge_commits':
return 1 if commit['message'].startswith('Merge') else 0
else:
raise ValueError(f'Invalid contribution type: {contribution_type}')
def run_metrics(ledger_repos, metrics, commits_per_sample_window, contributor_type, contribution_type):
"""
Calculates metrics for the distribution in each sample window.
Saves the results in a csv file in the 'output' directory.
:param ledger_repos: dictionary with ledger names as keys and lists of repository names as values
:param metrics: list of metric names
:param commits_per_sample_window: int that represents the number of commits per sample window (or None)
:param contributor_type: string with the type of entity to consider in the analysis (author or committer)
:param contribution_type: string with the type of contribution to consider in the analysis (commits,
merge_commits, lines_added, lines_deleted, or lines_changed)
"""
logging.info('Calculating metrics...')
contributions_per_entity_data_dir = hlp.get_output_dir(output_type='data', contribution_type=contribution_type,
contributor_type=contributor_type, commits_per_sample_window=commits_per_sample_window,
data_type='contributions_per_entity')
metrics_data_dir = hlp.get_output_dir(output_type='data', contribution_type=contribution_type,
contributor_type=contributor_type, commits_per_sample_window=commits_per_sample_window,
data_type='metrics', mkdir=True)
repos = [repo for repos in ledger_repos.values() for repo in repos]
metric_dfs = {metric: pd.DataFrame() for metric in metrics}
all_metrics_rows = []
for repo in repos:
sample_windows, contributions_per_entity = hlp.get_contributions_per_entity_from_file(
contributions_per_entity_data_dir / f'{repo}_contributions_per_entity.csv')
if len(sample_windows) > 1:
sample_window_results = defaultdict(list)
for metric in metrics:
metric_repo_results = {}
for sample_window_id in range(len(sample_windows)):
sample_contributions_per_entity = {}
for entity, contribution_values in contributions_per_entity.items():
sample_contributions_per_entity[entity] = contribution_values[sample_window_id]
# Remove entities with no commits in the sample window
sample_contributions_per_entity = {k: v for k, v in sample_contributions_per_entity.items() if
v > 0}
sorted_sample_commits = sorted(sample_contributions_per_entity.values(), reverse=True)
func = eval(f'compute_{metric}')
metric_repo_results[sample_window_id] = func(sorted_sample_commits)
sample_window_results[sample_window_id].append(metric_repo_results[sample_window_id])
metric_df_repo = pd.DataFrame.from_dict(metric_repo_results, orient='index', columns=[repo])
metric_dfs[metric] = metric_dfs[metric].join(metric_df_repo, how='outer')
all_metrics_rows.extend([[repo, sample_windows[sample_window_id]] + results for sample_window_id, results in
sample_window_results.items()])
if all_metrics_rows:
all_metrics_df = pd.DataFrame(all_metrics_rows, columns=['ledger', 'date'] + metrics)
all_metrics_df.to_csv(metrics_data_dir / 'all_metrics.csv', index=False, date_format='%Y%m%d')
if __name__ == '__main__':
logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)
ledger_repos = hlp.get_ledger_repos()
refresh_data_flag = hlp.get_refresh_data_flag()
fetch_data(repos=[(ledger, repo) for ledger in ledger_repos for repo in ledger_repos[ledger]],
update_existing=refresh_data_flag)
metrics = hlp.get_metrics()
commits_per_sample_window_list = hlp.get_commits_per_sample_window_list()
contributor_types = hlp.get_contributor_types()
contribution_types = hlp.get_contribution_types()
for contribution_type in contribution_types:
logging.info(f'Processing by contribution type: {contribution_type}')
for contributor_type in contributor_types:
logging.info(f'Processing per contributor type: {contributor_type}')
for commits_per_sample_window in commits_per_sample_window_list:
logging.info(f'Processing with {commits_per_sample_window} commits per sample window')
for ledger, repos in ledger_repos.items():
for repo in repos:
aggregate(ledger, repo, commits_per_sample_window, contributor_type, contribution_type)
run_metrics(ledger_repos, metrics, commits_per_sample_window, contributor_type, contribution_type)
plot(ledger_repos, metrics, commits_per_sample_window, contributor_type, contribution_type)