Skip to content

Commit 4ca8819

Browse files
Full draft of fairness metrics
1 parent b580de0 commit 4ca8819

File tree

2 files changed

+203
-72
lines changed

2 files changed

+203
-72
lines changed

Project.toml

-3
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,7 @@ authors = ["TheCedarPrince <[email protected]> and contributors"]
44
version = "0.0.1"
55

66
[deps]
7-
Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"
8-
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
97
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
10-
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
118
OMOPCDMCohortCreator = "f525a15e-a73f-4eef-870f-f901257eae22"
129

1310
[compat]

src/fairness.jl

+203-69
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,34 @@ module Fairness
22

33
using DataFrames
44
using OMOPCDMCohortCreator
5+
import Base:
6+
Fix2
57

6-
import Combinatorics:
7-
powerset
8-
import Chain:
9-
@chain
8+
function _counter_reducer(sub, count_name, funcs)
9+
for fun in funcs
10+
sub = fun(sub)
11+
end
12+
sub[:, Not(:person_id)] |>
13+
x -> groupby(x, names(x)) |>
14+
x -> combine(x, nrow => count_name)
15+
end
16+
17+
function _subset_subjects(vec, process_size)
18+
subsets = []
19+
for i in 1:process_size:size(vec)[1]
20+
if i + process_size > size(vec)[1]
21+
push!(subsets, vec[i:end])
22+
else
23+
push!(subsets, vec[i:i+process_size])
24+
end
25+
end
26+
27+
return subsets
28+
end
1029

1130
function _overlapped_subjects(cohorts, conn)
1231

32+
# Requirement is based on majority of phenotypes provided
1333
required_overlapping_phenotypes = ceil(length(cohorts) / 2)
1434

1535
subjects = GetCohortSubjects(cohorts, conn)
@@ -27,109 +47,223 @@ module Fairness
2747

2848
end
2949

30-
function demographic_parity(cohorts, classes, conn)
50+
function demographic_parity(
51+
cohorts,
52+
funcs,
53+
conn;
54+
labels = false,
55+
silver = false,
56+
reference_subjects = "",
57+
process_size = 10000
58+
)
59+
if labels == true
60+
_demographic_parity(cohorts, funcs, conn,
61+
reference_subjects,
62+
process_size,
63+
silver)
64+
else
65+
_demographic_parity(cohorts, funcs, conn, reference_subjects,
66+
process_size)
67+
end
68+
end
3169

32-
study, PP, PN = _overlapped_subjects(cohorts, conn)
33-
34-
dps = DataFrame()
35-
for class in classes
36-
for cohort in cohorts
37-
cohort_subjects = GetCohortSubjects(cohort, conn).subject_id
70+
function _demographic_parity(cohorts, funcs, conn, reference_subjects, process_size, silver)
3871

39-
S = class(study, conn)
72+
_funcs = [Fix2(fun, conn) for fun in funcs]
4073

41-
feature_name = names(S)[2]
74+
if isempty(reference_subjects)
75+
reference_subjects = GetDatabasePersonIDs(conn)
76+
end
4277

43-
for feature in unique(S[:, 2])
44-
C = class(cohort_subjects, conn)
45-
TP =
46-
filter(row -> row[2] == feature, C) |>
47-
filter(row -> in(row[1], PP))
78+
cohorts_df = GetCohortSubjects(cohorts, conn)
4879

49-
FP =
50-
filter(row -> row[2] == feature, C) |>
51-
filter(row -> in(row[1], PN))
52-
53-
N = filter(row -> row[2] == feature, S)
80+
subsets = _subset_subjects(reference_subjects, process_size)
5481

55-
dp = (nrow(TP) + nrow(FP)) / nrow(N)
82+
denom = DataFrame()
83+
for sub in subsets
84+
denom = vcat(denom, _counter_reducer(sub, :count_denom, _funcs))
85+
end
5686

57-
push!(dps, Dict(:cohort_definition_id => cohort, Symbol(feature_name) => feature, :dp => dp), cols = :union)
87+
denom = groupby(denom, Not(:count_denom)) |>
88+
x -> combine(x, :count_denom => sum => :count_denom)
89+
90+
num = DataFrame()
91+
for cohort_idx in unique(cohorts_df.cohort_definition_id)
92+
subjects = filter(row -> row.cohort_definition_id == cohort_idx, cohorts_df).subject_id
93+
subsets = _subset_subjects(subjects, process_size)
94+
for sub in subsets
95+
vals = _counter_reducer(sub, :count_num, _funcs)
96+
vals.cohort_definition_id .= cohort_idx
97+
num = vcat(num, vals)
98+
end
99+
end
58100

59-
end
101+
if silver == true
102+
_, true_subjects, _ = _overlapped_subjects(cohorts, conn)
103+
104+
subsets = _subset_subjects(true_subjects, process_size)
105+
106+
silver = DataFrame()
107+
for sub in subsets
108+
silver = vcat(silver, _counter_reducer(sub, :count_num, _funcs))
60109
end
110+
111+
silver.cohort_definition_id .= :silver
112+
silver = groupby(silver, Not(:count_num)) |>
113+
x -> combine(x, :count_num => sum => :count_num)
114+
115+
num = vcat(num, silver)
116+
61117
end
62118

63-
return dps
119+
num = groupby(num, Not(:count_num)) |>
120+
x -> combine(x, :count_num => sum => :count_num)
121+
122+
dps = outerjoin(num, denom; on = names(num)[1:end-2] .|>
123+
x -> Symbol(x) => Symbol(x)) |>
124+
x -> coalesce.(x, 0)
64125

126+
dps.demographic_parity = dps.count_num ./ dps.count_denom
127+
128+
return dps
65129
end
66130

67-
function equality_of_opportunity(cohorts, classes, conn)
131+
function _demographic_parity(cohorts, funcs, conn, reference_subjects, process_size)
68132

69-
study, PP, PN = _overlapped_subjects(cohorts, conn)
70-
71-
eoos = DataFrame()
72-
for class in classes
73-
for cohort in cohorts
74-
cohort_subjects = GetCohortSubjects(cohort, conn).subject_id
133+
_funcs = [Fix2(fun, conn) for fun in funcs]
75134

76-
S = class(study, conn)
135+
if isempty(reference_subjects)
136+
reference_subjects = GetDatabasePersonIDs(conn)
137+
end
77138

78-
feature_name = names(S)[2]
139+
cohorts = GetCohortSubjects(cohorts, conn).subject_id
79140

80-
for feature in unique(S[:, 2])
81-
C = class(cohort_subjects, conn)
82-
TP =
83-
filter(row -> row[2] == feature, C) |>
84-
filter(row -> in(row[1], PP))
141+
subsets = _subset_subjects(reference_subjects, process_size)
85142

86-
P =
87-
filter(row -> row[2] == feature, C)
143+
denom = DataFrame()
144+
for sub in subsets
145+
denom = vcat(denom, _counter_reducer(sub, :count_denom, _funcs))
146+
end
88147

89-
eoo = nrow(TP) / nrow(P)
148+
denom = groupby(denom, names(denom)[1:end-1]) |>
149+
x -> combine(x, :count_denom => sum => :count_denom)
90150

91-
push!(eoos, Dict(:cohort_definition_id => cohort, Symbol(feature_name) => feature, :eoo => eoo), cols = :union)
151+
subsets = _subset_subjects(cohorts, process_size)
92152

93-
end
94-
end
153+
num = DataFrame()
154+
for sub in subsets
155+
num = vcat(num, _counter_reducer(sub, :count_num, _funcs))
95156
end
96157

97-
return eoos
158+
num = groupby(num, names(num)[1:end-1]) |>
159+
x -> combine(x, :count_num => sum => :count_num)
160+
161+
dps = outerjoin(num, denom; on = names(num)[1:end-1] .|>
162+
x -> Symbol(x) => Symbol(x)) |>
163+
x -> coalesce.(x, 0)
98164

165+
dps.demographic_parity = dps.count_num ./ dps.count_denom
166+
167+
return dps
99168
end
100169

101-
function predictive_rate_parity(cohorts, classes, conn)
170+
function equality_of_opportunity(cohorts, funcs, conn; reference_subjects = "", process_size = 10000)
102171

103-
study, PP, PN = _overlapped_subjects(cohorts, conn)
104-
105-
prps = DataFrame()
106-
for class in classes
107-
for cohort in cohorts
108-
cohort_subjects = GetCohortSubjects(cohort, conn).subject_id
172+
_funcs = [Fix2(fun, conn) for fun in funcs]
109173

110-
S = class(study, conn)
174+
study_subjects, true_subjects, false_subjects = _overlapped_subjects(cohorts, conn)
111175

112-
feature_name = names(S)[2]
176+
subsets = _subset_subjects(true_subjects, process_size)
113177

114-
for feature in unique(S[:, 2])
115-
C = class(cohort_subjects, conn)
116-
TP =
117-
filter(row -> row[2] == feature, C) |>
118-
filter(row -> in(row[1], PP))
178+
denom = DataFrame()
179+
for sub in subsets
180+
denom = vcat(denom, _counter_reducer(sub, :count_denom, _funcs))
181+
end
119182

120-
FP =
121-
filter(row -> row[2] == feature, C) |>
122-
filter(row -> in(row[1], PN))
183+
denom = groupby(denom, names(denom)[1:end-1]) |>
184+
x -> combine(x, :count_denom => sum => :count_denom)
123185

124-
prp = nrow(TP) / (nrow(TP) + nrow(FP))
186+
eoo = DataFrame()
187+
for cohort_idx in cohorts
125188

126-
push!(prps, Dict(:cohort_definition_id => cohort, Symbol(feature_name) => feature, :prp => prp), cols = :union)
189+
cohort = GetCohortSubjects(cohort_idx, conn)
190+
cohort = filter(row -> in(row.subject_id, true_subjects), cohort)
127191

128-
end
192+
subsets = _subset_subjects(cohort.subject_id, process_size)
193+
194+
num = DataFrame()
195+
for sub in subsets
196+
num = vcat(num, _counter_reducer(sub, :count_num, _funcs))
129197
end
198+
199+
num = groupby(num, names(num)[1:end-1]) |>
200+
x -> combine(x, :count_num => sum => :count_num)
201+
202+
cohort = outerjoin(num, denom; on = names(num)[1:end-1] .|>
203+
x -> Symbol(x) => Symbol(x)) |>
204+
x -> coalesce.(x, 0)
205+
206+
cohort.equality_of_opportunity = cohort.count_num ./ cohort.count_denom
207+
208+
cohort.cohort_definition_id = ones(Int, nrow(cohort)) .* cohort_idx
209+
eoo = vcat(eoo, cohort)
130210
end
131211

132-
return prps
212+
return eoo
213+
end
214+
215+
function predictive_rate_parity(cohorts, funcs, conn; reference_subjects = "", process_size = 10000)
216+
217+
_funcs = [Fix2(fun, conn) for fun in funcs]
218+
219+
study_subjects, true_subjects, false_subjects = _overlapped_subjects(cohorts, conn)
220+
221+
prp = DataFrame()
222+
for cohort_idx in cohorts
133223

224+
cohort = GetCohortSubjects(cohort_idx, conn)
225+
true_cohort = filter(row -> in(row.subject_id, true_subjects), cohort)
226+
false_cohort = filter(row -> in(row.subject_id, false_subjects), cohort)
227+
228+
subsets = _subset_subjects(true_cohort.subject_id, process_size)
229+
230+
num = DataFrame()
231+
for sub in subsets
232+
num = vcat(num, _counter_reducer(sub, :count_num, _funcs))
233+
end
234+
235+
subsets = _subset_subjects(false_cohort.subject_id, process_size)
236+
237+
if !isempty(subsets)
238+
false_denom = DataFrame()
239+
for sub in subsets
240+
false_denom = vcat(false_denom, _counter_reducer(sub, :count_num, _funcs))
241+
end
242+
denom = vcat(num, false_denom)
243+
denom = groupby(denom, names(denom)[1:end-1]) |>
244+
x -> combine(x, :count_num => sum => :count_denom)
245+
else
246+
denom = num
247+
denom = groupby(denom, names(denom)[1:end-1]) |>
248+
x -> combine(x, :count_num => sum => :count_denom)
249+
end
250+
251+
num = groupby(num, names(num)[1:end-1]) |>
252+
x -> combine(x, :count_num => sum => :count_num)
253+
254+
cohort = outerjoin(num, denom; on = names(num)[1:end-1] .|>
255+
x -> Symbol(x) => Symbol(x)) |>
256+
x -> coalesce.(x, 0)
257+
258+
cohort.predictive_rate_parity = cohort.count_num ./ cohort.count_denom
259+
260+
cohort.cohort_definition_id = ones(Int, nrow(cohort)) .* cohort_idx
261+
prp = vcat(prp, cohort)
262+
end
263+
264+
return prp
134265
end
266+
267+
export demographic_parity, equality_of_opportunity,predictive_rate_parity
268+
135269
end

0 commit comments

Comments
 (0)