Skip to content

Commit d60bb2a

Browse files
committed
Reorganize code a little bit in preparation for using chain decomposition in main compilation step.
1 parent c174631 commit d60bb2a

4 files changed

+19
-179
lines changed

lib/BUILD.bazel

+1
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,7 @@ cc_library(
550550
"nnc/ccv_nnc_symbolic_graph_parallel.c",
551551
"nnc/ccv_nnc_symbolic_graph_memory_compression.c",
552552
"nnc/ccv_nnc_symbolic_graph_memory_reduction.c",
553+
"nnc/ccv_nnc_symbolic_graph_chain_decomposition.c",
553554
"nnc/ccv_nnc_xpu_alloc.c",
554555
"nnc/ccv_nnc_dynamic_graph.c",
555556
"nnc/ccv_nnc_dynamic_graph_alloc.c",

lib/nnc/_ccv_nnc_symbolic_graph.h

+12
Original file line numberDiff line numberDiff line change
@@ -217,5 +217,17 @@ void ccv_nnc_symbolic_graph_symbol_infer(const ccv_nnc_symbolic_graph_t* const s
217217
int ccv_nnc_over_tensor_symbol_aliases(const ccv_nnc_tensor_symbol_info_t* const tensor_a, const ccv_nnc_tensor_symbol_info_t* const tensor_b);
218218
int ccv_nnc_tensor_symbol_map_raw(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol);
219219

220+
typedef struct {
221+
int flags;
222+
int* chain_ids;
223+
int* chain_pos;
224+
ccv_sparse_matrix_t* deps;
225+
} ccv_nnc_exec_dep_t;
226+
227+
ccv_nnc_exec_dep_t ccv_nnc_exec_dep_new(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_graph_visit_t* const reversed_visit);
228+
int ccv_nnc_exec_dep_hop(const ccv_nnc_exec_dep_t exec_dep, const int d, ccv_sparse_matrix_vector_t* const vector, const int dd);
229+
int ccv_nnc_exec_dep_check(const ccv_nnc_exec_dep_t exec_dep, const int d, const int dd);
230+
void ccv_nnc_exec_dep_free(const ccv_nnc_exec_dep_t exec_dep);
231+
220232
#endif
221233

lib/nnc/ccv_nnc_symbolic_graph_memory_reduction.c

+5-178
Original file line numberDiff line numberDiff line change
@@ -13,179 +13,6 @@ static void _ccv_nnc_remove_unused_from_marked(const uint32_t* const tensor_used
1313
tensor_marked[i] &= tensor_used[i];
1414
}
1515

16-
typedef struct {
17-
int* chain_ids;
18-
int* chain_pos;
19-
ccv_sparse_matrix_t* deps;
20-
} ccv_nnc_exec_dep_t;
21-
22-
// Implement the new method for exec_dep. We use chain decomposition such that each node only needs to log which chain and at which node to be dependent on.
23-
static ccv_nnc_exec_dep_t _ccv_nnc_exec_dep_new(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_visit_t* const visit, const ccv_nnc_graph_visit_t* const reversed_visit)
24-
{
25-
const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
26-
int* chain_ids = ccmalloc(sizeof(int) * exec_symbol_info_size * 2);
27-
int* chain_pos = chain_ids + exec_symbol_info_size;
28-
int* buf = (int*)ccmalloc(sizeof(int) * exec_symbol_info_size * 3);
29-
int* reversed_depth = buf;
30-
const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0);
31-
int i, j;
32-
// Go reverse order to generate the distance from sink.
33-
ccv_nnc_graph_visit_for(reversed_visit, exec_symbol_info, node, idx, term) {
34-
chain_ids[idx] = -1;
35-
if (!node->outgoings || node->outgoings->rnum == 0)
36-
{
37-
reversed_depth[idx] = 0;
38-
continue;
39-
}
40-
const int outgoing = *(int*)ccv_array_get(node->outgoings, 0);
41-
int depth = reversed_depth[outgoing];
42-
for (i = 1; i < node->outgoings->rnum; i++)
43-
{
44-
const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
45-
depth = ccv_max(depth, reversed_depth[outgoing]);
46-
}
47-
reversed_depth[idx] = depth + 1;
48-
} ccv_nnc_graph_visit_endfor
49-
// Go in order to generate chain ids (if there are multiple exits, we use the reverse depth to break the tie).
50-
// Note that we cannot use depth so-far because then multiple exit nodes are equally good to "inherit" the chain selection.
51-
int chain_count = 0;
52-
ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
53-
int chain_id = chain_ids[idx];
54-
if (chain_ids[idx] < 0)
55-
{
56-
chain_id = chain_count;
57-
chain_ids[idx] = chain_id;
58-
chain_pos[idx] = 1; // The first one in this chain. 1-based index because in sparse matrix, 0 is the default value.
59-
chain_count += 1;
60-
}
61-
if (!node->outgoings || node->outgoings->rnum == 0)
62-
continue;
63-
int depth = 0;
64-
int next_idx = -1;
65-
for (i = 0; i < node->outgoings->rnum; i++)
66-
{
67-
const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
68-
if (chain_ids[outgoing] < 0 && reversed_depth[outgoing] > depth)
69-
depth = reversed_depth[outgoing], next_idx = outgoing;
70-
}
71-
if (next_idx >= 0)
72-
{
73-
chain_ids[next_idx] = chain_id;
74-
chain_pos[next_idx] = chain_pos[idx] + 1;
75-
}
76-
} ccv_nnc_graph_visit_endfor
77-
ccv_sparse_matrix_t* deps = ccv_sparse_matrix_new(graph->exec_symbol_info->rnum, chain_count, CCV_32S | CCV_C2, CCV_SPARSE_ROW_MAJOR, 0);
78-
// It logs which pos on that chain we depend on. We can simply compare that with the chain_pos for a node to know if they are ancestors.
79-
#define for_block(x, val) \
80-
do { \
81-
if (((int32_t*)val)[0] > 0) \
82-
{ \
83-
buf[buf_size * 3] = x; \
84-
buf[buf_size * 3 + 1] = ((int32_t*)val)[0]; \
85-
buf[buf_size * 3 + 2] = ((int32_t*)val)[1] + 1; \
86-
++buf_size; \
87-
} \
88-
} while (0)
89-
int buf_size;
90-
ccv_nnc_graph_visit_for(visit, exec_symbol_info, node, idx, term) {
91-
buf_size = 0; /* save all its parent deps to this buffer */
92-
ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(deps, idx);
93-
if (vector)
94-
CCV_SPARSE_VECTOR_FOREACH(deps, vector, for_block);
95-
if (!node->outgoings)
96-
continue;
97-
const int chain_id = chain_ids[idx];
98-
const int pos = chain_pos[idx];
99-
for (i = 0; i < node->outgoings->rnum; i++)
100-
{
101-
const int outgoing = *(int*)ccv_array_get(node->outgoings, i);
102-
const int outgoing_chain_id = chain_ids[outgoing];
103-
if (outgoing_chain_id != chain_id)
104-
{
105-
ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(deps, outgoing, chain_id);
106-
/* If not found, set, if the current node is the destination node, no need
107-
* set itself as parent of subsequent nodes because its terminal nature. */
108-
if (!cell.i32 || cell.i32[0] == 0 || cell.i32[0] < pos)
109-
{
110-
int p[2] = { pos, 1 };
111-
ccv_set_sparse_matrix_cell(deps, outgoing, chain_id, &p);
112-
}
113-
}
114-
if (buf_size > 0)
115-
{
116-
ccv_sparse_matrix_vector_t* vector = ccv_get_sparse_matrix_vector(deps, outgoing);
117-
for (j = 0; j < buf_size; j++) /* set with all idx's dependencies as well */
118-
{
119-
if (outgoing_chain_id == buf[j * 3]) // We don't need to add as dependency for the same chain.
120-
continue;
121-
if (!vector)
122-
{
123-
ccv_set_sparse_matrix_cell(deps, outgoing, buf[j * 3], &buf[j * 3 + 1]);
124-
vector = ccv_get_sparse_matrix_vector(deps, outgoing);
125-
continue;
126-
}
127-
ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(deps, vector, buf[j * 3]);
128-
/* If not found, set. Otherwise, set to the latest one only if it is later. */
129-
if (!cell.i32)
130-
ccv_set_sparse_matrix_cell_from_vector(deps, vector, buf[j * 3], &buf[j * 3 + 1]);
131-
else if (cell.i32[0] == 0 || cell.i32[0] < buf[j * 3 + 1])
132-
ccv_set_sparse_matrix_cell_from_vector(deps, vector, buf[j * 3], &buf[j * 3 + 1]);
133-
else if (cell.i32[0] == buf[j * 3 + 1]) { // If we point to the same one, use the longest.
134-
int p[2] = { cell.i32[0], ccv_max(buf[j * 3 + 2], cell.i32[1]) };
135-
ccv_set_sparse_matrix_cell_from_vector(deps, vector, buf[j * 3], &p);
136-
}
137-
}
138-
}
139-
}
140-
} ccv_nnc_graph_visit_endfor
141-
#undef for_block
142-
ccfree(buf);
143-
ccv_nnc_exec_dep_t exec_dep = {
144-
.chain_ids = chain_ids,
145-
.chain_pos = chain_pos,
146-
.deps = deps
147-
};
148-
return exec_dep;
149-
}
150-
151-
static int _ccv_nnc_exec_dep_dist(const ccv_nnc_exec_dep_t exec_dep, const int d, ccv_sparse_matrix_vector_t* const vector, const int dd)
152-
{
153-
// Check if dd is d's ancestor.
154-
const int dd_chain_id = exec_dep.chain_ids[dd];
155-
const int dd_chain_pos = exec_dep.chain_pos[dd];
156-
if (exec_dep.chain_ids[d] == dd_chain_id)
157-
return exec_dep.chain_pos[d] - dd_chain_pos;
158-
const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell_from_vector(exec_dep.deps, vector, dd_chain_id);
159-
if (cell.i32 && cell.i32[0] > 0 && cell.i32[0] >= dd_chain_pos)
160-
{
161-
// Check if the chain pos is greater than or equal to dd_chain_pos. If it is, it is an ancestor.
162-
return cell.i32[0] - dd_chain_pos + cell.i32[1];
163-
}
164-
return -1;
165-
}
166-
167-
static int _ccv_nnc_exec_dep_check(const ccv_nnc_exec_dep_t exec_dep, const int d, const int dd)
168-
{
169-
// Check if dd is d's ancestor.
170-
const int dd_chain_id = exec_dep.chain_ids[dd];
171-
const int dd_chain_pos = exec_dep.chain_pos[dd];
172-
if (exec_dep.chain_ids[d] == dd_chain_id)
173-
return exec_dep.chain_pos[d] > dd_chain_pos;
174-
const ccv_numeric_data_t cell = ccv_get_sparse_matrix_cell(exec_dep.deps, d, dd_chain_id);
175-
if (cell.i32 && cell.i32[0] > 0)
176-
{
177-
// Check if the chain pos is greater than or equal to dd_chain_pos. If it is, it is an ancestor.
178-
return cell.i32[0] >= dd_chain_pos;
179-
}
180-
return 0;
181-
}
182-
183-
static void _ccv_nnc_exec_dep_free(const ccv_nnc_exec_dep_t exec_dep)
184-
{
185-
ccfree(exec_dep.chain_ids);
186-
ccv_matrix_free(exec_dep.deps);
187-
}
188-
18916
typedef struct {
19017
int okay;
19118
int original;
@@ -254,7 +81,7 @@ void ccv_nnc_symbolic_graph_memory_reduction(ccv_nnc_symbolic_graph_t* const gra
25481
tensor_marked[d >> 5] &= ~(1u << (d & 0x1f));
25582
}
25683
ccv_nnc_graph_visit_t* const reversed_visit = ccv_nnc_graph_visit_new(graph, reversed_nodes, exec_symbol_info_size, destinations, destination_size, sources, source_size, 0);
257-
ccv_nnc_exec_dep_t exec_deps = _ccv_nnc_exec_dep_new(graph, visit, reversed_visit);
84+
ccv_nnc_exec_dep_t exec_deps = ccv_nnc_exec_dep_new(graph, visit, reversed_visit);
25885
ccv_nnc_graph_visit_free(reversed_visit);
25986
// Now tensor_marked only contains the tensors that we think beneficial to reconvert. Find the best place to insert conversion.
26087
ccv_nnc_conversion_info_t* const conversion_info = cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_conversion_info_t));
@@ -304,8 +131,8 @@ void ccv_nnc_symbolic_graph_memory_reduction(ccv_nnc_symbolic_graph_t* const gra
304131
for (k = 0; k < old_conversion_nodes->rnum; k++)
305132
{
306133
const int dd = *(int*)ccv_array_get(old_conversion_nodes, k);
307-
const int dist = _ccv_nnc_exec_dep_dist(exec_deps, d, vector, dd);
308-
if (dist >= 0 && dist <= 3)
134+
const int hop = ccv_nnc_exec_dep_hop(exec_deps, d, vector, dd);
135+
if (hop >= 0 && hop <= 3)
309136
flag = 1;
310137
}
311138
if (flag)
@@ -338,7 +165,7 @@ void ccv_nnc_symbolic_graph_memory_reduction(ccv_nnc_symbolic_graph_t* const gra
338165
continue;
339166
}
340167
// Check dependencies, if there is a dependency from y node to dd, dd cannot be source.
341-
const int checked = _ccv_nnc_exec_dep_check(exec_deps, dd, ddd);
168+
const int checked = ccv_nnc_exec_dep_check(exec_deps, dd, ddd);
342169
if (checked)
343170
flag = 1;
344171
}
@@ -393,7 +220,7 @@ void ccv_nnc_symbolic_graph_memory_reduction(ccv_nnc_symbolic_graph_t* const gra
393220
}
394221
}
395222
ccv_nnc_graph_visit_free(visit);
396-
_ccv_nnc_exec_dep_free(exec_deps);
223+
ccv_nnc_exec_dep_free(exec_deps);
397224
ccfree(tensor_marked);
398225
for (i = 0; i < tensor_symbol_info_size; i++)
399226
{

lib/nnc/makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ include ../config.mk
33
CFLAGS := -O3 -Wall -I"../" $(CFLAGS)
44
NVFLAGS := -O3 $(NVFLAGS)
55

6-
SRCS := ccv_nnc_cmd.c ccv_nnc_tensor.c ccv_nnc_tensor_io.c ccv_nnc_stream.c ccv_nnc_micro.c ccv_nnc_micro_core.c ccv_nnc_micro_interpret.c ccv_nnc_micro_simplify.c ccv_nnc_graph.c ccv_nnc_symbolic_graph.c ccv_nnc_symbolic_graph_io.c ccv_nnc_symbolic_graph_compile.c ccv_nnc_symbolic_graph_backward.c ccv_nnc_symbolic_graph_while.c ccv_nnc_graph_while.c ccv_nnc_tensor_tape.c ccv_nnc_symbolic_graph_case_of.c ccv_nnc_graph_case_of.c ccv_nnc_symbolic_graph_minimize.c ccv_nnc_symbolic_graph_parallel.c ccv_nnc_symbolic_graph_simplify.c ccv_nnc_symbolic_graph_memory_compression.c ccv_nnc_symbolic_graph_memory_reduction.c ccv_nnc_graph_run.c ccv_nnc_xpu_alloc.c ccv_nnc_dynamic_graph.c ccv_nnc_dynamic_graph_alloc.c ccv_nnc_dynamic_graph_backward.c ccv_nnc_dynamic_graph_apply_gradients.c ccv_nnc_dynamic_graph_minimize.c ccv_nnc_dynamic_graph_evaluate.c ccv_cnnp_dataframe.c ccv_cnnp_dataframe_core.c ccv_cnnp_dataframe_addons.c ccv_cnnp_dataframe_csv.c ccv_cnnp_model.c ccv_cnnp_model_io.c ccv_cnnp_model_core.c ccv_cnnp_model_addons.c co.c ccv_nnc_palettize.c ccv_cnnp_model_gradient_checkpointing.c
6+
SRCS := ccv_nnc_cmd.c ccv_nnc_tensor.c ccv_nnc_tensor_io.c ccv_nnc_stream.c ccv_nnc_micro.c ccv_nnc_micro_core.c ccv_nnc_micro_interpret.c ccv_nnc_micro_simplify.c ccv_nnc_graph.c ccv_nnc_symbolic_graph.c ccv_nnc_symbolic_graph_io.c ccv_nnc_symbolic_graph_compile.c ccv_nnc_symbolic_graph_backward.c ccv_nnc_symbolic_graph_while.c ccv_nnc_graph_while.c ccv_nnc_tensor_tape.c ccv_nnc_symbolic_graph_case_of.c ccv_nnc_graph_case_of.c ccv_nnc_symbolic_graph_minimize.c ccv_nnc_symbolic_graph_parallel.c ccv_nnc_symbolic_graph_simplify.c ccv_nnc_symbolic_graph_memory_compression.c ccv_nnc_symbolic_graph_memory_reduction.c ccv_nnc_graph_run.c ccv_nnc_xpu_alloc.c ccv_nnc_dynamic_graph.c ccv_nnc_dynamic_graph_alloc.c ccv_nnc_dynamic_graph_backward.c ccv_nnc_dynamic_graph_apply_gradients.c ccv_nnc_dynamic_graph_minimize.c ccv_nnc_dynamic_graph_evaluate.c ccv_cnnp_dataframe.c ccv_cnnp_dataframe_core.c ccv_cnnp_dataframe_addons.c ccv_cnnp_dataframe_csv.c ccv_cnnp_model.c ccv_cnnp_model_io.c ccv_cnnp_model_core.c ccv_cnnp_model_addons.c co.c ccv_nnc_palettize.c ccv_cnnp_model_gradient_checkpointing.c ccv_nnc_symbolic_graph_chain_decomposition.c
77

88
SRC_OBJS := $(patsubst %.c,%.o,$(SRCS))
99

0 commit comments

Comments
 (0)