Skip to content

Commit

Permalink
Merge pull request #1643 from CEED/jeremy/less-rstr
Browse files Browse the repository at this point in the history
Minor performance improvement for op with repeat input rstr
  • Loading branch information
jeremylt authored Aug 5, 2024
2 parents 68667be + 3aab95c commit 50ca0d9
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 44 deletions.
38 changes: 31 additions & 7 deletions backends/blocked/ceed-blocked-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
//------------------------------------------------------------------------------
// Setup Input/Output Fields
//------------------------------------------------------------------------------
static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt block_size,
static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, bool *skip_rstr, const CeedInt block_size,
CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs,
CeedInt start_e, CeedInt num_fields, CeedInt Q) {
Ceed ceed;
Expand Down Expand Up @@ -135,6 +135,28 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo
break;
}
}
// Drop duplicate input restrictions
if (is_input) {
for (CeedInt i = 0; i < num_fields; i++) {
CeedVector vec_i;
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
for (CeedInt j = i + 1; j < num_fields; j++) {
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
if (vec_i == vec_j && rstr_i == rstr_j) {
CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
CeedCallBackend(CeedVectorReferenceCopy(e_vecs_full[i], &e_vecs_full[j]));
skip_rstr[j] = true;
}
}
}
}
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -166,6 +188,7 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {
CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr));
CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full));

CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->e_vecs_out));
Expand All @@ -177,11 +200,11 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) {

// Set up infield and outfield pointer arrays
// Infields
CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0,
num_input_fields, Q));
CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, true, impl->skip_rstr_in, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in,
impl->q_vecs_in, 0, num_input_fields, Q));
// Outfields
CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out,
num_input_fields, num_output_fields, Q));
CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, NULL, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out,
impl->q_vecs_out, num_input_fields, num_output_fields, Q));

// Identity QFunctions
if (impl->is_identity_qf) {
Expand Down Expand Up @@ -226,10 +249,10 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed
} else {
// Restrict
CeedCallBackend(CeedVectorGetState(vec, &state));
if (state != impl->input_states[i] || vec == in_vec) {
if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) {
CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request));
impl->input_states[i] = state;
}
impl->input_states[i] = state;
// Get evec
CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs_full[i], CEED_MEM_HOST, (const CeedScalar **)&e_data_full[i]));
}
Expand Down Expand Up @@ -647,6 +670,7 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) {

CeedCallBackend(CeedOperatorGetData(op, &impl));

CeedCallBackend(CeedFree(&impl->skip_rstr_in));
for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
CeedCallBackend(CeedElemRestrictionDestroy(&impl->block_rstr[i]));
CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i]));
Expand Down
5 changes: 3 additions & 2 deletions backends/blocked/ceed-blocked.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ typedef struct {

typedef struct {
bool is_identity_qf, is_identity_rstr_op;
CeedElemRestriction *block_rstr; /* Blocked versions of restrictions */
CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */
bool *skip_rstr_in;
uint64_t *input_states; /* State counter of inputs */
CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */
CeedVector *e_vecs_in; /* Element block input E-vectors */
CeedVector *e_vecs_out; /* Element block output E-vectors */
CeedVector *q_vecs_in; /* Element block input Q-vectors */
CeedVector *q_vecs_out; /* Element block output Q-vectors */
CeedElemRestriction *block_rstr; /* Blocked versions of restrictions */
CeedInt num_inputs, num_outputs;
CeedInt qf_size_in, qf_size_out;
CeedVector qf_l_vec;
Expand Down
42 changes: 34 additions & 8 deletions backends/cuda-ref/ceed-cuda-ref-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
CeedCallBackend(CeedOperatorGetData(op, &impl));

// Apply data
CeedCallBackend(CeedFree(&impl->skip_rstr_in));
for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
}
Expand Down Expand Up @@ -96,8 +97,8 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
//------------------------------------------------------------------------------
// Setup infields or outfields
//------------------------------------------------------------------------------
static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, CeedVector *e_vecs, CeedVector *q_vecs,
CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, CeedVector *e_vecs,
CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
Ceed ceed;
CeedQFunctionField *qf_fields;
CeedOperatorField *op_fields;
Expand Down Expand Up @@ -183,6 +184,27 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool
break;
}
}
// Drop duplicate input restrictions
if (is_input) {
for (CeedInt i = 0; i < num_fields; i++) {
CeedVector vec_i;
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
for (CeedInt j = i + 1; j < num_fields; j++) {
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
if (vec_i == vec_j && rstr_i == rstr_j) {
CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
skip_rstr[j] = true;
}
}
}
}
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -211,6 +233,7 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {

// Allocate
CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
Expand All @@ -219,10 +242,11 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) {

// Set up infield and outfield e_vecs and q_vecs
// Infields
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
CeedCallBackend(
CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
// Outfields
CeedCallBackend(
CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
CeedOperatorSetupFields_Cuda(qf, op, false, false, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));

CeedCallBackend(CeedOperatorSetSetupDone(op));
return CEED_ERROR_SUCCESS;
Expand Down Expand Up @@ -262,10 +286,10 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu
uint64_t state;

CeedCallBackend(CeedVectorGetState(vec, &state));
if (state != impl->input_states[i]) {
if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
impl->input_states[i] = state;
}
impl->input_states[i] = state;
// Get evec
CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
}
Expand Down Expand Up @@ -474,6 +498,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {

// Allocate
CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
Expand All @@ -482,9 +507,10 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {

// Set up infield and outfield e_vecs and q_vecs
// Infields
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, max_num_points, num_elem));
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
max_num_points, num_elem));
// Outfields
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
max_num_points, num_elem));

CeedCallBackend(CeedOperatorSetSetupDone(op));
Expand Down
3 changes: 2 additions & 1 deletion backends/cuda-ref/ceed-cuda-ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,9 @@ typedef struct {
} CeedOperatorAssemble_Cuda;

typedef struct {
CeedVector *e_vecs; // E-vectors, inputs followed by outputs
bool *skip_rstr_in;
uint64_t *input_states; // State tracking for passive inputs
CeedVector *e_vecs; // E-vectors, inputs followed by outputs
CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator
CeedVector *q_vecs_out; // Output Q-vectors needed to apply operator
CeedInt num_inputs, num_outputs;
Expand Down
42 changes: 34 additions & 8 deletions backends/hip-ref/ceed-hip-ref-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
CeedCallBackend(CeedOperatorGetData(op, &impl));

// Apply data
CeedCallBackend(CeedFree(&impl->skip_rstr_in));
for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) {
CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i]));
}
Expand Down Expand Up @@ -95,8 +96,8 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
//------------------------------------------------------------------------------
// Setup infields or outfields
//------------------------------------------------------------------------------
static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, CeedVector *e_vecs, CeedVector *q_vecs,
CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, bool is_at_points, bool *skip_rstr, CeedVector *e_vecs,
CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q, CeedInt num_elem) {
Ceed ceed;
CeedQFunctionField *qf_fields;
CeedOperatorField *op_fields;
Expand Down Expand Up @@ -182,6 +183,27 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i
break;
}
}
// Drop duplicate input restrictions
if (is_input) {
for (CeedInt i = 0; i < num_fields; i++) {
CeedVector vec_i;
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec_i));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr_i));
for (CeedInt j = i + 1; j < num_fields; j++) {
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_fields[j], &vec_j));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[j], &rstr_j));
if (vec_i == vec_j && rstr_i == rstr_j) {
CeedCallBackend(CeedVectorReferenceCopy(e_vecs[i], &e_vecs[j]));
skip_rstr[j] = true;
}
}
}
}
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -210,6 +232,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {

// Allocate
CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
Expand All @@ -218,10 +241,11 @@ static int CeedOperatorSetup_Hip(CeedOperator op) {

// Set up infield and outfield e_vecs and q_vecs
// Infields
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, false, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
CeedCallBackend(
CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem));
// Outfields
CeedCallBackend(
CeedOperatorSetupFields_Hip(qf, op, false, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));
CeedOperatorSetupFields_Hip(qf, op, false, false, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem));

CeedCallBackend(CeedOperatorSetSetupDone(op));
return CEED_ERROR_SUCCESS;
Expand Down Expand Up @@ -261,10 +285,10 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun
uint64_t state;

CeedCallBackend(CeedVectorGetState(vec, &state));
if (state != impl->input_states[i]) {
if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) {
CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request));
impl->input_states[i] = state;
}
impl->input_states[i] = state;
// Get evec
CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i]));
}
Expand Down Expand Up @@ -473,6 +497,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {

// Allocate
CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->skip_rstr_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in));
CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out));
Expand All @@ -481,9 +506,10 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {

// Set up infield and outfield e_vecs and q_vecs
// Infields
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, max_num_points, num_elem));
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields,
max_num_points, num_elem));
// Outfields
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, NULL, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields,
max_num_points, num_elem));

CeedCallBackend(CeedOperatorSetSetupDone(op));
Expand Down
3 changes: 2 additions & 1 deletion backends/hip-ref/ceed-hip-ref.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,9 @@ typedef struct {
} CeedOperatorAssemble_Hip;

typedef struct {
CeedVector *e_vecs; // E-vectors, inputs followed by outputs
bool *skip_rstr_in;
uint64_t *input_states; // State tracking for passive inputs
CeedVector *e_vecs; // E-vectors, inputs followed by outputs
CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator
CeedVector *q_vecs_out; // Output Q-vectors needed to apply operator
CeedInt num_inputs, num_outputs;
Expand Down
Loading

0 comments on commit 50ca0d9

Please sign in to comment.