In practice, no additional overhead will be incurred. In C++, small functions are usually inlined by the compiler as an optimization, so the resulting assembly will have all the operations at the callsite— the functions won't call each other, since the functions won't exist in the final code, only the mathematical operations.
Depending on the compiler, you may see one of these functions calling the other with no or low optimization (as with debug builds). At higher optimizations level though (release builds), they'll be optimized down to just the math.
If you'd still like to be pedantic about it (say you're creating a library), adding the inline
keyword to operator*()
(and similar wrapper functions) may hint your compiler to perform the inline, or using compiler-specific flags/syntax like: -finline-small-functions
, -finline-functions
, -findirect-inlining
, __attribute__((always_inline))
(credit to @Stephane Hockenhull's helpful info in the comments). Personally, I tend to follow what the framework/libs I'm using do— if I'm using GLKit's math library, I'll just use the GLK_INLINE
macro that it provides too.
Double-checking using Clang (Xcode 7.2's Apple LLVM version 7.0.2 / clang-700.1.81), the following main()
function (in combination with your functions and a naive Vector3<T>
implementation):
int main(int argc, const char * argv[])
{
Vector3<int> a = { 1, 2, 3 };
Vector3<int> b;
scanf("%d", &b.x);
scanf("%d", &b.y);
scanf("%d", &b.z);
Vector3<int> c = a * b;
printf("%d, %d, %d\n", c.x, c.y, c.z);
return 0;
}
compiles to this assembly using optimization flag -O0
:
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## @main
Lfunc_begin0:
.loc 6 30 0 ## main.cpp:30:0
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
subq $128, %rsp
leaq L_.str1(%rip), %rax
##DEBUG_VALUE: main:argc <- undef
##DEBUG_VALUE: main:argv <- undef
movl $0, -4(%rbp)
movl %edi, -8(%rbp)
movq %rsi, -16(%rbp)
.loc 6 31 15 prologue_end ## main.cpp:31:15
Ltmp3:
movl l__ZZ4mainE1a+8(%rip), %edi
movl %edi, -24(%rbp)
movq l__ZZ4mainE1a(%rip), %rsi
movq %rsi, -32(%rbp)
.loc 6 33 2 ## main.cpp:33:2
leaq L_.str(%rip), %rsi
xorl %edi, %edi
movb %dil, %cl
leaq -48(%rbp), %rdx
movq %rsi, %rdi
movq %rsi, -88(%rbp) ## 8-byte Spill
movq %rdx, %rsi
movq %rax, -96(%rbp) ## 8-byte Spill
movb %cl, %al
movb %cl, -97(%rbp) ## 1-byte Spill
movq %rdx, -112(%rbp) ## 8-byte Spill
callq _scanf
.loc 6 34 17 ## main.cpp:34:17
leaq -44(%rbp), %rsi
.loc 6 34 2 is_stmt 0 ## main.cpp:34:2
movq -88(%rbp), %rdi ## 8-byte Reload
movb -97(%rbp), %cl ## 1-byte Reload
movl %eax, -116(%rbp) ## 4-byte Spill
movb %cl, %al
callq _scanf
.loc 6 35 17 is_stmt 1 ## main.cpp:35:17
leaq -40(%rbp), %rsi
.loc 6 35 2 is_stmt 0 ## main.cpp:35:2
movq -88(%rbp), %rdi ## 8-byte Reload
movb -97(%rbp), %cl ## 1-byte Reload
movl %eax, -120(%rbp) ## 4-byte Spill
movb %cl, %al
callq _scanf
leaq -32(%rbp), %rdi
.loc 6 37 21 is_stmt 1 ## main.cpp:37:21
movq -112(%rbp), %rsi ## 8-byte Reload
movl %eax, -124(%rbp) ## 4-byte Spill
callq __ZmlIiiE7Vector3IDTmldtfp_1xdtfp0_1xEERKS0_IT_ERKS0_IT0_E
movl %edx, -72(%rbp)
movq %rax, -80(%rbp)
movq -80(%rbp), %rax
movq %rax, -64(%rbp)
movl -72(%rbp), %edx
movl %edx, -56(%rbp)
.loc 6 39 27 ## main.cpp:39:27
movl -64(%rbp), %esi
.loc 6 39 32 is_stmt 0 ## main.cpp:39:32
movl -60(%rbp), %edx
.loc 6 39 37 ## main.cpp:39:37
movl -56(%rbp), %ecx
.loc 6 39 2 ## main.cpp:39:2
movq -96(%rbp), %rdi ## 8-byte Reload
movb $0, %al
callq _printf
xorl %ecx, %ecx
.loc 6 41 5 is_stmt 1 ## main.cpp:41:5
movl %eax, -128(%rbp) ## 4-byte Spill
movl %ecx, %eax
addq $128, %rsp
popq %rbp
retq
Ltmp4:
Lfunc_end0:
.cfi_endproc
In the above, __ZmlIiiE7Vector3IDTmldtfp_1xdtfp0_1xEERKS0_IT_ERKS0_IT0_E
is your operator*()
function and ends up callq
ing another __…Vector3…
function. It amounts to quite a lot of assembly. Compiling with -O1
is almost the same, still calling out to __…Vector3…
functions.
However, when we bump it up to -O2
, the callq
s to __…Vector3…
disappear, replaced with a imull
instruction (the * a.z
≈ * 3
), an addl
instruction (the * a.y
≈ * 2
), and just using the b.x
value straight-up (because * a.x
≈ * 1
).
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## @main
Lfunc_begin0:
.loc 6 30 0 ## main.cpp:30:0
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
.loc 6 33 2 prologue_end ## main.cpp:33:2
Ltmp3:
pushq %rbx
subq $24, %rsp
Ltmp4:
.cfi_offset %rbx, -24
##DEBUG_VALUE: main:argc <- EDI
##DEBUG_VALUE: main:argv <- RSI
leaq L_.str(%rip), %rbx
leaq -24(%rbp), %rsi
Ltmp5:
##DEBUG_VALUE: operator*=<int, int>:rhs <- [RSI+0]
##DEBUG_VALUE: operator*<int, int>:rhs <- [RSI+0]
##DEBUG_VALUE: main:b <- [RSI+0]
xorl %eax, %eax
movq %rbx, %rdi
Ltmp6:
callq _scanf
.loc 6 34 17 ## main.cpp:34:17
leaq -20(%rbp), %rsi
Ltmp7:
xorl %eax, %eax
.loc 6 34 2 is_stmt 0 ## main.cpp:34:2
movq %rbx, %rdi
callq _scanf
.loc 6 35 17 is_stmt 1 ## main.cpp:35:17
leaq -16(%rbp), %rsi
xorl %eax, %eax
.loc 6 35 2 is_stmt 0 ## main.cpp:35:2
movq %rbx, %rdi
callq _scanf
.loc 6 22 18 is_stmt 1 ## main.cpp:22:18
Ltmp8:
movl -24(%rbp), %esi
.loc 6 23 18 ## main.cpp:23:18
movl -20(%rbp), %edx
.loc 6 23 11 is_stmt 0 ## main.cpp:23:11
addl %edx, %edx
.loc 6 24 11 is_stmt 1 ## main.cpp:24:11
imull $3, -16(%rbp), %ecx
Ltmp9:
##DEBUG_VALUE: main:c [bit_piece offset=64 size=32] <- ECX
.loc 6 39 2 ## main.cpp:39:2
leaq L_.str1(%rip), %rdi
xorl %eax, %eax
callq _printf
xorl %eax, %eax
.loc 6 41 5 ## main.cpp:41:5
addq $24, %rsp
popq %rbx
popq %rbp
retq
Ltmp10:
Lfunc_end0:
.cfi_endproc
For this code, the assembly at -O2
, -O3
, -Os
, & -Ofast
all look identical.
*
and*=
are doing two different things - the former adds the individual values, the latter multiplying them. They also appear to have different type signatures. – Clockwork-Muse Jan 10 '16 at 14:50+
is an obvious mistake (but the same pattern applies tooperator +=
). The signatures are different because the syntax isa*=b
versusa = b*c
(wherea*=b
is the more efficient form ofa=a*b
). Since they're so obviously related, it makes sense to define one in terms of the other. On the other hand, it's a don't care since modern compilers generate the same code regardless. – MSalters Jan 11 '16 at 09:16