그래서 토론이 있었에 대해 쓰고 더욱 이해할 수 있는(대한 다른 사람)코드,이상을 만들려고 노력하 어셈블리 코드를 더 친절합니다.
나의 찬성 쓰 이해할 수 있는 코드는 별개의 행동을,별도의 responsibilies,을 준수하는 C++지침,etc. 그래서,일반적으로,하나는 첫번째로 돌 알고리즘의 복잡성과에 대한 코드를 작성된 추상적인 기계. 컴파일러는 정교한 만들기에 충분하 깔끔한 최적화되어 있습니다.
그래서 제가 한쪽에서는 조각의 코드가 나지 않을 찾을 청소하거나 이해하기 쉬운 눈에서(에서 Stroustroup 의 책 afaic):
void Window::move_to_top(Shape& s)
{
for (int i = 0; i < shapes.size(); ++i)
if (&s == shapes[i]) {
for (++i; i < shapes.size(); ++i)
shapes[i - 1] = shapes[i];
shapes[shapes.size() - 1] = &s;
return;
}
}
또 다른 버전이있는 더 많은 자명하고 이해하기 쉽게 사용하여,적인 stanard 라이브러리 알고리즘이 있습니다.
void Window::move_to_top(Shape& s)
{
auto iterShape = std::find(shapes.begin(), shapes.end(), &s);
std::rotate(iterShape, std::next(iterShape), shapes.end());
}
두 버전 모두가 선형 복잡성과 기본적으로 동(모두 가정 Shape &s
저장된 내 편). 그러나,gcc -O3
:
"수동"버:
Window::move_to_top(Shape&):
mov r8, QWORD PTR [rdi+8]
mov rcx, QWORD PTR [rdi]
mov rdi, r8
sub rdi, rcx
sar rdi, 3
je .L1
mov eax, 1
jmp .L7
.L3:
lea rdx, [rax+1]
cmp rdi, rax
je .L1
mov rax, rdx
.L7:
mov edx, eax
cmp QWORD PTR [rcx-8+rax*8], rsi
jne .L3
add edx, 1
movsx rdx, edx
cmp rdi, rax
jbe .L6
.L5:
mov rax, QWORD PTR [rcx+rax*8]
mov QWORD PTR [rcx-16+rdx*8], rax
mov rax, rdx
add rdx, 1
cmp rdi, rax
ja .L5
.L6:
mov QWORD PTR [r8-8], rsi
ret
.L1:
ret
STL 버전:
Window::move_to_top(Shape&):
push r12
push rbp
push rbx
mov rax, QWORD PTR [rdi+8]
mov rbp, QWORD PTR [rdi]
mov rdx, rax
sub rdx, rbp
mov rcx, rdx
sar rdx, 5
sar rcx, 3
test rdx, rdx
jle .L2
sal rdx, 5
add rdx, rbp
jmp .L7
.L76:
cmp rsi, QWORD PTR [rbp+8]
je .L72
cmp rsi, QWORD PTR [rbp+16]
je .L73
cmp rsi, QWORD PTR [rbp+24]
je .L74
add rbp, 32
cmp rbp, rdx
je .L75
.L7:
cmp rsi, QWORD PTR [rbp+0]
jne .L76
.L3:
lea rdx, [rbp+8]
cmp rax, rdx
je .L1
sub rax, rbp
cmp rax, 16
je .L14
.L35:
sar rax, 3
mov esi, 1
.L15:
mov rdi, rax
sub rdi, rsi
cmp rsi, rdi
jge .L16
.L78:
cmp rsi, 1
je .L77
lea rcx, [0+rsi*8]
lea rdx, [rbp+0+rcx]
test rdi, rdi
jle .L19
lea r8, [rbp+16]
cmp rdx, r8
setnb r8b
add rcx, 16
test rcx, rcx
setle cl
or r8b, cl
je .L36
lea rcx, [rdi-1]
cmp rcx, 1
jbe .L36
mov r9, rdi
xor ecx, ecx
xor r8d, r8d
shr r9
.L21:
movdqu xmm0, XMMWORD PTR [rbp+0+rcx]
movdqu xmm1, XMMWORD PTR [rdx+rcx]
add r8, 1
movups XMMWORD PTR [rbp+0+rcx], xmm1
movups XMMWORD PTR [rdx+rcx], xmm0
add rcx, 16
cmp r9, r8
jne .L21
mov r9, rdi
and r9, -2
lea rcx, [0+r9*8]
lea r8, [rbp+0+rcx]
add rdx, rcx
cmp rdi, r9
je .L24
mov rcx, QWORD PTR [r8]
mov r9, QWORD PTR [rdx]
mov QWORD PTR [r8], r9
mov QWORD PTR [rdx], rcx
.L24:
lea rbp, [rbp+0+rdi*8]
.L19:
cqo
idiv rsi
test rdx, rdx
je .L1
mov rax, rsi
sub rsi, rdx
mov rdi, rax
sub rdi, rsi
cmp rsi, rdi
jl .L78
.L16:
lea rdx, [0+rax*8]
lea r11, [rbp+0+rdx]
cmp rdi, 1
je .L79
lea rcx, [0+rdi*8]
mov r10, r11
sub r10, rcx
test rsi, rsi
jle .L37
mov rbx, rdx
lea r8, [rdx-16]
sub rbx, rcx
cmp rbx, r8
lea r9, [rbx-16]
setle cl
cmp rdx, r9
setle dl
or cl, dl
je .L38
lea rdx, [rsi-1]
cmp rdx, 1
jbe .L38
mov rbx, rsi
add r9, rbp
add r8, rbp
xor ecx, ecx
shr rbx
xor edx, edx
.L31:
movdqu xmm0, XMMWORD PTR [r9+rcx]
movdqu xmm2, XMMWORD PTR [r8+rcx]
add rdx, 1
movups XMMWORD PTR [r9+rcx], xmm2
movups XMMWORD PTR [r8+rcx], xmm0
sub rcx, 16
cmp rdx, rbx
jne .L31
mov rdx, rsi
and rdx, -2
mov rcx, rdx
neg rcx
sal rcx, 3
cmp rsi, rdx
je .L34
sub rcx, 8
lea rdx, [r10+rcx]
add rcx, r11
mov r8, QWORD PTR [rdx]
mov r9, QWORD PTR [rcx]
mov QWORD PTR [rdx], r9
mov QWORD PTR [rcx], r8
.L34:
neg rsi
lea rbp, [r10+rsi*8]
.L29:
cqo
idiv rdi
mov rsi, rdx
test rdx, rdx
je .L1
mov rax, rdi
jmp .L15
.L14:
movdqu xmm0, XMMWORD PTR [rbp+0]
shufpd xmm0, xmm0, 1
movups XMMWORD PTR [rbp+0], xmm0
.L1:
pop rbx
pop rbp
pop r12
ret
.L38:
mov rdx, -8
xor ecx, ecx
.L30:
mov r8, QWORD PTR [r10+rdx]
mov r9, QWORD PTR [r11+rdx]
add rcx, 1
mov QWORD PTR [r10+rdx], r9
mov QWORD PTR [r11+rdx], r8
sub rdx, 8
cmp rsi, rcx
jne .L30
jmp .L34
.L36:
xor ecx, ecx
.L20:
mov r8, QWORD PTR [rbp+0+rcx*8]
mov r9, QWORD PTR [rdx+rcx*8]
mov QWORD PTR [rbp+0+rcx*8], r9
mov QWORD PTR [rdx+rcx*8], r8
add rcx, 1
cmp rdi, rcx
jne .L20
jmp .L24
.L37:
mov rbp, r10
jmp .L29
.L75:
mov rcx, rax
sub rcx, rbp
sar rcx, 3
.L2:
cmp rcx, 2
je .L8
cmp rcx, 3
je .L9
cmp rcx, 1
je .L10
.L11:
mov rbp, rax
xor eax, eax
jmp .L35
.L9:
cmp rsi, QWORD PTR [rbp+0]
je .L3
add rbp, 8
.L8:
cmp rsi, QWORD PTR [rbp+0]
je .L3
add rbp, 8
.L10:
cmp rsi, QWORD PTR [rbp+0]
jne .L11
jmp .L3
.L72:
add rbp, 8
jmp .L3
.L73:
add rbp, 16
jmp .L3
.L74:
add rbp, 24
jmp .L3
.L77:
sal rax, 3
lea rsi, [rbp+8]
mov r12, QWORD PTR [rbp+0]
lea rbx, [rbp+0+rax]
cmp rbx, rsi
je .L18
lea rdx, [rax-8]
mov rdi, rbp
call memmove
.L18:
mov QWORD PTR [rbx-8], r12
jmp .L1
.L79:
lea rax, [r11-8]
mov rbx, QWORD PTR [r11-8]
cmp rbp, rax
je .L28
sub rax, rbp
mov rdi, r11
mov rsi, rbp
mov rdx, rax
sub rdi, rax
call memmove
.L28:
mov QWORD PTR [rbp+0], rbx
jmp .L1
https://godbolt.org/z/P83YT37f8
그래서 대부분의 중요한 질문은: 그것은 그것을 의미하는 STL 버전이 느리게? 그렇다면,무엇에 의해 크기가?
그리고 또 다른 문제는 것: 왜 어셈블리는 그렇게 부풀어-O3?
그래서 나는 일부 벤치마킹:
#include <cstddef>
struct Shape
{
static size_t count;
size_t id = count++;
};
size_t Shape::count = 0;
#include <vector>
struct Window
{
std::vector<Shape*> shapes;
void move_to_top_manual(Shape& s);
void move_to_top(Shape& s);
};
void Window::move_to_top_manual(Shape& s)
{
for (int i = 0; i < shapes.size(); ++i)
if (&s == shapes[i])
{
for (++i; i < shapes.size(); ++i)
shapes[i - 1] = shapes[i];
shapes[shapes.size() - 1] = &s;
return;
}
}
#include <algorithm>
void Window::move_to_top(Shape& s)
{
auto iterShape = std::find(shapes.begin(), shapes.end(), &s);
std::rotate(iterShape, std::next(iterShape), shapes.end());
}
#include <chrono>
#include <string>
#include <iostream>
int main(int argc, char **argv)
{
size_t indx = argc < 2 ? 0 : std::stoll(argv[1]);
std::vector<Shape> shapes{size_t(5000000)};
Window window{};
window.shapes.reserve(shapes.size());
for(auto &s : shapes)
window.shapes.push_back(std::addressof(s));
Window windowsManual = window;
std::chrono::nanoseconds stlDuration{},
manualDuration{};
{
Shape &moveToTop = *window.shapes[indx];
auto timePointBegin = std::chrono::steady_clock::now();
window.move_to_top(moveToTop);
auto timePointEnd = std::chrono::steady_clock::now();
stlDuration = timePointEnd - timePointBegin;
std::cout << "STL nanosec: " << stlDuration.count() << std::endl;
if (&moveToTop != window.shapes.back())
return -1;
}
{
Shape &moveToTop = *windowsManual.shapes[indx];
auto timePointBegin = std::chrono::steady_clock::now();
windowsManual.move_to_top_manual(moveToTop);
auto timePointEnd = std::chrono::steady_clock::now();
manualDuration = timePointEnd - timePointBegin;
std::cout << "Manual nanosec: " << manualDuration.count() << std::endl;
if (&moveToTop != windowsManual.shapes.back())
return -1;
}
std::cout << "STL/Manual = " << double(stlDuration.count()) / manualDuration.count();
return 0;
}
평균 가 MSVC STL 버전이 1.5-2.0 시대보다 느리게 수동입니다.
STL nanosec: 19559200
Manual nanosec: 10636300
STL/Manual = 1.83891
STL nanosec: 20507700
Manual nanosec: 11418600
STL/Manual = 1.79599
Manual nanosec: 11685300
STL/Manual = 1.70025
그래서, 그것은 느와 MSVC.
와 MinGW64 그것은 거의 동일:
STL nanosec: 9069300
Manual nanosec: 7860500
STL/Manual = 1.15378
STL nanosec: 8865300
Manual nanosec: 11290500
STL/Manual = 0.7852
STL nanosec: 10713700
Manual nanosec: 8235200
STL/Manual = 1.30096
실행하는 이에 godbolt 예기치 않은 결과가 나타납:
STL nanosec: 3551159
Manual nanosec: 30056271
STL/Manual = 0.11815