를 나누는 방법에는 128 비트의 배당에는 64 비트 제수는 배당의 비트는 모든 1 개의,그리고 어디에 내가가 필요합 64LSBs 의 몫?

Question 1

요 계산 ((2^128) - 1) / x. 제수, x이,서명되지 않은 64 비트 숫자입니다. 배당가로 구성된 두 가지의 서명이 없는 64 비트 숫자(높고 낮음),두 번호 UINT64_MAX. 나는 사용할 수 있는 64 비트 산야 하 휴대용(의 사용 GNU's __int128,MSCV 의 _udiv128어셈블리,또는 그와 같은 아무것도). 저는 필요하지 않 고 부품의 몫,나만의 낮은 64 비트입니다.

이렇게 하려면 어떻게 해야 합니까 작업?

도: x >= 3, x 은 전력의 2.

편집:내가 만들어 내 자신의 솔루션을(아래 답변). 그러나 나는 환영하는 다른 솔루션을 수행하는 더:)

Question 2

나의 최적화에 적용되는 정수 부서와 함께 일정한 배당입니다. 번 확인 했 테스트 사례는 모든 사람을 배당으로 컴파일러 Explorer. 사용 gcc,icc,와 그 소리,최고 최적화 지정한 수준,생성된 코드를 보여주는 최적화되고 적용됩니다.

그것은 확실히 구축이 가능하 고 성능 128-bit 분열 루틴이지만,개인적인 경험에서 나는 이것은 아주 오류가 발생하기 쉬운,그리고 매우 정교한 테스트를 달성하기 위해 필요한 좋은 테스트 범위를 포함한 사례로,철저한 시험이 가능하지 않은 이 연산자 크기입니다. 의 노력을 디자인하고 테스트 쉽게 초과하는 것에 대한 합리적인 응답에서 유래에 의해 두 개 소수 orders of magnitude.

가장 쉽게 수행할 수 있는 방법 정수 부분은 알고리즘을 사용합니다 우리는 모두 초등 학교에서 배운만에 바이너리입니다. 이 결정에 대해 다음의 몫 조금 특히 쉽게:1 경우 현재의 나머지 부분보다 크거나 같음,제수,그렇지 않으면 0 하여 자세한 바이너리 부문,정수만 우리는 작업이 필요 및 빼기.

우리를 구축할 수 있는 휴대용 기본체 위해 수행하는 이들에 피연산자의 모든 비트 길이의 방법을 모방하여 프로세서의 기계 지침 사용하여 효과 작업에 여러 단어의 정수와 함께 추가 수하이,추가로 수행에서,추가로 수행 및 수행웃;유사한다. 아래 코드에서는 내가 사용하여 간단한 C 매크로는 확실히 더 정교한 방법이 가능합니다.

때문에 나는 지금 일하고 지원하지 않는 128 비트의 정수,나는 프로토타입 및 이 테스트 방식에 대한 64 비트 정수입니다. 128-bit 버전 다음 운동에 간단한 기계적인 이름을 변경. 현대에는 64 비트 프로세서 사이에는 128 비트 부문에 기능을 실행에서 약 3000 주기입니다.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <limits.h>

#define SUBCcc(a,b,cy,t0,t1,t2) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)

#define SUBcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), cy=t1<t0, t1-t0)

#define SUBC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t1-t0)

#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

typedef struct {
    uint64_t l;
    uint64_t h;
} my_uint128;

my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
    my_uint128 quot, rem, tmp;
    uint64_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (my_uint128);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

typedef struct {
    uint32_t l;
    uint32_t h;
} my_uint64;

my_uint64 bitwise_division_64 (my_uint64 dvnd, my_uint64 dvsr)
{
    my_uint64 quot, rem, tmp;
    uint32_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (my_uint64);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    uint64_t a, b, res, ref;
    my_uint64 aa, bb, rr;
    do {
        a = KISS64;
        b = KISS64;
        ref = a / b;

        aa.l = (uint32_t)a;
        aa.h = (uint32_t)(a >> 32);
        bb.l = (uint32_t)b;
        bb.h = (uint32_t)(b >> 32);
        rr = bitwise_division_64 (aa, bb);
        res = (((uint64_t)rr.h) << 32) + rr.l;

        if (ref != res) {
            printf ("a=%016llx b=%016llx res=%016llx ref=%016llx\n", a, b, res, ref);
            return EXIT_FAILURE;
        }
    } while (a);
    return EXIT_SUCCESS;
}

빠른 접근 보다는 비트 계산 계산하는 상호의 제수,곱하여 배당 결과에 대한 예비의 몫,그 계산의 나머지 부분을 정확하게 조정 몫. 전체 계산이 수행될 수 있습에서 고정 소수점 연산입니다. 그러나,현대적 프로세서와 함께 빠르게 부동 소수점 단위는 것이 더 편리성 시작 근사치에 대한 상호 이중 정밀도 division. 단일 누나 반복과 큐빅 융합 그 결과 전체에서 정밀한 상호.

핼리 반복을 위해 상호주 정수 곱 집중적으로,64×64-bit 곱 128 비트 결과(umul64wide() 아래 코드에서)는 빌딩 블록은 중요한 성과입니다. 에 현대적이 64 비트는 아키텍처 이것은 일반적으로 단일 컴퓨터 명령어의 실행에서 몇 가지 사이클,그러나 이에 접근할 수 없 휴대용 코드입니다. 휴대용 코드 에뮬레이션 명령에 대한 요구 15 20 지침에 따라 건축과 컴파일러입니다.

전체 128-bit 부문 약 300 사이클,또는 열 번으로 빠른 비트 단위조작에 의존하고 있습니다. 기 때문에 이 코드는 상당히 복잡한,그것은 필요한 상당한 금액의 테스트는 올바른 작동을 보장하기 위해. 틀에서 아래 내가 사용하는 패턴을 기반의 임의의 테스트를 위해 적당히 집중적인 테스트를 사용하여 간단 bit-wise 구현는 전문 기술을 제공합니다.

의 구현 udiv128() 아래는 프로그래밍 환경을 사용합 IEEE-754 준수 부동 소수점 연산,는 double 형식은 매핑을 IEEE-754 의 binary64 형식,그리고 그 부문 double 피연산자가 올바르게 둥글게 됩니다.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>

typedef struct {
    uint64_t l;
    uint64_t h;
} my_uint128;

my_uint128 make_my_uint128 (uint64_t h, uint64_t l);
my_uint128 add128 (my_uint128 a, my_uint128 b);
my_uint128 sub128 (my_uint128 a, my_uint128 b);
my_uint128 lsl128 (my_uint128 a, int sh);
my_uint128 lsr128 (my_uint128 a, int sh);
my_uint128 not128 (my_uint128 a);
my_uint128 umul128lo (my_uint128 a, my_uint128 b);
my_uint128 umul128hi (my_uint128 a, my_uint128 b);
double my_uint128_to_double (my_uint128 a);
int lt128 (my_uint128 a, my_uint128 b);
int eq128 (my_uint128 a, my_uint128 b);
uint64_t double_as_uint64 (double a);
double uint64_as_double (uint64_t a);

#define FP64_EXPO_BIAS   (1023)
#define FP64_MANT_BITS   (53)
#define FP64_MANT_IBIT   (0x0010000000000000ULL)
#define FP64_MANT_MASK   (0x000fffffffffffffULL)
#define FP64_INC_EXP_128 (0x0800000000000000ULL)
#define FP64_MANT_ADJ    (2)  // adjustment to ensure underestimate

my_uint128 udiv128 (my_uint128 dividend, my_uint128 divisor)
{
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    const my_uint128 one  = make_my_uint128 (0ULL, 1ULL);
    const my_uint128 two  = make_my_uint128 (0ULL, 2ULL);
    my_uint128 recip, temp, quo, rem;
    my_uint128 neg_divisor = sub128 (zero, divisor);
    double r;

    /* compute initial approximation for reciprocal; must be underestimate! */
    r = 1.0 / my_uint128_to_double (divisor);
    uint64_t i = double_as_uint64 (r) - FP64_MANT_ADJ + FP64_INC_EXP_128;
    temp = make_my_uint128 (0ULL, (i & FP64_MANT_MASK) | FP64_MANT_IBIT);
    int sh = (i >> (FP64_MANT_BITS-1)) - FP64_EXPO_BIAS - (FP64_MANT_BITS-1);
    recip = (sh < 0) ? lsr128 (temp, -sh) : lsl128 (temp, sh);

    /* perform Halley iteration with cubic convergence to refine reciprocal */
    temp = umul128lo (neg_divisor, recip);
    temp = add128 (umul128hi (temp, temp), temp);
    recip = add128 (umul128hi (recip, temp), recip);

    /* compute preliminary quotient and remainder */
    quo = umul128hi (dividend, recip); 
    rem = sub128 (dividend, umul128lo (divisor, quo));

    /* adjust quotient if too small; quotient off by 2 at most */
    if (! lt128 (rem, divisor)) {
        quo = add128 (quo, lt128 (sub128 (rem, divisor), divisor) ? one : two);
    }

    /* handle division by zero */
    if (eq128 (divisor, zero)) quo = not128 (zero);

    return quo;
}

#define SUBCcc(a,b,cy,t0,t1,t2) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)

#define SUBcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), cy=t1<t0, t1-t0)

#define SUBC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t1-t0)

#define ADDCcc(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)

#define ADDcc(a,b,cy,t0,t1) \
  (t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)

#define ADDC(a,b,cy,t0,t1) \
  (t0=(b)+cy, t1=(a), t0+t1)

uint64_t double_as_uint64 (double a) 
{ 
    uint64_t r; 
    memcpy (&r, &a, sizeof r); 
    return r; 
}

double uint64_as_double (uint64_t a) 
{ 
    double r; 
    memcpy (&r, &a, sizeof r); 
    return r; 
}

my_uint128 add128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1;
    a.l = ADDcc (a.l, b.l, cy, t0, t1);
    a.h = ADDC  (a.h, b.h, cy, t0, t1);
    return a;
}

my_uint128 sub128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1;
    a.l = SUBcc (a.l, b.l, cy, t0, t1);
    a.h = SUBC  (a.h, b.h, cy, t0, t1);
    return a;
}

my_uint128 lsl128 (my_uint128 a, int sh)
{
    if (sh >= 64) {
        a.h = a.l << (sh - 64);
        a.l = 0ULL;
    } else if (sh) {
        a.h = (a.h << sh) + (a.l >> (64 - sh));
        a.l = a.l << sh;
    }
    return a;
}

my_uint128 lsr128 (my_uint128 a, int sh)
{
    if (sh >= 64) {
        a.l = a.h >> (sh - 64);
        a.h = 0ULL;
    } else if (sh) {
        a.l = (a.l >> sh) + (a.h << (64 - sh));
        a.h = a.h >> sh;
    } 
    return a;
}

my_uint128 not128 (my_uint128 a)
{
    a.l = ~a.l;
    a.h = ~a.h;
    return a;
}

int lt128 (my_uint128 a, my_uint128 b)
{
    uint64_t cy, t0, t1, t2;
    a.l = SUBcc  (a.l, b.l, cy, t0, t1);
    a.h = SUBCcc (a.h, b.h, cy, t0, t1, t2);
    return cy;
}

int eq128 (my_uint128 a, my_uint128 b)
{
    return (a.l == b.l) && (a.h == b.h);
}

// derived from Hacker's Delight 2nd ed. figure 8-2
my_uint128 umul64wide (uint64_t u, uint64_t v)
{
    my_uint128 r;
    uint64_t u0, v0, u1, v1, w0, w1, w2, t;
    u0 = (uint32_t)u;  u1 = u >> 32;
    v0 = (uint32_t)v;  v1 = v >> 32;
    w0 = u0 * v0;
    t  = u1 * v0 + (w0 >> 32);
    w1 = (uint32_t)t;
    w2 = t >> 32;
    w1 = u0 * v1 + w1;
    r.h = u1 * v1 + w2 + (w1 >> 32);
    r.l = (w1 << 32) + (uint32_t)w0;
    return r;
}

my_uint128 make_my_uint128 (uint64_t h, uint64_t l)
{
    my_uint128 r;
    r.h = h;
    r.l = l;
    return r;
}

my_uint128 umul128lo (my_uint128 a, my_uint128 b)
{
    my_uint128 r;
    r = umul64wide (a.l, b.l);
    r.h = r.h + a.l * b.h + a.h * b.l;
    return r;
}

my_uint128 umul128hi (my_uint128 a, my_uint128 b)
{
    my_uint128 t0, t1, t2, t3;
    t0 = umul64wide (a.l, b.l);
    t3 = add128 (umul64wide (a.h, b.l), make_my_uint128 (0ULL, t0.h));
    t1 = make_my_uint128 (0ULL, t3.l);
    t2 = make_my_uint128 (0ULL, t3.h);
    t1 = add128 (umul64wide (a.l, b.h), t1);
    return add128 (add128 (umul64wide (a.h, b.h), t2), make_my_uint128 (0ULL, t1.h));
}

double my_uint128_to_double (my_uint128 a)
{
    const int intbits = sizeof (a) * CHAR_BIT;
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    my_uint128 rnd, i = a;
    uint64_t j;
    int sh = 0;
    double r;

    // normalize integer so MSB is set
    if (lt128 (i, make_my_uint128(0x0000000000000001ULL, 0))) {i = lsl128 (i,64); sh += 64; }
    if (lt128 (i, make_my_uint128(0x0000000100000000ULL, 0))) {i = lsl128 (i,32); sh += 32; }
    if (lt128 (i, make_my_uint128(0x0001000000000000ULL, 0))) {i = lsl128 (i,16); sh += 16; }
    if (lt128 (i, make_my_uint128(0x0100000000000000ULL, 0))) {i = lsl128 (i, 8); sh +=  8; } 
    if (lt128 (i, make_my_uint128(0x1000000000000000ULL, 0))) {i = lsl128 (i, 4); sh +=  4; }
    if (lt128 (i, make_my_uint128(0x4000000000000000ULL, 0))) {i = lsl128 (i, 2); sh +=  2; }
    if (lt128 (i, make_my_uint128(0x8000000000000000ULL, 0))) {i = lsl128 (i, 1); sh +=  1; }
    // form mantissa with explicit integer bit 
    rnd = lsl128 (i, FP64_MANT_BITS);
    i = lsr128 (i, intbits - FP64_MANT_BITS);
    j = i.l;
    // add in exponent, taking into account integer bit of mantissa
    if (! eq128 (a, zero)) {
        j += (uint64_t)(FP64_EXPO_BIAS + (intbits-1) - 1 - sh) << (FP64_MANT_BITS-1);
    }
    // round to nearest or even
    rnd.h = rnd.h | (rnd.l != 0);
    if ((rnd.h > 0x8000000000000000ULL) || 
        ((rnd.h == 0x8000000000000000ULL) && (j & 1))) j++;
    // reinterpret bit pattern as IEEE-754 'binary64'
    r = uint64_as_double (j);
    return r;
}

my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
    my_uint128 quot, rem, tmp;
    uint64_t cy, t0, t1, t2;
    int bits_left = CHAR_BIT * sizeof (dvsr);
    
    quot.h = dvnd.h;
    quot.l = dvnd.l;
    rem.h = 0;
    rem.l = 0;
    do {
        quot.l = ADDcc  (quot.l, quot.l, cy, t0, t1);
        quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
        rem.l  = ADDCcc (rem.l,  rem.l,  cy, t0, t1);
        rem.h  = ADDC   (rem.h,  rem.h,  cy, t0, t1);
        tmp.l  = SUBcc  (rem.l,  dvsr.l, cy, t0, t1);
        tmp.h  = SUBCcc (rem.h,  dvsr.h, cy, t0, t1, t2);
        if (!cy) { // remainder >= divisor
            rem.l = tmp.l;
            rem.h = tmp.h;
            quot.l = quot.l | 1;
        }
        bits_left--;
    } while (bits_left);
    return quot;
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

my_uint128 v[100000]; /* FIXME: size appropriately */

int main (void)
{
    const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
    const my_uint128 one = make_my_uint128 (0ULL, 1ULL);
    my_uint128 dividend, divisor, quot, ref;
    int i, j, patterns, idx = 0, nbrBits = sizeof (v[0]) * CHAR_BIT;
    int patterns_done = 0;

    /* pattern class 1: 2**i */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = lsl128 (one, i);
        idx++;
    }
    /* pattern class 2: 2**i-1 */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = sub128 (lsl128 (one, i), one);
        idx++;
    }
    /* pattern class 3: 2**i+1 */
    for (i = 0; i < nbrBits; i++) {
        v [idx] = add128 (lsl128 (one, i), one); 
        idx++;
    }
    /* pattern class 4: 2**i + 2**j */
    for (i = 0; i < nbrBits; i++) {
        for (j = 0; j < nbrBits; j++) {
            v [idx] = add128 (lsl128 (one, i), lsl128 (one, j));
            idx++;
        }
    }
    /* pattern class 5: 2**i - 2**j */
    for (i = 0; i < nbrBits; i++) {
        for (j = 0; j < nbrBits; j++) {
            v [idx] = sub128 (lsl128 (one, i), lsl128 (one, j));
            idx++;
        }
    }
    patterns = idx;
    /* pattern class 6: one's complement of pattern classes 1 through 5 */
    for (i = 0; i < patterns; i++) {
        v [idx] = not128 (v [i]);
        idx++;
    }
    /* pattern class 7: two's complement of pattern classes 1 through 5 */
    for (i = 0; i < patterns; i++) {
        v [idx] = sub128 (zero, v[i]);
        idx++;
    }
    patterns = idx;
    printf ("Starting pattern-based tests. Number of patterns: %d\n", patterns);

    for (long long int k = 0; k < 100000000000LL; k++) {
        if (k < patterns * patterns) {
            dividend = v [k / patterns];
            divisor  = v [k % patterns];
        } else {
            if (!patterns_done) {
                printf ("Starting random tests\n");
                patterns_done = 1;
            }
            dividend.l = KISS64;
            dividend.h = KISS64;
            divisor.h  = KISS64;
            divisor.l  = KISS64;
        }
        /* exclude cases with undefined results: division by zero */
        if (! eq128 (divisor, zero)) {
            quot = udiv128 (dividend, divisor);
            ref = bitwise_division_128 (dividend, divisor);
            if (! eq128 (quot, ref)) {
                printf ("@ (%016llx_%016llx, %016llx_%016llx): quot = %016llx_%016llx  ref=%016llx_%016llx\n", 
                        dividend.h, dividend.l, divisor.h, divisor.l, 
                        quot.h, quot.l, ref.h, ref.l);
                return EXIT_FAILURE;
            }
        }
    }
    printf ("unsigned 128-bit division: tests passed\n");
    return EXIT_SUCCESS;
}

Question 3

이것은 내가 결국 코딩이다. 나는 확실히 훨씬 더 빠른 대안,하지만 적어도 이 기능이 갖추어져 있습니다.

Based on: https://en.wikipedia.org/wiki/Division_algorithm#Integer_division_(unsigned)_with_remainder. 에 대한 적응을 위한 이는 특정 사용한 경우.

// q = (2^128 - 1) / d, where q is the 64 LSBs of the quotient
uint64_t two_pow_128_minus_1_div_d(uint64_t d) {
    uint64_t q = 0, r_hi = 0, r_lo = 0;

    for (int i = 127; i >= 0; --i) {
        r_hi = (r_hi << 1) | (r_lo >> 63);
        r_lo <<= 1;

        r_lo |= 1UL;

        if (r_hi || r_lo >= d) {
            const uint64_t borrow = d > r_lo;
            r_lo -= d;
            r_hi -= borrow;

            if (i < 64)
                q |= 1UL << i;
        }
    }
    return q;
}

njuffa · Answer 1 · 2021-11-22T01:55:43