2,279,635 cycles—7373 bytes (deterministic)
One-by-one:
robert@unity:~/golf-cpu/factor$ ./test.sh
13
23
31
37
41
47
47
59
61
79
Execution terminated after 1088 cycles with exit code 0.
2
2
2
2
3
3
107
164546579
1121707
Execution terminated after 41325 cycles with exit code 0.
3
3
3
7
7
7
13
50759273983933
Execution terminated after 9413 cycles with exit code 0.
7
7
7
13
250250521
2531
4091
Execution terminated after 22282 cycles with exit code 0.
31
89
97
357653
3881
18211
Execution terminated after 47656 cycles with exit code 0.
2
2
2
3
702924188994286579
Execution terminated after 17423 cycles with exit code 0.
7
103
7209485975851
727
Execution terminated after 15973 cycles with exit code 0.
1920043
66449
1604167
Execution terminated after 153683 cycles with exit code 0.
52528036667
322255481
Execution terminated after 1952229 cycles with exit code 0.
9929766466606501253
Execution terminated after 18563 cycles with exit code 0.
robert@unity:~/golf-cpu/factor$
Summary:
robert@unity:~/golf-cpu/factor$ ./test.py factor.golf factors_testset
binary length: 7373 bytes
10/10 passed in 0:00:08.53 s
2,279,635 cycles (227963.50 average)
(267.14 KCPS average)
0 ######################################################################
100000 ########
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000 ########
robert@unity:~/golf-cpu/factor$
I use a combination of trial division and the Brent-Pollard rho algorithm for factorization, and table lookup plus Miller-Rabin for primality testing. I'll add some more explanation tomorrow.
Note that because of the character limit on post length, the second data table is truncated. This gist includes the full table.
primes_53 = \
b'\x03\x05\x07\x0b\x0d\x11\x13\x17\x1d\x1f\x25\x29\x2b\x2f\x35\x3b' \
b'\x3d\x43\x47\x49\x4f\x53\x59\x61\x65\x67\x6b\x6d\x71\x7f\x83\x89' \
b'\x8b\x95\x97\x9d\xa3\xa7\xad\xb3\xb5\xbf\xc1\xc5\xc7\xd3\xdf\xe3' \
b'\xe5\xe9\xef\xf1\xfb'
# for 2**16+1:2:2**17-1, 1 if prime, 0 otherwise
prime_bytes = \
b'\x8b\x24\x60\x82\x10\x41\x81\x12\x40\x08\x26\x0d\x03\x00\x01\x41' \
...
b'\x41\x30\x20\x10\x00\x00\x80\x00\x40\x50\x24\x00\x83\x00\x01\x8a'
main:
call input
# check if input is 1
cmp c, n, 1
sz c, 3
mov o, n
call print
halt 0
# remove factors of two
mov o, 2
div_2_loop:
and c, n, 1
snz c, 3
call print
shr n, n, 1
jmp div_2_loop
# check if n is a power of two
cmp c, n, 1
sz c, 1
halt 0
# trial division
mov d, data(primes_53)
mov i, 53
trial_division_loop:
lbu p, d
trial_division_check:
divu q, r, n, p
snz r, 4
mov o, p
call print
mov n, q
jmp trial_division_check
mulu s, t, p, p
leu c, n, s
sz c, 5
cmp c, n, 1
snz c, 2
mov o, n
call print
halt 0
sub i, i, 1
add d, d, 1
jnz trial_division_loop, i
# mov g, 0 # initialize factor list pointers
# mov h, 0 # all registers are zero at program start
# *g is the next factor to check for primality
# *h is the last factor we've found so far
# the first factor is always *0
sw g, n
prime_test:
# by this point we know that n has no prime factors less than 256,
# so if n is less than 256**2, then it must be prime
lequ c, n, 2**16
jnz is_prime, c
geu c, n, 2**17+1
jnz miller_rabin, c
sub d, n, 2**16+1 # table starts at 2**16+1 (by above argument)
shr d, d, 1 # we know n is odd, so we ignore the last bit
and m, d, 2**3-1 # each byte is 3-bit lookup table
shr d, d, 3
add d, d, data(prime_bytes)
lbu p, d
shr p, p, m
and p, p, 1
jnz is_prime, p
jmp pollard_rho # I should probably do trial division here
# since we only have around 20 more primes to check
################
# Miller-Rabin primality test; bases from https://miller-rabin.appspot.com/
# Note that this subroutine, as well as most of the ones before the 'utility
# functions' section are not proper functions, as they sometimes jump directly
# to the next step instead of returning!
#
# input: n
# output: n/a
################
miller_rabin:
shr d, n, 1
mov b, 1
mr_rd_loop: # n - 1 == d * 2**b, d is odd
and c, d, 1
snz c, 3
shr d, d, 1
inc b
jmp mr_rd_loop
geu c, n, 2**32-1 # test if we can use single-precision arithmetic
jnz miller_rabin_montgomery, c
sub m, n, 1
mr_test_1:
gequ c, n, 0x000000000005361b
jnz mr_test_2, c
mov a, 0x81b33f22efdceaa9
call miller_rabin_test
jmp is_prime
mr_test_2:
gequ c, n, 0x000000003e9de64d
jnz mr_test_3, c
mov a, 0x0000004e69b6552d
call miller_rabin_test
mov a, 0x00223f5bb83fc553
call miller_rabin_test
jmp is_prime
mr_test_3:
mov a, 0x3ab4f88ff0cc7c80
call miller_rabin_test
mov a, 0xcbee4cdf120c10aa
call miller_rabin_test
mov a, 0xe6f1343b0edca8e7
call miller_rabin_test
jmp is_prime
miller_rabin_montgomery:
call montgomery_precompute
sub m, n, r # montgomery representation of n-1
mr_test_mt_3:
gequ c, n, 0x000000518dafbfd1
jnz mr_test_mt_4, c
mov a, 0x3ab4f88ff0cc7c80
call miller_rabin_test_montgomery
mov a, 0xcbee4cdf120c10aa
call miller_rabin_test_montgomery
mov a, 0xe6f1343b0edca8e7
call miller_rabin_test_montgomery
jmp is_prime
mr_test_mt_4:
gequ c, n, 0x0000323ee0e55e6b
jnz mr_test_mt_5, c
mov a, 0x0000000000000002
call miller_rabin_test_montgomery
mov a, 0x0000810c207b08bf
call miller_rabin_test_montgomery
mov a, 0x10a42595b01d3765
call miller_rabin_test_montgomery
mov a, 0x99fd2b545eab5322
call miller_rabin_test_montgomery
jmp is_prime
mr_test_mt_5:
gequ c, n, 0x001c6b470864f683
jnz mr_test_mt_6, c
mov a, 0x0000000000000002
call miller_rabin_test_montgomery
mov a, 0x000003c1c7396f6d
call miller_rabin_test_montgomery
mov a, 0x02142e2e3f22de5c
call miller_rabin_test_montgomery
mov a, 0x0297105b6b7b29dd
call miller_rabin_test_montgomery
mov a, 0x370eb221a5f176dd
call miller_rabin_test_montgomery
jmp is_prime
mr_test_mt_6:
gequ c, n, 0x081f23f390affe89
jnz mr_test_mt_7, c
mov a, 0x0000000000000002
call miller_rabin_test_montgomery
mov a, 0x000070722e8f5cd0
call miller_rabin_test_montgomery
mov a, 0x0020cd6bd5ace2d1
call miller_rabin_test_montgomery
mov a, 0x009bbc940c751630
call miller_rabin_test_montgomery
mov a, 0x0a90404784bfcb4d
call miller_rabin_test_montgomery
mov a, 0x1189b3f265c2b0c7
call miller_rabin_test_montgomery
jmp is_prime
mr_test_mt_7:
mov a, 0x0000000000000002
call miller_rabin_test_montgomery
mov a, 0x0000000000000145
call miller_rabin_test_montgomery
mov a, 0x000000000000249f
call miller_rabin_test_montgomery
mov a, 0x0000000000006e12
call miller_rabin_test_montgomery
mov a, 0x000000000006e0d7
call miller_rabin_test_montgomery
mov a, 0x0000000000953d18
call miller_rabin_test_montgomery
mov a, 0x000000006b0191fe
call miller_rabin_test_montgomery
jmp is_prime
is_prime:
mov o, n
call print
cmp c, g, h
sz c, 1
halt 0
add g, g, 8
lw n, g
jmp prime_test
found_factor:
divu n, b, n, a
add h, h, 8
sw h, a
jmp prime_test
################
# Miller-Rabin primality test subroutine; needs m == n - 1
#
# input: a, n, m
# output: n/a
################
miller_rabin_test:
divu q, a, a, n # a = a mod n
snz a, 1
ret # a certifies primality if a == 0 mod n
call power # a = a**d mod n
cmp c, a, 1 # r == 1
sz c, 1
ret # a certifies primality if a**d == 1 mod n
cmp c, a, m # m == n-1
sz c, 1
ret # a certifies primality if a**d == -1 mod n
mov d, 2
mr_square_loop:
dec b
jz pollard_rho, b
mulu x, y, a, a # a = a**2 mod n
divu q, a, x, n # q is unused
cmp c, a, 1 # r == 1
jnz pollard_rho, c # a witnesses compositeness if a**(d*2**i) == 1 mod n
cmp c, a, m # m == n-1
sz c, 1
ret # a certifies primality if a**(d*2**i) == -1 mod n
jmp mr_square_loop
################
# Miller-Rabin primality test subroutine using Montgomery arithmetic; needs
# the values from the precomputation step as well as m == n - 1
#
# input: a, n, m, (u, v, r, s)
# output: n/a
################
miller_rabin_test_montgomery:
divu q, a, a, n # a = a mod n
snz a, 1
ret # a certifies primality if a == 0 mod n
mulu x, y, a, s
call montgomery_reduce # convert a to montgomery form
mov a, x
call montgomery_power # a = a**d mod n
cmp c, a, r # r == 1 (mongtomery form)
sz c, 1
ret # a certifies primality if a**d == 1 mod n
cmp c, a, m # m == n-1 (montgomery form)
sz c, 1
ret # a certifies primality if a**d == -1 mod n
mr_mt_square_loop:
dec b
jz pollard_rho_montgomery, b
mulu x, y, a, a # a = a**2 mod n
call montgomery_reduce
mov a, x
cmp c, a, r # r == 1 (mongtomery form)
jnz pollard_rho_montgomery, c # a witnesses compositeness if a**(d*2**i) == 1 mod n
cmp c, a, m # m == n-1 (montgomery form)
sz c, 1
ret # a certifies primality if a**(d*2**i) == -1 mod n
jmp mr_mt_square_loop
################
# Pollard-Brent rho factorization algorithm
#
# input: n
# output: n/a
################
pollard_rho:
rho_skip_count = 64
mov o, 42
rho_retry:
mov b, o
mov j, 1
mov f, 1
mov d, 1
rho_outer_loop:
mov a, b
mov i, j
rho_fast_loop:
mulu b, y, b, b
add b, b, 1
divu y, b, b, n
#mov o, b
#call print
sub i, i, 1
jnz rho_fast_loop, i
mov k, 0
rho_slow_loop:
mov e, b
sub i, j, k
leu c, rho_skip_count, i
sz c, 1
mov i, rho_skip_count
rho_inner_loop:
mulu b, y, b, b
add b, b, 1
divu y, b, b, n
leu c, a, b
sz c, 1
sub x, b, a
snz c, 1
sub x, a, b
mulu f, y, x, f
divu y, f, f, n
#add o, f, 1000000000
#call print
sub i, i, 1
jnz rho_inner_loop, i
mov x, f
call gcd
mov d, x
#mov o, d
#call print
add k, k, rho_skip_count
gequ c, k, j
snz c, 2
lequ c, d, 1
jnz rho_slow_loop, c
shl j, j, 1
lequ c, d, 1
jnz rho_outer_loop, c
cmp c, d, n
sz c, 12
rho_final_loop:
mulu e, y, e, e
add e, e, 1
divu y, e, e, n
#add o, e, 2000000000
#call print
leu c, a, e
sz c, 1
sub x, e, a
snz c, 1
sub x, a, e
call gcd
mov d, x
#mov o, d
#call print
lequ c, d, 1
jnz rho_final_loop, c
cmp c, d, n
sz c, 2
add o, o, 1
jmp rho_retry
mov a, d
jmp found_factor
################
# Pollard-Brent rho factorization algorithm using Montgomery arithmetic.
# Requires m = n - r
#
# input: n, (u, v, r, s, m)
# output: n/a
################
pollard_rho_montgomery:
rho_mt_skip_count = 64
mov o, 42
rho_mt_retry:
mulu x, y, o, s
call montgomery_reduce
mov b, x
mov j, 1
mov f, r
mov d, 1
rho_mt_outer_loop:
mov a, b
mov i, j
rho_mt_fast_loop:
mulu x, y, b, b
call montgomery_reduce
mov b, x
geu c, b, m
sz c, 1
sub b, b, m
snz c, 1
add b, b, r
#mov o, b
#call print
sub i, i, 1
jnz rho_mt_fast_loop, i
mov k, 0
rho_mt_slow_loop:
mov e, b
sub i, j, k
leu c, rho_mt_skip_count, i
sz c, 1
mov i, rho_mt_skip_count
rho_mt_inner_loop:
mulu x, y, b, b
call montgomery_reduce
mov b, x
geu c, b, m
sz c, 1
sub b, b, m
snz c, 1
add b, b, r
leu c, a, b
sz c, 1
sub x, b, a
snz c, 1
sub x, a, b
mulu x, y, x, f
call montgomery_reduce
mov f, x
sub i, i, 1
jnz rho_mt_inner_loop, i
mov x, f
#mov o, x
#call print
call gcd
mov d, x
#mov o, d
#call print
add k, k, rho_mt_skip_count
gequ c, k, j
snz c, 2
lequ c, d, 1
jnz rho_mt_slow_loop, c
shl j, j, 1
lequ c, d, 1
jnz rho_mt_outer_loop, c
cmp c, d, n
sz c, 15
rho_mt_final_loop:
mulu x, y, e, e
call montgomery_reduce
mov e, x
geu c, e, m
sz c, 1
sub e, e, m
snz c, 1
add e, e, r
leu c, a, e
sz c, 1
sub x, e, a
snz c, 1
sub x, a, e
call gcd
mov d, x
lequ c, d, 1
jnz rho_mt_final_loop, c
cmp c, d, n
sz c, 2
add o, o, 1
jmp rho_mt_retry
mov a, d
jmp found_factor
################ UTILITY FUNCTIONS ################
################
# Loads a decimal number terminated by a newline from stdin to register n.
#
# inputs: none
# outputs: n
################
input:
lw c, -1
input_add_digit:
sub c, c, ord('0')
mov o, c
mulu n, h, n, 10
add n, n, c
lw c, -1
cmp q, c, ord('\n')
jz input_add_digit, q
ret n
################
# Prints the value of register o to stdout in decimal, followed by a newline.
#
# inputs: o
# outputs: none
################
print:
call print_char
sw -1, ord('\n')
ret
print_char:
divu o, x, o, 10
sz o, 1
call print_char
add x, x, ord('0')
sw -1, x
ret
################
# Precomputes values used for Montgomery multiplication:
# * u = 1/2**64 mod n
# * v = -1/n mod 2**64
# * r = 2**64 mod n (montgomery representation of 1)
# * s = 2**128 mod n (montgomery representation of 2**64)
#
# inputs: n
# outputs: u, v, s, t
################
montgomery_precompute:
# first compute the inverses; based on xbinGCD from
# http://www.hackersdelight.org/MontgomeryMultiplication.pdf
# This step takes something like an average of 7 cycles per bit in the
# 'fast' mode, and 8 cycles per bit in 'safe' mode, for a total of around
# 500 cycles: but we only have to do it once per modulus
mov u, 1
mov v, 0
mov i, 64
# HD defensively computes (u+beta)/2 to avoid overflow, taking two extra
# instructions; however, since u < beta overflow is only possible when
# beta >= 2**63, in most cases we can use a faster version and save
# 2 * 64 = 128 cycles, at the expense of a couple cycles comparison
gequ c, n, 2**63
jz mt_pre_gcd_loop, c
mt_pre_gcd_loop_safe:
dec i
and c, u, 1
snz c, 3
shr u, u, 1
shr v, v, 1
sz 0, 6
and c, u, n
xor u, u, n
shr u, u, 1
add u, u, c
shr v, v, 1
add v, v, 2**63
jnz mt_pre_gcd_loop_safe, i
jmp mt_pre_gcd_end
mt_pre_gcd_loop:
dec i
and c, u, 1
snz c, 3
shr u, u, 1
shr v, v, 1
sz 0, 4
add u, u, n
shr u, u, 1
shr v, v, 1
add v, v, 2**63
jnz mt_pre_gcd_loop, i
mt_pre_gcd_end:
# compute 2**64 mod n; since 2**64 is too large to fit in a register, we
# first calculate r = 2**63 mod n, if it's greater than n/2, r = 2r - n,
# otherwise just r = 2r
divu q, r, 2**63, n # q is unused
shr c, n, 1
geu c, r, c
shl r, r, 1
sz c, 1
sub r, r, n
# It would be entirely possible to compute 2**128 mod n by starting with
# r and using repeated doubling (64 times); this would take something
# like 4*64 = 256 cycles. Instead I'll use a doubleword division
# algorithm from Hacker's Delight (ch. 9-4, fig. 9-3), which takes
# fewer than 100 cycles.
mov l, 0
# normalize the divisor (n)
and c, n, 2**64 - 2**32
snz c, 2
shl n, n, 32
add l, l, 32
and c, n, 2**64 - 2**48
snz c, 2
shl n, n, 16
add l, l, 16
and c, n, 2**64 - 2**56
snz c, 2
shl n, n, 8
add l, l, 8
and c, n, 2**64 - 2**60
snz c, 2
shl n, n, 4
add l, l, 4
and c, n, 2**64 - 2**62
snz c, 2
shl n, n, 2
add l, l, 2
and c, n, 2**64 - 2**63
snz c, 2
shl n, n, 1
add l, l, 1
# normalize the numerator
shl a, r, l
# split the divisor into halfwords
shr b, n, 32
and m, n, 2**32 - 1
# compute first halfword of remainder
divu q, p, a, b
mt_pre_div_adjust1: # loop runs at most two times, 035x on average
gequ c, q, 2**32
snz c, 4
mulu d, f, q, m # f is unused
shl c, p, 32
geu c, d, c
sz c, 4
sub q, q, 1
add p, p, b
leu c, p, 2**32
jnz mt_pre_div_adjust1, c
shl a, a, 32
mulu d, f, q, n
sub a, a, d
# compute second halfword of remainder
divu q, p, a, b
mt_pre_div_adjust2:
snz c, 4
mulu d, f, q, m
shl c, p, 32
geu c, d, c
sz c, 4
sub q, q, 1
add p, p, b
leu c, p, 2**32
jnz mt_pre_div_adjust2, c
shl a, a, 32
mulu d, f, q, n
sub a, a, d
# de-normalize remainder
shr s, a, l
ret u, v, r, s
################
# Performs the Montgomery reduction step, x = [yx]/2**64 mod n
# Requres the u and v calculated in mongomery_precompute.
#
# inputs: x, y, n, (u, v)
# outputs: x
################
montgomery_reduce:
mulu m, u, x, v # m = ([yx] mod 2**64) / (-n) mod 2**64
mulu z, w, m, n # [wz] = mN
add x, x, z # [yx] += [wz]
leu c, x, z # this is where I really miss an ADC instruction
sz c, 1
inc w
add x, y, w # x = [yx] / 2**64 = y (combined with y+w from above)
leu c, x, y # x -= n if overflow
sz c, 1
sub x, x, n
gequ c, x, n # if x >= n
sz c, 1
sub x, x, n # x -= n
ret x
################
# Computes a = a**d mod n by repeated squaring.
#
# inputs: a, d, n
# outputs: a
################
power:
mov b, a
mov a, 1
mt_pow_loop:
and c, d, 1
sz c, 2
mulu x, y, a, b # a = a*b mod n
divu q, a, x, n # q is unused
mulu x, y, b, b # b = b*b mod n
divu q, b, x, n
shr d, d, 1
jnz mt_pow_loop, d
ret a
################
# Uses Montgomery multiplication to compute a = a**d mod n. Treats d as an
# ordinary unsigned integer; treats d as an integer in Montgomery form.
# Requres the u, v, and r calculated in mongomery_precompute.
#
# inputs: a, d, n, (u, v, r)
# outputs: a
################
montgomery_power:
mov b, a
mov a, r
pow_loop:
and c, d, 1
sz c, 3
mulu x, y, a, b
call montgomery_reduce
mov a, x
mulu x, y, b, b
call montgomery_reduce
mov b, x
shr d, d, 1
jnz pow_loop, d
ret a
################
# Computes x = GCD(x,n) using the binary GCD algorithm. Assumes that x < n
# and that n is odd. Note that we don't need a separate Montgomery GCD,
# since, surprisingly, GCD(x*(2**64) mod n, n) == GCD(x, n).
#
# inputs: x, n
# outputs: x
################
gcd:
snz x, 2 # GCD(0, n) == n
mov x, n
ret x
gcd_loop:
and c, x, 1 # n is odd, so the GCD has no factors of two
snz c, 3
gcd_div_2_loop:
shr x, x, 1
and c, x, 1
jz gcd_div_2_loop, c
geu c, n, x
sz c, 5
sub x, n, x
sub n, n, x
jnz gcd_loop, x
mov x, n
ret x
sub x, x, n
jnz gcd_loop, x
mov x, n
ret x
1About golf: what do you mean by "Signed division and modulus work like they do in Python - the sign of the result is the same as the sign of the divisor"? -4/-2 should be -2? (I don't think so) – edc65 – 2015-04-25T16:50:11.363
1@edc65 Sorry, the sign portion only applies to modulus, I'll edit the docs. – orlp – 2015-04-25T16:51:50.483
Just a friendly reminder that comments are not meant for extended discussion. I've migrated the comments to chat and cleaned them up here.
– Martin Ender – 2015-04-25T18:00:25.267If the program must work for any 64-bit input integer, what happens if the input is prime? Since
1
is not a prime factor, should it only print the input integer? – mbomb007 – 2015-04-27T20:45:53.990@mbomb007 Correct, it should only print the input integer. – orlp – 2015-04-27T20:54:13.827
I really want to use SQUFOF on this one, but it just so happens that the second-to-last test case is one of the ~2% of numbers (of that difficulty) that fail to factor without a multiplier. What modifications would you consider "optimization towards this set of numbers"? – 2012rcampion – 2015-10-06T03:31:58.567
@2012rcampion I think having fallback algorithms is fine, but at every stage your program must be general for all numbers. I'm sorry that 2% of the input space is so overrepresented for the scoring of your answer though =/ – orlp – 2015-10-06T04:16:12.040
SQUFOF does succeed eventually on all numbers (if it fails in one round, you try again with a different multiplier), but it takes a long time to fail on the first round. If I started with a multiplier, that would cut my time on that particular test case by 50-70%. However, I feel like that's cheating since the average time over all numbers would stay the same. (I'm probably going to end up using Pollard-Brent rho factorization anyway, it doesn't require any expensive square roots.) – 2012rcampion – 2015-10-06T04:38:19.283