C, 0.026119s (Mar 12 2016)
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define cache_size 16384
#define Phi_prec_max (47 * a)
#define bit(k) (1ULL << ((k) & 63))
#define word(k) sieve[(k) >> 6]
#define sbit(k) ((word(k >> 1) >> (k >> 1)) & 1)
#define ones(k) (~0ULL >> (64 - (k)))
#define m2(k) ((k + 1) / 2)
#define max(a, b) ((a) > (b) ? (a) : (b))
#define min(a, b) ((a) < (b) ? (a) : (b))
#define ns(t) (1000000000 * t.tv_sec + t.tv_nsec)
#define popcnt __builtin_popcountll
#define mask_build(i, p, o, m) mask |= m << i, i += o, i -= p * (i >= p)
#define Phi_prec_bytes ((m2(Phi_prec_max) + 1) * sizeof(int16_t))
#define Phi_prec(i, j) Phi_prec_pointer[(j) * (m2(Phi_prec_max) + 1) + (i)]
#define Phi_6_next ((i / 1155) * 480 + Phi_5[i % 1155] - Phi_5[(i + 6) / 13])
#define Phi_6_upd_1() t = Phi_6_next, i += 1, *(l++) = t
#define Phi_6_upd_2() t = Phi_6_next, i += 2, *(l++) = t, *(l++) = t
#define Phi_6_upd_3() t = Phi_6_next, i += 3, *(l++) = t, *(l++) = t, *(l++) = t
typedef unsigned __int128 uint128_t;
struct timespec then, now;
uint64_t a, primes[4648] = { 2, 3, 5, 7, 11, 13, 17, 19 }, *primes_fastdiv;
uint16_t *Phi_6, *Phi_prec_pointer;
static inline uint64_t Phi_6_mod(uint64_t y)
{
if (y < 30030)
return Phi_6[m2(y)];
else
return (y / 30030) * 5760 + Phi_6[m2(y % 30030)];
}
static inline uint64_t fastdiv(uint64_t dividend, uint64_t fast_divisor)
{
return ((uint128_t) dividend * fast_divisor) >> 64;
}
uint64_t Phi(uint64_t y, uint64_t c)
{
uint64_t *d = primes_fastdiv, i = 0, r = Phi_6_mod(y), t = y / 17;
r -= Phi_6_mod(t), t = y / 19;
while (i < c && t > Phi_prec_max) r -= Phi(t, i++), t = fastdiv(y, *(d++));
while (i < c && t) r -= Phi_prec(m2(t), i++), t = fastdiv(y, *(d++));
return r;
}
uint64_t Phi_small(uint64_t y, uint64_t c)
{
if (!c--) return y;
return Phi_small(y, c) - Phi_small(y / primes[c], c);
}
uint64_t pi_small(uint64_t y)
{
uint64_t i, r = 0;
for (i = 0; i < 8; i++) r += (primes[i] <= y);
for (i = 21; i <= y; i += 2)
r += i % 3 && i % 5 && i % 7 && i % 11 && i % 13 && i % 17 && i % 19;
return r;
}
int output(int result)
{
clock_gettime(CLOCK_REALTIME, &now);
printf("pi(x) = %9d real time:%9ld ns\n", result , ns(now) - ns(then));
return 0;
}
int main(int argc, char *argv[])
{
uint64_t b, i, j, k, limit, mask, P2, *p, start, t = 8, x = atoi(argv[1]);
uint64_t root2 = sqrt(x), root3 = pow(x, 1./3), top = x / root3 + 1;
uint64_t halftop = m2(top), *sieve, sieve_length = (halftop + 63) / 64;
uint64_t i3 = 1, i5 = 2, i7 = 3, i11 = 5, i13 = 6, i17 = 8, i19 = 9;
uint16_t Phi_3[] = { 0, 1, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 7, 8 };
uint16_t *l, *m, Phi_4[106], Phi_5[1156];
clock_gettime(CLOCK_REALTIME, &then);
sieve = malloc(sieve_length * sizeof(int64_t));
if (x < 529) return output(pi_small(x));
for (i = 0; i < sieve_length; i++)
{
mask = 0;
mask_build( i3, 3, 2, 0x9249249249249249ULL);
mask_build( i5, 5, 1, 0x1084210842108421ULL);
mask_build( i7, 7, 6, 0x8102040810204081ULL);
mask_build(i11, 11, 2, 0x0080100200400801ULL);
mask_build(i13, 13, 1, 0x0010008004002001ULL);
mask_build(i17, 17, 4, 0x0008000400020001ULL);
mask_build(i19, 19, 12, 0x0200004000080001ULL);
sieve[i] = ~mask;
}
limit = min(halftop, 8 * cache_size);
for (i = 21; i < root3 + 1; i += 2)
if (sbit(i))
for (primes[t++] = i, j = i * i / 2; j < limit; j += i)
word(j) &= ~bit(j);
a = t;
for (i = (root3 + 1) | 1; i < root2 + 1; i += 2)
if (sbit(i)) primes[t++] = i;
b = t;
while (limit < halftop)
{
start = 2 * limit + 1, limit = min(halftop, limit + 8 * cache_size);
for (p = &primes[8]; p < &primes[a]; p++)
for (j = max(start / *p | 1, *p) * *p / 2; j < limit; j += *p)
word(j) &= ~bit(j);
}
P2 = (a - b) * (a + b - 1) / 2;
for (i = m2(root2); b --> a; P2 += t, i = limit)
{
limit = m2(x / primes[b]), j = limit & ~63;
if (i < j)
{
t += popcnt((word(i)) >> (i & 63)), i = (i | 63) + 1;
while (i < j) t += popcnt(word(i)), i += 64;
if (i < limit) t += popcnt(word(i) & ones(limit - i));
}
else if (i < limit) t += popcnt((word(i) >> (i & 63)) & ones(limit - i));
}
if (a < 7) return output(Phi_small(x, a) + a - 1 - P2);
a -= 7, Phi_6 = malloc(a * Phi_prec_bytes + 15016 * sizeof(int16_t));
Phi_prec_pointer = &Phi_6[15016];
for (i = 0; i <= 105; i++)
Phi_4[i] = (i / 15) * 8 + Phi_3[i % 15] - Phi_3[(i + 3) / 7];
for (i = 0; i <= 1155; i++)
Phi_5[i] = (i / 105) * 48 + Phi_4[i % 105] - Phi_4[(i + 5) / 11];
for (i = 1, l = Phi_6, *l++ = 0; i <= 15015; )
{
Phi_6_upd_3(); Phi_6_upd_2(); Phi_6_upd_1(); Phi_6_upd_2();
Phi_6_upd_1(); Phi_6_upd_2(); Phi_6_upd_3(); Phi_6_upd_1();
}
for (i = 0; i <= m2(Phi_prec_max); i++)
Phi_prec(i, 0) = Phi_6[i] - Phi_6[(i + 8) / 17];
for (j = 1, p = &primes[7]; j < a; j++, p++)
{
i = 1, memcpy(&Phi_prec(0, j), &Phi_prec(0, j - 1), Phi_prec_bytes);
l = &Phi_prec(*p / 2 + 1, j), m = &Phi_prec(m2(Phi_prec_max), j) - *p;
while (l <= m)
for (k = 0, t = Phi_prec(i++, j - 1); k < *p; k++) *(l++) -= t;
t = Phi_prec(i++, j - 1);
while (l <= m + *p) *(l++) -= t;
}
primes_fastdiv = malloc(a * sizeof(int64_t));
for (i = 0, p = &primes[8]; i < a; i++, p++)
{
t = 96 - __builtin_clzll(*p);
primes_fastdiv[i] = (bit(t) / *p + 1) << (64 - t);
}
return output(Phi(x, a) + a + 6 - P2);
}
This uses the Meissel-Lehmer method.
Timings
On my machine, I'm getting roughly 5.7 milliseconds for the combined test cases. This is on an Intel Core i7-3770 with DDR3 RAM at 1867 MHz, running openSUSE 13.2.
$ ./timepi '-march=native -O3' pi 1000
pi(x) = 93875448 real time: 2774958 ns
pi(x) = 66990613 real time: 2158491 ns
pi(x) = 62366021 real time: 2023441 ns
pi(x) = 34286170 real time: 1233158 ns
pi(x) = 5751639 real time: 384284 ns
pi(x) = 2465109 real time: 239783 ns
pi(x) = 1557132 real time: 196248 ns
pi(x) = 4339 real time: 60597 ns
0.00572879 s
Because the variance got too high, I'm using timings from within the program for the unofficial run times. This is the script that computed the average of the combined run times.
#!/bin/bash
all() { for j in ${a[@]}; do ./$1 $j; done; }
gcc -Wall $1 -lm -o $2 $2.c
a=(1907000000 1337000000 1240000000 660000000 99820000 40550000 24850000 41500)
all $2
r=$(seq 1 $3)
for i in $r; do all $2; done > times
awk -v it=$3 '{ sum += $6 } END { print "\n" sum / (1e9 * it) " s" }' times
rm times
Official times
This time is for doing the score cases 1000 times.
real 0m28.006s
user 0m15.703s
sys 0m14.319s
How it works
Formula
Let \$x\$ be a positive integer.
Each positive integer \$n \le x\$ satisfies exactly one of the following conditions.
\$n = 1\$
\$n\$ is divisible by a prime number \$p\$ in \$[1, \sqrt[3]{x}]\$.
\$n = pq\$, where \$p\$ and \$q\$ are (not necessarily distinct) prime numbers in \$(\sqrt[3]{x}, \sqrt[3]{x^2})\$.
\$n\$ is prime and \$n > \sqrt[3]{x}\$
Let \$\pi(y)\$ denote the number of primes \$p\$ such that \$p \le y\$. There are \$\pi(x) - \pi(\sqrt[3]{x})\$ numbers that fall in the fourth category.
Let \$P_k(y, c)\$ denote the amount of positive integers \$m \le y\$ that are a product of exactly \$k\$ prime numbers not among the first \$c\$ prime numbers. There are \$P_2(x, \pi(\sqrt[3]{x}))\$ numbers that fall in the third category.
Finally, let \$\phi(y, c)\$ denote the amount of positive integers \$k \le y\$ that are coprime to the first \$c\$ prime numbers. There are \$x - \phi(x, \pi(\sqrt[3]{x}))\$ numbers that fall into the second category.
Since there are \$x\$ numbers in all categories,
$$1 + x - \phi(x, \pi(\sqrt[3]{x})) + P_2(x, \pi(\sqrt[3]{x})) + \pi(x) - \pi(\sqrt[3]{x}) = x$$
and, therefore,
$$\pi(x) = \phi(x, \pi(\sqrt[3]{x})) + \pi(\sqrt[3]{x}) - 1 - P_2(x, \pi(\sqrt[3]{x}))$$
The numbers in the third category have a unique representation if we require that \$p \le q\$ and, therefore \$p \le \sqrt{x}\$. This way, the product of the primes \$p\$ and \$q\$ is in the third category if and only if \$\sqrt[3]{x} < p \le q \le \frac{x}{p}\$, so there are \$\pi(\frac{x}{p}) - \pi(p) + 1\$ possible values for \$q\$ for a fixed value of \$p\$, and \$P_2(x, \pi(\sqrt[3]{x})) = \sum_{\pi(\sqrt[3]{x}) < k \le \pi(\sqrt{x})} (\pi(\frac{x}{p_k}) - \pi(p_k) + 1)\$, where \$p_k\$ denotes the \$k^{\text{th}}\$ prime number.
Finally, every positive integer \$n \le y\$ that is not coprime to the first \$c\$ prime numbers can be expressed in unique fashion as \$n = p_kf\$, where \$p_k\$ is the lowest prime factor of \$n\$. This way, \$k \le c\$, and \$f\$ is coprime to the first \$k - 1\$ primes numbers.
This leads to the recursive formula \$\phi(y, c) = y - \sum_{1 \le k \le c} \phi(\frac{y}{p_k}, k - 1)\$. In particular, the sum is empty if \$c = 0\$, so \$\phi(y, 0) = y\$.
We now have a formula that allows us to compute \$\pi(x)\$ by generating only the first \$\pi(\sqrt[3]{x^2})\$ prime numbers (millions vs billions).
Algorithm
We'll need to compute \$\pi(\frac{x}{p})\$, where \$p\$ can get as low as \$\sqrt[3]{x}\$. While there are other ways to do this (like applying our formula recursively), the fastest way seems to be to enumerate all primes up to \$\sqrt[3]{x^2}\$, which can be done with the sieve of Eratosthenes.
First, we identify and store all prime numbers in \$[1, \sqrt{x}]\$, and compute \$\pi(\sqrt[3]{x})\$ and \$\pi(\sqrt{x})\$ at the same time. Then, we compute \$\frac{x}{p_k}\$ for all \$k\$ in \$(\pi(\sqrt[3]{x}), \pi(\sqrt{x})]\$, and count the primes up to each successive quotient.
Also, \$\sum_{\pi(\sqrt[3]{x}) < k \le \pi(\sqrt{x})} (-\pi(p_k) + 1)\$ has the closed form \$\frac{\pi(\sqrt[3]{x}) - \pi(\sqrt{x}))(\pi(\sqrt[3]x) + \pi(\sqrt{x}) - 1}{2}\$, which allows us to complete the computation of \$P_2(x, \pi(\sqrt[3]{x}))\$.
That leaves the computation of \$\phi\$, which is the most expensive part of the algorithm. Simply using the recursive formula would require \$2^c\$ function calls to compute \$\phi(y, c)\$.
First of all, \$\phi(0, c) = 0\$ for all values of \$c\$, so \$\phi(y, c) = y - \sum_{1 \le k \le c, p_k \le y} \phi(\frac{y}{p_k}, k - 1)\$. By itself, this observation is already enough to make the computation feasible. This is because any number below \$2 \cdot 10^9\$ is smaller than the product of any ten distinct primes, so the overwhelming majority of summands vanishes.
Also, by grouping \$y\$ and the first \$c'\$ summands of the definition of \$\phi\$, we get the alternative formula \$\phi(y, c) = \phi(y, c') - \sum_{c' < k \le c, p_k \le y} \phi(\frac{y}{p_k}, k - 1)\$. Thus, precomputing \$\phi(y, c')\$ for a fixed \$c'\$ and appropriate values of \$y\$ saves most of the remaining function calls and the associated computations.
If \$m_c = \prod_{1 \le k \le c} p_k\$, then \$\phi(m_c, c) = \varphi(m_c)\$, since the integers in \$[1, m_c]\$ that are divisible by none of \$p_1, \cdots, p_c\$ are precisely those that are coprime to \$m_c\$. Also, since \$\gcd(z + m_c, m_c) = \gcd(z, m_c)\$, we have that \$\phi(y, c) = \phi(\lfloor \frac{y}{m_c} \rfloor m_c, c) + \phi(y % m_c, c) = \lfloor \frac{y}{m_c} \rfloor \varphi(m_c) + \phi(y % m_c, c)\$.
Since Euler's totient function is multiplicative, \$\varphi(m_c) = \prod_{1 \le k \le c} \varphi(p_k) = \prod_{1 \le k \le c} (p_k - 1)\$, and we have an easy way to derive \$\phi(y, c)\$ for all \$y\$ by precomputing the values for only those \$y\$ in \$[0, m_c)\$.
Also, if we set \$c' = c - 1\$, we obtain \$\phi(y, c) = \phi(y, c - 1) - \phi(\frac{y}{p_c}, c - 1)\$, the original definition from Lehmer's paper. This gives us a simple way to precompute \$\phi(y, c)\$ for increasing values of \$c\$.
In addition for precomputing \$\phi(y, c)\$ for a certain, low value of \$c\$, we'll also precompute it for low values of \$y\$, cutting the recursion short after falling below a certain threshold.
Implementation
The previous section covers most parts of the code. One remaining, important detail is how the divisions in the function Phi
are performed.
Since computing \$\phi\$ only requires dividing by the first \$\pi(\sqrt[3]{x})\$ prime numbers, we can use the fastdiv
function instead. Rather than simply dividing an \$y\$ by a prime \$p\$, we multiply \$y\$ by \$d_p \approx \frac{2^{64}}{p}\$ instead and recover \$\frac{y}{p}\$ as \$\frac{d_py}{2^{64}}\$. Because of how integer multiplication is implemented on x64, dividing by \$2^{64}\$ is not required; the higher 64 bits of \$d_py\$ are stored in their own register.
Note that this method requires precomputing \$d_p\$, which isn't faster than computing \$\frac{y}{p}\$ directly. However, since we have to divide by the same primes over and over again and division is so much slower than multiplication, this results in an important speed-up. More details on this algorithm, as well as a formal proof, can be found in Division by Invariant Integers using Multiplication.
What prevents us from using a lookup table with the first 2e9 values of pi(n)? Would that be acceptable? (Not sure how fast it would be, though, because it would be a large table) – Luis Mendo – 2016-02-26T19:15:41.440
@DonMuesli That wouldn't be acceptable (goes against the spirit of the challenge), I've edited to make it expressly forbidden as well now. – Liam – 2016-02-26T19:18:45.937
8It's dangerous to refer to the "spirit" of the challenge. Your "against the spirit" may be someone else's "great trick" :-) It's better that you made it explicit – Luis Mendo – 2016-02-26T19:20:51.020
@DonMuesli Yeah. Honestly I think that is actually a standard loophole so it was closed by default. I would consider it Fetching the desired output from an external source unless the language for some reason has a built-in for it. In which case, "built-ins that trivialize" were already disallowed.
– Liam – 2016-02-26T19:24:18.4801What's a built-in? I have a list-of-primes function in a library. May I use it? If not, can I copy the source code of the library in my program and use that? – nimi – 2016-02-26T19:26:01.300
@nimi "Built-ins that test primality or generate primes are not allowed. This includes lists of primes." also "you may hardcode primes up to and including 19 and no higher." – Liam – 2016-02-26T19:27:19.570
1@Liam: Yes, I know, but what counts as a built-in? Is copying source code from a library a built-in? – nimi – 2016-02-26T19:28:05.163
@nimi a function in your language. If it's not a function you wrote, then it is a built-in. I've added a bit to the challenge to hopefully clear things up. – Liam – 2016-02-26T19:28:58.173
@nimi if the definition is in your submission, then I suppose it would not be a built-in, it is now hard-coded. Just make sure to give credit. – Liam – 2016-02-26T19:33:59.713
@DonMuesli You could hardcode it and call it with reflection? That'd be pretty quick, provided the lookup was. – Addison Crump – 2016-02-26T23:24:58.470
@VoteToClose Anyway, the challenge now excludes that: may not be looked up externally or hardcoded locally – Luis Mendo – 2016-02-26T23:26:50.333
Do you have a 64-bit system? – LegionMammal978 – 2016-02-27T02:45:21.113
Is there any limit on compile time/executable size because I may be able to use a preprocessor-macro to generate all the primes up to 2000000000 into a table and then during the execution it would just be to look up in table. – 永劫回帰 – 2016-02-28T08:40:18.147
@変幻出没 That's an interesting question. I haven't considered compile time with the other entries, so I'm leaning toward that it's not expressly forbidden, but is still kind of questionable. Having said that, it's a bit late to change the rules now, so go ahead. – Liam – 2016-02-29T03:05:19.240
@Liam I've been meaning to refamiliarize myself with simd instructions and this seems like a good opportunity. Even if it's non-competing, could you give your cpu type so I can attempt an entry with simd stuff? – Michael Klein – 2016-02-29T13:05:04.733
@MichaelKlein AMD FX(tm)-6300 Six-Core Processor. If you need more info, please let me know. – Liam – 2016-02-29T16:37:13.553
@Liam, should be enough, it has AVX (advanced vector extensions, 128 bit) and fused-multiply add. – Michael Klein – 2016-02-29T16:59:22.723
What, no answers in machine code, or even assembler? – Cary Swoveland – 2016-03-01T07:40:50.730
@CarySwoveland There is time yet! – Liam – 2016-03-01T07:41:43.060
launching a new process for each test seems rather unfair to interpreted languages. should entries not just be able to include a test harness main() function? – Sparr – 2016-03-07T14:31:00.513
@Liam, Hi.. I know its too late.. I'm planning to generate primes with Sieve of Eratosthenes (ofcourse it's not a plain Sieve, I'm gng to heavily customise it) to generate primes upto given number, and print the number of primes upto given number.. I assume my program will be provided with many test cases. One thing I want to confirm is, whether all the numbers will be given in an one run, or does one number will be given for every exection upto n test cases..? – The Coder – 2016-03-24T16:46:52.477
@thecoder inputs are given one at a time. – Liam – 2016-03-24T17:51:39.453
What are the specs on your computer's GPU? – Mego – 2016-06-11T02:15:14.097
@mego Graphics Card Manufacturer - Powered by AMD Graphics Chipset - AMD Radeon R9 200 Series Device ID - 6810 Vendor ID - 1002 SubSystem ID - E271 SubSystem Vendor ID - 174B Memory Size - 2048 MB Memory Type - GDDR5 Memory Clock - 1400 MHz Core Clock - 1070 MHz Total Memory Bandwidth - 179 GByte/s – Liam – 2016-06-11T19:43:13.503
@Liam Thanks! I might be able to compete with Dennis! – Mego – 2016-06-11T19:45:57.887
@Mego I'm looking forward to it! If you need more details, let me know – Liam – 2016-06-11T19:46:55.983