x86_64 assembly, 48 bytes
This routine expects arguments eax/ebx, ecx/edx
and returns the result in r9/r11
(and registers r8
and r10
are clobbered).
.text
.globl f
f:
xor %r8, %r8
xor %r9, %r9
xor %r10, %r10
inc %r8
mov %r8, %r11
.Lloop:
cmp %rdx, %rcx
ja .L1
# a/b < c/d <= 1: return 1 / f(d/c,b/a)
xchg %rax, %rdx
xchg %rbx, %rcx
# 1/ matrix to multiply on right is [0 1; 1 0]
xchg %r8, %r9
xchg %r10, %r11
jmp .Lloop
.L1:
# 1+ matrix to multiply on right is [1 1; 0 1]
add %r8, %r9
add %r10, %r11
# 1 <= a/b < c/d: return 1 + f(a/b-1, c/d-1)
sub %rdx, %rcx
sub %rbx, %rax
jae .Lloop
# a/b < 1 < c/d:
# return 0/1 (which becomes a return of 1/1 with the 1+ matrix
# merged between this case and the previous one)
ret
C test driver (requires GCC):
#include <stdio.h>
void f_wrap(unsigned long a,
unsigned long b,
unsigned long c,
unsigned long d,
unsigned long *e,
unsigned long *f) {
register unsigned long r9 __asm__("r9");
register unsigned long r11 __asm__("r11");
__asm__("call f" :
"=a"(a), "=b"(b), "=c"(c), "=d"(d), "=r"(r9), "=r"(r11) :
"0"(a), "1"(b), "2"(c), "3"(d) :
"r8", "r10");
*e = r9;
*f = r11;
}
void test_f(unsigned long a,
unsigned long b,
unsigned long c,
unsigned long d) {
unsigned long e;
unsigned long f;
f_wrap(a, b, c, d, &e, &f);
printf("%lu/%lu %lu/%lu -> %lu/%lu\n", a, b, c, d, e, f);
}
int main() {
test_f(2, 5, 4, 5);
test_f(1, 1, 2, 1);
test_f(3, 5, 1, 1);
test_f(5, 13, 7, 18);
test_f(12, 31, 7, 18);
test_f(2, 5, 1, 2);
test_f(3, 7, 1, 2);
// 3.14 = 314/100 = 157/50; 3.15 = 315/100 = 63/20
test_f(157, 50, 63, 20);
// 3.1415 = 31415/10000 = 6283/2000; 3.1416 = 31416/10000 = 3927/1250
test_f(6283, 2000, 3927, 1250);
return 0;
}
And disassembly of the code as it ended up in the executable:
0000000000001229 <f>:
1229: 4d 31 c0 xor %r8,%r8
122c: 4d 31 c9 xor %r9,%r9
122f: 4d 31 d2 xor %r10,%r10
1232: 49 ff c0 inc %r8
1235: 4d 89 c3 mov %r8,%r11
1238: 48 39 d1 cmp %rdx,%rcx
123b: 77 0d ja 124a <f+0x21>
123d: 48 92 xchg %rax,%rdx
123f: 48 87 d9 xchg %rbx,%rcx
1242: 4d 87 c1 xchg %r8,%r9
1245: 4d 87 d3 xchg %r10,%r11
1248: eb ee jmp 1238 <f+0xf>
124a: 4d 01 c1 add %r8,%r9
124d: 4d 01 d3 add %r10,%r11
1250: 48 29 d1 sub %rdx,%rcx
1253: 48 29 d8 sub %rbx,%rax
1256: 73 e0 jae 1238 <f+0xf>
1258: c3 ret
The basic idea in pseudocode behind this implementation is inspired by continued fractions:
f(x,y):
if y <= 1
return 1 / f(1/y, 1/x)
else if x >= 1
return 1 + f(x-1, y-1)
else
return 1/1
(Do note, however, that in certain cases, this makes a recursive call to f
with y = 1 / 0
where we treat that as infinity.)
Now, the biggest optimization I made was to note that at any point, the transformations on the stack to the return value form a projective linear transformation. So, in the assembly version, instead of maintaining a call stack, I maintain a 2x2 matrix representing the required projective linear transformation so far in registers [r8 r9; r10 r11]
.
Aside from that, and some usual optimizations, there was a folding of common code between the second and third cases: both involve taking the sum of the two columns of the transformation matrix.
(However, I didn't make any effort to try different register allocations, and see if others might end up saving a couple bytes.)