Line data Source code
1 : /* SPDX-License-Identifier: GPL-2.0 */ 2 : #ifndef _LINUX_RECIPROCAL_DIV_H 3 : #define _LINUX_RECIPROCAL_DIV_H 4 : 5 : #include <linux/types.h> 6 : 7 : /* 8 : * This algorithm is based on the paper "Division by Invariant 9 : * Integers Using Multiplication" by Torbjörn Granlund and Peter 10 : * L. Montgomery. 11 : * 12 : * The assembler implementation from Agner Fog, which this code is 13 : * based on, can be found here: 14 : * http://www.agner.org/optimize/asmlib.zip 15 : * 16 : * This optimization for A/B is helpful if the divisor B is mostly 17 : * runtime invariant. The reciprocal of B is calculated in the 18 : * slow-path with reciprocal_value(). The fast-path can then just use 19 : * a much faster multiplication operation with a variable dividend A 20 : * to calculate the division A/B. 21 : */ 22 : 23 : struct reciprocal_value { 24 : u32 m; 25 : u8 sh1, sh2; 26 : }; 27 : 28 : /* "reciprocal_value" and "reciprocal_divide" together implement the basic 29 : * version of the algorithm described in Figure 4.1 of the paper. 30 : */ 31 : struct reciprocal_value reciprocal_value(u32 d); 32 : 33 : static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) 34 : { 35 0 : u32 t = (u32)(((u64)a * R.m) >> 32); 36 0 : return (t + ((a - t) >> R.sh1)) >> R.sh2; 37 : } 38 : 39 : struct reciprocal_value_adv { 40 : u32 m; 41 : u8 sh, exp; 42 : bool is_wide_m; 43 : }; 44 : 45 : /* "reciprocal_value_adv" implements the advanced version of the algorithm 46 : * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose 47 : * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The 48 : * exception case could be easily handled before calling "reciprocal_value_adv". 49 : * 50 : * The advanced version requires more complex calculation to get the reciprocal 51 : * multiplier and other control variables, but then could reduce the required 52 : * emulation operations. 53 : * 54 : * It makes no sense to use this advanced version for host divide emulation, 55 : * those extra complexities for calculating multiplier etc could completely 56 : * waive our saving on emulation operations. 57 : * 58 : * However, it makes sense to use it for JIT divide code generation for which 59 : * we are willing to trade performance of JITed code with that of host. As shown 60 : * by the following pseudo code, the required emulation operations could go down 61 : * from 6 (the basic version) to 3 or 4. 62 : * 63 : * To use the result of "reciprocal_value_adv", suppose we want to calculate 64 : * n/d, the pseudo C code will be: 65 : * 66 : * struct reciprocal_value_adv rvalue; 67 : * u8 pre_shift, exp; 68 : * 69 : * // handle exception case. 70 : * if (d >= (1U << 31)) { 71 : * result = n >= d; 72 : * return; 73 : * } 74 : * 75 : * rvalue = reciprocal_value_adv(d, 32) 76 : * exp = rvalue.exp; 77 : * if (rvalue.is_wide_m && !(d & 1)) { 78 : * // floor(log2(d & (2^32 -d))) 79 : * pre_shift = fls(d & -d) - 1; 80 : * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); 81 : * } else { 82 : * pre_shift = 0; 83 : * } 84 : * 85 : * // code generation starts. 86 : * if (imm == 1U << exp) { 87 : * result = n >> exp; 88 : * } else if (rvalue.is_wide_m) { 89 : * // pre_shift must be zero when reached here. 90 : * t = (n * rvalue.m) >> 32; 91 : * result = n - t; 92 : * result >>= 1; 93 : * result += t; 94 : * result >>= rvalue.sh - 1; 95 : * } else { 96 : * if (pre_shift) 97 : * result = n >> pre_shift; 98 : * result = ((u64)result * rvalue.m) >> 32; 99 : * result >>= rvalue.sh; 100 : * } 101 : */ 102 : struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); 103 : 104 : #endif /* _LINUX_RECIPROCAL_DIV_H */