A non-answer, tinyARM assembler (web doc) instead of C++ or C. I modified a pretty generic multiply-by-squares-lookup for speed (< 50 cycles excluding call&return overhead) at the cost of only fitting into AVRs with no less than 1KByte of RAM, using 512 aligned bytes for a table of the lower half of squares. At 20 MHz, that would nicely meet the 2 max 3 usec
time limit still not showing up in the question proper - but Sergio Formiggini wanted 16 MHz. As of 2015/04, there is just one ATtiny from Atmel with that much RAM, and that is specified up to 8 MHz … (Rolling your "own" (e.g., from OpenCores) your FPGA probably has a bunch of fast multipliers (18×18 bits seems popular), if not processor cores.)
For a stab at fast shift-and-add, have a look at shift and add, factor shifting left, unrolled 16×16→16 and/or improve on it (wiki post). (You might well create that community wiki answer begged for in the question.)
.def a0 = r16 ; factor low byte
.def a1 = r17
#warning two warnings about preceding definitions of
#warning r16 and r17 are due and may as well be ignored
.def a = r16 ; 8-bit factor
.def b = r17 ; 8-bit factor ; or r18, rather?
.def b0 = r18 ; factor low byte
.def b1 = r19
.def p0 = r20 ; product low byte
.def p1 = r21
; "squares table" SqTab shall be two 512 Byte tables of
; squares of 9-bit natural numbers, divided by 4
; Idea: exploit p = a * b = Squares[a+b] - Squares[a-b]
init:
ldi r16, 0x73
ldi r17, 0xab
ldi r18, 23
ldi r19, 1
ldi r20, HIGH(SRAM_SIZE)
cpi r20, 2
brsh fillSqTable ; ATtiny 1634?
rjmp mpy16T16
fillSqTable:
ldi r20, SqTabH
subi r20, -2
ldi zh, SqTabH
clr zl
; generate sqares by adding up odd numbers starting at 1 += -1
ldi r22, 1
clr r23
ser r26
ser r27
fillLoop:
add r22, r26
adc r23, r27
adiw r26, 2
mov r21, r23
lsr r21 ; get bits 9:2
mov r21, r22
ror r21
lsr r21
bst r23, 1
bld r21, 7
st z+, r21
cp zh, r20
brne fillLoop
rjmp mpy16F16
; assembly lines are marked up with cycle count
; and (latest) start cycle in block.
; If first line in code block, the (latest) block start cycle
; follows; else if last line, the (max) block cycle total
;**************************************************************
;*
;* "mpy16F16" - 16x16->16 Bit Unsigned Multiplication
;* using table lookup
;* Sergio Formiggini special edition
;* Multiplies two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p1:p0.
;*
;* Number of flash words: 318 + return =
;* (40 + 256(flash table) + 22(RAM init))
;* Number of cycles : 49 + return
;* Low registers used : None
;* High registers used : 7+2 (a1:a0, b1:b0, p1:p0, sq;
;* + Z(r31:r30))
;* RAM bytes used : 512 (squares table)
;*
;**************************************************************
mpy16F16:
ldi ZH, SqTabH>>1;1 0 0 squares table>>1
mov ZL, a0 ; 1 1
add ZL, b0 ; 1 2 a0+b0
rol ZH ; 1 3 9 bit offset
ld p0, Z ; 2 4 a0+b0l 1
lpm p1, Z ; 3 6 9 a0+b0h 2
ldi ZH, SqTabH ; 1 0 9 squares table
mov ZL, a1 ; 1 0 10
sub ZL, b0 ; 1 1 a1-b0
brcc noNegF10 ; 1 2
neg ZL ; 1 3
noNegF10:
ld sq, Z ; 2 4 a1-b0l 3
sub p1, sq ; 1 6 7
mov ZL, a0 ; 1 0 17
sub ZL, b1 ; 1 1 a0-b1
brcc noNegF01 ; 1 2
neg ZL ; 1 3
noNegF01:
ld sq, Z ; 2 4 a0-b1l 4
sub p1, sq ; 1 6 7
mov ZL, a0 ; 1 0 24
sub ZL, b0 ; 1 1 a0-b0
brcc noNegF00 ; 1 2
neg ZL ; 1 3
noNegF00:
ld sq, Z ; 2 4 a0-b0l 5
sub p0, sq ; 1 6
lpm sq, Z ; 3 7 a0-b0h 6*
sbc p1, sq ; 1 10 11
ldi ZH, SqTabH>>1;1 0 35
mov ZL, a1 ; 1 1
add ZL, b0 ; 1 2 a1+b0
rol ZH ; 1 3
ld sq, Z ; 2 4 a1+b0l 7
add p1, sq ; 1 6 7
ldi ZH, SqTabH>>1;1 0 42
mov ZL, a0 ; 1 1
add ZL, b1 ; 1 2 a0+b1
rol ZH ; 1 3
ld sq, Z ; 2 4 a0+b1l 8
add p1, sq ; 1 6 7
ret ; 49
.CSEG
.org 256; words?!
SqTableH:
.db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.db 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
.db 1, 1, 1, 1, 1, 1, 2, 2, 2, 2
.db 2, 2, 2, 2, 2, 2, 3, 3, 3, 3
.db 3, 3, 3, 3, 4, 4, 4, 4, 4, 4
.db 4, 4, 5, 5, 5, 5, 5, 5, 5, 6
.db 6, 6, 6, 6, 6, 7, 7, 7, 7, 7
.db 7, 8, 8, 8, 8, 8, 9, 9, 9, 9
.db 9, 9, 10, 10, 10, 10, 10, 11, 11, 11
.db 11, 12, 12, 12, 12, 12, 13, 13, 13, 13
.db 14, 14, 14, 14, 15, 15, 15, 15, 16, 16
.db 16, 16, 17, 17, 17, 17, 18, 18, 18, 18
.db 19, 19, 19, 19, 20, 20, 20, 21, 21, 21
.db 21, 22, 22, 22, 23, 23, 23, 24, 24, 24
.db 25, 25, 25, 25, 26, 26, 26, 27, 27, 27
.db 28, 28, 28, 29, 29, 29, 30, 30, 30, 31
.db 31, 31, 32, 32, 33, 33, 33, 34, 34, 34
.db 35, 35, 36, 36, 36, 37, 37, 37, 38, 38
.db 39, 39, 39, 40, 40, 41, 41, 41, 42, 42
.db 43, 43, 43, 44, 44, 45, 45, 45, 46, 46
.db 47, 47, 48, 48, 49, 49, 49, 50, 50, 51
.db 51, 52, 52, 53, 53, 53, 54, 54, 55, 55
.db 56, 56, 57, 57, 58, 58, 59, 59, 60, 60
.db 61, 61, 62, 62, 63, 63, 64, 64, 65, 65
.db 66, 66, 67, 67, 68, 68, 69, 69, 70, 70
.db 71, 71, 72, 72, 73, 73, 74, 74, 75, 76
.db 76, 77, 77, 78, 78, 79, 79, 80, 81, 81
.db 82, 82, 83, 83, 84, 84, 85, 86, 86, 87
.db 87, 88, 89, 89, 90, 90, 91, 92, 92, 93
.db 93, 94, 95, 95, 96, 96, 97, 98, 98, 99
.db 100, 100, 101, 101, 102, 103, 103, 104, 105, 105
.db 106, 106, 107, 108, 108, 109, 110, 110, 111, 112
.db 112, 113, 114, 114, 115, 116, 116, 117, 118, 118
.db 119, 120, 121, 121, 122, 123, 123, 124, 125, 125
.db 126, 127, 127, 128, 129, 130, 130, 131, 132, 132
.db 133, 134, 135, 135, 136, 137, 138, 138, 139, 140
.db 141, 141, 142, 143, 144, 144, 145, 146, 147, 147
.db 148, 149, 150, 150, 151, 152, 153, 153, 154, 155
.db 156, 157, 157, 158, 159, 160, 160, 161, 162, 163
.db 164, 164, 165, 166, 167, 168, 169, 169, 170, 171
.db 172, 173, 173, 174, 175, 176, 177, 178, 178, 179
.db 180, 181, 182, 183, 183, 184, 185, 186, 187, 188
.db 189, 189, 190, 191, 192, 193, 194, 195, 196, 196
.db 197, 198, 199, 200, 201, 202, 203, 203, 204, 205
.db 206, 207, 208, 209, 210, 211, 212, 212, 213, 214
.db 215, 216, 217, 218, 219, 220, 221, 222, 223, 224
.db 225, 225, 226, 227, 228, 229, 230, 231, 232, 233
.db 234, 235, 236, 237, 238, 239, 240, 241, 242, 243
.db 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
.db 254, 255
; word addresses, again?!
.equ SqTabH = (high(SqTableH) << 1)
.DSEG
RAMTab .BYTE 512
a+=a;
witha<<=1
doesn't really change anything in the generated code, correct? – Creakya<b
: you should use asb
the smaller of the two, so that you have the minimum number of cycles. – Creakyuint8_t mul(uint8_t, uint8_t)
anduint16_t mul_wide(uint8_t, uint8_t)
, and then treatuint16_t
as a multiprecision integer consisting of two 8-bit words. – Necrotomy