mirror of https://github.com/prometheus/prometheus
234 lines
4.1 KiB
ArmAsm
234 lines
4.1 KiB
ArmAsm
|
// +build !appengine
|
||
|
// +build gc
|
||
|
// +build !noasm
|
||
|
|
||
|
#include "textflag.h"
|
||
|
|
||
|
// Register allocation:
|
||
|
// AX h
|
||
|
// CX pointer to advance through b
|
||
|
// DX n
|
||
|
// BX loop end
|
||
|
// R8 v1, k1
|
||
|
// R9 v2
|
||
|
// R10 v3
|
||
|
// R11 v4
|
||
|
// R12 tmp
|
||
|
// R13 prime1v
|
||
|
// R14 prime2v
|
||
|
// R15 prime4v
|
||
|
|
||
|
// round reads from and advances the buffer pointer in CX.
|
||
|
// It assumes that R13 has prime1v and R14 has prime2v.
|
||
|
#define round(r) \
|
||
|
MOVQ (CX), R12 \
|
||
|
ADDQ $8, CX \
|
||
|
IMULQ R14, R12 \
|
||
|
ADDQ R12, r \
|
||
|
ROLQ $31, r \
|
||
|
IMULQ R13, r
|
||
|
|
||
|
// mergeRound applies a merge round on the two registers acc and val.
|
||
|
// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
|
||
|
#define mergeRound(acc, val) \
|
||
|
IMULQ R14, val \
|
||
|
ROLQ $31, val \
|
||
|
IMULQ R13, val \
|
||
|
XORQ val, acc \
|
||
|
IMULQ R13, acc \
|
||
|
ADDQ R15, acc
|
||
|
|
||
|
// func Sum64(b []byte) uint64
|
||
|
TEXT ·Sum64(SB), NOSPLIT, $0-32
|
||
|
// Load fixed primes.
|
||
|
MOVQ ·prime1v(SB), R13
|
||
|
MOVQ ·prime2v(SB), R14
|
||
|
MOVQ ·prime4v(SB), R15
|
||
|
|
||
|
// Load slice.
|
||
|
MOVQ b_base+0(FP), CX
|
||
|
MOVQ b_len+8(FP), DX
|
||
|
LEAQ (CX)(DX*1), BX
|
||
|
|
||
|
// The first loop limit will be len(b)-32.
|
||
|
SUBQ $32, BX
|
||
|
|
||
|
// Check whether we have at least one block.
|
||
|
CMPQ DX, $32
|
||
|
JLT noBlocks
|
||
|
|
||
|
// Set up initial state (v1, v2, v3, v4).
|
||
|
MOVQ R13, R8
|
||
|
ADDQ R14, R8
|
||
|
MOVQ R14, R9
|
||
|
XORQ R10, R10
|
||
|
XORQ R11, R11
|
||
|
SUBQ R13, R11
|
||
|
|
||
|
// Loop until CX > BX.
|
||
|
blockLoop:
|
||
|
round(R8)
|
||
|
round(R9)
|
||
|
round(R10)
|
||
|
round(R11)
|
||
|
|
||
|
CMPQ CX, BX
|
||
|
JLE blockLoop
|
||
|
|
||
|
MOVQ R8, AX
|
||
|
ROLQ $1, AX
|
||
|
MOVQ R9, R12
|
||
|
ROLQ $7, R12
|
||
|
ADDQ R12, AX
|
||
|
MOVQ R10, R12
|
||
|
ROLQ $12, R12
|
||
|
ADDQ R12, AX
|
||
|
MOVQ R11, R12
|
||
|
ROLQ $18, R12
|
||
|
ADDQ R12, AX
|
||
|
|
||
|
mergeRound(AX, R8)
|
||
|
mergeRound(AX, R9)
|
||
|
mergeRound(AX, R10)
|
||
|
mergeRound(AX, R11)
|
||
|
|
||
|
JMP afterBlocks
|
||
|
|
||
|
noBlocks:
|
||
|
MOVQ ·prime5v(SB), AX
|
||
|
|
||
|
afterBlocks:
|
||
|
ADDQ DX, AX
|
||
|
|
||
|
// Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
|
||
|
ADDQ $24, BX
|
||
|
|
||
|
CMPQ CX, BX
|
||
|
JG fourByte
|
||
|
|
||
|
wordLoop:
|
||
|
// Calculate k1.
|
||
|
MOVQ (CX), R8
|
||
|
ADDQ $8, CX
|
||
|
IMULQ R14, R8
|
||
|
ROLQ $31, R8
|
||
|
IMULQ R13, R8
|
||
|
|
||
|
XORQ R8, AX
|
||
|
ROLQ $27, AX
|
||
|
IMULQ R13, AX
|
||
|
ADDQ R15, AX
|
||
|
|
||
|
CMPQ CX, BX
|
||
|
JLE wordLoop
|
||
|
|
||
|
fourByte:
|
||
|
ADDQ $4, BX
|
||
|
CMPQ CX, BX
|
||
|
JG singles
|
||
|
|
||
|
MOVL (CX), R8
|
||
|
ADDQ $4, CX
|
||
|
IMULQ R13, R8
|
||
|
XORQ R8, AX
|
||
|
|
||
|
ROLQ $23, AX
|
||
|
IMULQ R14, AX
|
||
|
ADDQ ·prime3v(SB), AX
|
||
|
|
||
|
singles:
|
||
|
ADDQ $4, BX
|
||
|
CMPQ CX, BX
|
||
|
JGE finalize
|
||
|
|
||
|
singlesLoop:
|
||
|
MOVBQZX (CX), R12
|
||
|
ADDQ $1, CX
|
||
|
IMULQ ·prime5v(SB), R12
|
||
|
XORQ R12, AX
|
||
|
|
||
|
ROLQ $11, AX
|
||
|
IMULQ R13, AX
|
||
|
|
||
|
CMPQ CX, BX
|
||
|
JL singlesLoop
|
||
|
|
||
|
finalize:
|
||
|
MOVQ AX, R12
|
||
|
SHRQ $33, R12
|
||
|
XORQ R12, AX
|
||
|
IMULQ R14, AX
|
||
|
MOVQ AX, R12
|
||
|
SHRQ $29, R12
|
||
|
XORQ R12, AX
|
||
|
IMULQ ·prime3v(SB), AX
|
||
|
MOVQ AX, R12
|
||
|
SHRQ $32, R12
|
||
|
XORQ R12, AX
|
||
|
|
||
|
MOVQ AX, ret+24(FP)
|
||
|
RET
|
||
|
|
||
|
// writeBlocks uses the same registers as above except that it uses AX to store
|
||
|
// the x pointer.
|
||
|
|
||
|
// func writeBlocks(x *xxh, b []byte) []byte
|
||
|
TEXT ·writeBlocks(SB), NOSPLIT, $0-56
|
||
|
// Load fixed primes needed for round.
|
||
|
MOVQ ·prime1v(SB), R13
|
||
|
MOVQ ·prime2v(SB), R14
|
||
|
|
||
|
// Load slice.
|
||
|
MOVQ b_base+8(FP), CX
|
||
|
MOVQ CX, ret_base+32(FP) // initialize return base pointer; see NOTE below
|
||
|
MOVQ b_len+16(FP), DX
|
||
|
LEAQ (CX)(DX*1), BX
|
||
|
SUBQ $32, BX
|
||
|
|
||
|
// Load vN from x.
|
||
|
MOVQ x+0(FP), AX
|
||
|
MOVQ 0(AX), R8 // v1
|
||
|
MOVQ 8(AX), R9 // v2
|
||
|
MOVQ 16(AX), R10 // v3
|
||
|
MOVQ 24(AX), R11 // v4
|
||
|
|
||
|
// We don't need to check the loop condition here; this function is
|
||
|
// always called with at least one block of data to process.
|
||
|
blockLoop:
|
||
|
round(R8)
|
||
|
round(R9)
|
||
|
round(R10)
|
||
|
round(R11)
|
||
|
|
||
|
CMPQ CX, BX
|
||
|
JLE blockLoop
|
||
|
|
||
|
// Copy vN back to x.
|
||
|
MOVQ R8, 0(AX)
|
||
|
MOVQ R9, 8(AX)
|
||
|
MOVQ R10, 16(AX)
|
||
|
MOVQ R11, 24(AX)
|
||
|
|
||
|
// Construct return slice.
|
||
|
// NOTE: It's important that we don't construct a slice that has a base
|
||
|
// pointer off the end of the original slice, as in Go 1.7+ this will
|
||
|
// cause runtime crashes. (See discussion in, for example,
|
||
|
// https://github.com/golang/go/issues/16772.)
|
||
|
// Therefore, we calculate the length/cap first, and if they're zero, we
|
||
|
// keep the old base. This is what the compiler does as well if you
|
||
|
// write code like
|
||
|
// b = b[len(b):]
|
||
|
|
||
|
// New length is 32 - (CX - BX) -> BX+32 - CX.
|
||
|
ADDQ $32, BX
|
||
|
SUBQ CX, BX
|
||
|
JZ afterSetBase
|
||
|
|
||
|
MOVQ CX, ret_base+32(FP)
|
||
|
|
||
|
afterSetBase:
|
||
|
MOVQ BX, ret_len+40(FP)
|
||
|
MOVQ BX, ret_cap+48(FP) // set cap == len
|
||
|
|
||
|
RET
|