264 lines
6.7 KiB
C
264 lines
6.7 KiB
C
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
#ifdef USE_HW_SHA1
|
|
|
|
#ifndef __ARM_FEATURE_CRYPTO
|
|
#error "Compiler option is invalid"
|
|
#endif
|
|
|
|
#ifdef FREEBL_NO_DEPEND
|
|
#include "stubs.h"
|
|
#endif
|
|
|
|
#include <arm_neon.h>
|
|
#include <memory.h>
|
|
#include "blapi.h"
|
|
#include "sha_fast.h"
|
|
|
|
#if !defined(SHA_PUT_W_IN_STACK)
|
|
#define H2X 11
|
|
#else
|
|
#define H2X 0
|
|
#endif
|
|
|
|
static void shaCompress(SHA_HW_t *X, const PRUint32 *datain);
|
|
|
|
void
|
|
SHA1_Compress_Native(SHA1Context *ctx)
|
|
{
|
|
shaCompress(&ctx->H[H2X], ctx->u.w);
|
|
}
|
|
|
|
/*
|
|
* SHA: Add data to context.
|
|
*/
|
|
void
|
|
SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len)
|
|
{
|
|
unsigned int lenB;
|
|
unsigned int togo;
|
|
|
|
if (!len) {
|
|
return;
|
|
}
|
|
|
|
/* accumulate the byte count. */
|
|
lenB = (unsigned int)(ctx->size) & 63U;
|
|
|
|
ctx->size += len;
|
|
|
|
/*
|
|
* Read the data into W and process blocks as they get full
|
|
*/
|
|
if (lenB > 0) {
|
|
togo = 64U - lenB;
|
|
if (len < togo) {
|
|
togo = len;
|
|
}
|
|
memcpy(ctx->u.b + lenB, dataIn, togo);
|
|
len -= togo;
|
|
dataIn += togo;
|
|
lenB = (lenB + togo) & 63U;
|
|
if (!lenB) {
|
|
shaCompress(&ctx->H[H2X], ctx->u.w);
|
|
}
|
|
}
|
|
|
|
while (len >= 64U) {
|
|
len -= 64U;
|
|
shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn);
|
|
dataIn += 64U;
|
|
}
|
|
|
|
if (len) {
|
|
memcpy(ctx->u.b, dataIn, len);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* SHA: Compression function, unrolled.
|
|
*/
|
|
static void
|
|
shaCompress(SHA_HW_t *X, const PRUint32 *inbuf)
|
|
{
|
|
#define XH(n) X[n - H2X]
|
|
|
|
const uint32x4_t K0 = vdupq_n_u32(0x5a827999);
|
|
const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1);
|
|
const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc);
|
|
const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6);
|
|
|
|
uint32x4_t abcd = vld1q_u32(&XH(0));
|
|
PRUint32 e = XH(4);
|
|
|
|
const uint32x4_t origABCD = abcd;
|
|
const PRUint32 origE = e;
|
|
|
|
uint32x4_t w0 = vld1q_u32(inbuf);
|
|
uint32x4_t w1 = vld1q_u32(inbuf + 4);
|
|
uint32x4_t w2 = vld1q_u32(inbuf + 8);
|
|
uint32x4_t w3 = vld1q_u32(inbuf + 12);
|
|
|
|
w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
|
|
w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
|
|
w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
|
|
w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));
|
|
|
|
uint32x4_t t0 = vaddq_u32(w0, K0);
|
|
uint32x4_t t1 = vaddq_u32(w1, K0);
|
|
|
|
PRUint32 tmpE;
|
|
|
|
/*
|
|
* Using the following ARM instructions to accelerate SHA1
|
|
*
|
|
* sha1c for round 0 - 20
|
|
* sha1p for round 20 - 40
|
|
* sha1m for round 40 - 60
|
|
* sha1p for round 60 - 80
|
|
* sha1su0 and shasu1 for message schedule
|
|
* sha1h for rotate left 30
|
|
*/
|
|
|
|
/* Round 0-3 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1cq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w2, K0);
|
|
w0 = vsha1su0q_u32(w0, w1, w2);
|
|
|
|
/* Round 4-7 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1cq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w3, K0);
|
|
w0 = vsha1su1q_u32(w0, w3);
|
|
w1 = vsha1su0q_u32(w1, w2, w3);
|
|
|
|
/* Round 8-11 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1cq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w0, K0);
|
|
w1 = vsha1su1q_u32(w1, w0);
|
|
w2 = vsha1su0q_u32(w2, w3, w0);
|
|
|
|
/* Round 12-15 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1cq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w1, K1);
|
|
w2 = vsha1su1q_u32(w2, w1);
|
|
w3 = vsha1su0q_u32(w3, w0, w1);
|
|
|
|
/* Round 16-19 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1cq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w2, K1);
|
|
w3 = vsha1su1q_u32(w3, w2);
|
|
w0 = vsha1su0q_u32(w0, w1, w2);
|
|
|
|
/* Round 20-23 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w3, K1);
|
|
w0 = vsha1su1q_u32(w0, w3);
|
|
w1 = vsha1su0q_u32(w1, w2, w3);
|
|
|
|
/* Round 24-27 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w0, K1);
|
|
w1 = vsha1su1q_u32(w1, w0);
|
|
w2 = vsha1su0q_u32(w2, w3, w0);
|
|
|
|
/* Round 28-31 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w1, K1);
|
|
w2 = vsha1su1q_u32(w2, w1);
|
|
w3 = vsha1su0q_u32(w3, w0, w1);
|
|
|
|
/* Round 32-35 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w2, K2);
|
|
w3 = vsha1su1q_u32(w3, w2);
|
|
w0 = vsha1su0q_u32(w0, w1, w2);
|
|
|
|
/* Round 36-39 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w3, K2);
|
|
w0 = vsha1su1q_u32(w0, w3);
|
|
w1 = vsha1su0q_u32(w1, w2, w3);
|
|
|
|
/* Round 40-43 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1mq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w0, K2);
|
|
w1 = vsha1su1q_u32(w1, w0);
|
|
w2 = vsha1su0q_u32(w2, w3, w0);
|
|
|
|
/* Round 44-47 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1mq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w1, K2);
|
|
w2 = vsha1su1q_u32(w2, w1);
|
|
w3 = vsha1su0q_u32(w3, w0, w1);
|
|
|
|
/* Round 48-51 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1mq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w2, K2);
|
|
w3 = vsha1su1q_u32(w3, w2);
|
|
w0 = vsha1su0q_u32(w0, w1, w2);
|
|
|
|
/* Round 52-55 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1mq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w3, K3);
|
|
w0 = vsha1su1q_u32(w0, w3);
|
|
w1 = vsha1su0q_u32(w1, w2, w3);
|
|
|
|
/* Round 56-59 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1mq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w0, K3);
|
|
w1 = vsha1su1q_u32(w1, w0);
|
|
w2 = vsha1su0q_u32(w2, w3, w0);
|
|
|
|
/* Round 60-63 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w1, K3);
|
|
w2 = vsha1su1q_u32(w2, w1);
|
|
w3 = vsha1su0q_u32(w3, w0, w1);
|
|
|
|
/* Round 64-67 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, e, t0);
|
|
t0 = vaddq_u32(w2, K3);
|
|
w3 = vsha1su1q_u32(w3, w2);
|
|
w0 = vsha1su0q_u32(w0, w1, w2);
|
|
|
|
/* Round 68-71 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, tmpE, t1);
|
|
t1 = vaddq_u32(w3, K3);
|
|
w0 = vsha1su1q_u32(w0, w3);
|
|
|
|
/* Round 72-75 */
|
|
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, e, t0);
|
|
|
|
/* Round 76-79 */
|
|
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
|
|
abcd = vsha1pq_u32(abcd, tmpE, t1);
|
|
|
|
e += origE;
|
|
abcd = vaddq_u32(origABCD, abcd);
|
|
|
|
vst1q_u32(&XH(0), abcd);
|
|
XH(4) = e;
|
|
}
|
|
|
|
#endif /* USE_HW_SHA1 */
|