#!/usr/bin/env perl
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
#
# ====================================================================
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
# project.
# ====================================================================
#
# IALU(*)/gcc-4.4 NEON
#
# ARM11xx(ARMv6) 7.78/+100% -
# Cortex-A5 6.35/+130% 3.00
# Cortex-A8 6.25/+115% 2.36
# Cortex-A9 5.10/+95% 2.55
# Cortex-A15 3.85/+85% 1.25(**)
# Snapdragon S4 5.70/+100% 1.48(**)
#
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
# (**) these are trade-off results, they can be improved by ~8% but at
# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
$code.=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
# define poly1305_init poly1305_block_init
# define poly1305_blocks poly1305_blocks_arm
#endif
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.text
.globl poly1305_emit
.globl poly1305_blocks
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
.Lpoly1305_init:
stmdb sp!,{r4-r11}
eor r3,r3,r3
cmp $inp,#0
str r3,[$ctx,#0] @ zero hash value
str r3,[$ctx,#4]
str r3,[$ctx,#8]
str r3,[$ctx,#12]
str r3,[$ctx,#16]
str r3,[$ctx,#36] @ clear is_base2_26
add $ctx,$ctx,#20
#ifdef __thumb2__
it eq
#endif
moveq r0,#0
beq .Lno_key
#if __ARM_MAX_ARCH__>=7
mov r3,#-1
str r3,[$ctx,#28] @ impossible key power value
# ifndef __KERNEL__
adr r11,.Lpoly1305_init
ldr r12,.LOPENSSL_armcap
# endif
#endif
ldrb r4,[$inp,#0]
mov r10,#0x0fffffff
ldrb r5,[$inp,#1]
and r3,r10,#-4 @ 0x0ffffffc
ldrb r6,[$inp,#2]
ldrb r7,[$inp,#3]
orr r4,r4,r5,lsl#8
ldrb r5,[$inp,#4]
orr r4,r4,r6,lsl#16
ldrb r6,[$inp,#5]
orr r4,r4,r7,lsl#24
ldrb r7,[$inp,#6]
and r4,r4,r10
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# if !defined(_WIN32)
ldr r12,[r11,r12] @ OPENSSL_armcap_P
# endif
# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
#endif
ldrb r8,[$inp,#7]
orr r5,r5,r6,lsl#8
ldrb r6,[$inp,#8]
orr r5,r5,r7,lsl#16
ldrb r7,[$inp,#9]
orr r5,r5,r8,lsl#24
ldrb r8,[$inp,#10]
and r5,r5,r3
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
tst r12,#ARMV7_NEON @ check for NEON
# ifdef __thumb2__
adr r9,.Lpoly1305_blocks_neon
adr r11,.Lpoly1305_blocks
it ne
movne r11,r9
adr r12,.Lpoly1305_emit
orr r11,r11,#1 @ thumb-ify addresses
orr r12,r12,#1
# else
add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
ite eq
addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
# endif
#endif
ldrb r9,[$inp,#11]
orr r6,r6,r7,lsl#8
ldrb r7,[$inp,#12]
orr r6,r6,r8,lsl#16
ldrb r8,[$inp,#13]
orr r6,r6,r9,lsl#24
ldrb r9,[$inp,#14]
and r6,r6,r3
ldrb r10,[$inp,#15]
orr r7,r7,r8,lsl#8
str r4,[$ctx,#0]
orr r7,r7,r9,lsl#16
str r5,[$ctx,#4]
orr r7,r7,r10,lsl#24
str r6,[$ctx,#8]
and r7,r7,r3
str r7,[$ctx,#12]
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
stmia r2,{r11,r12} @ fill functions table
mov r0,#1
#else
mov r0,#0
#endif
.Lno_key:
ldmia sp!,{r4-r11}
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size poly1305_init,.-poly1305_init
___
{
my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
my ($s1,$s2,$s3)=($r1,$r2,$r3);
$code.=<<___;
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
.Lpoly1305_blocks:
stmdb sp!,{r3-r11,lr}
ands $len,$len,#-16
beq .Lno_data
add $len,$len,$inp @ end pointer
sub sp,sp,#32
#if __ARM_ARCH__<7
ldmia $ctx,{$h0-$r3} @ load context
add $ctx,$ctx,#20
str $len,[sp,#16] @ offload stuff
str $ctx,[sp,#12]
#else
ldr lr,[$ctx,#36] @ is_base2_26
ldmia $ctx!,{$h0-$h4} @ load hash value
str $len,[sp,#16] @ offload stuff
str $ctx,[sp,#12]
adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
mov $r1,$h1,lsr#6
adcs $r1,$r1,$h2,lsl#20
mov $r2,$h2,lsr#12
adcs $r2,$r2,$h3,lsl#14
mov $r3,$h3,lsr#18