lib/crc/arm64/crc32-core.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
 *
 * Copyright (C) 2016 - 2018 Linaro Ltd.
 * Copyright (C) 2024 Google LLC
 *
 * Author: Ard Biesheuvel <ardb@kernel.org>
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

	.cpu		generic+crc+crypto

	.macro		bitle, reg
	.endm

	.macro		bitbe, reg
	rbit		\reg, \reg
	.endm

	.macro		bytele, reg
	.endm

	.macro		bytebe, reg
	rbit		\reg, \reg
	lsr		\reg, \reg, #24
	.endm

	.macro		hwordle, reg
CPU_BE(	rev16		\reg, \reg	)
	.endm

	.macro		hwordbe, reg
CPU_LE(	rev		\reg, \reg	)
	rbit		\reg, \reg
CPU_BE(	lsr		\reg, \reg, #16	)
	.endm

	.macro		le, regs:vararg
	.irp		r, \regs
CPU_BE(	rev		\r, \r		)
	.endr
	.endm

	.macro		be, regs:vararg
	.irp		r, \regs
CPU_LE(	rev		\r, \r		)
	.endr
	.irp		r, \regs
	rbit		\r, \r
	.endr
	.endm

	.macro		__crc32, c, order=le
	bit\order	w0
	cmp		x2, #16
	b.lt		8f			// less than 16 bytes

	and		x7, x2, #0x1f
	and		x2, x2, #~0x1f
	cbz		x7, 32f			// multiple of 32 bytes

	and		x8, x7, #0xf
	ldp		x3, x4, [x1]
	add		x8, x8, x1
	add		x1, x1, x7
	ldp		x5, x6, [x8]
	\order		x3, x4, x5, x6

	tst		x7, #8
	crc32\c\()x	w8, w0, x3
	csel		x3, x3, x4, eq
	csel		w0, w0, w8, eq
	tst		x7, #4
	lsr		x4, x3, #32
	crc32\c\()w	w8, w0, w3
	csel		x3, x3, x4, eq
	csel		w0, w0, w8, eq
	tst		x7, #2
	lsr		w4, w3, #16
	crc32\c\()h	w8, w0, w3
	csel		w3, w3, w4, eq
	csel		w0, w0, w8, eq
	tst		x7, #1
	crc32\c\()b	w8, w0, w3
	csel		w0, w0, w8, eq
	tst		x7, #16
	crc32\c\()x	w8, w0, x5
	crc32\c\()x	w8, w8, x6
	csel		w0, w0, w8, eq
	cbz		x2, 0f

32:	ldp		x3, x4, [x1], #32
	sub		x2, x2, #32
	ldp		x5, x6, [x1, #-16]
	\order		x3, x4, x5, x6
	crc32\c\()x	w0, w0, x3
	crc32\c\()x	w0, w0, x4
	crc32\c\()x	w0, w0, x5
	crc32\c\()x	w0, w0, x6
	cbnz		x2, 32b
0:	bit\order	w0
	ret

8:	tbz		x2, #3, 4f
	ldr		x3, [x1], #8
	\order		x3
	crc32\c\()x	w0, w0, x3
4:	tbz		x2, #2, 2f
	ldr		w3, [x1], #4
	\order		w3
	crc32\c\()w	w0, w0, w3
2:	tbz		x2, #1, 1f
	ldrh		w3, [x1], #2
	hword\order	w3
	crc32\c\()h	w0, w0, w3
1:	tbz		x2, #0, 0f
	ldrb		w3, [x1]
	byte\order	w3
	crc32\c\()b	w0, w0, w3
0:	bit\order	w0
	ret
	.endm

	.align		5
SYM_FUNC_START(crc32_le_arm64)
	__crc32
SYM_FUNC_END(crc32_le_arm64)

	.align		5
SYM_FUNC_START(crc32c_le_arm64)
	__crc32		c
SYM_FUNC_END(crc32c_le_arm64)

	.align		5
SYM_FUNC_START(crc32_be_arm64)
	__crc32		order=be
SYM_FUNC_END(crc32_be_arm64)

	in		.req	x1
	len		.req	x2

	/*
	 * w0: input CRC at entry, output CRC at exit
	 * x1: pointer to input buffer
	 * x2: length of input in bytes
	 */
	.macro		crc4way, insn, table, order=le
	bit\order	w0
	lsr		len, len, #6		// len := # of 64-byte blocks

	/* Process up to 64 blocks of 64 bytes at a time */
.La\@:	mov		x3, #64
	cmp		len, #64
	csel		x3, x3, len, hi		// x3 := min(len, 64)
	sub		len, len, x3

	/* Divide the input into 4 contiguous blocks */
	add		x4, x3, x3, lsl #1	// x4 :=  3 * x3
	add		x7, in, x3, lsl #4	// x7 := in + 16 * x3
	add		x8, in, x3, lsl #5	// x8 := in + 32 * x3
	add		x9, in, x4, lsl #4	// x9 := in + 16 * x4

	/* Load the folding coefficients from the lookup table */
	adr_l		x5, \table - 12		// entry 0 omitted
	add		x5, x5, x4, lsl #2	// x5 += 12 * x3
	ldp		s0, s1, [x5]
	ldr		s2, [x5, #8]

	/* Zero init partial CRCs for this iteration */
	mov		w4, wzr
	mov		w5, wzr
	mov		w6, wzr
	mov		x17, xzr

.Lb\@:	sub		x3, x3, #1
	\insn		w6, w6, x17
	ldp		x10, x11, [in], #16
	ldp		x12, x13, [x7], #16
	ldp		x14, x15, [x8], #16
	ldp		x16, x17, [x9], #16

	\order		x10, x11, x12, x13, x14, x15, x16, x17

	/* Apply the CRC transform to 4 16-byte blocks in parallel */
	\insn		w0, w0, x10
	\insn		w4, w4, x12
	\insn		w5, w5, x14
	\insn		w6, w6, x16
	\insn		w0, w0, x11
	\insn		w4, w4, x13
	\insn		w5, w5, x15
	cbnz		x3, .Lb\@

	/* Combine the 4 partial results into w0 */
	mov		v3.d[0], x0
	mov		v4.d[0], x4
	mov		v5.d[0], x5
	pmull		v0.1q, v0.1d, v3.1d
	pmull		v1.1q, v1.1d, v4.1d
	pmull		v2.1q, v2.1d, v5.1d
	eor		v0.8b, v0.8b, v1.8b
	eor		v0.8b, v0.8b, v2.8b
	mov		x5, v0.d[0]
	eor		x5, x5, x17
	\insn		w0, w6, x5

	mov		in, x9
	cbnz		len, .La\@

	bit\order	w0
	ret
	.endm

	.align		5
SYM_FUNC_START(crc32c_le_arm64_4way)
	crc4way		crc32cx, .L0
SYM_FUNC_END(crc32c_le_arm64_4way)

	.align		5
SYM_FUNC_START(crc32_le_arm64_4way)
	crc4way		crc32x, .L1
SYM_FUNC_END(crc32_le_arm64_4way)

	.align		5
SYM_FUNC_START(crc32_be_arm64_4way)
	crc4way		crc32x, .L1, be
SYM_FUNC_END(crc32_be_arm64_4way)

	.section	.rodata, "a", %progbits
	.align		6
.L0:	.long		0xddc0152b, 0xba4fc28e, 0x493c7d27
	.long		0x0715ce53, 0x9e4addf8, 0xba4fc28e
	.long		0xc96cfdc0, 0x0715ce53, 0xddc0152b