fusion.c 108 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249
  1. /*
  2. * This source file is licensed under the Apache License 2.0 *and* the MIT
  3. * License. Please agree to *both* of the licensing terms!
  4. *
  5. *
  6. * `transformH` function is a derivative work of OpenSSL. The original work
  7. * is covered by the following license:
  8. *
  9. * Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  10. *
  11. * Licensed under the Apache License 2.0 (the "License"). You may not use
  12. * this file except in compliance with the License. You can obtain a copy
  13. * in the file LICENSE in the source distribution or at
  14. * https://www.openssl.org/source/license.html
  15. *
  16. *
  17. * All other work, including modifications to the `transformH` function is
  18. * covered by the following MIT license:
  19. *
  20. * Copyright (c) 2020-2022 Fastly, Kazuho Oku
  21. *
  22. * Permission is hereby granted, free of charge, to any person obtaining a copy
  23. * of this software and associated documentation files (the "Software"), to
  24. * deal in the Software without restriction, including without limitation the
  25. * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  26. * sell copies of the Software, and to permit persons to whom the Software is
  27. * furnished to do so, subject to the following conditions:
  28. *
  29. * The above copyright notice and this permission notice shall be included in
  30. * all copies or substantial portions of the Software.
  31. *
  32. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  33. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  34. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  35. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  36. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  37. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  38. * IN THE SOFTWARE.
  39. */
  40. #include <stdint.h>
  41. #include <stdlib.h>
  42. #include <string.h>
  43. #include <immintrin.h>
  44. #include <tmmintrin.h>
  45. #include <nmmintrin.h>
  46. #include <wmmintrin.h>
  47. #include "picotls.h"
  48. #include "picotls/fusion.h"
  49. #if defined(__clang__)
  50. #if __has_feature(address_sanitizer)
  51. #define NO_SANITIZE_ADDRESS __attribute__((no_sanitize("address")))
  52. #endif
  53. #elif __SANITIZE_ADDRESS__ /* gcc */
  54. #define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
  55. #endif
  56. #ifndef NO_SANITIZE_ADDRESS
  57. #define NO_SANITIZE_ADDRESS
  58. #endif
  59. #ifdef _WINDOWS
  60. #define aligned_alloc(a, s) _aligned_malloc((s), (a))
  61. #define aligned_free(p) _aligned_free(p)
  62. #else
  63. #define aligned_free(p) free(p)
  64. #endif
  65. struct ptls_fusion_aesgcm_context {
  66. ptls_fusion_aesecb_context_t ecb;
  67. size_t capacity;
  68. size_t ghash_cnt;
  69. };
  70. struct ptls_fusion_aesgcm_context128 {
  71. struct ptls_fusion_aesgcm_context super;
  72. struct ptls_fusion_aesgcm_ghash_precompute128 {
  73. __m128i H;
  74. __m128i r;
  75. } ghash[0];
  76. };
  77. struct ptls_fusion_aesgcm_context256 {
  78. struct ptls_fusion_aesgcm_context super;
  79. union ptls_fusion_aesgcm_ghash_precompute256 {
  80. struct {
  81. __m128i H[2];
  82. __m128i r[2];
  83. };
  84. struct {
  85. __m256i Hx2;
  86. __m256i rx2;
  87. };
  88. } ghash[0];
  89. };
  90. struct ctr_context {
  91. ptls_cipher_context_t super;
  92. ptls_fusion_aesecb_context_t fusion;
  93. __m128i bits;
  94. uint8_t is_ready;
  95. };
  96. struct aesgcm_context {
  97. ptls_aead_context_t super;
  98. ptls_fusion_aesgcm_context_t *aesgcm;
  99. /**
  100. * retains the static IV in the upper 96 bits (in little endian)
  101. */
  102. __m128i static_iv;
  103. };
  104. static const uint64_t poly_[2] __attribute__((aligned(16))) = {1, 0xc200000000000000};
  105. #define poly (*(__m128i *)poly_)
  106. static const uint8_t byteswap_[32] __attribute__((aligned(32))) = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
  107. 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
  108. #define byteswap128 (*(__m128i *)byteswap_)
  109. #define byteswap256 (*(__m256i *)byteswap_)
  110. static const uint8_t one_[16] __attribute__((aligned(16))) = {1};
  111. #define one8 (*(__m128i *)one_)
  112. static const uint8_t incr128x2_[32] __attribute__((aligned(32))) = {2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2};
  113. #define incr128x2 (*(__m256i *)incr128x2_)
  114. /* This function is covered by the Apache License and the MIT License. The origin is crypto/modes/asm/ghash-x86_64.pl of openssl
  115. * at commit 33388b4. */
  116. static __m128i transformH(__m128i H)
  117. {
  118. // # <<1 twist
  119. // pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  120. __m128i t2 = _mm_shuffle_epi32(H, 0xff);
  121. // movdqa $Hkey,$T1
  122. __m128i t1 = H;
  123. // psllq \$1,$Hkey
  124. H = _mm_slli_epi64(H, 1);
  125. // pxor $T3,$T3 #
  126. __m128i t3 = _mm_setzero_si128();
  127. // psrlq \$63,$T1
  128. t1 = _mm_srli_epi64(t1, 63);
  129. // pcmpgtd $T2,$T3 # broadcast carry bit
  130. t3 = _mm_cmplt_epi32(t2, t3);
  131. // pslldq \$8,$T1
  132. t1 = _mm_slli_si128(t1, 8);
  133. // por $T1,$Hkey # H<<=1
  134. H = _mm_or_si128(t1, H);
  135. // # magic reduction
  136. // pand .L0x1c2_polynomial(%rip),$T3
  137. t3 = _mm_and_si128(t3, poly);
  138. // pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  139. H = _mm_xor_si128(t3, H);
  140. return H;
  141. }
  142. // end of Apache License code
  143. static __m128i gfmul(__m128i x, __m128i y)
  144. {
  145. __m128i lo = _mm_clmulepi64_si128(x, y, 0x00);
  146. __m128i hi = _mm_clmulepi64_si128(x, y, 0x11);
  147. __m128i a = _mm_shuffle_epi32(x, 78);
  148. __m128i b = _mm_shuffle_epi32(y, 78);
  149. a = _mm_xor_si128(a, x);
  150. b = _mm_xor_si128(b, y);
  151. a = _mm_clmulepi64_si128(a, b, 0x00);
  152. a = _mm_xor_si128(a, lo);
  153. a = _mm_xor_si128(a, hi);
  154. b = _mm_slli_si128(a, 8);
  155. a = _mm_srli_si128(a, 8);
  156. lo = _mm_xor_si128(lo, b);
  157. hi = _mm_xor_si128(hi, a);
  158. // from https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
  159. __m128i t = _mm_clmulepi64_si128(lo, poly, 0x10);
  160. lo = _mm_shuffle_epi32(lo, 78);
  161. lo = _mm_xor_si128(lo, t);
  162. t = _mm_clmulepi64_si128(lo, poly, 0x10);
  163. lo = _mm_shuffle_epi32(lo, 78);
  164. lo = _mm_xor_si128(lo, t);
  165. return _mm_xor_si128(hi, lo);
  166. }
  167. static inline __m128i gfmul_do_reduce(__m128i hi, __m128i lo, __m128i mid)
  168. {
  169. mid = _mm_xor_si128(mid, hi);
  170. mid = _mm_xor_si128(mid, lo);
  171. lo = _mm_xor_si128(lo, _mm_slli_si128(mid, 8));
  172. hi = _mm_xor_si128(hi, _mm_srli_si128(mid, 8));
  173. /* fast reduction, using https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf */
  174. __m128i r = _mm_clmulepi64_si128(lo, poly, 0x10);
  175. lo = _mm_shuffle_epi32(lo, 78);
  176. lo = _mm_xor_si128(lo, r);
  177. r = _mm_clmulepi64_si128(lo, poly, 0x10);
  178. lo = _mm_shuffle_epi32(lo, 78);
  179. lo = _mm_xor_si128(lo, r);
  180. lo = _mm_xor_si128(hi, lo);
  181. return lo;
  182. }
  183. struct ptls_fusion_gfmul_state128 {
  184. __m128i hi, lo, mid;
  185. };
  186. #if defined(__GNUC__) && !defined(__clang__)
  187. static inline __m128i xor128(__m128i x, __m128i y)
  188. {
  189. __m128i ret;
  190. __asm__("vpxor %2, %1, %0" : "=x"(ret) : "x"(x), "xm"(y));
  191. return ret;
  192. }
  193. #else
  194. #define xor128 _mm_xor_si128
  195. #endif
  196. static inline void gfmul_do_step128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
  197. struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
  198. {
  199. __m128i t1 = _mm_clmulepi64_si128(precompute->H, X, 0x00);
  200. __m128i t2 = _mm_clmulepi64_si128(precompute->H, X, 0x11);
  201. __m128i t3 = _mm_shuffle_epi32(X, 78);
  202. t3 = _mm_xor_si128(t3, X);
  203. t3 = _mm_clmulepi64_si128(precompute->r, t3, 0x00);
  204. gstate->lo = xor128(gstate->lo, t1);
  205. gstate->hi = xor128(gstate->hi, t2);
  206. gstate->mid = xor128(gstate->mid, t3);
  207. }
  208. #undef xor128
  209. static inline void gfmul_firststep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
  210. struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
  211. {
  212. X = _mm_shuffle_epi8(X, byteswap128);
  213. X = _mm_xor_si128(gstate->lo, X);
  214. gstate->lo = _mm_setzero_si128();
  215. gstate->hi = _mm_setzero_si128();
  216. gstate->mid = _mm_setzero_si128();
  217. gfmul_do_step128(gstate, X, precompute);
  218. }
  219. static inline void gfmul_nextstep128(struct ptls_fusion_gfmul_state128 *gstate, __m128i X,
  220. struct ptls_fusion_aesgcm_ghash_precompute128 *precompute)
  221. {
  222. X = _mm_shuffle_epi8(X, byteswap128);
  223. gfmul_do_step128(gstate, X, precompute);
  224. }
  225. static inline void gfmul_reduce128(struct ptls_fusion_gfmul_state128 *gstate)
  226. {
  227. gstate->lo = gfmul_do_reduce(gstate->hi, gstate->lo, gstate->mid);
  228. }
  229. static inline __m128i gfmul_get_tag128(struct ptls_fusion_gfmul_state128 *gstate, __m128i ek0)
  230. {
  231. __m128i tag = _mm_shuffle_epi8(gstate->lo, byteswap128);
  232. tag = _mm_xor_si128(tag, ek0);
  233. return tag;
  234. }
  235. struct ptls_fusion_gfmul_state256 {
  236. __m256i hi, lo, mid;
  237. };
  238. static inline void gfmul_do_step256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X,
  239. union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
  240. {
  241. __m256i t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x00);
  242. gstate->lo = _mm256_xor_si256(gstate->lo, t);
  243. t = _mm256_clmulepi64_epi128(precompute->Hx2, X, 0x11);
  244. gstate->hi = _mm256_xor_si256(gstate->hi, t);
  245. t = _mm256_shuffle_epi32(X, 78);
  246. t = _mm256_xor_si256(t, X);
  247. t = _mm256_clmulepi64_epi128(precompute->rx2, t, 0x00);
  248. gstate->mid = _mm256_xor_si256(gstate->mid, t);
  249. }
  250. static inline void gfmul_firststep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X, int half,
  251. union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
  252. {
  253. X = _mm256_shuffle_epi8(X, byteswap256);
  254. X = _mm256_xor_si256(gstate->lo, X);
  255. if (half)
  256. X = _mm256_permute2f128_si256(X, X, 0x08);
  257. gstate->lo = _mm256_setzero_si256();
  258. gstate->hi = _mm256_setzero_si256();
  259. gstate->mid = _mm256_setzero_si256();
  260. gfmul_do_step256(gstate, X, precompute);
  261. }
  262. static inline void gfmul_nextstep256(struct ptls_fusion_gfmul_state256 *gstate, __m256i X,
  263. union ptls_fusion_aesgcm_ghash_precompute256 *precompute)
  264. {
  265. X = _mm256_shuffle_epi8(X, byteswap256);
  266. gfmul_do_step256(gstate, X, precompute);
  267. }
  268. static inline void gfmul_reduce256(struct ptls_fusion_gfmul_state256 *gstate)
  269. {
  270. #define XOR_256TO128(y) _mm_xor_si128(_mm256_castsi256_si128(y), _mm256_extractf128_si256((y), 1))
  271. __m128i hi = XOR_256TO128(gstate->hi);
  272. __m128i lo = XOR_256TO128(gstate->lo);
  273. __m128i mid = XOR_256TO128(gstate->mid);
  274. #undef XOR_256TO128
  275. lo = gfmul_do_reduce(hi, lo, mid);
  276. gstate->lo = _mm256_castsi128_si256(lo);
  277. }
  278. static inline __m128i gfmul_get_tag256(struct ptls_fusion_gfmul_state256 *gstate, __m128i ek0)
  279. {
  280. __m128i tag = _mm_shuffle_epi8(_mm256_castsi256_si128(gstate->lo), byteswap128);
  281. tag = _mm_xor_si128(tag, ek0);
  282. return tag;
  283. }
  284. static inline __m128i aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, __m128i v)
  285. {
  286. #define ROUNDKEY(i) (ctx->aesni256 ? _mm256_castsi256_si128(ctx->keys.m256[i]) : ctx->keys.m128[i])
  287. v = _mm_xor_si128(v, ROUNDKEY(0));
  288. for (size_t i = 1; i < ctx->rounds; ++i)
  289. v = _mm_aesenc_si128(v, ROUNDKEY(i));
  290. v = _mm_aesenclast_si128(v, ROUNDKEY(ctx->rounds));
  291. return v;
  292. #undef ROUNDKEY
  293. }
  294. // 32-bytes of 0xff followed by 31-bytes of 0x00
  295. static const uint8_t loadn_mask[63] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  296. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  297. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
  298. static const uint8_t loadn_shuffle[31] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  299. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, // first 16 bytes map to byte offsets
  300. 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  301. 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // latter 15 bytes map to zero
  302. NO_SANITIZE_ADDRESS
  303. static inline __m128i loadn_end_of_page(const void *p, size_t l)
  304. {
  305. uintptr_t shift = (uintptr_t)p & 15;
  306. __m128i pattern = _mm_loadu_si128((const __m128i *)(loadn_shuffle + shift));
  307. return _mm_shuffle_epi8(_mm_load_si128((const __m128i *)((uintptr_t)p - shift)), pattern);
  308. }
  309. NO_SANITIZE_ADDRESS
  310. static inline __m128i loadn128(const void *p, size_t l)
  311. {
  312. __m128i v, mask = _mm_loadu_si128((__m128i *)(loadn_mask + 32 - l));
  313. uintptr_t mod4k = (uintptr_t)p % 4096;
  314. if (PTLS_LIKELY(mod4k <= 4096 - 16) || mod4k + l > 4096) {
  315. v = _mm_loadu_si128(p);
  316. } else {
  317. v = loadn_end_of_page(p, l);
  318. }
  319. v = _mm_and_si128(v, mask);
  320. return v;
  321. }
  322. NO_SANITIZE_ADDRESS
  323. static inline __m256i loadn256(const void *p, size_t l)
  324. {
  325. __m256i v, mask = _mm256_loadu_si256((__m256i *)(loadn_mask + 32 - l));
  326. uintptr_t mod4k = (uintptr_t)p % 4096;
  327. if (PTLS_LIKELY(mod4k < 4096 - 32) || mod4k + l > 4096) {
  328. v = _mm256_loadu_si256(p);
  329. } else if (l > 16) {
  330. __m128i first16 = _mm_loadu_si128(p), second16 = loadn128((uint8_t *)p + 16, l - 16);
  331. v = _mm256_permute2f128_si256(_mm256_castsi128_si256(first16), _mm256_castsi128_si256(second16), 0x20);
  332. } else if (l == 16) {
  333. v = _mm256_castsi128_si256(_mm_loadu_si128(p));
  334. } else {
  335. v = _mm256_castsi128_si256(loadn_end_of_page(p, l));
  336. }
  337. v = _mm256_and_si256(v, mask);
  338. return v;
  339. }
  340. static inline void storen128(void *_p, size_t l, __m128i v)
  341. {
  342. uint8_t buf[16], *p = _p;
  343. *(__m128i *)buf = v;
  344. for (size_t i = 0; i != l; ++i)
  345. p[i] = buf[i];
  346. }
  347. void ptls_fusion_aesgcm_encrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr,
  348. const void *_aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp)
  349. {
  350. /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
  351. #define AESECB6_INIT() \
  352. do { \
  353. ctr = _mm_add_epi64(ctr, one8); \
  354. bits0 = _mm_shuffle_epi8(ctr, byteswap128); \
  355. ctr = _mm_add_epi64(ctr, one8); \
  356. bits1 = _mm_shuffle_epi8(ctr, byteswap128); \
  357. ctr = _mm_add_epi64(ctr, one8); \
  358. bits2 = _mm_shuffle_epi8(ctr, byteswap128); \
  359. ctr = _mm_add_epi64(ctr, one8); \
  360. bits3 = _mm_shuffle_epi8(ctr, byteswap128); \
  361. ctr = _mm_add_epi64(ctr, one8); \
  362. bits4 = _mm_shuffle_epi8(ctr, byteswap128); \
  363. if (PTLS_LIKELY(srclen > 16 * 5)) { \
  364. ctr = _mm_add_epi64(ctr, one8); \
  365. bits5 = _mm_shuffle_epi8(ctr, byteswap128); \
  366. } else { \
  367. if ((state & STATE_EK0_BEEN_FED) == 0) { \
  368. bits5 = ek0; \
  369. state |= STATE_EK0_BEEN_FED; \
  370. } \
  371. if ((state & STATE_SUPP_USED) != 0 && srclen <= 16 * 4 && (const __m128i *)supp->input + 1 <= dst_ghash) { \
  372. bits4 = _mm_loadu_si128(supp->input); \
  373. bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128; \
  374. state |= STATE_SUPP_IN_PROCESS; \
  375. } \
  376. } \
  377. __m128i k = ctx->super.ecb.keys.m128[0]; \
  378. bits0 = _mm_xor_si128(bits0, k); \
  379. bits1 = _mm_xor_si128(bits1, k); \
  380. bits2 = _mm_xor_si128(bits2, k); \
  381. bits3 = _mm_xor_si128(bits3, k); \
  382. bits4 = _mm_xor_si128(bits4, bits4keys[0]); \
  383. bits5 = _mm_xor_si128(bits5, k); \
  384. } while (0)
  385. /* aes block update */
  386. #define AESECB6_UPDATE(i) \
  387. do { \
  388. __m128i k = ctx->super.ecb.keys.m128[i]; \
  389. bits0 = _mm_aesenc_si128(bits0, k); \
  390. bits1 = _mm_aesenc_si128(bits1, k); \
  391. bits2 = _mm_aesenc_si128(bits2, k); \
  392. bits3 = _mm_aesenc_si128(bits3, k); \
  393. bits4 = _mm_aesenc_si128(bits4, bits4keys[i]); \
  394. bits5 = _mm_aesenc_si128(bits5, k); \
  395. } while (0)
  396. /* aesenclast */
  397. #define AESECB6_FINAL(i) \
  398. do { \
  399. __m128i k = ctx->super.ecb.keys.m128[i]; \
  400. bits0 = _mm_aesenclast_si128(bits0, k); \
  401. bits1 = _mm_aesenclast_si128(bits1, k); \
  402. bits2 = _mm_aesenclast_si128(bits2, k); \
  403. bits3 = _mm_aesenclast_si128(bits3, k); \
  404. bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]); \
  405. bits5 = _mm_aesenclast_si128(bits5, k); \
  406. } while (0)
  407. struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx;
  408. __m128i ek0, bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
  409. const __m128i *bits4keys = ctx->super.ecb.keys.m128; /* is changed to supp->ctx.keys when calcurating suppout */
  410. struct ptls_fusion_gfmul_state128 gstate = {0};
  411. __m128i gdatabuf[6];
  412. __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128);
  413. // src and dst are updated after the chunk is processed
  414. const __m128i *src = input;
  415. __m128i *dst = output;
  416. size_t srclen = inlen;
  417. // aad and src_ghash are updated before the chunk is processed (i.e., when the pointers are fed indo the processor)
  418. const __m128i *aad = _aad, *dst_ghash = dst;
  419. size_t dst_ghashlen = srclen;
  420. struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (srclen + 15) / 16 + 1;
  421. #define STATE_EK0_BEEN_FED 0x3
  422. #define STATE_EK0_INCOMPLETE 0x2
  423. #define STATE_EK0_READY() ((state & STATE_EK0_BEEN_FED) == 0x1)
  424. #define STATE_SUPP_USED 0x4
  425. #define STATE_SUPP_IN_PROCESS 0x8
  426. int32_t state = supp != NULL ? STATE_SUPP_USED : 0;
  427. /* build counter */
  428. ctr = _mm_insert_epi32(ctr, 1, 0);
  429. ek0 = _mm_shuffle_epi8(ctr, byteswap128);
  430. /* start preparing AES */
  431. AESECB6_INIT();
  432. AESECB6_UPDATE(1);
  433. /* build first ghash data (only AAD can be fed at this point, as this would be calculated alongside the first AES block) */
  434. const __m128i *gdata = gdatabuf; // points to the elements fed into GHASH
  435. size_t gdata_cnt = 0;
  436. if (PTLS_LIKELY(aadlen != 0)) {
  437. while (gdata_cnt < 6) {
  438. if (PTLS_LIKELY(aadlen < 16)) {
  439. if (aadlen != 0) {
  440. gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
  441. aadlen = 0;
  442. }
  443. goto MainLoop;
  444. }
  445. gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
  446. aadlen -= 16;
  447. }
  448. }
  449. /* the main loop */
  450. MainLoop:
  451. while (1) {
  452. /* run AES and multiplication in parallel */
  453. size_t i;
  454. for (i = 2; i < gdata_cnt + 2; ++i) {
  455. AESECB6_UPDATE(i);
  456. gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
  457. }
  458. for (; i < ctx->super.ecb.rounds; ++i)
  459. AESECB6_UPDATE(i);
  460. AESECB6_FINAL(i);
  461. /* apply the bit stream to src and write to dest */
  462. if (PTLS_LIKELY(srclen >= 6 * 16)) {
  463. #define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src + i), bits##i))
  464. APPLY(0);
  465. APPLY(1);
  466. APPLY(2);
  467. APPLY(3);
  468. APPLY(4);
  469. APPLY(5);
  470. #undef APPLY
  471. dst += 6;
  472. src += 6;
  473. srclen -= 6 * 16;
  474. } else {
  475. if ((state & STATE_EK0_BEEN_FED) == STATE_EK0_BEEN_FED) {
  476. ek0 = bits5;
  477. state &= ~STATE_EK0_INCOMPLETE;
  478. }
  479. if ((state & STATE_SUPP_IN_PROCESS) != 0) {
  480. _mm_storeu_si128((__m128i *)supp->output, bits4);
  481. state &= ~(STATE_SUPP_USED | STATE_SUPP_IN_PROCESS);
  482. }
  483. if (srclen != 0) {
  484. #define APPLY(i) \
  485. do { \
  486. if (PTLS_LIKELY(srclen >= 16)) { \
  487. _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src++), bits##i)); \
  488. srclen -= 16; \
  489. } else if (PTLS_LIKELY(srclen != 0)) { \
  490. bits0 = bits##i; \
  491. goto ApplyRemainder; \
  492. } else { \
  493. goto ApplyEnd; \
  494. } \
  495. } while (0)
  496. APPLY(0);
  497. APPLY(1);
  498. APPLY(2);
  499. APPLY(3);
  500. APPLY(4);
  501. APPLY(5);
  502. #undef APPLY
  503. goto ApplyEnd;
  504. ApplyRemainder:
  505. storen128(dst, srclen, _mm_xor_si128(loadn128(src, srclen), bits0));
  506. dst = (__m128i *)((uint8_t *)dst + srclen);
  507. srclen = 0;
  508. ApplyEnd:;
  509. }
  510. }
  511. /* next block AES starts here */
  512. AESECB6_INIT();
  513. AESECB6_UPDATE(1);
  514. /* setup gdata */
  515. if (PTLS_UNLIKELY(aadlen != 0)) {
  516. gdata_cnt = 0;
  517. while (gdata_cnt < 6) {
  518. if (aadlen < 16) {
  519. if (aadlen != 0) {
  520. gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
  521. aadlen = 0;
  522. }
  523. goto GdataFillDST;
  524. }
  525. gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
  526. aadlen -= 16;
  527. }
  528. gdata = gdatabuf;
  529. } else if (PTLS_LIKELY(dst_ghashlen >= 6 * 16)) {
  530. gdata = dst_ghash;
  531. gdata_cnt = 6;
  532. dst_ghash += 6;
  533. dst_ghashlen -= 96;
  534. } else {
  535. gdata_cnt = 0;
  536. GdataFillDST:
  537. while (gdata_cnt < 6) {
  538. if (dst_ghashlen < 16) {
  539. if (dst_ghashlen != 0) {
  540. gdatabuf[gdata_cnt++] = loadn128(dst_ghash, dst_ghashlen);
  541. dst_ghashlen = 0;
  542. }
  543. if (gdata_cnt < 6)
  544. goto Finish;
  545. break;
  546. }
  547. gdatabuf[gdata_cnt++] = _mm_loadu_si128(dst_ghash++);
  548. dst_ghashlen -= 16;
  549. }
  550. gdata = gdatabuf;
  551. }
  552. }
  553. Finish:
  554. gdatabuf[gdata_cnt++] = ac;
  555. /* We have complete set of data to be fed into GHASH. Let's finish the remaining calculation.
  556. * Note that by now, all AES operations for payload encryption and ek0 are complete. This is is because it is necessary for GCM
  557. * to process at least the same amount of data (i.e. payload-blocks + AC), and because AES is at least one 96-byte block ahead.
  558. */
  559. assert(STATE_EK0_READY());
  560. for (size_t i = 0; i < gdata_cnt; ++i)
  561. gfmul_nextstep128(&gstate, gdatabuf[i], --ghash_precompute);
  562. gfmul_reduce128(&gstate);
  563. _mm_storeu_si128(dst, gfmul_get_tag128(&gstate, ek0));
  564. /* Finish the calculation of supplemental vector. Done at the very last, because the sample might cover the GCM tag. */
  565. if ((state & STATE_SUPP_USED) != 0) {
  566. size_t i;
  567. if ((state & STATE_SUPP_IN_PROCESS) == 0) {
  568. bits4keys = ((struct ctr_context *)supp->ctx)->fusion.keys.m128;
  569. bits4 = _mm_xor_si128(_mm_loadu_si128(supp->input), bits4keys[0]);
  570. i = 1;
  571. } else {
  572. i = 2;
  573. }
  574. do {
  575. bits4 = _mm_aesenc_si128(bits4, bits4keys[i++]);
  576. } while (i != ctx->super.ecb.rounds);
  577. bits4 = _mm_aesenclast_si128(bits4, bits4keys[i]);
  578. _mm_storeu_si128((__m128i *)supp->output, bits4);
  579. }
  580. #undef AESECB6_INIT
  581. #undef AESECB6_UPDATE
  582. #undef AESECB6_FINAL
  583. #undef STATE_EK0_BEEN_FOUND
  584. #undef STATE_EK0_READY
  585. #undef STATE_SUPP_IN_PROCESS
  586. }
  587. int ptls_fusion_aesgcm_decrypt(ptls_fusion_aesgcm_context_t *_ctx, void *output, const void *input, size_t inlen, __m128i ctr,
  588. const void *_aad, size_t aadlen, const void *tag)
  589. {
  590. struct ptls_fusion_aesgcm_context128 *ctx = (void *)_ctx;
  591. __m128i ek0 = _mm_setzero_si128(), bits0, bits1 = _mm_setzero_si128(), bits2 = _mm_setzero_si128(), bits3 = _mm_setzero_si128(),
  592. bits4 = _mm_setzero_si128(), bits5 = _mm_setzero_si128();
  593. struct ptls_fusion_gfmul_state128 gstate = {0};
  594. __m128i gdatabuf[6];
  595. __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128);
  596. struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (aadlen + 15) / 16 + (inlen + 15) / 16 + 1;
  597. const __m128i *gdata; // points to the elements fed into GHASH
  598. size_t gdata_cnt;
  599. const __m128i *src_ghash = input, *src_aes = input, *aad = _aad;
  600. __m128i *dst = output;
  601. size_t nondata_aes_cnt = 0, src_ghashlen = inlen, src_aeslen = inlen;
  602. /* schedule ek0 and suppkey */
  603. ctr = _mm_add_epi64(ctr, one8);
  604. bits0 = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), ctx->super.ecb.keys.m128[0]);
  605. ++nondata_aes_cnt;
  606. #define STATE_IS_FIRST_RUN 0x1
  607. #define STATE_GHASH_HAS_MORE 0x2
  608. int state = STATE_IS_FIRST_RUN | STATE_GHASH_HAS_MORE;
  609. /* the main loop */
  610. while (1) {
  611. /* setup gdata */
  612. if (PTLS_UNLIKELY(aadlen != 0)) {
  613. gdata = gdatabuf;
  614. gdata_cnt = 0;
  615. while (gdata_cnt < 6) {
  616. if (aadlen < 16) {
  617. if (aadlen != 0) {
  618. gdatabuf[gdata_cnt++] = loadn128(aad, aadlen);
  619. aadlen = 0;
  620. ++nondata_aes_cnt;
  621. }
  622. goto GdataFillSrc;
  623. }
  624. gdatabuf[gdata_cnt++] = _mm_loadu_si128(aad++);
  625. aadlen -= 16;
  626. ++nondata_aes_cnt;
  627. }
  628. } else if (PTLS_LIKELY(src_ghashlen >= 6 * 16)) {
  629. gdata = src_ghash;
  630. gdata_cnt = 6;
  631. src_ghash += 6;
  632. src_ghashlen -= 6 * 16;
  633. } else {
  634. gdata = gdatabuf;
  635. gdata_cnt = 0;
  636. GdataFillSrc:
  637. while (gdata_cnt < 6) {
  638. if (src_ghashlen < 16) {
  639. if (src_ghashlen != 0) {
  640. gdatabuf[gdata_cnt++] = loadn128(src_ghash, src_ghashlen);
  641. src_ghash = (__m128i *)((uint8_t *)src_ghash + src_ghashlen);
  642. src_ghashlen = 0;
  643. }
  644. if (gdata_cnt < 6 && (state & STATE_GHASH_HAS_MORE) != 0) {
  645. gdatabuf[gdata_cnt++] = ac;
  646. state &= ~STATE_GHASH_HAS_MORE;
  647. }
  648. break;
  649. }
  650. gdatabuf[gdata_cnt++] = _mm_loadu_si128(src_ghash++);
  651. src_ghashlen -= 16;
  652. }
  653. }
  654. /* setup aes bits */
  655. if (PTLS_LIKELY(nondata_aes_cnt == 0))
  656. goto InitAllBits;
  657. switch (nondata_aes_cnt) {
  658. #define INIT_BITS(n, keys) \
  659. case n: \
  660. ctr = _mm_add_epi64(ctr, one8); \
  661. bits##n = _mm_xor_si128(_mm_shuffle_epi8(ctr, byteswap128), keys.m128[0]);
  662. InitAllBits:
  663. INIT_BITS(0, ctx->super.ecb.keys);
  664. INIT_BITS(1, ctx->super.ecb.keys);
  665. INIT_BITS(2, ctx->super.ecb.keys);
  666. INIT_BITS(3, ctx->super.ecb.keys);
  667. INIT_BITS(4, ctx->super.ecb.keys);
  668. INIT_BITS(5, ctx->super.ecb.keys);
  669. #undef INIT_BITS
  670. }
  671. { /* run aes and ghash */
  672. #define AESECB6_UPDATE(i) \
  673. do { \
  674. __m128i k = ctx->super.ecb.keys.m128[i]; \
  675. bits0 = _mm_aesenc_si128(bits0, k); \
  676. bits1 = _mm_aesenc_si128(bits1, k); \
  677. bits2 = _mm_aesenc_si128(bits2, k); \
  678. bits3 = _mm_aesenc_si128(bits3, k); \
  679. bits4 = _mm_aesenc_si128(bits4, k); \
  680. bits5 = _mm_aesenc_si128(bits5, k); \
  681. } while (0)
  682. size_t aesi;
  683. for (aesi = 1; aesi <= gdata_cnt; ++aesi) {
  684. AESECB6_UPDATE(aesi);
  685. gfmul_nextstep128(&gstate, _mm_loadu_si128(gdata++), --ghash_precompute);
  686. }
  687. for (; aesi < ctx->super.ecb.rounds; ++aesi)
  688. AESECB6_UPDATE(aesi);
  689. __m128i k = ctx->super.ecb.keys.m128[aesi];
  690. bits0 = _mm_aesenclast_si128(bits0, k);
  691. bits1 = _mm_aesenclast_si128(bits1, k);
  692. bits2 = _mm_aesenclast_si128(bits2, k);
  693. bits3 = _mm_aesenclast_si128(bits3, k);
  694. bits4 = _mm_aesenclast_si128(bits4, k);
  695. bits5 = _mm_aesenclast_si128(bits5, k);
  696. #undef AESECB6_UPDATE
  697. }
  698. /* apply aes bits */
  699. if (PTLS_LIKELY(nondata_aes_cnt == 0 && src_aeslen >= 6 * 16)) {
  700. #define APPLY(i) _mm_storeu_si128(dst + i, _mm_xor_si128(_mm_loadu_si128(src_aes + i), bits##i))
  701. APPLY(0);
  702. APPLY(1);
  703. APPLY(2);
  704. APPLY(3);
  705. APPLY(4);
  706. APPLY(5);
  707. #undef APPLY
  708. dst += 6;
  709. src_aes += 6;
  710. src_aeslen -= 6 * 16;
  711. } else {
  712. if ((state & STATE_IS_FIRST_RUN) != 0) {
  713. ek0 = bits0;
  714. state &= ~STATE_IS_FIRST_RUN;
  715. }
  716. switch (nondata_aes_cnt) {
  717. #define APPLY(i) \
  718. case i: \
  719. if (PTLS_LIKELY(src_aeslen > 16)) { \
  720. _mm_storeu_si128(dst++, _mm_xor_si128(_mm_loadu_si128(src_aes++), bits##i)); \
  721. src_aeslen -= 16; \
  722. } else { \
  723. bits0 = bits##i; \
  724. goto Finish; \
  725. }
  726. APPLY(0);
  727. APPLY(1);
  728. APPLY(2);
  729. APPLY(3);
  730. APPLY(4);
  731. APPLY(5);
  732. #undef APPLY
  733. }
  734. nondata_aes_cnt = 0;
  735. }
  736. }
  737. Finish:
  738. if (src_aeslen == 16) {
  739. _mm_storeu_si128(dst, _mm_xor_si128(_mm_loadu_si128(src_aes), bits0));
  740. } else if (src_aeslen != 0) {
  741. storen128(dst, src_aeslen, _mm_xor_si128(loadn128(src_aes, src_aeslen), bits0));
  742. }
  743. assert((state & STATE_IS_FIRST_RUN) == 0);
  744. /* the only case where AES operation is complete and GHASH is not is when the application of AC is remaining */
  745. if ((state & STATE_GHASH_HAS_MORE) != 0) {
  746. assert(ghash_precompute - 1 == ctx->ghash);
  747. gfmul_nextstep128(&gstate, ac, --ghash_precompute);
  748. }
  749. gfmul_reduce128(&gstate);
  750. __m128i calctag = gfmul_get_tag128(&gstate, ek0);
  751. return _mm_movemask_epi8(_mm_cmpeq_epi8(calctag, _mm_loadu_si128(tag))) == 0xffff;
  752. #undef STATE_IS_FIRST_RUN
  753. #undef STATE_GHASH_HAS_MORE
  754. }
  755. static __m128i expand_key(__m128i key, __m128i temp)
  756. {
  757. key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
  758. key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
  759. key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
  760. key = _mm_xor_si128(key, temp);
  761. return key;
  762. }
  763. void ptls_fusion_aesecb_init(ptls_fusion_aesecb_context_t *ctx, int is_enc, const void *key, size_t key_size, int aesni256)
  764. {
  765. assert(is_enc && "decryption is not supported (yet)");
  766. size_t i = 0;
  767. switch (key_size) {
  768. case 16: /* AES128 */
  769. ctx->rounds = 10;
  770. break;
  771. case 32: /* AES256 */
  772. ctx->rounds = 14;
  773. break;
  774. default:
  775. assert(!"invalid key size; AES128 / AES256 are supported");
  776. break;
  777. }
  778. ctx->aesni256 = aesni256;
  779. /* load and expand keys using keys.m128 */
  780. ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key);
  781. if (key_size == 32)
  782. ctx->keys.m128[i++] = _mm_loadu_si128((__m128i *)key + 1);
  783. while (1) {
  784. #define EXPAND(R) \
  785. { \
  786. ctx->keys.m128[i] = \
  787. expand_key(ctx->keys.m128[i - key_size / 16], \
  788. _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(3, 3, 3, 3))); \
  789. if (i == ctx->rounds) \
  790. break; \
  791. ++i; \
  792. if (key_size > 24) { \
  793. ctx->keys.m128[i] = \
  794. expand_key(ctx->keys.m128[i - key_size / 16], \
  795. _mm_shuffle_epi32(_mm_aeskeygenassist_si128(ctx->keys.m128[i - 1], R), _MM_SHUFFLE(2, 2, 2, 2))); \
  796. ++i; \
  797. } \
  798. }
  799. EXPAND(0x1);
  800. EXPAND(0x2);
  801. EXPAND(0x4);
  802. EXPAND(0x8);
  803. EXPAND(0x10);
  804. EXPAND(0x20);
  805. EXPAND(0x40);
  806. EXPAND(0x80);
  807. EXPAND(0x1b);
  808. EXPAND(0x36);
  809. #undef EXPAND
  810. }
  811. /* convert to keys.m256 if aesni256 is used */
  812. if (ctx->aesni256) {
  813. size_t i = ctx->rounds;
  814. do {
  815. ctx->keys.m256[i] = _mm256_broadcastsi128_si256(ctx->keys.m128[i]);
  816. } while (i-- != 0);
  817. }
  818. }
  819. void ptls_fusion_aesecb_dispose(ptls_fusion_aesecb_context_t *ctx)
  820. {
  821. ptls_clear_memory(ctx, sizeof(*ctx));
  822. }
  823. void ptls_fusion_aesecb_encrypt(ptls_fusion_aesecb_context_t *ctx, void *dst, const void *src)
  824. {
  825. __m128i v = _mm_loadu_si128(src);
  826. v = aesecb_encrypt(ctx, v);
  827. _mm_storeu_si128(dst, v);
  828. }
  829. /**
  830. * returns the number of ghash entries that is required to handle an AEAD block of given size
  831. */
  832. static size_t aesgcm_calc_ghash_cnt(size_t capacity)
  833. {
  834. // round-up by block size, add to handle worst split of the size between AAD and payload, plus context to hash AC
  835. return (capacity + 15) / 16 + 2;
  836. }
  837. static void setup_one_ghash_entry(ptls_fusion_aesgcm_context_t *ctx)
  838. {
  839. __m128i *H, *r, *Hprev, H0;
  840. if (ctx->ecb.aesni256) {
  841. struct ptls_fusion_aesgcm_context256 *ctx256 = (void *)ctx;
  842. #define GET_SLOT(i, mem) (&ctx256->ghash[(i) / 2].mem[(i) % 2 == 0])
  843. H = GET_SLOT(ctx->ghash_cnt, H);
  844. r = GET_SLOT(ctx->ghash_cnt, r);
  845. Hprev = ctx->ghash_cnt == 0 ? NULL : GET_SLOT(ctx->ghash_cnt - 1, H);
  846. #undef GET_SLOT
  847. H0 = ctx256->ghash[0].H[1];
  848. } else {
  849. struct ptls_fusion_aesgcm_context128 *ctx128 = (void *)ctx;
  850. H = &ctx128->ghash[ctx->ghash_cnt].H;
  851. r = &ctx128->ghash[ctx->ghash_cnt].r;
  852. Hprev = ctx->ghash_cnt == 0 ? NULL : &ctx128->ghash[ctx->ghash_cnt - 1].H;
  853. H0 = ctx128->ghash[0].H;
  854. }
  855. if (Hprev != NULL)
  856. *H = gfmul(*Hprev, H0);
  857. *r = _mm_shuffle_epi32(*H, 78);
  858. *r = _mm_xor_si128(*r, *H);
  859. ++ctx->ghash_cnt;
  860. }
  861. static size_t calc_aesgcm_context_size(size_t *ghash_cnt, int aesni256)
  862. {
  863. size_t sz;
  864. if (aesni256) {
  865. if (*ghash_cnt % 2 != 0)
  866. ++*ghash_cnt;
  867. sz = offsetof(struct ptls_fusion_aesgcm_context256, ghash) +
  868. sizeof(union ptls_fusion_aesgcm_ghash_precompute256) * *ghash_cnt / 2;
  869. } else {
  870. sz = offsetof(struct ptls_fusion_aesgcm_context128, ghash) +
  871. sizeof(struct ptls_fusion_aesgcm_ghash_precompute128) * *ghash_cnt;
  872. }
  873. return sz;
  874. }
  875. static ptls_fusion_aesgcm_context_t *new_aesgcm(const void *key, size_t key_size, size_t capacity, int aesni256)
  876. {
  877. ptls_fusion_aesgcm_context_t *ctx;
  878. size_t ghash_cnt = aesgcm_calc_ghash_cnt(capacity), ctx_size = calc_aesgcm_context_size(&ghash_cnt, aesni256);
  879. if ((ctx = aligned_alloc(32, ctx_size)) == NULL)
  880. return NULL;
  881. ptls_fusion_aesecb_init(&ctx->ecb, 1, key, key_size, aesni256);
  882. ctx->capacity = capacity;
  883. __m128i H0 = aesecb_encrypt(&ctx->ecb, _mm_setzero_si128());
  884. H0 = _mm_shuffle_epi8(H0, byteswap128);
  885. H0 = transformH(H0);
  886. if (ctx->ecb.aesni256) {
  887. ((struct ptls_fusion_aesgcm_context256 *)ctx)->ghash[0].H[1] = H0;
  888. } else {
  889. ((struct ptls_fusion_aesgcm_context128 *)ctx)->ghash[0].H = H0;
  890. }
  891. ctx->ghash_cnt = 0;
  892. while (ctx->ghash_cnt < ghash_cnt)
  893. setup_one_ghash_entry(ctx);
  894. return ctx;
  895. }
  896. ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_new(const void *key, size_t key_size, size_t capacity)
  897. {
  898. return new_aesgcm(key, key_size, capacity, 0);
  899. }
  900. ptls_fusion_aesgcm_context_t *ptls_fusion_aesgcm_set_capacity(ptls_fusion_aesgcm_context_t *ctx, size_t capacity)
  901. {
  902. size_t new_ghash_cnt = aesgcm_calc_ghash_cnt(capacity);
  903. if (new_ghash_cnt <= ctx->ghash_cnt)
  904. return ctx;
  905. size_t new_ctx_size = calc_aesgcm_context_size(&new_ghash_cnt, ctx->ecb.aesni256),
  906. old_ctx_size = calc_aesgcm_context_size(&ctx->ghash_cnt, ctx->ecb.aesni256);
  907. ptls_fusion_aesgcm_context_t *newp;
  908. if ((newp = aligned_alloc(32, new_ctx_size)) == NULL)
  909. return NULL;
  910. memcpy(newp, ctx, old_ctx_size);
  911. ptls_clear_memory(ctx, old_ctx_size);
  912. aligned_free(ctx);
  913. ctx = newp;
  914. ctx->capacity = capacity;
  915. while (ctx->ghash_cnt < new_ghash_cnt)
  916. setup_one_ghash_entry(ctx);
  917. return ctx;
  918. }
  919. void ptls_fusion_aesgcm_free(ptls_fusion_aesgcm_context_t *ctx)
  920. {
  921. ptls_clear_memory(ctx, calc_aesgcm_context_size(&ctx->ghash_cnt, ctx->ecb.aesni256));
  922. /* skip ptls_fusion_aesecb_dispose, based on the knowledge that it does not allocate memory elsewhere */
  923. aligned_free(ctx);
  924. }
  925. static void ctr_dispose(ptls_cipher_context_t *_ctx)
  926. {
  927. struct ctr_context *ctx = (struct ctr_context *)_ctx;
  928. ptls_fusion_aesecb_dispose(&ctx->fusion);
  929. _mm_storeu_si128(&ctx->bits, _mm_setzero_si128());
  930. }
  931. static void ctr_init(ptls_cipher_context_t *_ctx, const void *iv)
  932. {
  933. struct ctr_context *ctx = (struct ctr_context *)_ctx;
  934. _mm_storeu_si128(&ctx->bits, aesecb_encrypt(&ctx->fusion, _mm_loadu_si128(iv)));
  935. ctx->is_ready = 1;
  936. }
  937. static void ctr_transform(ptls_cipher_context_t *_ctx, void *output, const void *input, size_t len)
  938. {
  939. struct ctr_context *ctx = (struct ctr_context *)_ctx;
  940. assert((ctx->is_ready && len <= 16) ||
  941. !"CTR transfomation is supported only once per call to `init` and the maximum size is limited to 16 bytes");
  942. ctx->is_ready = 0;
  943. if (len < 16) {
  944. storen128(output, len, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), loadn128(input, len)));
  945. } else {
  946. _mm_storeu_si128(output, _mm_xor_si128(_mm_loadu_si128(&ctx->bits), _mm_loadu_si128(input)));
  947. }
  948. }
  949. static int aesctr_setup(ptls_cipher_context_t *_ctx, int is_enc, const void *key, size_t key_size)
  950. {
  951. struct ctr_context *ctx = (struct ctr_context *)_ctx;
  952. ctx->super.do_dispose = ctr_dispose;
  953. ctx->super.do_init = ctr_init;
  954. ctx->super.do_transform = ctr_transform;
  955. ptls_fusion_aesecb_init(&ctx->fusion, 1, key, key_size, 0 /* probably we do not need aesni256 for CTR? */);
  956. ctx->is_ready = 0;
  957. return 0;
  958. }
  959. static int aes128ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key)
  960. {
  961. return aesctr_setup(ctx, is_enc, key, PTLS_AES128_KEY_SIZE);
  962. }
  963. static int aes256ctr_setup(ptls_cipher_context_t *ctx, int is_enc, const void *key)
  964. {
  965. return aesctr_setup(ctx, is_enc, key, PTLS_AES256_KEY_SIZE);
  966. }
  967. static void aesgcm_dispose_crypto(ptls_aead_context_t *_ctx)
  968. {
  969. struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
  970. ptls_fusion_aesgcm_free(ctx->aesgcm);
  971. }
  972. static void aead_do_encrypt_init(ptls_aead_context_t *_ctx, uint64_t seq, const void *aad, size_t aadlen)
  973. {
  974. assert(!"FIXME");
  975. }
  976. static size_t aead_do_encrypt_update(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen)
  977. {
  978. assert(!"FIXME");
  979. return SIZE_MAX;
  980. }
  981. static size_t aead_do_encrypt_final(ptls_aead_context_t *_ctx, void *_output)
  982. {
  983. assert(!"FIXME");
  984. return SIZE_MAX;
  985. }
  986. static inline __m128i calc_counter(struct aesgcm_context *ctx, uint64_t seq)
  987. {
  988. __m128i ctr = _mm_setzero_si128();
  989. ctr = _mm_insert_epi64(ctr, seq, 0);
  990. ctr = _mm_slli_si128(ctr, 4);
  991. ctr = _mm_xor_si128(ctx->static_iv, ctr);
  992. return ctr;
  993. }
  994. void aead_do_encrypt(struct st_ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq,
  995. const void *aad, size_t aadlen, ptls_aead_supplementary_encryption_t *supp)
  996. {
  997. struct aesgcm_context *ctx = (void *)_ctx;
  998. if (inlen + aadlen > ctx->aesgcm->capacity)
  999. ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, inlen + aadlen);
  1000. ptls_fusion_aesgcm_encrypt(ctx->aesgcm, output, input, inlen, calc_counter(ctx, seq), aad, aadlen, supp);
  1001. }
  1002. static void aead_do_encrypt_v(struct st_ptls_aead_context_t *ctx, void *output, ptls_iovec_t *input, size_t incnt, uint64_t seq,
  1003. const void *aad, size_t aadlen)
  1004. {
  1005. assert(!"FIXME");
  1006. }
  1007. static size_t aead_do_decrypt(ptls_aead_context_t *_ctx, void *output, const void *input, size_t inlen, uint64_t seq,
  1008. const void *aad, size_t aadlen)
  1009. {
  1010. struct aesgcm_context *ctx = (void *)_ctx;
  1011. if (inlen < 16)
  1012. return SIZE_MAX;
  1013. size_t enclen = inlen - 16;
  1014. if (enclen + aadlen > ctx->aesgcm->capacity)
  1015. ctx->aesgcm = ptls_fusion_aesgcm_set_capacity(ctx->aesgcm, enclen + aadlen);
  1016. if (!ptls_fusion_aesgcm_decrypt(ctx->aesgcm, output, input, enclen, calc_counter(ctx, seq), aad, aadlen,
  1017. (const uint8_t *)input + enclen))
  1018. return SIZE_MAX;
  1019. return enclen;
  1020. }
  1021. static inline void aesgcm_get_iv(ptls_aead_context_t *_ctx, void *iv)
  1022. {
  1023. struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
  1024. __m128i m128 = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
  1025. storen128(iv, PTLS_AESGCM_IV_SIZE, m128);
  1026. }
  1027. static inline void aesgcm_set_iv(ptls_aead_context_t *_ctx, const void *iv)
  1028. {
  1029. struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
  1030. ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE);
  1031. ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
  1032. }
  1033. static int aesgcm_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size)
  1034. {
  1035. struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
  1036. ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE);
  1037. ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
  1038. if (key == NULL)
  1039. return 0;
  1040. ctx->super.dispose_crypto = aesgcm_dispose_crypto;
  1041. ctx->super.do_get_iv = aesgcm_get_iv;
  1042. ctx->super.do_set_iv = aesgcm_set_iv;
  1043. ctx->super.do_encrypt_init = aead_do_encrypt_init;
  1044. ctx->super.do_encrypt_update = aead_do_encrypt_update;
  1045. ctx->super.do_encrypt_final = aead_do_encrypt_final;
  1046. ctx->super.do_encrypt = aead_do_encrypt;
  1047. ctx->super.do_encrypt_v = aead_do_encrypt_v;
  1048. ctx->super.do_decrypt = aead_do_decrypt;
  1049. ctx->aesgcm = new_aesgcm(key, key_size, 1500 /* assume ordinary packet size */, 0 /* no support for aesni256 yet */);
  1050. return 0;
  1051. }
  1052. static int aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
  1053. {
  1054. return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE);
  1055. }
  1056. static int aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
  1057. {
  1058. return aesgcm_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE);
  1059. }
  1060. int ptls_fusion_can_aesni256 = 0;
  1061. ptls_cipher_algorithm_t ptls_fusion_aes128ctr = {"AES128-CTR",
  1062. PTLS_AES128_KEY_SIZE,
  1063. 1, // block size
  1064. PTLS_AES_IV_SIZE,
  1065. sizeof(struct ctr_context),
  1066. aes128ctr_setup};
  1067. ptls_cipher_algorithm_t ptls_fusion_aes256ctr = {"AES256-CTR",
  1068. PTLS_AES256_KEY_SIZE,
  1069. 1, // block size
  1070. PTLS_AES_IV_SIZE,
  1071. sizeof(struct ctr_context),
  1072. aes256ctr_setup};
  1073. ptls_aead_algorithm_t ptls_fusion_aes128gcm = {"AES128-GCM",
  1074. PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
  1075. PTLS_AESGCM_INTEGRITY_LIMIT,
  1076. &ptls_fusion_aes128ctr,
  1077. NULL, // &ptls_fusion_aes128ecb,
  1078. PTLS_AES128_KEY_SIZE,
  1079. PTLS_AESGCM_IV_SIZE,
  1080. PTLS_AESGCM_TAG_SIZE,
  1081. {0}, // while it may work, no reason to support TLS/1.2
  1082. 0,
  1083. 0,
  1084. sizeof(struct aesgcm_context),
  1085. aes128gcm_setup};
  1086. ptls_aead_algorithm_t ptls_fusion_aes256gcm = {"AES256-GCM",
  1087. PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
  1088. PTLS_AESGCM_INTEGRITY_LIMIT,
  1089. &ptls_fusion_aes256ctr,
  1090. NULL, // &ptls_fusion_aes256ecb,
  1091. PTLS_AES256_KEY_SIZE,
  1092. PTLS_AESGCM_IV_SIZE,
  1093. PTLS_AESGCM_TAG_SIZE,
  1094. {0}, // while it may work, no reason to support TLS/1.2
  1095. 0,
  1096. 0,
  1097. sizeof(struct aesgcm_context),
  1098. aes256gcm_setup};
  1099. static inline size_t calc_total_length(ptls_iovec_t *input, size_t incnt)
  1100. {
  1101. size_t totlen = 0;
  1102. for (size_t i = 0; i < incnt; ++i)
  1103. totlen += input[i].len;
  1104. return totlen;
  1105. }
  1106. static inline void reduce_aad128(struct ptls_fusion_gfmul_state128 *gstate, struct ptls_fusion_aesgcm_ghash_precompute128 *ghash,
  1107. const void *_aad, size_t aadlen)
  1108. {
  1109. struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute;
  1110. const uint8_t *aad = _aad;
  1111. while (PTLS_UNLIKELY(aadlen >= 6 * 16)) {
  1112. ghash_precompute = ghash + 6;
  1113. gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
  1114. aad += 16;
  1115. aadlen -= 16;
  1116. for (int i = 1; i < 6; ++i) {
  1117. gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
  1118. aad += 16;
  1119. aadlen -= 16;
  1120. }
  1121. gfmul_reduce128(gstate);
  1122. }
  1123. if (PTLS_LIKELY(aadlen != 0)) {
  1124. ghash_precompute = ghash + (aadlen + 15) / 16;
  1125. if (PTLS_UNLIKELY(aadlen >= 16)) {
  1126. gfmul_firststep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
  1127. aad += 16;
  1128. aadlen -= 16;
  1129. while (aadlen >= 16) {
  1130. gfmul_nextstep128(gstate, _mm_loadu_si128((void *)aad), --ghash_precompute);
  1131. aad += 16;
  1132. aadlen -= 16;
  1133. }
  1134. if (PTLS_LIKELY(aadlen != 0))
  1135. gfmul_nextstep128(gstate, loadn128(aad, aadlen), --ghash_precompute);
  1136. } else {
  1137. gfmul_firststep128(gstate, loadn128(aad, aadlen), --ghash_precompute);
  1138. }
  1139. assert(ghash == ghash_precompute);
  1140. gfmul_reduce128(gstate);
  1141. }
  1142. }
  1143. NO_SANITIZE_ADDRESS
  1144. static inline uint8_t *load_preceding_unaligned(uint8_t *encbuf, uint8_t **output)
  1145. {
  1146. uint8_t *encp;
  1147. if ((encp = encbuf + ((uintptr_t)*output & 63)) != encbuf) {
  1148. _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(*output - (encp - encbuf))));
  1149. _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(*output - (encp - encbuf) + 32)));
  1150. *output -= encp - encbuf;
  1151. }
  1152. return encp;
  1153. }
  1154. NO_SANITIZE_ADDRESS
  1155. static inline void write_remaining_bytes(uint8_t *dst, const uint8_t *src, const uint8_t *end)
  1156. {
  1157. /* Write in 64-byte chunks, using NT store instructions. Last partial block, if any, is written to cache, as that cache line
  1158. * would likely be read when the next TLS record is being built. */
  1159. for (; end - src >= 64; dst += 64, src += 64) {
  1160. _mm256_stream_si256((void *)dst, _mm256_load_si256((void *)src));
  1161. _mm256_stream_si256((void *)(dst + 32), _mm256_load_si256((void *)(src + 32)));
  1162. }
  1163. _mm_sfence(); /* weakly ordered writes have to be synced before being passed to NIC */
  1164. if (src != end) {
  1165. for (; end - src >= 16; dst += 16, src += 16)
  1166. _mm_store_si128((void *)dst, _mm_load_si128((void *)src));
  1167. if (src != end)
  1168. storen128((void *)dst, end - src, loadn128((void *)src, end - src));
  1169. }
  1170. }
  1171. NO_SANITIZE_ADDRESS
  1172. static void non_temporal_encrypt_v128(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt,
  1173. uint64_t seq, const void *aad, size_t aadlen)
  1174. {
  1175. /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
  1176. #define AESECB6_INIT() \
  1177. do { \
  1178. ctr = _mm_add_epi64(ctr, one8); \
  1179. bits0 = _mm_shuffle_epi8(ctr, byteswap128); \
  1180. ctr = _mm_add_epi64(ctr, one8); \
  1181. bits1 = _mm_shuffle_epi8(ctr, byteswap128); \
  1182. ctr = _mm_add_epi64(ctr, one8); \
  1183. bits2 = _mm_shuffle_epi8(ctr, byteswap128); \
  1184. ctr = _mm_add_epi64(ctr, one8); \
  1185. bits3 = _mm_shuffle_epi8(ctr, byteswap128); \
  1186. ctr = _mm_add_epi64(ctr, one8); \
  1187. bits4 = _mm_shuffle_epi8(ctr, byteswap128); \
  1188. if (PTLS_LIKELY(srclen > 16 * 5) || src_vecleft != 0) { \
  1189. ctr = _mm_add_epi64(ctr, one8); \
  1190. bits5 = _mm_shuffle_epi8(ctr, byteswap128); \
  1191. } else { \
  1192. bits5 = ek0; \
  1193. state |= STATE_EK0_READY; \
  1194. } \
  1195. __m128i k = ctx->super.ecb.keys.m128[0]; \
  1196. bits0 = _mm_xor_si128(bits0, k); \
  1197. bits1 = _mm_xor_si128(bits1, k); \
  1198. bits2 = _mm_xor_si128(bits2, k); \
  1199. bits3 = _mm_xor_si128(bits3, k); \
  1200. bits4 = _mm_xor_si128(bits4, k); \
  1201. bits5 = _mm_xor_si128(bits5, k); \
  1202. } while (0)
  1203. /* aes block update */
  1204. #define AESECB6_UPDATE(i) \
  1205. do { \
  1206. __m128i k = ctx->super.ecb.keys.m128[i]; \
  1207. bits0 = _mm_aesenc_si128(bits0, k); \
  1208. bits1 = _mm_aesenc_si128(bits1, k); \
  1209. bits2 = _mm_aesenc_si128(bits2, k); \
  1210. bits3 = _mm_aesenc_si128(bits3, k); \
  1211. bits4 = _mm_aesenc_si128(bits4, k); \
  1212. bits5 = _mm_aesenc_si128(bits5, k); \
  1213. } while (0)
  1214. /* aesenclast */
  1215. #define AESECB6_FINAL(i) \
  1216. do { \
  1217. __m128i k = ctx->super.ecb.keys.m128[i]; \
  1218. bits0 = _mm_aesenclast_si128(bits0, k); \
  1219. bits1 = _mm_aesenclast_si128(bits1, k); \
  1220. bits2 = _mm_aesenclast_si128(bits2, k); \
  1221. bits3 = _mm_aesenclast_si128(bits3, k); \
  1222. bits4 = _mm_aesenclast_si128(bits4, k); \
  1223. bits5 = _mm_aesenclast_si128(bits5, k); \
  1224. } while (0)
  1225. struct aesgcm_context *agctx = (void *)_ctx;
  1226. uint8_t *output = _output;
  1227. #define STATE_EK0_READY 0x1
  1228. #define STATE_COPY_128B 0x2
  1229. int32_t state = 0;
  1230. /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. */
  1231. uint8_t encbuf[32 * 6] __attribute__((aligned(32))), *encp;
  1232. /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is
  1233. * append to the ciphertext before writing the bytes to main memory using NT store instructions. */
  1234. PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 16 + 16);
  1235. /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */
  1236. encp = load_preceding_unaligned(encbuf, &output);
  1237. /* First write would be 128 bytes (32+6*16), if encbuf contains no less than 32 bytes already. */
  1238. if (encp - encbuf >= 32)
  1239. state |= STATE_COPY_128B;
  1240. /* setup ctr, retain Ek(0), len(A) | len(C) to be fed into GCM */
  1241. __m128i ctr = calc_counter(agctx, seq);
  1242. ctr = _mm_insert_epi32(ctr, 1, 0);
  1243. __m128i ek0 = _mm_shuffle_epi8(ctr, byteswap128);
  1244. __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128);
  1245. struct ptls_fusion_aesgcm_context128 *ctx = (void *)agctx->aesgcm;
  1246. __m128i bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
  1247. struct ptls_fusion_gfmul_state128 gstate = {0};
  1248. /* find the first non-empty vec */
  1249. const uint8_t *src = NULL;
  1250. size_t srclen = 0, src_vecleft = incnt;
  1251. while (srclen == 0 && src_vecleft != 0) {
  1252. src = (void *)input[0].base;
  1253. srclen = input[0].len;
  1254. ++input;
  1255. --src_vecleft;
  1256. }
  1257. /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */
  1258. AESECB6_INIT();
  1259. AESECB6_UPDATE(1);
  1260. AESECB6_UPDATE(2);
  1261. reduce_aad128(&gstate, ctx->ghash, aad, aadlen);
  1262. for (size_t i = 3; i < ctx->super.ecb.rounds; ++i)
  1263. AESECB6_UPDATE(i);
  1264. AESECB6_FINAL(ctx->super.ecb.rounds);
  1265. /* Main loop. This loop:
  1266. * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf,
  1267. * 2. then if there is no more data to be encrypted, exit the loop, otherwise,
  1268. * 3. calculate ghash of the blocks being written to encbuf,
  1269. * 4. calculate next 6 * 16 bytes of keystream,
  1270. * 5. writes encbuf in 64-byte blocks
  1271. * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be
  1272. * calculated. */
  1273. size_t remaining_ghash_from = encp - encbuf;
  1274. if (srclen != 0) {
  1275. while (1) {
  1276. /* apply the bit stream to input, writing to encbuf */
  1277. if (PTLS_LIKELY(srclen >= 6 * 16)) {
  1278. #define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(src + i * 16)), bits##i))
  1279. APPLY(0);
  1280. APPLY(1);
  1281. APPLY(2);
  1282. APPLY(3);
  1283. APPLY(4);
  1284. APPLY(5);
  1285. #undef APPLY
  1286. encp += 6 * 16;
  1287. src += 6 * 16;
  1288. srclen -= 6 * 16;
  1289. if (PTLS_UNLIKELY(srclen == 0)) {
  1290. if (src_vecleft == 0) {
  1291. remaining_ghash_from = (encp - encbuf) - 96;
  1292. break;
  1293. }
  1294. src = (void *)input[0].base;
  1295. srclen = input[0].len;
  1296. ++input;
  1297. --src_vecleft;
  1298. }
  1299. } else {
  1300. /* slow path, load at most 6 * 16 bytes to encbuf then encrypt in-place */
  1301. size_t bytes_copied = 0;
  1302. do {
  1303. if (srclen >= 16 && bytes_copied < 5 * 16) {
  1304. _mm_storeu_si128((void *)(encp + bytes_copied), _mm_loadu_si128((void *)src));
  1305. bytes_copied += 16;
  1306. src += 16;
  1307. srclen -= 16;
  1308. } else {
  1309. encp[bytes_copied++] = *src++;
  1310. --srclen;
  1311. }
  1312. if (PTLS_UNLIKELY(srclen == 0)) {
  1313. do {
  1314. if (src_vecleft == 0)
  1315. break;
  1316. src = (void *)input[0].base;
  1317. srclen = input[0].len;
  1318. ++input;
  1319. --src_vecleft;
  1320. } while (srclen == 0);
  1321. if (srclen == 0)
  1322. break;
  1323. }
  1324. } while (bytes_copied < 6 * 16);
  1325. #define APPLY(i) _mm_storeu_si128((void *)(encp + i * 16), _mm_xor_si128(_mm_loadu_si128((void *)(encp + i * 16)), bits##i))
  1326. APPLY(0);
  1327. APPLY(1);
  1328. APPLY(2);
  1329. APPLY(3);
  1330. APPLY(4);
  1331. APPLY(5);
  1332. #undef APPLY
  1333. encp += bytes_copied;
  1334. if (PTLS_UNLIKELY(srclen == 0)) {
  1335. /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it
  1336. * will be fed into ghash. */
  1337. remaining_ghash_from = (encp - encbuf) - bytes_copied;
  1338. if ((bytes_copied & 15) != 0)
  1339. _mm_storeu_si128((void *)encp, _mm_setzero_si128());
  1340. break;
  1341. }
  1342. }
  1343. /* Next 96-byte block starts here. Run AES and ghash in while writing output using non-temporal stores in 64-byte
  1344. * blocks. */
  1345. AESECB6_INIT();
  1346. struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + 6;
  1347. gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encp - 6 * 16)), --ghash_precompute);
  1348. AESECB6_UPDATE(1);
  1349. gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 5 * 16)), --ghash_precompute);
  1350. AESECB6_UPDATE(2);
  1351. gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 4 * 16)), --ghash_precompute);
  1352. AESECB6_UPDATE(3);
  1353. _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf));
  1354. _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32)));
  1355. AESECB6_UPDATE(4);
  1356. gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 3 * 16)), --ghash_precompute);
  1357. AESECB6_UPDATE(5);
  1358. gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 2 * 16)), --ghash_precompute);
  1359. AESECB6_UPDATE(6);
  1360. gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encp - 1 * 16)), --ghash_precompute);
  1361. AESECB6_UPDATE(7);
  1362. if ((state & STATE_COPY_128B) != 0) {
  1363. _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64)));
  1364. _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96)));
  1365. output += 128;
  1366. encp -= 128;
  1367. AESECB6_UPDATE(8);
  1368. _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 128)));
  1369. _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 160)));
  1370. } else {
  1371. output += 64;
  1372. encp -= 64;
  1373. _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 64)));
  1374. _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 96)));
  1375. AESECB6_UPDATE(8);
  1376. }
  1377. state ^= STATE_COPY_128B;
  1378. AESECB6_UPDATE(9);
  1379. if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) {
  1380. for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i)
  1381. AESECB6_UPDATE(i);
  1382. }
  1383. assert(ctx->ghash == ghash_precompute);
  1384. gfmul_reduce128(&gstate);
  1385. AESECB6_FINAL(ctx->super.ecb.rounds);
  1386. }
  1387. }
  1388. /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */
  1389. { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7
  1390. * blocks at once. */
  1391. size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16;
  1392. _mm_storeu_si128((void *)(encbuf + ac_off), ac);
  1393. size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */
  1394. assert(blocks <= 7);
  1395. struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + blocks;
  1396. gfmul_firststep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
  1397. remaining_ghash_from += 16;
  1398. while (ghash_precompute != ctx->ghash) {
  1399. gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
  1400. remaining_ghash_from += 16;
  1401. }
  1402. gfmul_reduce128(&gstate);
  1403. }
  1404. /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */
  1405. if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) {
  1406. bits5 = _mm_xor_si128(ek0, ctx->super.ecb.keys.m128[0]);
  1407. for (size_t i = 1; i < ctx->super.ecb.rounds; ++i)
  1408. bits5 = _mm_aesenc_si128(bits5, ctx->super.ecb.keys.m128[i]);
  1409. bits5 = _mm_aesenclast_si128(bits5, ctx->super.ecb.keys.m128[ctx->super.ecb.rounds]);
  1410. }
  1411. /* append tag to encbuf */
  1412. _mm_storeu_si128((void *)encp, gfmul_get_tag128(&gstate, bits5));
  1413. encp += 16;
  1414. /* write remaining bytes */
  1415. write_remaining_bytes(output, encbuf, encp);
  1416. #undef AESECB6_INIT
  1417. #undef AESECB6_UPDATE
  1418. #undef AESECB6_FINAL
  1419. #undef STATE_EK0_READY
  1420. #undef STATE_COPY_128B
  1421. }
  1422. static size_t non_temporal_decrypt128(ptls_aead_context_t *_ctx, void *_output, const void *_input, size_t inlen, uint64_t seq,
  1423. const void *aad, size_t aadlen)
  1424. {
  1425. /* Bail out if the input is too short, or remove tag from range. */
  1426. if (inlen < 16)
  1427. return SIZE_MAX;
  1428. inlen -= 16;
  1429. size_t textlen = inlen;
  1430. /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
  1431. #define AESECB6_INIT() \
  1432. do { \
  1433. ctr = _mm_add_epi64(ctr, one8); \
  1434. bits0 = _mm_shuffle_epi8(ctr, byteswap128); \
  1435. ctr = _mm_add_epi64(ctr, one8); \
  1436. bits1 = _mm_shuffle_epi8(ctr, byteswap128); \
  1437. ctr = _mm_add_epi64(ctr, one8); \
  1438. bits2 = _mm_shuffle_epi8(ctr, byteswap128); \
  1439. ctr = _mm_add_epi64(ctr, one8); \
  1440. bits3 = _mm_shuffle_epi8(ctr, byteswap128); \
  1441. ctr = _mm_add_epi64(ctr, one8); \
  1442. bits4 = _mm_shuffle_epi8(ctr, byteswap128); \
  1443. if (PTLS_LIKELY(inlen > 16 * 5)) { \
  1444. ctr = _mm_add_epi64(ctr, one8); \
  1445. bits5 = _mm_shuffle_epi8(ctr, byteswap128); \
  1446. } else { \
  1447. bits5 = ek0; \
  1448. state |= STATE_EK0_READY; \
  1449. } \
  1450. __m128i k = ctx->super.ecb.keys.m128[0]; \
  1451. bits0 = _mm_xor_si128(bits0, k); \
  1452. bits1 = _mm_xor_si128(bits1, k); \
  1453. bits2 = _mm_xor_si128(bits2, k); \
  1454. bits3 = _mm_xor_si128(bits3, k); \
  1455. bits4 = _mm_xor_si128(bits4, k); \
  1456. bits5 = _mm_xor_si128(bits5, k); \
  1457. } while (0)
  1458. /* aes block update */
  1459. #define AESECB6_UPDATE(i) \
  1460. do { \
  1461. __m128i k = ctx->super.ecb.keys.m128[i]; \
  1462. bits0 = _mm_aesenc_si128(bits0, k); \
  1463. bits1 = _mm_aesenc_si128(bits1, k); \
  1464. bits2 = _mm_aesenc_si128(bits2, k); \
  1465. bits3 = _mm_aesenc_si128(bits3, k); \
  1466. bits4 = _mm_aesenc_si128(bits4, k); \
  1467. bits5 = _mm_aesenc_si128(bits5, k); \
  1468. } while (0)
  1469. /* aesenclast */
  1470. #define AESECB6_FINAL(i) \
  1471. do { \
  1472. __m128i k = ctx->super.ecb.keys.m128[i]; \
  1473. bits0 = _mm_aesenclast_si128(bits0, k); \
  1474. bits1 = _mm_aesenclast_si128(bits1, k); \
  1475. bits2 = _mm_aesenclast_si128(bits2, k); \
  1476. bits3 = _mm_aesenclast_si128(bits3, k); \
  1477. bits4 = _mm_aesenclast_si128(bits4, k); \
  1478. bits5 = _mm_aesenclast_si128(bits5, k); \
  1479. } while (0)
  1480. struct aesgcm_context *agctx = (void *)_ctx;
  1481. uint8_t *output = _output;
  1482. const uint8_t *input = _input;
  1483. #define STATE_EK0_READY 0x1
  1484. int32_t state = 0;
  1485. /* setup ctr, retain Ek(0), len(A) | len(C) to be fed into GCM */
  1486. __m128i ctr = calc_counter(agctx, seq);
  1487. ctr = _mm_insert_epi32(ctr, 1, 0);
  1488. __m128i ek0 = _mm_shuffle_epi8(ctr, byteswap128);
  1489. __m128i ac = _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)inlen * 8), byteswap128);
  1490. struct ptls_fusion_aesgcm_context128 *ctx = (void *)agctx->aesgcm;
  1491. __m128i bits0, bits1, bits2, bits3, bits4, bits5 = _mm_setzero_si128();
  1492. struct ptls_fusion_gfmul_state128 gstate = {0};
  1493. /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */
  1494. AESECB6_INIT();
  1495. AESECB6_UPDATE(1);
  1496. AESECB6_UPDATE(2);
  1497. reduce_aad128(&gstate, ctx->ghash, aad, aadlen);
  1498. for (size_t i = 3; i < ctx->super.ecb.rounds; ++i)
  1499. AESECB6_UPDATE(i);
  1500. AESECB6_FINAL(ctx->super.ecb.rounds);
  1501. /* Main loop. Operate in full blocks (6 * 16 bytes). */
  1502. while (PTLS_LIKELY(inlen >= 6 * 16)) {
  1503. #define DECRYPT(i) _mm_storeu_si128((void *)(output + i * 16), _mm_xor_si128(bits##i, _mm_loadu_si128((void *)(input + i * 16))))
  1504. DECRYPT(0);
  1505. DECRYPT(1);
  1506. DECRYPT(2);
  1507. DECRYPT(3);
  1508. DECRYPT(4);
  1509. DECRYPT(5);
  1510. #undef DECRYPT
  1511. #define GFMUL_NEXT(i) gfmul_nextstep128(&gstate, _mm_loadu_si128((void *)(input + i * 16)), ctx->ghash + 5 - i)
  1512. AESECB6_INIT();
  1513. AESECB6_UPDATE(1);
  1514. AESECB6_UPDATE(2);
  1515. AESECB6_UPDATE(3);
  1516. gfmul_firststep128(&gstate, _mm_loadu_si128((void *)input), ctx->ghash + 5);
  1517. AESECB6_UPDATE(4);
  1518. GFMUL_NEXT(1);
  1519. AESECB6_UPDATE(5);
  1520. GFMUL_NEXT(2);
  1521. AESECB6_UPDATE(6);
  1522. GFMUL_NEXT(3);
  1523. AESECB6_UPDATE(7);
  1524. GFMUL_NEXT(4);
  1525. AESECB6_UPDATE(8);
  1526. GFMUL_NEXT(5);
  1527. AESECB6_UPDATE(9);
  1528. gfmul_reduce128(&gstate);
  1529. if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) {
  1530. size_t i = 10;
  1531. do {
  1532. AESECB6_UPDATE(i);
  1533. } while (++i < ctx->super.ecb.rounds);
  1534. }
  1535. AESECB6_FINAL(ctx->super.ecb.rounds);
  1536. output += 6 * 16;
  1537. input += 6 * 16;
  1538. inlen -= 6 * 16;
  1539. #undef GFMUL_NEXT
  1540. }
  1541. /* Decrypt the remainder as well as finishing GHASH calculation. */
  1542. if (inlen != 0) {
  1543. struct ptls_fusion_aesgcm_ghash_precompute128 *ghash_precompute = ctx->ghash + (inlen + 15) / 16 + 1;
  1544. #define ONEBLOCK(i) \
  1545. do { \
  1546. if (inlen != 0) { \
  1547. __m128i b = inlen >= 16 ? _mm_loadu_si128((void *)input) : loadn128(input, inlen); \
  1548. if (i == 0) { \
  1549. gfmul_firststep128(&gstate, b, --ghash_precompute); \
  1550. } else { \
  1551. gfmul_nextstep128(&gstate, b, --ghash_precompute); \
  1552. } \
  1553. b = _mm_xor_si128(b, bits##i); \
  1554. if (inlen >= 16) { \
  1555. _mm_storeu_si128((void *)output, b); \
  1556. output += 16; \
  1557. input += 16; \
  1558. inlen -= 16; \
  1559. } else { \
  1560. storen128(output, inlen, b); \
  1561. output += inlen; \
  1562. input += inlen; \
  1563. inlen = 0; \
  1564. } \
  1565. } \
  1566. } while (0)
  1567. ONEBLOCK(0);
  1568. ONEBLOCK(1);
  1569. ONEBLOCK(2);
  1570. ONEBLOCK(3);
  1571. ONEBLOCK(4);
  1572. ONEBLOCK(5);
  1573. #undef ONEBLOCK
  1574. gfmul_nextstep128(&gstate, ac, --ghash_precompute);
  1575. assert(ghash_precompute == ctx->ghash);
  1576. } else {
  1577. gfmul_firststep128(&gstate, ac, ctx->ghash);
  1578. }
  1579. gfmul_reduce128(&gstate);
  1580. /* Calculate EK0 if not yet available in bits5. */
  1581. if ((state & STATE_EK0_READY) == 0) {
  1582. bits5 = _mm_xor_si128(ek0, ctx->super.ecb.keys.m128[0]);
  1583. for (size_t i = 1; i < ctx->super.ecb.rounds; ++i)
  1584. bits5 = _mm_aesenc_si128(bits5, ctx->super.ecb.keys.m128[i]);
  1585. bits5 = _mm_aesenclast_si128(bits5, ctx->super.ecb.keys.m128[ctx->super.ecb.rounds]);
  1586. }
  1587. /* Calculate GCM tag and compare. */
  1588. __m128i calctag = gfmul_get_tag128(&gstate, bits5);
  1589. __m128i recvtag = _mm_loadu_si128((void *)input);
  1590. if (_mm_movemask_epi8(_mm_cmpeq_epi8(calctag, recvtag)) != 0xffff)
  1591. return SIZE_MAX;
  1592. return textlen;
  1593. #undef AESECB6_INIT
  1594. #undef AESECB6_UPDATE
  1595. #undef AESECB6_FINAL
  1596. #undef STATE_EK0_READY
  1597. }
  1598. NO_SANITIZE_ADDRESS
  1599. static void non_temporal_encrypt_v256(struct st_ptls_aead_context_t *_ctx, void *_output, ptls_iovec_t *input, size_t incnt,
  1600. uint64_t seq, const void *_aad, size_t aadlen)
  1601. {
  1602. /* init the bits (we can always run in full), but use the last slot for calculating ek0, if possible */
  1603. #define AESECB6_INIT() \
  1604. do { \
  1605. ctr = _mm256_add_epi64(ctr, incr128x2); \
  1606. bits0 = _mm256_shuffle_epi8(ctr, byteswap256); \
  1607. ctr = _mm256_add_epi64(ctr, incr128x2); \
  1608. bits1 = _mm256_shuffle_epi8(ctr, byteswap256); \
  1609. ctr = _mm256_add_epi64(ctr, incr128x2); \
  1610. bits2 = _mm256_shuffle_epi8(ctr, byteswap256); \
  1611. ctr = _mm256_add_epi64(ctr, incr128x2); \
  1612. bits3 = _mm256_shuffle_epi8(ctr, byteswap256); \
  1613. ctr = _mm256_add_epi64(ctr, incr128x2); \
  1614. bits4 = _mm256_shuffle_epi8(ctr, byteswap256); \
  1615. ctr = _mm256_add_epi64(ctr, incr128x2); \
  1616. bits5 = _mm256_shuffle_epi8(ctr, byteswap256); \
  1617. if (PTLS_UNLIKELY(srclen <= 32 * 6 - 16) && src_vecleft == 0) { \
  1618. bits5 = _mm256_permute2f128_si256(bits5, ac_ek0, 0x30); \
  1619. state |= STATE_EK0_READY; \
  1620. } \
  1621. __m256i k = ctx->super.ecb.keys.m256[0]; \
  1622. bits0 = _mm256_xor_si256(bits0, k); \
  1623. bits1 = _mm256_xor_si256(bits1, k); \
  1624. bits2 = _mm256_xor_si256(bits2, k); \
  1625. bits3 = _mm256_xor_si256(bits3, k); \
  1626. bits4 = _mm256_xor_si256(bits4, k); \
  1627. bits5 = _mm256_xor_si256(bits5, k); \
  1628. } while (0)
  1629. /* aes block update */
  1630. #define AESECB6_UPDATE(i) \
  1631. do { \
  1632. __m256i k = ctx->super.ecb.keys.m256[i]; \
  1633. bits0 = _mm256_aesenc_epi128(bits0, k); \
  1634. bits1 = _mm256_aesenc_epi128(bits1, k); \
  1635. bits2 = _mm256_aesenc_epi128(bits2, k); \
  1636. bits3 = _mm256_aesenc_epi128(bits3, k); \
  1637. bits4 = _mm256_aesenc_epi128(bits4, k); \
  1638. bits5 = _mm256_aesenc_epi128(bits5, k); \
  1639. } while (0)
  1640. /* aesenclast */
  1641. #define AESECB6_FINAL(i) \
  1642. do { \
  1643. __m256i k = ctx->super.ecb.keys.m256[i]; \
  1644. bits0 = _mm256_aesenclast_epi128(bits0, k); \
  1645. bits1 = _mm256_aesenclast_epi128(bits1, k); \
  1646. bits2 = _mm256_aesenclast_epi128(bits2, k); \
  1647. bits3 = _mm256_aesenclast_epi128(bits3, k); \
  1648. bits4 = _mm256_aesenclast_epi128(bits4, k); \
  1649. bits5 = _mm256_aesenclast_epi128(bits5, k); \
  1650. } while (0)
  1651. struct aesgcm_context *agctx = (void *)_ctx;
  1652. uint8_t *output = _output;
  1653. const uint8_t *aad = _aad;
  1654. #define STATE_EK0_READY 0x1
  1655. int32_t state = 0;
  1656. /* Bytes are written here first then written using NT store instructions, 64 bytes at a time. */
  1657. uint8_t encbuf[32 * 9] __attribute__((aligned(32))), *encp;
  1658. /* `encbuf` should be large enough to store up to 63-bytes of unaligned bytes, 6 16-byte AES blocks, plus AEAD tag that is
  1659. * append to the ciphertext before writing the bytes to main memory using NT store instructions. */
  1660. PTLS_BUILD_ASSERT(sizeof(encbuf) >= 64 + 6 * 32 + 16);
  1661. /* load unaligned data within same cache line preceding `output`, adjusting pointers accordingly */
  1662. encp = load_preceding_unaligned(encbuf, &output);
  1663. /* setup ctr, retaining Ek(0), len(A) | len(C) to be fed into GCM */
  1664. __m256i ctr = _mm256_broadcastsi128_si256(calc_counter(agctx, seq));
  1665. ctr = _mm256_insert_epi32(ctr, 1, 4);
  1666. __m256i ac_ek0 = _mm256_permute2f128_si256(
  1667. /* first half: ac */
  1668. _mm256_castsi128_si256(
  1669. _mm_shuffle_epi8(_mm_set_epi32(0, (int)aadlen * 8, 0, (int)calc_total_length(input, incnt) * 8), byteswap128)),
  1670. /* second half: ek0 */
  1671. _mm256_shuffle_epi8(ctr, byteswap256), 0x30);
  1672. struct ptls_fusion_aesgcm_context256 *ctx = (void *)agctx->aesgcm;
  1673. __m256i bits0, bits1, bits2, bits3, bits4, bits5 = _mm256_setzero_si256();
  1674. struct ptls_fusion_gfmul_state256 gstate = {0};
  1675. /* find the first non-empty vec */
  1676. const uint8_t *src = NULL;
  1677. size_t srclen = 0, src_vecleft = incnt;
  1678. while (srclen == 0 && src_vecleft != 0) {
  1679. src = (void *)input[0].base;
  1680. srclen = input[0].len;
  1681. ++input;
  1682. --src_vecleft;
  1683. }
  1684. /* Prepare first 6 blocks of bit stream, at the same time calculating ghash of AAD. */
  1685. AESECB6_INIT();
  1686. AESECB6_UPDATE(1);
  1687. AESECB6_UPDATE(2);
  1688. if (PTLS_LIKELY(aadlen != 0)) {
  1689. union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute;
  1690. while (PTLS_UNLIKELY(aadlen >= 6 * 32)) {
  1691. ghash_precompute = ctx->ghash + 6;
  1692. gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute);
  1693. aad += 32;
  1694. aadlen -= 32;
  1695. for (int i = 1; i < 6; ++i) {
  1696. gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute);
  1697. aad += 32;
  1698. aadlen -= 32;
  1699. }
  1700. gfmul_reduce256(&gstate);
  1701. }
  1702. if (PTLS_LIKELY(aadlen != 0)) {
  1703. ghash_precompute = ctx->ghash + (aadlen + 31) / 32;
  1704. if (PTLS_UNLIKELY(aadlen >= 32)) {
  1705. if (aadlen % 32 == 0 || aadlen % 32 > 16) {
  1706. gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 0, --ghash_precompute);
  1707. aad += 32;
  1708. aadlen -= 32;
  1709. } else {
  1710. gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)aad), 1, --ghash_precompute);
  1711. aad += 16;
  1712. aadlen -= 16;
  1713. }
  1714. while (aadlen >= 32) {
  1715. gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)aad), --ghash_precompute);
  1716. aad += 32;
  1717. aadlen -= 32;
  1718. }
  1719. if (PTLS_LIKELY(aadlen != 0)) {
  1720. assert(aadlen > 16);
  1721. gfmul_nextstep256(&gstate, loadn256(aad, aadlen), --ghash_precompute);
  1722. }
  1723. } else {
  1724. gfmul_firststep256(&gstate, loadn256(aad, aadlen), aadlen <= 16, --ghash_precompute);
  1725. }
  1726. assert(ctx->ghash == ghash_precompute);
  1727. gfmul_reduce256(&gstate);
  1728. }
  1729. }
  1730. for (size_t i = 3; i < ctx->super.ecb.rounds; ++i)
  1731. AESECB6_UPDATE(i);
  1732. AESECB6_FINAL(ctx->super.ecb.rounds);
  1733. /* Main loop. This loop:
  1734. * 1. using current keystream (bits0..bits5), xors a up to 6 * 16 bytes and writes to encbuf,
  1735. * 2. then if there is no more data to be encrypted, exit the loop, otherwise,
  1736. * 3. calculate ghash of the blocks being written to encbuf,
  1737. * 4. calculate next 6 * 16 bytes of keystream,
  1738. * 5. writes encbuf in 64-byte blocks
  1739. * When exitting the loop, `remaining_ghash_from` represents the offset within `encbuf` from where ghash remains to be
  1740. * calculated. */
  1741. size_t remaining_ghash_from = encp - encbuf;
  1742. if (srclen != 0) {
  1743. while (1) {
  1744. /* apply the bit stream to input, writing to encbuf */
  1745. if (PTLS_LIKELY(srclen >= 6 * 32)) {
  1746. #define APPLY(i) _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(src + i * 32)), bits##i))
  1747. APPLY(0);
  1748. APPLY(1);
  1749. APPLY(2);
  1750. APPLY(3);
  1751. APPLY(4);
  1752. APPLY(5);
  1753. #undef APPLY
  1754. encp += 6 * 32;
  1755. src += 6 * 32;
  1756. srclen -= 6 * 32;
  1757. if (PTLS_UNLIKELY(srclen == 0)) {
  1758. if (src_vecleft == 0) {
  1759. remaining_ghash_from = (encp - encbuf) - 6 * 32;
  1760. break;
  1761. }
  1762. src = (void *)input[0].base;
  1763. srclen = input[0].len;
  1764. ++input;
  1765. --src_vecleft;
  1766. }
  1767. } else {
  1768. /* slow path, load at most 6 * 32 bytes to encbuf then encrypt in-place */
  1769. size_t bytes_copied = 0;
  1770. do {
  1771. if (srclen >= 32 && bytes_copied < 5 * 32) {
  1772. _mm256_storeu_si256((void *)(encp + bytes_copied), _mm256_loadu_si256((void *)src));
  1773. bytes_copied += 32;
  1774. src += 32;
  1775. srclen -= 32;
  1776. } else {
  1777. encp[bytes_copied++] = *src++;
  1778. --srclen;
  1779. }
  1780. if (PTLS_UNLIKELY(srclen == 0)) {
  1781. do {
  1782. if (src_vecleft == 0)
  1783. break;
  1784. src = (void *)input[0].base;
  1785. srclen = input[0].len;
  1786. ++input;
  1787. --src_vecleft;
  1788. } while (srclen == 0);
  1789. if (srclen == 0)
  1790. break;
  1791. }
  1792. } while (bytes_copied < 6 * 32);
  1793. #define APPLY(i) \
  1794. _mm256_storeu_si256((void *)(encp + i * 32), _mm256_xor_si256(_mm256_loadu_si256((void *)(encp + i * 32)), bits##i))
  1795. APPLY(0);
  1796. APPLY(1);
  1797. APPLY(2);
  1798. APPLY(3);
  1799. APPLY(4);
  1800. APPLY(5);
  1801. #undef APPLY
  1802. encp += bytes_copied;
  1803. if (PTLS_UNLIKELY(srclen == 0)) {
  1804. /* Calculate amonut of data left to be ghashed, as well as zero-clearing the remainedr of partial block, as it
  1805. * will be fed into ghash. */
  1806. remaining_ghash_from = (encp - encbuf) - bytes_copied;
  1807. if ((bytes_copied & 15) != 0)
  1808. _mm_storeu_si128((void *)encp, _mm_setzero_si128());
  1809. break;
  1810. }
  1811. }
  1812. /* Next 96-byte block starts here. Run AES and ghash in parallel while writing output using non-temporal store
  1813. * instructions. */
  1814. AESECB6_INIT();
  1815. union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + 6;
  1816. gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encp - 6 * 32)), 0, --ghash_precompute);
  1817. AESECB6_UPDATE(1);
  1818. gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 5 * 32)), --ghash_precompute);
  1819. AESECB6_UPDATE(2);
  1820. gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 4 * 32)), --ghash_precompute);
  1821. AESECB6_UPDATE(3);
  1822. gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 3 * 32)), --ghash_precompute);
  1823. AESECB6_UPDATE(4);
  1824. _mm256_stream_si256((void *)output, _mm256_load_si256((void *)encbuf));
  1825. _mm256_stream_si256((void *)(output + 32), _mm256_load_si256((void *)(encbuf + 32)));
  1826. _mm256_stream_si256((void *)(output + 64), _mm256_load_si256((void *)(encbuf + 64)));
  1827. _mm256_stream_si256((void *)(output + 96), _mm256_load_si256((void *)(encbuf + 96)));
  1828. _mm256_stream_si256((void *)(output + 128), _mm256_load_si256((void *)(encbuf + 128)));
  1829. _mm256_stream_si256((void *)(output + 160), _mm256_load_si256((void *)(encbuf + 160)));
  1830. AESECB6_UPDATE(5);
  1831. gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 2 * 32)), --ghash_precompute);
  1832. AESECB6_UPDATE(6);
  1833. gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encp - 1 * 32)), --ghash_precompute);
  1834. output += 192;
  1835. encp -= 192;
  1836. AESECB6_UPDATE(7);
  1837. _mm256_store_si256((void *)encbuf, _mm256_load_si256((void *)(encbuf + 192)));
  1838. AESECB6_UPDATE(8);
  1839. _mm256_store_si256((void *)(encbuf + 32), _mm256_load_si256((void *)(encbuf + 224)));
  1840. AESECB6_UPDATE(9);
  1841. if (PTLS_UNLIKELY(ctx->super.ecb.rounds != 10)) {
  1842. for (size_t i = 10; PTLS_LIKELY(i < ctx->super.ecb.rounds); ++i)
  1843. AESECB6_UPDATE(i);
  1844. }
  1845. assert(ctx->ghash == ghash_precompute);
  1846. gfmul_reduce256(&gstate);
  1847. AESECB6_FINAL(ctx->super.ecb.rounds);
  1848. }
  1849. }
  1850. /* Now, All the encrypted bits are built in encbuf. Calculate AEAD tag and append to encbuf. */
  1851. { /* Run ghash against the remaining bytes, after appending `ac` (i.e., len(A) | len(C)). At this point, we might be ghashing 7
  1852. * blocks at once. */
  1853. size_t ac_off = remaining_ghash_from + ((encp - encbuf) - remaining_ghash_from + 15) / 16 * 16;
  1854. _mm_storeu_si128((void *)(encbuf + ac_off), _mm256_castsi256_si128(ac_ek0));
  1855. size_t blocks = ((encp - encbuf) - remaining_ghash_from + 15) / 16 + 1; /* round up, +1 for AC */
  1856. assert(blocks <= 13);
  1857. union ptls_fusion_aesgcm_ghash_precompute256 *ghash_precompute = ctx->ghash + blocks / 2;
  1858. if (blocks % 2 != 0) {
  1859. gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 1, ghash_precompute);
  1860. remaining_ghash_from += 16;
  1861. } else {
  1862. gfmul_firststep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), 0, --ghash_precompute);
  1863. remaining_ghash_from += 32;
  1864. }
  1865. while (ghash_precompute != ctx->ghash) {
  1866. gfmul_nextstep256(&gstate, _mm256_loadu_si256((void *)(encbuf + remaining_ghash_from)), --ghash_precompute);
  1867. remaining_ghash_from += 32;
  1868. }
  1869. gfmul_reduce256(&gstate);
  1870. }
  1871. /* Calculate EK0, if in the unlikely case on not been done yet. When encoding in full size (16K), EK0 will be ready. */
  1872. if (PTLS_UNLIKELY((state & STATE_EK0_READY) == 0)) {
  1873. bits5 = ac_ek0;
  1874. bits5 = _mm256_xor_si256(bits5, ctx->super.ecb.keys.m256[0]);
  1875. for (size_t i = 1; i < ctx->super.ecb.rounds; ++i)
  1876. bits5 = _mm256_aesenc_epi128(bits5, ctx->super.ecb.keys.m256[i]);
  1877. bits5 = _mm256_aesenclast_epi128(bits5, ctx->super.ecb.keys.m256[ctx->super.ecb.rounds]);
  1878. }
  1879. /* append tag to encbuf */
  1880. _mm_storeu_si128((void *)encp,
  1881. gfmul_get_tag256(&gstate, _mm256_castsi256_si128(_mm256_permute2f128_si256(bits5, bits5, 0x11))));
  1882. encp += 16;
  1883. /* write remaining bytes */
  1884. write_remaining_bytes(output, encbuf, encp);
  1885. }
  1886. static int non_temporal_setup(ptls_aead_context_t *_ctx, int is_enc, const void *key, const void *iv, size_t key_size)
  1887. {
  1888. struct aesgcm_context *ctx = (struct aesgcm_context *)_ctx;
  1889. int aesni256 = is_enc && ptls_fusion_can_aesni256;
  1890. ctx->static_iv = loadn128(iv, PTLS_AESGCM_IV_SIZE);
  1891. ctx->static_iv = _mm_shuffle_epi8(ctx->static_iv, byteswap128);
  1892. if (key == NULL)
  1893. return 0;
  1894. ctx->super.dispose_crypto = aesgcm_dispose_crypto;
  1895. ctx->super.do_get_iv = aesgcm_get_iv;
  1896. ctx->super.do_set_iv = aesgcm_set_iv;
  1897. ctx->super.do_encrypt_init = NULL;
  1898. ctx->super.do_encrypt_update = NULL;
  1899. ctx->super.do_encrypt_final = NULL;
  1900. if (is_enc) {
  1901. ctx->super.do_encrypt = ptls_aead__do_encrypt;
  1902. ctx->super.do_encrypt_v = aesni256 ? non_temporal_encrypt_v256 : non_temporal_encrypt_v128;
  1903. ctx->super.do_decrypt = NULL;
  1904. } else {
  1905. assert(!aesni256);
  1906. ctx->super.do_encrypt = NULL;
  1907. ctx->super.do_encrypt_v = NULL;
  1908. ctx->super.do_decrypt = non_temporal_decrypt128;
  1909. }
  1910. ctx->aesgcm =
  1911. new_aesgcm(key, key_size,
  1912. 7 * (ptls_fusion_can_aesni256 ? 32 : 16), // 6 blocks at once, plus len(A) | len(C) that we might append
  1913. aesni256);
  1914. return 0;
  1915. }
  1916. static int non_temporal_aes128gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
  1917. {
  1918. return non_temporal_setup(ctx, is_enc, key, iv, PTLS_AES128_KEY_SIZE);
  1919. }
  1920. static int non_temporal_aes256gcm_setup(ptls_aead_context_t *ctx, int is_enc, const void *key, const void *iv)
  1921. {
  1922. return non_temporal_setup(ctx, is_enc, key, iv, PTLS_AES256_KEY_SIZE);
  1923. }
  1924. ptls_aead_algorithm_t ptls_non_temporal_aes128gcm = {"AES128-GCM",
  1925. PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
  1926. PTLS_AESGCM_INTEGRITY_LIMIT,
  1927. &ptls_fusion_aes128ctr,
  1928. NULL, // &ptls_fusion_aes128ecb,
  1929. PTLS_AES128_KEY_SIZE,
  1930. PTLS_AESGCM_IV_SIZE,
  1931. PTLS_AESGCM_TAG_SIZE,
  1932. {PTLS_TLS12_AESGCM_FIXED_IV_SIZE, PTLS_TLS12_AESGCM_RECORD_IV_SIZE},
  1933. 1,
  1934. PTLS_X86_CACHE_LINE_ALIGN_BITS,
  1935. sizeof(struct aesgcm_context),
  1936. non_temporal_aes128gcm_setup};
  1937. ptls_aead_algorithm_t ptls_non_temporal_aes256gcm = {"AES256-GCM",
  1938. PTLS_AESGCM_CONFIDENTIALITY_LIMIT,
  1939. PTLS_AESGCM_INTEGRITY_LIMIT,
  1940. &ptls_fusion_aes256ctr,
  1941. NULL, // &ptls_fusion_aes128ecb,
  1942. PTLS_AES256_KEY_SIZE,
  1943. PTLS_AESGCM_IV_SIZE,
  1944. PTLS_AESGCM_TAG_SIZE,
  1945. {PTLS_TLS12_AESGCM_FIXED_IV_SIZE, PTLS_TLS12_AESGCM_RECORD_IV_SIZE},
  1946. 1,
  1947. PTLS_X86_CACHE_LINE_ALIGN_BITS,
  1948. sizeof(struct aesgcm_context),
  1949. non_temporal_aes256gcm_setup};
  1950. #ifdef _WINDOWS
  1951. /**
  1952. * ptls_fusion_is_supported_by_cpu:
  1953. * Check that the CPU has extended instructions for PCMUL, AES and AVX2.
  1954. * This test assumes that the CPU is following the x86/x64 architecture.
  1955. * A slightly more refined test could check that the cpu_info spells out
  1956. * "genuineIntel" or "authenticAMD", but would fail in presence of
  1957. * little known CPU brands or some VM */
  1958. int ptls_fusion_is_supported_by_cpu(void)
  1959. {
  1960. uint32_t cpu_info[4];
  1961. uint32_t nb_ids;
  1962. int is_supported = 0;
  1963. __cpuid(cpu_info, 0);
  1964. nb_ids = cpu_info[0];
  1965. if (nb_ids >= 7) {
  1966. uint32_t leaf1_ecx;
  1967. __cpuid(cpu_info, 1);
  1968. leaf1_ecx = cpu_info[2];
  1969. if (/* PCLMUL */ (leaf1_ecx & (1 << 5)) != 0 && /* AES */ (leaf1_ecx & (1 << 25)) != 0) {
  1970. uint32_t leaf7_ebx, leaf7_ecx;
  1971. __cpuid(cpu_info, 7);
  1972. leaf7_ebx = cpu_info[1];
  1973. leaf7_ecx = cpu_info[2];
  1974. is_supported = /* AVX2 */ (leaf7_ebx & (1 << 5)) != 0;
  1975. /* enable 256-bit mode if possible */
  1976. if (is_supported && (leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_aesni256)
  1977. ptls_fusion_can_aesni256 = 1;
  1978. }
  1979. }
  1980. return is_supported;
  1981. }
  1982. #else
  1983. int ptls_fusion_is_supported_by_cpu(void)
  1984. {
  1985. unsigned leaf1_ecx, leaf7_ebx, leaf7_ecx;
  1986. { /* GCC-specific code to obtain CPU features */
  1987. unsigned leaf_cnt;
  1988. __asm__("cpuid" : "=a"(leaf_cnt) : "a"(0) : "ebx", "ecx", "edx");
  1989. if (leaf_cnt < 7)
  1990. return 0;
  1991. __asm__("cpuid" : "=c"(leaf1_ecx) : "a"(1) : "ebx", "edx");
  1992. __asm__("cpuid" : "=b"(leaf7_ebx), "=c"(leaf7_ecx) : "a"(7), "c"(0) : "edx");
  1993. }
  1994. /* AVX2 */
  1995. if ((leaf7_ebx & (1 << 5)) == 0)
  1996. return 0;
  1997. /* AES */
  1998. if ((leaf1_ecx & (1 << 25)) == 0)
  1999. return 0;
  2000. /* PCLMUL */
  2001. if ((leaf1_ecx & (1 << 1)) == 0)
  2002. return 0;
  2003. /* enable 256-bit mode if possible */
  2004. if ((leaf7_ecx & 0x600) != 0 && !ptls_fusion_can_aesni256)
  2005. ptls_fusion_can_aesni256 = 1;
  2006. return 1;
  2007. }
  2008. #endif