diff --git a/go.mod b/go.mod
index d9f0b7272152b0658a7b145e1757a213f35242b7..752a9cf00b7ee8ca06e9302c4a8f2d0bbc43534b 100644
--- a/go.mod
+++ b/go.mod
@@ -17,11 +17,12 @@ require (
 	// Do not update obfs4 past e330d1b7024b, a backwards incompatible change was
 	// made that will break negotiation!! riseup should move to the newest asap.
 	gitlab.com/yawning/obfs4.git v0.0.0-20231012084234-c3e2d44b1033 // indirect
-	golang.org/x/sys v0.23.0
+	golang.org/x/sys v0.28.0
 )
 
 require (
-	0xacab.org/leap/bitmask-core v0.0.0-20241015185856-0812b9aadf98
+	0xacab.org/leap/bitmask-core v0.0.0-20241213164419-d3c23078a4e0
+	0xacab.org/leap/menshen v0.0.0-20241122184833-0524d7e093d5
 	github.com/natefinch/npipe v0.0.0-20160621034901-c1b8fa8bdcce
 	github.com/prometheus-community/pro-bing v0.4.0
 	github.com/rs/zerolog v1.33.0
@@ -100,12 +101,12 @@ require (
 	go.opentelemetry.io/otel/metric v1.28.0 // indirect
 	go.opentelemetry.io/otel/trace v1.28.0 // indirect
 	go.uber.org/mock v0.4.0 // indirect
-	golang.org/x/crypto v0.26.0 // indirect
+	golang.org/x/crypto v0.30.0 // indirect
 	golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect
 	golang.org/x/mod v0.17.0 // indirect
-	golang.org/x/net v0.28.0 // indirect
-	golang.org/x/sync v0.8.0 // indirect
-	golang.org/x/text v0.17.0 // indirect
+	golang.org/x/net v0.32.0 // indirect
+	golang.org/x/sync v0.10.0 // indirect
+	golang.org/x/text v0.21.0 // indirect
 	golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
 	gopkg.in/natefinch/npipe.v2 v2.0.0-20160621034901-c1b8fa8bdcce // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
diff --git a/go.sum b/go.sum
index 915e51ff514b4c1bd2aed0d4107f5d4d09de3f4f..f1c0f4e0035856c58d59f7a5b5f41b34dfea6583 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,7 @@
-0xacab.org/leap/bitmask-core v0.0.0-20241015185856-0812b9aadf98 h1:0tsyOms2hcHOe9V1aJ8fhRpw/DxpswZzCisIvulLamk=
-0xacab.org/leap/bitmask-core v0.0.0-20241015185856-0812b9aadf98/go.mod h1:VDzbMRiY/bCOu4z1DwpzzqgQyj6i82TpuvuCU6ogl6Q=
+0xacab.org/leap/bitmask-core v0.0.0-20241213164419-d3c23078a4e0 h1:l/jiquEeu11GLIf8sLbjl2qf0Stq3JADpwv07tlRnFU=
+0xacab.org/leap/bitmask-core v0.0.0-20241213164419-d3c23078a4e0/go.mod h1:bAbIbmlJo9XVro/IBGdj7rYIKr0M8s+mBM2yZ+k8+Bc=
+0xacab.org/leap/menshen v0.0.0-20241122184833-0524d7e093d5 h1:/Etlz1PLzdz5sNV/CW1IELIU/5XiZ7OcLmXjc0PUkBk=
+0xacab.org/leap/menshen v0.0.0-20241122184833-0524d7e093d5/go.mod h1:F9EdDMDvVwurBjfQExqDXpIObJf2/f4cCp8CtACfr1M=
 0xacab.org/leap/obfsvpn v1.3.1-0.20241121155258-e6b06efc4456 h1:vPw0Uof9ArxiwPL/mlVPHDY3lCPFCEJzM4V5c6bdCYY=
 0xacab.org/leap/obfsvpn v1.3.1-0.20241121155258-e6b06efc4456/go.mod h1:2Nb7wDMNovIkwXOOBcb6/ocaRj9djAoPYM3NsGhu8JA=
 0xacab.org/leap/tunnel-telemetry v0.0.0-20240830081933-7328bb50078b h1:GI6SVhECFVdHalARYd5Qt/i5/+M0fjcLtJIEEQVO4Sk=
@@ -560,8 +562,8 @@ golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIi
 golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw=
 golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
 golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg=
-golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
-golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
+golang.org/x/crypto v0.30.0 h1:RwoQn3GkWiMkzlX562cLB7OxWvjH1L8xutO2WoJcRoY=
+golang.org/x/crypto v0.30.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM=
 golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
@@ -615,8 +617,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
 golang.org/x/net v0.13.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA=
 golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI=
 golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
-golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
-golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
+golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
+golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -629,8 +631,8 @@ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
-golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -677,8 +679,8 @@ golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM=
-golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
@@ -700,8 +702,8 @@ golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
-golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc=
-golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
+golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
 golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
diff --git a/pkg/vpn/menshen/backwards.go b/pkg/vpn/menshen/backwards.go
index f1f5e647d36c8e5fffdd18fe172c721771fa727d..a8940b861b626a122ff2ff58d5ed3dc72f8d3bf2 100644
--- a/pkg/vpn/menshen/backwards.go
+++ b/pkg/vpn/menshen/backwards.go
@@ -4,8 +4,8 @@ import (
 	"fmt"
 	"strings"
 
-	"0xacab.org/leap/bitmask-core/models"
 	"0xacab.org/leap/bitmask-vpn/pkg/vpn/bonafide"
+	"0xacab.org/leap/menshen/models"
 )
 
 func NewBonafideGatewayArray(gatewaysV5 []*models.ModelsGateway) []bonafide.Gateway {
diff --git a/pkg/vpn/menshen/gateway.go b/pkg/vpn/menshen/gateway.go
index f9fe9bf8d310ef0e75bcc95c7fc7478e187984a9..6c76b9198ec07696a53ff488232026f47a081a71 100644
--- a/pkg/vpn/menshen/gateway.go
+++ b/pkg/vpn/menshen/gateway.go
@@ -7,8 +7,8 @@ import (
 	"slices"
 	"strings"
 
-	"0xacab.org/leap/bitmask-core/models"
 	"0xacab.org/leap/bitmask-vpn/pkg/vpn/bonafide"
+	"0xacab.org/leap/menshen/models"
 	"github.com/rs/zerolog/log"
 )
 
diff --git a/pkg/vpn/menshen/init.go b/pkg/vpn/menshen/init.go
index f7354fea3329094268847feb6b2ae9c8f9b2de3b..957cb9fcb03c73ea8b11ead9038d62abf6f1c2cc 100644
--- a/pkg/vpn/menshen/init.go
+++ b/pkg/vpn/menshen/init.go
@@ -5,11 +5,11 @@ import (
 	"os"
 	"strings"
 
-	"0xacab.org/leap/bitmask-core/models"
 	"0xacab.org/leap/bitmask-core/pkg/bootstrap"
 	"0xacab.org/leap/bitmask-core/pkg/storage"
 	"0xacab.org/leap/bitmask-vpn/pkg/config"
 	"0xacab.org/leap/bitmask-vpn/pkg/snowflake"
+	"0xacab.org/leap/menshen/models"
 
 	"github.com/rs/zerolog/log"
 )
diff --git a/vendor/0xacab.org/leap/bitmask-core/models/models_e_ip_service.go b/vendor/0xacab.org/leap/bitmask-core/models/models_e_ip_service.go
deleted file mode 100644
index 17ad391e5d6a1e1c792fa306d3b786f60f3d8e67..0000000000000000000000000000000000000000
--- a/vendor/0xacab.org/leap/bitmask-core/models/models_e_ip_service.go
+++ /dev/null
@@ -1,62 +0,0 @@
-// Code generated by go-swagger; DO NOT EDIT.
-
-package models
-
-// This file was generated by the swagger tool.
-// Editing this file might prove futile when you re-run the swagger generate command
-
-import (
-	"context"
-
-	"github.com/go-openapi/strfmt"
-	"github.com/go-openapi/swag"
-)
-
-// ModelsEIPService models e IP service
-//
-// swagger:model models.EIPService
-type ModelsEIPService struct {
-
-	// auth
-	Auth string `json:"auth,omitempty"`
-
-	// locations
-	Locations interface{} `json:"locations,omitempty"`
-
-	// openvpn configuration
-	OpenvpnConfiguration map[string]interface{} `json:"openvpn_configuration,omitempty"`
-
-	// serial
-	Serial int64 `json:"serial,omitempty"`
-
-	// version
-	Version int64 `json:"version,omitempty"`
-}
-
-// Validate validates this models e IP service
-func (m *ModelsEIPService) Validate(formats strfmt.Registry) error {
-	return nil
-}
-
-// ContextValidate validates this models e IP service based on context it is used
-func (m *ModelsEIPService) ContextValidate(ctx context.Context, formats strfmt.Registry) error {
-	return nil
-}
-
-// MarshalBinary interface implementation
-func (m *ModelsEIPService) MarshalBinary() ([]byte, error) {
-	if m == nil {
-		return nil, nil
-	}
-	return swag.WriteJSON(m)
-}
-
-// UnmarshalBinary interface implementation
-func (m *ModelsEIPService) UnmarshalBinary(b []byte) error {
-	var res ModelsEIPService
-	if err := swag.ReadJSON(b, &res); err != nil {
-		return err
-	}
-	*m = res
-	return nil
-}
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/bootstrap/init.go b/vendor/0xacab.org/leap/bitmask-core/pkg/bootstrap/init.go
index 68041247bd541c95aafe95c308dd3f81973d1c8b..e03a4199833753eb835b5a2828eea37714108aef 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/bootstrap/init.go
+++ b/vendor/0xacab.org/leap/bitmask-core/pkg/bootstrap/init.go
@@ -12,17 +12,17 @@ import (
 	"github.com/rs/zerolog/log"
 	"golang.org/x/net/proxy"
 
-	"0xacab.org/leap/bitmask-core/models"
-	"0xacab.org/leap/bitmask-core/pkg/client"
-	"0xacab.org/leap/bitmask-core/pkg/client/provisioning"
 	"0xacab.org/leap/bitmask-core/pkg/introducer"
 	bitmask_storage "0xacab.org/leap/bitmask-core/pkg/storage"
+	"0xacab.org/leap/menshen/client"
+	"0xacab.org/leap/menshen/client/provisioning"
+	"0xacab.org/leap/menshen/models"
+
 	"0xacab.org/leap/tunnel-telemetry/pkg/geolocate"
 )
 
 type Config struct {
-	// country code used to fetch gateways/bridges. If geolocation lookup fails,
-	// api.config.FallbackCountryCode is used
+	// country code used to fetch gateways/bridges.
 	CountryCode string
 	// Host we will connect to for API operations.
 	Host string
@@ -150,7 +150,7 @@ func NewAPI(cfg *Config) (*API, error) {
 func getSocksProxyClient(proxyString string) (*http.Client, error) {
 	proxyURL, err := url.Parse(proxyString)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("failed to parse proxy URL: %w", err)
 	}
 
 	dialer, err := proxy.FromURL(proxyURL, proxy.Direct)
@@ -221,8 +221,6 @@ func (api *API) GetProvider() (*models.ModelsProvider, error) {
 }
 
 // call menshen endpoint /service and return response
-// TODO: rename endpoint and and this function
-// TODO: split /service into multiple endpoints:
 // locations, openvpn arguments, serial+version, auth
 func (api *API) GetService() (*models.ModelsEIPService, error) {
 	params := provisioning.NewGetAPI5ServiceParams()
@@ -230,13 +228,6 @@ func (api *API) GetService() (*models.ModelsEIPService, error) {
 		params = params.WithHTTPClient(api.httpClient)
 	}
 
-	// TODO: menshen needs to accept cc as param too.
-	/*
-		if api.config.CountryCode != "" {
-			params.Cc = api.config.CountryCode
-		}
-	*/
-
 	service, err := api.client.Provisioning.GetAPI5Service(params)
 	if err != nil {
 		return nil, err
@@ -244,21 +235,11 @@ func (api *API) GetService() (*models.ModelsEIPService, error) {
 	return service.Payload, nil
 }
 
-// TODO: split /service endpoint into multiple endpoints
-// then call this endpoint and return locations
-// do not use an internal variable to store it
-/*
-func (api *API) Locations() interface{} {
-	panic("TODO")
-}
-*/
-
 // GatewayParams contains the fields that can be used to filter the listing of available gateways.
 type GatewayParams struct {
 	Location  string
 	Port      string
 	Transport string
-	CC        string
 }
 
 type BridgeParams struct {
@@ -273,18 +254,29 @@ type BridgeParams struct {
 // API). It optionally accepts a GatewayParams object where you can set
 // different filters.
 func (api *API) GetGateways(p *GatewayParams) ([]*models.ModelsGateway, error) {
-	params := provisioning.NewGetAPI5GatewaysParams()
+	params := provisioning.NewGetAPI5GatewayParams()
 	if p != nil {
 		params.Loc = &p.Location
-		params.Port = &p.Port
-		params.Tr = &p.Transport
-		params.Cc = &p.CC
+		// /api/5/gateway curently only supports filtering by cc and location
+		//params.Port = &p.Port
+		//params.Transport = &p.Transport
+	}
+
+	if api.config.CountryCode != "" {
+		log.Debug().
+			Str("countryCode", api.config.CountryCode).
+			Msg("Setting country code")
+		params.Cc = &api.config.CountryCode
 	}
+
 	if api.httpClient != nil {
 		params = params.WithHTTPClient(api.httpClient)
 	}
+	authHeader := api.getInviteTokenAuth()
 
-	gateways, err := api.client.Provisioning.GetAPI5Gateways(params)
+	// TODO: only temporary - call api/5/gateway because it supports filtering
+	// will be merged in the future https://0xacab.org/leap/bitmask-core/-/issues/32
+	gateways, err := api.client.Provisioning.GetAPI5Gateway(params, authHeader)
 	if err != nil {
 		return nil, err
 	}
@@ -302,7 +294,8 @@ func (api *API) GetAllBridges(p *BridgeParams) ([]*models.ModelsBridge, error) {
 	if api.httpClient != nil {
 		params = params.WithHTTPClient(api.httpClient)
 	}
-	bridges, err := api.client.Provisioning.GetAPI5Bridges(params)
+	authHeader := api.getInviteTokenAuth()
+	bridges, err := api.client.Provisioning.GetAPI5Bridges(params, authHeader)
 	if err != nil {
 		return nil, err
 	}
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/bootstrap/utils.go b/vendor/0xacab.org/leap/bitmask-core/pkg/bootstrap/utils.go
index bb87325fa15ff4623256ab9f0f149666674aca15..4347bca69e178442a632875baf0910c0394216b3 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/bootstrap/utils.go
+++ b/vendor/0xacab.org/leap/bitmask-core/pkg/bootstrap/utils.go
@@ -9,6 +9,9 @@ import (
 	"strconv"
 	"time"
 
+	bitmask_storage "0xacab.org/leap/bitmask-core/pkg/storage"
+	"github.com/go-openapi/runtime"
+	openapi "github.com/go-openapi/runtime/client"
 	utls "github.com/refraction-networking/utls"
 	"github.com/rs/zerolog/log"
 	"golang.org/x/net/http2"
@@ -86,3 +89,37 @@ func (c *Config) getAPIClient() *http.Client {
 		return &http.Client{Timeout: time.Duration(30) * time.Second}
 	}
 }
+
+// Returns authentication header (invite token) from database
+// Returns nil if no introducer is saved or an error occurs
+func (api *API) getInviteTokenAuth() runtime.ClientAuthInfoWriter {
+	if len(api.config.Introducer) == 0 {
+		return nil
+	}
+
+	log.Trace().Msg("Getting invite token from db")
+	storage, err := bitmask_storage.GetStorage()
+	if err != nil {
+		log.Warn().
+			Err(err).
+			Msg("Could not get storage to load invite token")
+		return nil
+	}
+
+	introducer, err := storage.GetIntroducerByFQDN(api.config.Host)
+	if err != nil {
+		log.Debug().
+			Str("err", err.Error()).
+			Str("fqdn", api.config.Host).
+			Msg("Could not get introducer by fqdn")
+		return nil
+	}
+
+	if len(introducer.Auth) == 0 {
+		log.Warn().Msg("An introducer was found for this fqdn, but the invite token is empty")
+		return nil
+	}
+
+	log.Debug().Msg("Sending invite token")
+	return openapi.APIKeyAuth("x-menshen-auth-token", "header", introducer.Auth)
+}
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/bridge/bridge.go b/vendor/0xacab.org/leap/bitmask-core/pkg/bridge/bridge.go
new file mode 100644
index 0000000000000000000000000000000000000000..3b657687f2a09573603ac6a6388a57e3cda5a8d4
--- /dev/null
+++ b/vendor/0xacab.org/leap/bitmask-core/pkg/bridge/bridge.go
@@ -0,0 +1,14 @@
+package bridge
+
+import "time"
+
+// Bridge is a private bridge.
+type Bridge struct {
+	Name     string
+	Location string
+	Type     string
+	// Raw is the raw JSON serialization of the bridge. We could also use the menshen model as a nested struct.
+	Raw       string
+	CreatedAt time.Time
+	LastUsed  time.Time
+}
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/introducer/httpClient.go b/vendor/0xacab.org/leap/bitmask-core/pkg/introducer/httpClient.go
index 167a2a2abe05972eae35616a41a13cb9ccb18dd2..4c3904635c2ff8e72b805a9b17034fa05d698d3e 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/introducer/httpClient.go
+++ b/vendor/0xacab.org/leap/bitmask-core/pkg/introducer/httpClient.go
@@ -8,26 +8,10 @@ import (
 
 	"github.com/rs/zerolog/log"
 
-	"0xacab.org/leap/bitmask-core/pkg/storage"
 	"0xacab.org/leap/obfsvpn/obfsvpn"
 	"github.com/xtaci/kcp-go"
 )
 
-// CallbackTransport calls a callback function after a successful 200 OK response.
-type CallbackTransport struct {
-	OriginalTransport http.RoundTripper
-	Callback          func(*http.Response)
-}
-
-// RoundTrip performs the request and calls the callback if we get a 200 OK.
-func (c *CallbackTransport) RoundTrip(req *http.Request) (*http.Response, error) {
-	resp, err := c.OriginalTransport.RoundTrip(req)
-	if err == nil && resp.StatusCode == http.StatusOK {
-		c.Callback(resp)
-	}
-	return resp, err
-}
-
 // NewHTTPClientFromIntroducer returns an http.Client that will use the passed introducer.
 func NewHTTPClientFromIntroducer(introducer *Introducer) (*http.Client, error) {
 
@@ -73,23 +57,7 @@ func NewHTTPClientFromIntroducer(introducer *Introducer) (*http.Client, error) {
 	}
 
 	client := &http.Client{
-		Transport: &CallbackTransport{
-			OriginalTransport: transport,
-			Callback: func(r *http.Response) {
-				store, err := storage.GetStorage()
-				if err != nil {
-					log.Warn().
-						Err(err).
-						Msg("Could not load DB to update lastUsed timestamp for introducer")
-					return
-				}
-				if err := store.UpdateLastUsedForIntroducer(introducer.URL()); err != nil {
-					log.Warn().
-						Err(err).
-						Msg("Could not update lastUsed timestamp for introducer")
-				}
-			},
-		},
+		Transport: transport,
 	}
 	return client, nil
 }
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/introducer/introducer.go b/vendor/0xacab.org/leap/bitmask-core/pkg/introducer/introducer.go
index 0a244acf09178f36e1b73a5970ea29721a2c6160..1c9ec007d47fa963b8d7788a08a58d38ec4029b8 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/introducer/introducer.go
+++ b/vendor/0xacab.org/leap/bitmask-core/pkg/introducer/introducer.go
@@ -6,15 +6,18 @@ import (
 	"fmt"
 	"net/url"
 	"strings"
+
+	"github.com/rs/zerolog/log"
 )
 
 // Introducer has everything we need to instantiate a http.Client that dials to the obfuscated introducer.
 type Introducer struct {
-	Type string
-	Addr string
-	Cert string
-	FQDN string
-	KCP  bool
+	Type string // currently only obfsvpnintro is supported
+	Addr string // host/ip of proxy - can have a port, e.g. proxy.menshen.net:4430
+	Cert string // obfs4 proxy cert
+	FQDN string // hostname of menshen hostname the proxy server should connect to
+	KCP  bool   // use obfs4 with kcp
+	Auth string // credentials for menshen authentication to get private gateways/bridges
 }
 
 // Validate returns true if all the fields in this Introducer are like expected.
@@ -31,6 +34,9 @@ func (i *Introducer) Validate() error {
 	if i.FQDN != "localhost" && len(strings.Split(i.FQDN, ".")) < 2 {
 		return fmt.Errorf("expected a FQDN, got: %s", i.FQDN)
 	}
+	if len(i.Auth) == 0 {
+		log.Warn().Msg("Invite token of introducer url is empty")
+	}
 	return nil
 }
 
@@ -44,19 +50,20 @@ func (i *Introducer) URL() string {
 	default:
 		kcp = "0"
 	}
-	return fmt.Sprintf("%s://%s/?cert=%s&fqdn=%s&kcp=%s",
+	return fmt.Sprintf("%s://%s/?cert=%s&fqdn=%s&kcp=%s&auth=%s",
 		i.Type,
 		i.Addr,
-		i.Cert,
-		i.FQDN,
+		url.QueryEscape(i.Cert),
+		url.QueryEscape(i.FQDN),
 		kcp,
+		url.QueryEscape(i.Auth),
 	)
 }
 
 // NewIntroducerFromURL returns a new Introducer after parsing the passed URL. It will also return an error
 // if it was not possible to parse the URL correctly.
 func NewIntroducerFromURL(introducerURL string) (*Introducer, error) {
-	// Parse the introducer URL string
+	// Parse the introducer URL string. Get parameters are automatically url decoded
 	u, err := url.Parse(introducerURL)
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse introducer URL: %w", err)
@@ -80,12 +87,16 @@ func NewIntroducerFromURL(introducerURL string) (*Introducer, error) {
 		return nil, fmt.Errorf("cert not found in the introducer URL")
 	}
 
+	// Extract Auth from query parameters (can be empty)
+	auth := u.Query().Get("auth")
+
 	introducer := &Introducer{
 		Type: u.Scheme,
 		Addr: u.Host,
 		FQDN: fqdn,
 		KCP:  kcp,
 		Cert: cert,
+		Auth: auth,
 	}
 
 	return introducer, nil
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/models/models.go b/vendor/0xacab.org/leap/bitmask-core/pkg/models/models.go
deleted file mode 100644
index d7073625fd5c12501f78f1d18a873dfc2af81f5a..0000000000000000000000000000000000000000
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/models/models.go
+++ /dev/null
@@ -1,29 +0,0 @@
-// models contains models that relate to client-only concepts, like objects we keep in the internal storage.
-// For models coming from the menshen API Spec, see ../../models
-package models
-
-import "time"
-
-// TODO - can simplify into a single model. URL and Raw are equivalent to "Blob"
-// TODO - introduce "type" field.
-
-// Introducer keeps metadata about introducers that the user has added to the Bitmask application. Introducers are expected to be transmitted off-band.
-type Introducer struct {
-	FQDN string
-	// URL is the canonical URL. It should be stored after validation and writing in the canonical order, since
-	// we will check for uniqueness.
-	URL       string
-	CreatedAt time.Time
-	LastUsed  time.Time
-}
-
-// Bridge is a private bridge.
-type Bridge struct {
-	Name     string
-	Location string
-	Type     string
-	// Raw is the raw JSON serialization of the bridge. We could also use the menshen model as a nested struct.
-	Raw       string
-	CreatedAt time.Time
-	LastUsed  time.Time
-}
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/storage/storage.go b/vendor/0xacab.org/leap/bitmask-core/pkg/storage/storage.go
index b0a66d3ac0c1f47fa24ec07c36b08fb68d2fdd60..b4bd54f93f5a2eb7cc2920efcbd66703f48e6f26 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/storage/storage.go
+++ b/vendor/0xacab.org/leap/bitmask-core/pkg/storage/storage.go
@@ -19,14 +19,14 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"net/url"
 	"os"
 	"path/filepath"
 	"time"
 
 	homedir "github.com/mitchellh/go-homedir"
 
-	"0xacab.org/leap/bitmask-core/pkg/models"
+	"0xacab.org/leap/bitmask-core/pkg/bridge"
+	"0xacab.org/leap/bitmask-core/pkg/introducer"
 )
 
 var AppName = "bitmask"
@@ -43,10 +43,10 @@ type Storage struct {
 }
 
 // CompareIntroducer is a function type for the comparison of introducers
-type CompareIntroducer func(introducer models.Introducer) bool
+type CompareIntroducer func(introducer introducer.Introducer) bool
 
 // CompareBridge is a function type for the comparison of bridges
-type CompareBridge func(bridge models.Bridge) bool
+type CompareBridge func(bridge bridge.Bridge) bool
 
 // InitAppStorage initializes the global storage with a given storage instance
 func InitAppStorageWith(store Store) {
@@ -115,50 +115,43 @@ func NewStorageWithDefaultDir() (*Storage, error) {
 	return NewStorage(configPath)
 }
 
-// AddIntroducers adds a new Introducer model from URL
-// to the storage
-func (s *Storage) AddIntroducer(url string) error {
-	fqdn, err := getFQDNFromIntroducerURL(url)
+// Add new introducer to storage. Before, delete all existing
+// introducers with the same fqdn
+func (s *Storage) AddIntroducer(intro *introducer.Introducer) error {
+	err := intro.Validate()
 	if err != nil {
 		return err
 	}
 
-	item := &models.Introducer{
-		FQDN:      *fqdn,
-		URL:       url,
-		CreatedAt: time.Now(),
-	}
-
 	// delete existing introducer with same FQDN as we only want
 	// to allow one for per provider for now
-	err = s.DeleteIntroducer(*fqdn)
+	err = s.DeleteIntroducer(intro.FQDN)
 	if err != nil {
 		return err
 	}
 
 	introducers, err := s.getAllIntroducers()
-
 	if err != nil {
 		return err
 	}
-	introducers = append(introducers, *item)
+	introducers = append(introducers, *intro)
 	return s.saveIntroducers(introducers)
 }
 
-func (s *Storage) getAllIntroducers() ([]models.Introducer, error) {
+func (s *Storage) getAllIntroducers() ([]introducer.Introducer, error) {
 	// Create an empty slice of Introducer
-	emptySlice := []models.Introducer{}
+	emptySlice := []introducer.Introducer{}
 	bytes, _ := json.Marshal(emptySlice)
 
 	introducerString := s.store.GetByteArrayWithDefault(INTRODUCER, bytes)
-	introducers, err := unmarshalJSON[[]models.Introducer](introducerString)
+	introducers, err := unmarshalJSON[[]introducer.Introducer](introducerString)
 	if err != nil {
 		return nil, err
 	}
-	return *introducers, err
+	return *introducers, nil
 }
 
-func (s *Storage) saveIntroducers(introducers []models.Introducer) error {
+func (s *Storage) saveIntroducers(introducers []introducer.Introducer) error {
 	bytes, err := json.Marshal(introducers)
 	if err != nil {
 		return err
@@ -177,27 +170,19 @@ func unmarshalJSON[T any](data []byte) (*T, error) {
 }
 
 // ListIntroducers returns an array of all introducers
-func (s *Storage) ListIntroducers() ([]models.Introducer, error) {
+func (s *Storage) ListIntroducers() ([]introducer.Introducer, error) {
 	return s.getAllIntroducers()
 }
 
 // GetIntroducerByFQDN returns the first introducer for a given fqdn.
-func (s *Storage) GetIntroducerByFQDN(fqdn string) (*models.Introducer, error) {
-	compare := func(intro models.Introducer) bool {
+func (s *Storage) GetIntroducerByFQDN(fqdn string) (*introducer.Introducer, error) {
+	compare := func(intro introducer.Introducer) bool {
 		return intro.FQDN == fqdn
 	}
 	return s.getFirstIntroducer(compare)
 }
 
-// GetIntroducerByURL will return the Introducer with the given URL, if found, and an error.
-func (s *Storage) GetIntroducerByURL(url string) (*models.Introducer, error) {
-	compare := func(intro models.Introducer) bool {
-		return intro.URL == url
-	}
-	return s.getFirstIntroducer(compare)
-}
-
-func (s *Storage) getFirstIntroducer(compare CompareIntroducer) (*models.Introducer, error) {
+func (s *Storage) getFirstIntroducer(compare CompareIntroducer) (*introducer.Introducer, error) {
 	introducers, err := s.getAllIntroducers()
 	if err != nil {
 		return nil, err
@@ -210,28 +195,6 @@ func (s *Storage) getFirstIntroducer(compare CompareIntroducer) (*models.Introdu
 	return nil, fmt.Errorf("introducer not found")
 }
 
-func (s *Storage) updateFirstIntroducer(compare CompareIntroducer, lastUsed time.Time) ([]models.Introducer, error) {
-	introducers, err := s.getAllIntroducers()
-	if err != nil {
-		return nil, err
-	}
-
-	found := false
-	for _, intro := range introducers {
-		if compare(intro) {
-			intro.LastUsed = lastUsed
-			found = true
-			break
-		}
-	}
-
-	if !found {
-		return nil, fmt.Errorf("introducer not found")
-	}
-
-	return introducers, nil
-}
-
 // DeleteIntroducer deletes all introducers for a given fqdn.
 func (s *Storage) DeleteIntroducer(fqdn string) error {
 	if fqdn == "" {
@@ -242,11 +205,11 @@ func (s *Storage) DeleteIntroducer(fqdn string) error {
 		return err
 	}
 
-	compare := func(intro models.Introducer) bool {
+	compare := func(intro introducer.Introducer) bool {
 		return fqdn == intro.FQDN
 	}
 
-	updatedIntroducers := func() []models.Introducer {
+	updatedIntroducers := func() []introducer.Introducer {
 		for i, intro := range introducers {
 			if compare(intro) {
 				// Swap with the last element
@@ -263,7 +226,7 @@ func (s *Storage) DeleteIntroducer(fqdn string) error {
 
 // NewBridge creates a new Bridge from the passed parameters.
 func (s *Storage) NewBridge(name, bridgeType, location, raw string) error {
-	item := &models.Bridge{
+	item := &bridge.Bridge{
 		Name:     name,
 		Type:     bridgeType,
 		Location: location,
@@ -275,7 +238,7 @@ func (s *Storage) NewBridge(name, bridgeType, location, raw string) error {
 		return err
 	}
 
-	comparison := func(intro models.Bridge) bool {
+	comparison := func(intro bridge.Bridge) bool {
 		return intro.Name == name || intro.Raw == raw
 	}
 	bridge, _ := s.getFirstBridge(comparison)
@@ -287,20 +250,20 @@ func (s *Storage) NewBridge(name, bridgeType, location, raw string) error {
 	return s.saveBridges(bridges)
 }
 
-func (s *Storage) getAllBridges() ([]models.Bridge, error) {
+func (s *Storage) getAllBridges() ([]bridge.Bridge, error) {
 	// Create an empty slice of Introducer
-	emptySlice := []models.Bridge{}
+	emptySlice := []bridge.Bridge{}
 	bytes, _ := json.Marshal(emptySlice)
 
 	bridgeBytes := s.store.GetByteArrayWithDefault(BRIDGE, bytes)
-	bridges, err := unmarshalJSON[[]models.Bridge](bridgeBytes)
+	bridges, err := unmarshalJSON[[]bridge.Bridge](bridgeBytes)
 	if err != nil {
 		return nil, err
 	}
 	return *bridges, err
 }
 
-func (s *Storage) saveBridges(bridges []models.Bridge) error {
+func (s *Storage) saveBridges(bridges []bridge.Bridge) error {
 	bytes, err := json.Marshal(bridges)
 	if err != nil {
 		return err
@@ -310,11 +273,11 @@ func (s *Storage) saveBridges(bridges []models.Bridge) error {
 }
 
 // ListBridges returns an array of all the bridges.
-func (s *Storage) ListBridges() ([]models.Bridge, error) {
+func (s *Storage) ListBridges() ([]bridge.Bridge, error) {
 	return s.getAllBridges()
 }
 
-func (s *Storage) getFirstBridge(compare CompareBridge) (*models.Bridge, error) {
+func (s *Storage) getFirstBridge(compare CompareBridge) (*bridge.Bridge, error) {
 	bridges, err := s.getAllBridges()
 	if err != nil {
 		return nil, err
@@ -327,12 +290,12 @@ func (s *Storage) getFirstBridge(compare CompareBridge) (*models.Bridge, error)
 	return nil, fmt.Errorf("bridge not found")
 }
 
-func (s *Storage) getBridges(compare CompareBridge) ([]models.Bridge, error) {
+func (s *Storage) getBridges(compare CompareBridge) ([]bridge.Bridge, error) {
 	bridges, err := s.getAllBridges()
 	if err != nil {
 		return nil, err
 	}
-	var result []models.Bridge
+	var result []bridge.Bridge
 
 	for _, bridge := range bridges {
 		if compare(bridge) {
@@ -348,24 +311,24 @@ func (s *Storage) getBridges(compare CompareBridge) ([]models.Bridge, error) {
 }
 
 // GetBridgeByName will return the Bridge with the given Name, if found, and an error.
-func (s *Storage) GetBridgeByName(name string) (*models.Bridge, error) {
-	compare := func(bridge models.Bridge) bool {
+func (s *Storage) GetBridgeByName(name string) (*bridge.Bridge, error) {
+	compare := func(bridge bridge.Bridge) bool {
 		return bridge.Name == name
 	}
 	return s.getFirstBridge(compare)
 }
 
 // GetBridgesByType will return all Bridges with the given Type, if found, and an error.
-func (s *Storage) GetBridgesByType(bridgeType string) ([]models.Bridge, error) {
-	compare := func(bridge models.Bridge) bool {
+func (s *Storage) GetBridgesByType(bridgeType string) ([]bridge.Bridge, error) {
+	compare := func(bridge bridge.Bridge) bool {
 		return bridge.Type == bridgeType
 	}
 	return s.getBridges(compare)
 }
 
 // GetBridgesByLocation will return all Bridges with the given Location, if found, and an error.
-func (s *Storage) GetBridgesByLocation(location string) ([]models.Bridge, error) {
-	compare := func(bridge models.Bridge) bool {
+func (s *Storage) GetBridgesByLocation(location string) ([]bridge.Bridge, error) {
+	compare := func(bridge bridge.Bridge) bool {
 		return bridge.Location == location
 	}
 	return s.getBridges(compare)
@@ -379,11 +342,11 @@ func (s *Storage) DeleteBridge(name string) error {
 		return err
 	}
 
-	compare := func(bridge models.Bridge) bool {
+	compare := func(bridge bridge.Bridge) bool {
 		return bridge.Name == name
 	}
 
-	updatedBridges := func() []models.Bridge {
+	updatedBridges := func() []bridge.Bridge {
 		for i, bridge := range bridges {
 			if compare(bridge) {
 				// Swap with the last element
@@ -421,36 +384,6 @@ func GetStorage() (*Storage, error) {
 
 }
 
-func getFQDNFromIntroducerURL(introducerURL string) (*string, error) {
-	// Parse the introducer URL string
-	u, err := url.Parse(introducerURL)
-	if err != nil {
-		return nil, fmt.Errorf("failed to parse introducer URL: %w", err)
-	}
-
-	// Extract FQDN from query parameters
-	fqdn := u.Query().Get("fqdn")
-	if fqdn == "" {
-		return nil, fmt.Errorf("FQDN not found in the introducer URL")
-	}
-
-	return &fqdn, nil
-}
-
-// UpdateLastUsedForIntroducer will attempt to update the LastUsed timestamp for the introducer
-// that matches the passed URL.
-func (s *Storage) UpdateLastUsedForIntroducer(url string) error {
-	var compare CompareIntroducer = func(introducer models.Introducer) bool {
-		return introducer.URL == url
-	}
-	introducers, err := s.updateFirstIntroducer(compare, time.Now())
-	if err != nil {
-		return err
-	}
-
-	return s.saveIntroducers(introducers)
-}
-
 func (s *Storage) SaveFallbackCountryCode(cc string) {
 	s.store.SetString(COUNTRYCODE, cc)
 }
diff --git a/vendor/0xacab.org/leap/menshen/LICENSE b/vendor/0xacab.org/leap/menshen/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..94a9ed024d3859793618152ea559a168bbcbb5e2
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/vendor/0xacab.org/leap/menshen/client/agent/agent_client.go b/vendor/0xacab.org/leap/menshen/client/agent/agent_client.go
new file mode 100644
index 0000000000000000000000000000000000000000..8268a0066de7576d1b08c84c3d33f59ab2ef5bce
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/client/agent/agent_client.go
@@ -0,0 +1,151 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package agent
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"fmt"
+
+	"github.com/go-openapi/runtime"
+	httptransport "github.com/go-openapi/runtime/client"
+	"github.com/go-openapi/strfmt"
+)
+
+// New creates a new agent API client.
+func New(transport runtime.ClientTransport, formats strfmt.Registry) ClientService {
+	return &Client{transport: transport, formats: formats}
+}
+
+// New creates a new agent API client with basic auth credentials.
+// It takes the following parameters:
+// - host: http host (github.com).
+// - basePath: any base path for the API client ("/v1", "/v3").
+// - scheme: http scheme ("http", "https").
+// - user: user for basic authentication header.
+// - password: password for basic authentication header.
+func NewClientWithBasicAuth(host, basePath, scheme, user, password string) ClientService {
+	transport := httptransport.New(host, basePath, []string{scheme})
+	transport.DefaultAuthentication = httptransport.BasicAuth(user, password)
+	return &Client{transport: transport, formats: strfmt.Default}
+}
+
+// New creates a new agent API client with a bearer token for authentication.
+// It takes the following parameters:
+// - host: http host (github.com).
+// - basePath: any base path for the API client ("/v1", "/v3").
+// - scheme: http scheme ("http", "https").
+// - bearerToken: bearer token for Bearer authentication header.
+func NewClientWithBearerToken(host, basePath, scheme, bearerToken string) ClientService {
+	transport := httptransport.New(host, basePath, []string{scheme})
+	transport.DefaultAuthentication = httptransport.BearerToken(bearerToken)
+	return &Client{transport: transport, formats: strfmt.Default}
+}
+
+/*
+Client for agent API
+*/
+type Client struct {
+	transport runtime.ClientTransport
+	formats   strfmt.Registry
+}
+
+// ClientOption may be used to customize the behavior of Client methods.
+type ClientOption func(*runtime.ClientOperation)
+
+// ClientService is the interface for Client methods
+type ClientService interface {
+	PutAPI5AgentBridge(params *PutAPI5AgentBridgeParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*PutApi5AgentBridgeOK, error)
+
+	PutAPI5AgentGateway(params *PutAPI5AgentGatewayParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*PutApi5AgentGatewayOK, error)
+
+	SetTransport(transport runtime.ClientTransport)
+}
+
+/*
+PutAPI5AgentBridge registers a bridge
+
+Register a bridge. This endpoint allows "menshen agent" processes running on bridges to register themselves.
+*/
+func (a *Client) PutAPI5AgentBridge(params *PutAPI5AgentBridgeParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*PutApi5AgentBridgeOK, error) {
+	// TODO: Validate the params before sending
+	if params == nil {
+		params = NewPutAPI5AgentBridgeParams()
+	}
+	op := &runtime.ClientOperation{
+		ID:                 "PutAPI5AgentBridge",
+		Method:             "PUT",
+		PathPattern:        "/api/5/agent/bridge",
+		ProducesMediaTypes: []string{"application/json"},
+		ConsumesMediaTypes: []string{"application/json"},
+		Schemes:            []string{"http"},
+		Params:             params,
+		Reader:             &PutAPI5AgentBridgeReader{formats: a.formats},
+		AuthInfo:           authInfo,
+		Context:            params.Context,
+		Client:             params.HTTPClient,
+	}
+	for _, opt := range opts {
+		opt(op)
+	}
+
+	result, err := a.transport.Submit(op)
+	if err != nil {
+		return nil, err
+	}
+	success, ok := result.(*PutApi5AgentBridgeOK)
+	if ok {
+		return success, nil
+	}
+	// unexpected success response
+	// safeguard: normally, absent a default response, unknown success responses return an error above: so this is a codegen issue
+	msg := fmt.Sprintf("unexpected success response for PutAPI5AgentBridge: API contract not enforced by server. Client expected to get an error, but got: %T", result)
+	panic(msg)
+}
+
+/*
+PutAPI5AgentGateway registers a gateway
+
+Register a gateway. This endpoint allows "menshen agent" processes running on gateways to register themselves.
+*/
+func (a *Client) PutAPI5AgentGateway(params *PutAPI5AgentGatewayParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*PutApi5AgentGatewayOK, error) {
+	// TODO: Validate the params before sending
+	if params == nil {
+		params = NewPutAPI5AgentGatewayParams()
+	}
+	op := &runtime.ClientOperation{
+		ID:                 "PutAPI5AgentGateway",
+		Method:             "PUT",
+		PathPattern:        "/api/5/agent/gateway",
+		ProducesMediaTypes: []string{"application/json"},
+		ConsumesMediaTypes: []string{"application/json"},
+		Schemes:            []string{"http"},
+		Params:             params,
+		Reader:             &PutAPI5AgentGatewayReader{formats: a.formats},
+		AuthInfo:           authInfo,
+		Context:            params.Context,
+		Client:             params.HTTPClient,
+	}
+	for _, opt := range opts {
+		opt(op)
+	}
+
+	result, err := a.transport.Submit(op)
+	if err != nil {
+		return nil, err
+	}
+	success, ok := result.(*PutApi5AgentGatewayOK)
+	if ok {
+		return success, nil
+	}
+	// unexpected success response
+	// safeguard: normally, absent a default response, unknown success responses return an error above: so this is a codegen issue
+	msg := fmt.Sprintf("unexpected success response for PutAPI5AgentGateway: API contract not enforced by server. Client expected to get an error, but got: %T", result)
+	panic(msg)
+}
+
+// SetTransport changes the transport on the client
+func (a *Client) SetTransport(transport runtime.ClientTransport) {
+	a.transport = transport
+}
diff --git a/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_bridge_parameters.go b/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_bridge_parameters.go
new file mode 100644
index 0000000000000000000000000000000000000000..8456dc19227122711ecd791400eec3157357c42c
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_bridge_parameters.go
@@ -0,0 +1,153 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package agent
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"context"
+	"net/http"
+	"time"
+
+	"github.com/go-openapi/errors"
+	"github.com/go-openapi/runtime"
+	cr "github.com/go-openapi/runtime/client"
+	"github.com/go-openapi/strfmt"
+
+	"0xacab.org/leap/menshen/models"
+)
+
+// NewPutAPI5AgentBridgeParams creates a new PutAPI5AgentBridgeParams object,
+// with the default timeout for this client.
+//
+// Default values are not hydrated, since defaults are normally applied by the API server side.
+//
+// To enforce default values in parameter, use SetDefaults or WithDefaults.
+func NewPutAPI5AgentBridgeParams() *PutAPI5AgentBridgeParams {
+	return &PutAPI5AgentBridgeParams{
+		timeout: cr.DefaultTimeout,
+	}
+}
+
+// NewPutAPI5AgentBridgeParamsWithTimeout creates a new PutAPI5AgentBridgeParams object
+// with the ability to set a timeout on a request.
+func NewPutAPI5AgentBridgeParamsWithTimeout(timeout time.Duration) *PutAPI5AgentBridgeParams {
+	return &PutAPI5AgentBridgeParams{
+		timeout: timeout,
+	}
+}
+
+// NewPutAPI5AgentBridgeParamsWithContext creates a new PutAPI5AgentBridgeParams object
+// with the ability to set a context for a request.
+func NewPutAPI5AgentBridgeParamsWithContext(ctx context.Context) *PutAPI5AgentBridgeParams {
+	return &PutAPI5AgentBridgeParams{
+		Context: ctx,
+	}
+}
+
+// NewPutAPI5AgentBridgeParamsWithHTTPClient creates a new PutAPI5AgentBridgeParams object
+// with the ability to set a custom HTTPClient for a request.
+func NewPutAPI5AgentBridgeParamsWithHTTPClient(client *http.Client) *PutAPI5AgentBridgeParams {
+	return &PutAPI5AgentBridgeParams{
+		HTTPClient: client,
+	}
+}
+
+/*
+PutAPI5AgentBridgeParams contains all the parameters to send to the API endpoint
+
+	for the put API 5 agent bridge operation.
+
+	Typically these are written to a http.Request.
+*/
+type PutAPI5AgentBridgeParams struct {
+
+	/* Bridge.
+
+	   bridge
+	*/
+	Bridge *models.ModelsBridge
+
+	timeout    time.Duration
+	Context    context.Context
+	HTTPClient *http.Client
+}
+
+// WithDefaults hydrates default values in the put API 5 agent bridge params (not the query body).
+//
+// All values with no default are reset to their zero value.
+func (o *PutAPI5AgentBridgeParams) WithDefaults() *PutAPI5AgentBridgeParams {
+	o.SetDefaults()
+	return o
+}
+
+// SetDefaults hydrates default values in the put API 5 agent bridge params (not the query body).
+//
+// All values with no default are reset to their zero value.
+func (o *PutAPI5AgentBridgeParams) SetDefaults() {
+	// no default values defined for this parameter
+}
+
+// WithTimeout adds the timeout to the put API 5 agent bridge params
+func (o *PutAPI5AgentBridgeParams) WithTimeout(timeout time.Duration) *PutAPI5AgentBridgeParams {
+	o.SetTimeout(timeout)
+	return o
+}
+
+// SetTimeout adds the timeout to the put API 5 agent bridge params
+func (o *PutAPI5AgentBridgeParams) SetTimeout(timeout time.Duration) {
+	o.timeout = timeout
+}
+
+// WithContext adds the context to the put API 5 agent bridge params
+func (o *PutAPI5AgentBridgeParams) WithContext(ctx context.Context) *PutAPI5AgentBridgeParams {
+	o.SetContext(ctx)
+	return o
+}
+
+// SetContext adds the context to the put API 5 agent bridge params
+func (o *PutAPI5AgentBridgeParams) SetContext(ctx context.Context) {
+	o.Context = ctx
+}
+
+// WithHTTPClient adds the HTTPClient to the put API 5 agent bridge params
+func (o *PutAPI5AgentBridgeParams) WithHTTPClient(client *http.Client) *PutAPI5AgentBridgeParams {
+	o.SetHTTPClient(client)
+	return o
+}
+
+// SetHTTPClient adds the HTTPClient to the put API 5 agent bridge params
+func (o *PutAPI5AgentBridgeParams) SetHTTPClient(client *http.Client) {
+	o.HTTPClient = client
+}
+
+// WithBridge adds the bridge to the put API 5 agent bridge params
+func (o *PutAPI5AgentBridgeParams) WithBridge(bridge *models.ModelsBridge) *PutAPI5AgentBridgeParams {
+	o.SetBridge(bridge)
+	return o
+}
+
+// SetBridge adds the bridge to the put API 5 agent bridge params
+func (o *PutAPI5AgentBridgeParams) SetBridge(bridge *models.ModelsBridge) {
+	o.Bridge = bridge
+}
+
+// WriteToRequest writes these params to a swagger request
+func (o *PutAPI5AgentBridgeParams) WriteToRequest(r runtime.ClientRequest, reg strfmt.Registry) error {
+
+	if err := r.SetTimeout(o.timeout); err != nil {
+		return err
+	}
+	var res []error
+	if o.Bridge != nil {
+		if err := r.SetBodyParam(o.Bridge); err != nil {
+			return err
+		}
+	}
+
+	if len(res) > 0 {
+		return errors.CompositeValidationError(res...)
+	}
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_bridge_responses.go b/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_bridge_responses.go
new file mode 100644
index 0000000000000000000000000000000000000000..9f9489f6537784566311703d439d784094ca55a6
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_bridge_responses.go
@@ -0,0 +1,328 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package agent
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+
+	"github.com/go-openapi/runtime"
+	"github.com/go-openapi/strfmt"
+
+	"0xacab.org/leap/menshen/models"
+)
+
+// PutAPI5AgentBridgeReader is a Reader for the PutAPI5AgentBridge structure.
+type PutAPI5AgentBridgeReader struct {
+	formats strfmt.Registry
+}
+
+// ReadResponse reads a server response into the received o.
+func (o *PutAPI5AgentBridgeReader) ReadResponse(response runtime.ClientResponse, consumer runtime.Consumer) (interface{}, error) {
+	switch response.Code() {
+	case 200:
+		result := NewPutApi5AgentBridgeOK()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return result, nil
+	case 400:
+		result := NewPutApi5AgentBridgeBadRequest()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	case 404:
+		result := NewPutApi5AgentBridgeNotFound()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	case 500:
+		result := NewPutApi5AgentBridgeInternalServerError()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	default:
+		return nil, runtime.NewAPIError("[PUT /api/5/agent/bridge] PutAPI5AgentBridge", response, response.Code())
+	}
+}
+
+// NewPutApi5AgentBridgeOK creates a PutApi5AgentBridgeOK with default headers values
+func NewPutApi5AgentBridgeOK() *PutApi5AgentBridgeOK {
+	return &PutApi5AgentBridgeOK{}
+}
+
+/*
+PutApi5AgentBridgeOK describes a response with status code 200, with default header values.
+
+OK
+*/
+type PutApi5AgentBridgeOK struct {
+	Payload *models.ModelsBridge
+}
+
+// IsSuccess returns true when this put api5 agent bridge o k response has a 2xx status code
+func (o *PutApi5AgentBridgeOK) IsSuccess() bool {
+	return true
+}
+
+// IsRedirect returns true when this put api5 agent bridge o k response has a 3xx status code
+func (o *PutApi5AgentBridgeOK) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this put api5 agent bridge o k response has a 4xx status code
+func (o *PutApi5AgentBridgeOK) IsClientError() bool {
+	return false
+}
+
+// IsServerError returns true when this put api5 agent bridge o k response has a 5xx status code
+func (o *PutApi5AgentBridgeOK) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this put api5 agent bridge o k response a status code equal to that given
+func (o *PutApi5AgentBridgeOK) IsCode(code int) bool {
+	return code == 200
+}
+
+// Code gets the status code for the put api5 agent bridge o k response
+func (o *PutApi5AgentBridgeOK) Code() int {
+	return 200
+}
+
+func (o *PutApi5AgentBridgeOK) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/bridge][%d] putApi5AgentBridgeOK %s", 200, payload)
+}
+
+func (o *PutApi5AgentBridgeOK) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/bridge][%d] putApi5AgentBridgeOK %s", 200, payload)
+}
+
+func (o *PutApi5AgentBridgeOK) GetPayload() *models.ModelsBridge {
+	return o.Payload
+}
+
+func (o *PutApi5AgentBridgeOK) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	o.Payload = new(models.ModelsBridge)
+
+	// response payload
+	if err := consumer.Consume(response.Body(), o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewPutApi5AgentBridgeBadRequest creates a PutApi5AgentBridgeBadRequest with default headers values
+func NewPutApi5AgentBridgeBadRequest() *PutApi5AgentBridgeBadRequest {
+	return &PutApi5AgentBridgeBadRequest{}
+}
+
+/*
+PutApi5AgentBridgeBadRequest describes a response with status code 400, with default header values.
+
+Bad Request
+*/
+type PutApi5AgentBridgeBadRequest struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this put api5 agent bridge bad request response has a 2xx status code
+func (o *PutApi5AgentBridgeBadRequest) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this put api5 agent bridge bad request response has a 3xx status code
+func (o *PutApi5AgentBridgeBadRequest) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this put api5 agent bridge bad request response has a 4xx status code
+func (o *PutApi5AgentBridgeBadRequest) IsClientError() bool {
+	return true
+}
+
+// IsServerError returns true when this put api5 agent bridge bad request response has a 5xx status code
+func (o *PutApi5AgentBridgeBadRequest) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this put api5 agent bridge bad request response a status code equal to that given
+func (o *PutApi5AgentBridgeBadRequest) IsCode(code int) bool {
+	return code == 400
+}
+
+// Code gets the status code for the put api5 agent bridge bad request response
+func (o *PutApi5AgentBridgeBadRequest) Code() int {
+	return 400
+}
+
+func (o *PutApi5AgentBridgeBadRequest) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/bridge][%d] putApi5AgentBridgeBadRequest %s", 400, payload)
+}
+
+func (o *PutApi5AgentBridgeBadRequest) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/bridge][%d] putApi5AgentBridgeBadRequest %s", 400, payload)
+}
+
+func (o *PutApi5AgentBridgeBadRequest) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *PutApi5AgentBridgeBadRequest) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewPutApi5AgentBridgeNotFound creates a PutApi5AgentBridgeNotFound with default headers values
+func NewPutApi5AgentBridgeNotFound() *PutApi5AgentBridgeNotFound {
+	return &PutApi5AgentBridgeNotFound{}
+}
+
+/*
+PutApi5AgentBridgeNotFound describes a response with status code 404, with default header values.
+
+Not Found
+*/
+type PutApi5AgentBridgeNotFound struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this put api5 agent bridge not found response has a 2xx status code
+func (o *PutApi5AgentBridgeNotFound) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this put api5 agent bridge not found response has a 3xx status code
+func (o *PutApi5AgentBridgeNotFound) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this put api5 agent bridge not found response has a 4xx status code
+func (o *PutApi5AgentBridgeNotFound) IsClientError() bool {
+	return true
+}
+
+// IsServerError returns true when this put api5 agent bridge not found response has a 5xx status code
+func (o *PutApi5AgentBridgeNotFound) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this put api5 agent bridge not found response a status code equal to that given
+func (o *PutApi5AgentBridgeNotFound) IsCode(code int) bool {
+	return code == 404
+}
+
+// Code gets the status code for the put api5 agent bridge not found response
+func (o *PutApi5AgentBridgeNotFound) Code() int {
+	return 404
+}
+
+func (o *PutApi5AgentBridgeNotFound) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/bridge][%d] putApi5AgentBridgeNotFound %s", 404, payload)
+}
+
+func (o *PutApi5AgentBridgeNotFound) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/bridge][%d] putApi5AgentBridgeNotFound %s", 404, payload)
+}
+
+func (o *PutApi5AgentBridgeNotFound) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *PutApi5AgentBridgeNotFound) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewPutApi5AgentBridgeInternalServerError creates a PutApi5AgentBridgeInternalServerError with default headers values
+func NewPutApi5AgentBridgeInternalServerError() *PutApi5AgentBridgeInternalServerError {
+	return &PutApi5AgentBridgeInternalServerError{}
+}
+
+/*
+PutApi5AgentBridgeInternalServerError describes a response with status code 500, with default header values.
+
+Internal Server Error
+*/
+type PutApi5AgentBridgeInternalServerError struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this put api5 agent bridge internal server error response has a 2xx status code
+func (o *PutApi5AgentBridgeInternalServerError) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this put api5 agent bridge internal server error response has a 3xx status code
+func (o *PutApi5AgentBridgeInternalServerError) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this put api5 agent bridge internal server error response has a 4xx status code
+func (o *PutApi5AgentBridgeInternalServerError) IsClientError() bool {
+	return false
+}
+
+// IsServerError returns true when this put api5 agent bridge internal server error response has a 5xx status code
+func (o *PutApi5AgentBridgeInternalServerError) IsServerError() bool {
+	return true
+}
+
+// IsCode returns true when this put api5 agent bridge internal server error response a status code equal to that given
+func (o *PutApi5AgentBridgeInternalServerError) IsCode(code int) bool {
+	return code == 500
+}
+
+// Code gets the status code for the put api5 agent bridge internal server error response
+func (o *PutApi5AgentBridgeInternalServerError) Code() int {
+	return 500
+}
+
+func (o *PutApi5AgentBridgeInternalServerError) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/bridge][%d] putApi5AgentBridgeInternalServerError %s", 500, payload)
+}
+
+func (o *PutApi5AgentBridgeInternalServerError) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/bridge][%d] putApi5AgentBridgeInternalServerError %s", 500, payload)
+}
+
+func (o *PutApi5AgentBridgeInternalServerError) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *PutApi5AgentBridgeInternalServerError) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_gateway_parameters.go b/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_gateway_parameters.go
new file mode 100644
index 0000000000000000000000000000000000000000..bda91c3e25de209547bfed420f59b3955282e160
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_gateway_parameters.go
@@ -0,0 +1,153 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package agent
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"context"
+	"net/http"
+	"time"
+
+	"github.com/go-openapi/errors"
+	"github.com/go-openapi/runtime"
+	cr "github.com/go-openapi/runtime/client"
+	"github.com/go-openapi/strfmt"
+
+	"0xacab.org/leap/menshen/models"
+)
+
+// NewPutAPI5AgentGatewayParams creates a new PutAPI5AgentGatewayParams object,
+// with the default timeout for this client.
+//
+// Default values are not hydrated, since defaults are normally applied by the API server side.
+//
+// To enforce default values in parameter, use SetDefaults or WithDefaults.
+func NewPutAPI5AgentGatewayParams() *PutAPI5AgentGatewayParams {
+	return &PutAPI5AgentGatewayParams{
+		timeout: cr.DefaultTimeout,
+	}
+}
+
+// NewPutAPI5AgentGatewayParamsWithTimeout creates a new PutAPI5AgentGatewayParams object
+// with the ability to set a timeout on a request.
+func NewPutAPI5AgentGatewayParamsWithTimeout(timeout time.Duration) *PutAPI5AgentGatewayParams {
+	return &PutAPI5AgentGatewayParams{
+		timeout: timeout,
+	}
+}
+
+// NewPutAPI5AgentGatewayParamsWithContext creates a new PutAPI5AgentGatewayParams object
+// with the ability to set a context for a request.
+func NewPutAPI5AgentGatewayParamsWithContext(ctx context.Context) *PutAPI5AgentGatewayParams {
+	return &PutAPI5AgentGatewayParams{
+		Context: ctx,
+	}
+}
+
+// NewPutAPI5AgentGatewayParamsWithHTTPClient creates a new PutAPI5AgentGatewayParams object
+// with the ability to set a custom HTTPClient for a request.
+func NewPutAPI5AgentGatewayParamsWithHTTPClient(client *http.Client) *PutAPI5AgentGatewayParams {
+	return &PutAPI5AgentGatewayParams{
+		HTTPClient: client,
+	}
+}
+
+/*
+PutAPI5AgentGatewayParams contains all the parameters to send to the API endpoint
+
+	for the put API 5 agent gateway operation.
+
+	Typically these are written to a http.Request.
+*/
+type PutAPI5AgentGatewayParams struct {
+
+	/* Gateway.
+
+	   gateway
+	*/
+	Gateway *models.ModelsGateway
+
+	timeout    time.Duration
+	Context    context.Context
+	HTTPClient *http.Client
+}
+
+// WithDefaults hydrates default values in the put API 5 agent gateway params (not the query body).
+//
+// All values with no default are reset to their zero value.
+func (o *PutAPI5AgentGatewayParams) WithDefaults() *PutAPI5AgentGatewayParams {
+	o.SetDefaults()
+	return o
+}
+
+// SetDefaults hydrates default values in the put API 5 agent gateway params (not the query body).
+//
+// All values with no default are reset to their zero value.
+func (o *PutAPI5AgentGatewayParams) SetDefaults() {
+	// no default values defined for this parameter
+}
+
+// WithTimeout adds the timeout to the put API 5 agent gateway params
+func (o *PutAPI5AgentGatewayParams) WithTimeout(timeout time.Duration) *PutAPI5AgentGatewayParams {
+	o.SetTimeout(timeout)
+	return o
+}
+
+// SetTimeout adds the timeout to the put API 5 agent gateway params
+func (o *PutAPI5AgentGatewayParams) SetTimeout(timeout time.Duration) {
+	o.timeout = timeout
+}
+
+// WithContext adds the context to the put API 5 agent gateway params
+func (o *PutAPI5AgentGatewayParams) WithContext(ctx context.Context) *PutAPI5AgentGatewayParams {
+	o.SetContext(ctx)
+	return o
+}
+
+// SetContext adds the context to the put API 5 agent gateway params
+func (o *PutAPI5AgentGatewayParams) SetContext(ctx context.Context) {
+	o.Context = ctx
+}
+
+// WithHTTPClient adds the HTTPClient to the put API 5 agent gateway params
+func (o *PutAPI5AgentGatewayParams) WithHTTPClient(client *http.Client) *PutAPI5AgentGatewayParams {
+	o.SetHTTPClient(client)
+	return o
+}
+
+// SetHTTPClient adds the HTTPClient to the put API 5 agent gateway params
+func (o *PutAPI5AgentGatewayParams) SetHTTPClient(client *http.Client) {
+	o.HTTPClient = client
+}
+
+// WithGateway adds the gateway to the put API 5 agent gateway params
+func (o *PutAPI5AgentGatewayParams) WithGateway(gateway *models.ModelsGateway) *PutAPI5AgentGatewayParams {
+	o.SetGateway(gateway)
+	return o
+}
+
+// SetGateway adds the gateway to the put API 5 agent gateway params
+func (o *PutAPI5AgentGatewayParams) SetGateway(gateway *models.ModelsGateway) {
+	o.Gateway = gateway
+}
+
+// WriteToRequest writes these params to a swagger request
+func (o *PutAPI5AgentGatewayParams) WriteToRequest(r runtime.ClientRequest, reg strfmt.Registry) error {
+
+	if err := r.SetTimeout(o.timeout); err != nil {
+		return err
+	}
+	var res []error
+	if o.Gateway != nil {
+		if err := r.SetBodyParam(o.Gateway); err != nil {
+			return err
+		}
+	}
+
+	if len(res) > 0 {
+		return errors.CompositeValidationError(res...)
+	}
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_gateway_responses.go b/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_gateway_responses.go
new file mode 100644
index 0000000000000000000000000000000000000000..6a1dbbff208cfeaa3939b532cf406af367bb26c1
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/client/agent/put_api_5_agent_gateway_responses.go
@@ -0,0 +1,328 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package agent
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+
+	"github.com/go-openapi/runtime"
+	"github.com/go-openapi/strfmt"
+
+	"0xacab.org/leap/menshen/models"
+)
+
+// PutAPI5AgentGatewayReader is a Reader for the PutAPI5AgentGateway structure.
+type PutAPI5AgentGatewayReader struct {
+	formats strfmt.Registry
+}
+
+// ReadResponse reads a server response into the received o.
+func (o *PutAPI5AgentGatewayReader) ReadResponse(response runtime.ClientResponse, consumer runtime.Consumer) (interface{}, error) {
+	switch response.Code() {
+	case 200:
+		result := NewPutApi5AgentGatewayOK()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return result, nil
+	case 400:
+		result := NewPutApi5AgentGatewayBadRequest()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	case 404:
+		result := NewPutApi5AgentGatewayNotFound()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	case 500:
+		result := NewPutApi5AgentGatewayInternalServerError()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	default:
+		return nil, runtime.NewAPIError("[PUT /api/5/agent/gateway] PutAPI5AgentGateway", response, response.Code())
+	}
+}
+
+// NewPutApi5AgentGatewayOK creates a PutApi5AgentGatewayOK with default headers values
+func NewPutApi5AgentGatewayOK() *PutApi5AgentGatewayOK {
+	return &PutApi5AgentGatewayOK{}
+}
+
+/*
+PutApi5AgentGatewayOK describes a response with status code 200, with default header values.
+
+OK
+*/
+type PutApi5AgentGatewayOK struct {
+	Payload *models.ModelsGateway
+}
+
+// IsSuccess returns true when this put api5 agent gateway o k response has a 2xx status code
+func (o *PutApi5AgentGatewayOK) IsSuccess() bool {
+	return true
+}
+
+// IsRedirect returns true when this put api5 agent gateway o k response has a 3xx status code
+func (o *PutApi5AgentGatewayOK) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this put api5 agent gateway o k response has a 4xx status code
+func (o *PutApi5AgentGatewayOK) IsClientError() bool {
+	return false
+}
+
+// IsServerError returns true when this put api5 agent gateway o k response has a 5xx status code
+func (o *PutApi5AgentGatewayOK) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this put api5 agent gateway o k response a status code equal to that given
+func (o *PutApi5AgentGatewayOK) IsCode(code int) bool {
+	return code == 200
+}
+
+// Code gets the status code for the put api5 agent gateway o k response
+func (o *PutApi5AgentGatewayOK) Code() int {
+	return 200
+}
+
+func (o *PutApi5AgentGatewayOK) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/gateway][%d] putApi5AgentGatewayOK %s", 200, payload)
+}
+
+func (o *PutApi5AgentGatewayOK) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/gateway][%d] putApi5AgentGatewayOK %s", 200, payload)
+}
+
+func (o *PutApi5AgentGatewayOK) GetPayload() *models.ModelsGateway {
+	return o.Payload
+}
+
+func (o *PutApi5AgentGatewayOK) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	o.Payload = new(models.ModelsGateway)
+
+	// response payload
+	if err := consumer.Consume(response.Body(), o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewPutApi5AgentGatewayBadRequest creates a PutApi5AgentGatewayBadRequest with default headers values
+func NewPutApi5AgentGatewayBadRequest() *PutApi5AgentGatewayBadRequest {
+	return &PutApi5AgentGatewayBadRequest{}
+}
+
+/*
+PutApi5AgentGatewayBadRequest describes a response with status code 400, with default header values.
+
+Bad Request
+*/
+type PutApi5AgentGatewayBadRequest struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this put api5 agent gateway bad request response has a 2xx status code
+func (o *PutApi5AgentGatewayBadRequest) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this put api5 agent gateway bad request response has a 3xx status code
+func (o *PutApi5AgentGatewayBadRequest) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this put api5 agent gateway bad request response has a 4xx status code
+func (o *PutApi5AgentGatewayBadRequest) IsClientError() bool {
+	return true
+}
+
+// IsServerError returns true when this put api5 agent gateway bad request response has a 5xx status code
+func (o *PutApi5AgentGatewayBadRequest) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this put api5 agent gateway bad request response a status code equal to that given
+func (o *PutApi5AgentGatewayBadRequest) IsCode(code int) bool {
+	return code == 400
+}
+
+// Code gets the status code for the put api5 agent gateway bad request response
+func (o *PutApi5AgentGatewayBadRequest) Code() int {
+	return 400
+}
+
+func (o *PutApi5AgentGatewayBadRequest) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/gateway][%d] putApi5AgentGatewayBadRequest %s", 400, payload)
+}
+
+func (o *PutApi5AgentGatewayBadRequest) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/gateway][%d] putApi5AgentGatewayBadRequest %s", 400, payload)
+}
+
+func (o *PutApi5AgentGatewayBadRequest) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *PutApi5AgentGatewayBadRequest) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewPutApi5AgentGatewayNotFound creates a PutApi5AgentGatewayNotFound with default headers values
+func NewPutApi5AgentGatewayNotFound() *PutApi5AgentGatewayNotFound {
+	return &PutApi5AgentGatewayNotFound{}
+}
+
+/*
+PutApi5AgentGatewayNotFound describes a response with status code 404, with default header values.
+
+Not Found
+*/
+type PutApi5AgentGatewayNotFound struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this put api5 agent gateway not found response has a 2xx status code
+func (o *PutApi5AgentGatewayNotFound) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this put api5 agent gateway not found response has a 3xx status code
+func (o *PutApi5AgentGatewayNotFound) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this put api5 agent gateway not found response has a 4xx status code
+func (o *PutApi5AgentGatewayNotFound) IsClientError() bool {
+	return true
+}
+
+// IsServerError returns true when this put api5 agent gateway not found response has a 5xx status code
+func (o *PutApi5AgentGatewayNotFound) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this put api5 agent gateway not found response a status code equal to that given
+func (o *PutApi5AgentGatewayNotFound) IsCode(code int) bool {
+	return code == 404
+}
+
+// Code gets the status code for the put api5 agent gateway not found response
+func (o *PutApi5AgentGatewayNotFound) Code() int {
+	return 404
+}
+
+func (o *PutApi5AgentGatewayNotFound) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/gateway][%d] putApi5AgentGatewayNotFound %s", 404, payload)
+}
+
+func (o *PutApi5AgentGatewayNotFound) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/gateway][%d] putApi5AgentGatewayNotFound %s", 404, payload)
+}
+
+func (o *PutApi5AgentGatewayNotFound) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *PutApi5AgentGatewayNotFound) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewPutApi5AgentGatewayInternalServerError creates a PutApi5AgentGatewayInternalServerError with default headers values
+func NewPutApi5AgentGatewayInternalServerError() *PutApi5AgentGatewayInternalServerError {
+	return &PutApi5AgentGatewayInternalServerError{}
+}
+
+/*
+PutApi5AgentGatewayInternalServerError describes a response with status code 500, with default header values.
+
+Internal Server Error
+*/
+type PutApi5AgentGatewayInternalServerError struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this put api5 agent gateway internal server error response has a 2xx status code
+func (o *PutApi5AgentGatewayInternalServerError) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this put api5 agent gateway internal server error response has a 3xx status code
+func (o *PutApi5AgentGatewayInternalServerError) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this put api5 agent gateway internal server error response has a 4xx status code
+func (o *PutApi5AgentGatewayInternalServerError) IsClientError() bool {
+	return false
+}
+
+// IsServerError returns true when this put api5 agent gateway internal server error response has a 5xx status code
+func (o *PutApi5AgentGatewayInternalServerError) IsServerError() bool {
+	return true
+}
+
+// IsCode returns true when this put api5 agent gateway internal server error response a status code equal to that given
+func (o *PutApi5AgentGatewayInternalServerError) IsCode(code int) bool {
+	return code == 500
+}
+
+// Code gets the status code for the put api5 agent gateway internal server error response
+func (o *PutApi5AgentGatewayInternalServerError) Code() int {
+	return 500
+}
+
+func (o *PutApi5AgentGatewayInternalServerError) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/gateway][%d] putApi5AgentGatewayInternalServerError %s", 500, payload)
+}
+
+func (o *PutApi5AgentGatewayInternalServerError) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[PUT /api/5/agent/gateway][%d] putApi5AgentGatewayInternalServerError %s", 500, payload)
+}
+
+func (o *PutApi5AgentGatewayInternalServerError) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *PutApi5AgentGatewayInternalServerError) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/menshen_api_client.go b/vendor/0xacab.org/leap/menshen/client/menshen_api_client.go
similarity index 94%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/menshen_api_client.go
rename to vendor/0xacab.org/leap/menshen/client/menshen_api_client.go
index 75424ae701b24a83deb68f41cd71a99852594fa1..14bc26b4802a64803ea626d3a6aa5bd2f6dc01c4 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/menshen_api_client.go
+++ b/vendor/0xacab.org/leap/menshen/client/menshen_api_client.go
@@ -10,7 +10,8 @@ import (
 	httptransport "github.com/go-openapi/runtime/client"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/pkg/client/provisioning"
+	"0xacab.org/leap/menshen/client/agent"
+	"0xacab.org/leap/menshen/client/provisioning"
 )
 
 // Default menshen API HTTP client.
@@ -55,6 +56,7 @@ func New(transport runtime.ClientTransport, formats strfmt.Registry) *MenshenAPI
 
 	cli := new(MenshenAPI)
 	cli.Transport = transport
+	cli.Agent = agent.New(transport, formats)
 	cli.Provisioning = provisioning.New(transport, formats)
 	return cli
 }
@@ -100,6 +102,8 @@ func (cfg *TransportConfig) WithSchemes(schemes []string) *TransportConfig {
 
 // MenshenAPI is a client for menshen API
 type MenshenAPI struct {
+	Agent agent.ClientService
+
 	Provisioning provisioning.ClientService
 
 	Transport runtime.ClientTransport
@@ -108,5 +112,6 @@ type MenshenAPI struct {
 // SetTransport changes the transport on the client and all its subresources
 func (c *MenshenAPI) SetTransport(transport runtime.ClientTransport) {
 	c.Transport = transport
+	c.Agent.SetTransport(transport)
 	c.Provisioning.SetTransport(transport)
 }
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridge_location_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridge_location_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridge_location_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridge_location_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridge_location_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridge_location_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridge_location_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridge_location_responses.go
index 3a40bfe21097b6e5c13a985b0f7156552777fac6..a4722d70909b16c774e43e2b3f52c497c4eae548 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridge_location_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridge_location_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // Get5BridgeLocationReader is a Reader for the Get5BridgeLocation structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridges_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridges_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridges_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridges_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridges_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridges_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridges_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridges_responses.go
index 5820ed853b5a73b6ddd18f643f850b4cd385cb50..fbec00d009b58f18a686cc098ce998661d44ed60 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_bridges_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_bridges_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // Get5BridgesReader is a Reader for the Get5Bridges structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateway_location_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateway_location_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateway_location_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateway_location_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateway_location_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateway_location_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateway_location_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateway_location_responses.go
index fa23e465a459b95cc90848e36e2fac12961bf7fa..69fef833dbe7557b7b3379a7e6228a1299e46c28 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateway_location_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateway_location_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // Get5GatewayLocationReader is a Reader for the Get5GatewayLocation structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateways_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateways_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateways_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateways_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateways_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateways_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateways_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateways_responses.go
index 43aa09708fc264be9a10dd93a236f66287d3a050..e47bca2595473d43be96351eab1147236167ba89 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_gateways_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_gateways_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // Get5GatewaysReader is a Reader for the Get5Gateways structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_openvpn_cert_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_openvpn_cert_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_openvpn_cert_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_openvpn_cert_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_openvpn_cert_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_openvpn_cert_responses.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_openvpn_cert_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_openvpn_cert_responses.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_openvpn_config_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_openvpn_config_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_openvpn_config_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_openvpn_config_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_openvpn_config_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_openvpn_config_responses.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_openvpn_config_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_openvpn_config_responses.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_service_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_service_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_service_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_service_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_service_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_service_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_service_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get5_service_responses.go
index e0f4acc08bd2a4bad88a596fddab84495bc2a717..34218d906ec4389499f782e0a3f7f7ba352fb68c 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get5_service_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get5_service_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // Get5ServiceReader is a Reader for the Get5Service structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridge_location_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridge_location_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridge_location_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridge_location_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridge_location_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridge_location_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridge_location_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridge_location_responses.go
index db05bbccd97a3547f41a183f7cad8b05a07b15f7..8ecccf91c1c598bbed3836a045c5fafd7efb9527 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridge_location_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridge_location_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // GetAPI5BridgeLocationReader is a Reader for the GetAPI5BridgeLocation structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridges_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridges_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridges_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridges_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridges_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridges_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridges_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridges_responses.go
index 08c654cdf23f1e9f1b89c06d77a2511a5146dec6..81b53d2b7c15855b5f9aad7aed1b09642e9d0492 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_bridges_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_bridges_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // GetAPI5BridgesReader is a Reader for the GetAPI5Bridges structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateway_location_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_location_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateway_location_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_location_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateway_location_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_location_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateway_location_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_location_responses.go
index 70830cd8abcfba57761896b60c91258883a9758d..4d9551820f48256457de5198da2020c16220c2fb 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateway_location_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_location_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // GetAPI5GatewayLocationReader is a Reader for the GetAPI5GatewayLocation structure.
diff --git a/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_parameters.go
new file mode 100644
index 0000000000000000000000000000000000000000..6f14fb03bd2713fb9c74cbbb2950f56dffaf6fa0
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_parameters.go
@@ -0,0 +1,197 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package provisioning
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"context"
+	"net/http"
+	"time"
+
+	"github.com/go-openapi/errors"
+	"github.com/go-openapi/runtime"
+	cr "github.com/go-openapi/runtime/client"
+	"github.com/go-openapi/strfmt"
+)
+
+// NewGetAPI5GatewayParams creates a new GetAPI5GatewayParams object,
+// with the default timeout for this client.
+//
+// Default values are not hydrated, since defaults are normally applied by the API server side.
+//
+// To enforce default values in parameter, use SetDefaults or WithDefaults.
+func NewGetAPI5GatewayParams() *GetAPI5GatewayParams {
+	return &GetAPI5GatewayParams{
+		timeout: cr.DefaultTimeout,
+	}
+}
+
+// NewGetAPI5GatewayParamsWithTimeout creates a new GetAPI5GatewayParams object
+// with the ability to set a timeout on a request.
+func NewGetAPI5GatewayParamsWithTimeout(timeout time.Duration) *GetAPI5GatewayParams {
+	return &GetAPI5GatewayParams{
+		timeout: timeout,
+	}
+}
+
+// NewGetAPI5GatewayParamsWithContext creates a new GetAPI5GatewayParams object
+// with the ability to set a context for a request.
+func NewGetAPI5GatewayParamsWithContext(ctx context.Context) *GetAPI5GatewayParams {
+	return &GetAPI5GatewayParams{
+		Context: ctx,
+	}
+}
+
+// NewGetAPI5GatewayParamsWithHTTPClient creates a new GetAPI5GatewayParams object
+// with the ability to set a custom HTTPClient for a request.
+func NewGetAPI5GatewayParamsWithHTTPClient(client *http.Client) *GetAPI5GatewayParams {
+	return &GetAPI5GatewayParams{
+		HTTPClient: client,
+	}
+}
+
+/*
+GetAPI5GatewayParams contains all the parameters to send to the API endpoint
+
+	for the get API 5 gateway operation.
+
+	Typically these are written to a http.Request.
+*/
+type GetAPI5GatewayParams struct {
+
+	/* Cc.
+
+	   Country code (ISO-2)
+	*/
+	Cc *string
+
+	/* Loc.
+
+	   location
+	*/
+	Loc *string
+
+	timeout    time.Duration
+	Context    context.Context
+	HTTPClient *http.Client
+}
+
+// WithDefaults hydrates default values in the get API 5 gateway params (not the query body).
+//
+// All values with no default are reset to their zero value.
+func (o *GetAPI5GatewayParams) WithDefaults() *GetAPI5GatewayParams {
+	o.SetDefaults()
+	return o
+}
+
+// SetDefaults hydrates default values in the get API 5 gateway params (not the query body).
+//
+// All values with no default are reset to their zero value.
+func (o *GetAPI5GatewayParams) SetDefaults() {
+	// no default values defined for this parameter
+}
+
+// WithTimeout adds the timeout to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) WithTimeout(timeout time.Duration) *GetAPI5GatewayParams {
+	o.SetTimeout(timeout)
+	return o
+}
+
+// SetTimeout adds the timeout to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) SetTimeout(timeout time.Duration) {
+	o.timeout = timeout
+}
+
+// WithContext adds the context to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) WithContext(ctx context.Context) *GetAPI5GatewayParams {
+	o.SetContext(ctx)
+	return o
+}
+
+// SetContext adds the context to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) SetContext(ctx context.Context) {
+	o.Context = ctx
+}
+
+// WithHTTPClient adds the HTTPClient to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) WithHTTPClient(client *http.Client) *GetAPI5GatewayParams {
+	o.SetHTTPClient(client)
+	return o
+}
+
+// SetHTTPClient adds the HTTPClient to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) SetHTTPClient(client *http.Client) {
+	o.HTTPClient = client
+}
+
+// WithCc adds the cc to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) WithCc(cc *string) *GetAPI5GatewayParams {
+	o.SetCc(cc)
+	return o
+}
+
+// SetCc adds the cc to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) SetCc(cc *string) {
+	o.Cc = cc
+}
+
+// WithLoc adds the loc to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) WithLoc(loc *string) *GetAPI5GatewayParams {
+	o.SetLoc(loc)
+	return o
+}
+
+// SetLoc adds the loc to the get API 5 gateway params
+func (o *GetAPI5GatewayParams) SetLoc(loc *string) {
+	o.Loc = loc
+}
+
+// WriteToRequest writes these params to a swagger request
+func (o *GetAPI5GatewayParams) WriteToRequest(r runtime.ClientRequest, reg strfmt.Registry) error {
+
+	if err := r.SetTimeout(o.timeout); err != nil {
+		return err
+	}
+	var res []error
+
+	if o.Cc != nil {
+
+		// query param cc
+		var qrCc string
+
+		if o.Cc != nil {
+			qrCc = *o.Cc
+		}
+		qCc := qrCc
+		if qCc != "" {
+
+			if err := r.SetQueryParam("cc", qCc); err != nil {
+				return err
+			}
+		}
+	}
+
+	if o.Loc != nil {
+
+		// query param loc
+		var qrLoc string
+
+		if o.Loc != nil {
+			qrLoc = *o.Loc
+		}
+		qLoc := qrLoc
+		if qLoc != "" {
+
+			if err := r.SetQueryParam("loc", qLoc); err != nil {
+				return err
+			}
+		}
+	}
+
+	if len(res) > 0 {
+		return errors.CompositeValidationError(res...)
+	}
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_responses.go
new file mode 100644
index 0000000000000000000000000000000000000000..635eb408ce750db739be6e406a38c707e38c1438
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateway_responses.go
@@ -0,0 +1,326 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package provisioning
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+
+	"github.com/go-openapi/runtime"
+	"github.com/go-openapi/strfmt"
+
+	"0xacab.org/leap/menshen/models"
+)
+
+// GetAPI5GatewayReader is a Reader for the GetAPI5Gateway structure.
+type GetAPI5GatewayReader struct {
+	formats strfmt.Registry
+}
+
+// ReadResponse reads a server response into the received o.
+func (o *GetAPI5GatewayReader) ReadResponse(response runtime.ClientResponse, consumer runtime.Consumer) (interface{}, error) {
+	switch response.Code() {
+	case 200:
+		result := NewGetApi5GatewayOK()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return result, nil
+	case 400:
+		result := NewGetApi5GatewayBadRequest()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	case 404:
+		result := NewGetApi5GatewayNotFound()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	case 500:
+		result := NewGetApi5GatewayInternalServerError()
+		if err := result.readResponse(response, consumer, o.formats); err != nil {
+			return nil, err
+		}
+		return nil, result
+	default:
+		return nil, runtime.NewAPIError("[GET /api/5/gateway] GetAPI5Gateway", response, response.Code())
+	}
+}
+
+// NewGetApi5GatewayOK creates a GetApi5GatewayOK with default headers values
+func NewGetApi5GatewayOK() *GetApi5GatewayOK {
+	return &GetApi5GatewayOK{}
+}
+
+/*
+GetApi5GatewayOK describes a response with status code 200, with default header values.
+
+OK
+*/
+type GetApi5GatewayOK struct {
+	Payload []*models.ModelsGateway
+}
+
+// IsSuccess returns true when this get api5 gateway o k response has a 2xx status code
+func (o *GetApi5GatewayOK) IsSuccess() bool {
+	return true
+}
+
+// IsRedirect returns true when this get api5 gateway o k response has a 3xx status code
+func (o *GetApi5GatewayOK) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this get api5 gateway o k response has a 4xx status code
+func (o *GetApi5GatewayOK) IsClientError() bool {
+	return false
+}
+
+// IsServerError returns true when this get api5 gateway o k response has a 5xx status code
+func (o *GetApi5GatewayOK) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this get api5 gateway o k response a status code equal to that given
+func (o *GetApi5GatewayOK) IsCode(code int) bool {
+	return code == 200
+}
+
+// Code gets the status code for the get api5 gateway o k response
+func (o *GetApi5GatewayOK) Code() int {
+	return 200
+}
+
+func (o *GetApi5GatewayOK) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[GET /api/5/gateway][%d] getApi5GatewayOK %s", 200, payload)
+}
+
+func (o *GetApi5GatewayOK) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[GET /api/5/gateway][%d] getApi5GatewayOK %s", 200, payload)
+}
+
+func (o *GetApi5GatewayOK) GetPayload() []*models.ModelsGateway {
+	return o.Payload
+}
+
+func (o *GetApi5GatewayOK) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewGetApi5GatewayBadRequest creates a GetApi5GatewayBadRequest with default headers values
+func NewGetApi5GatewayBadRequest() *GetApi5GatewayBadRequest {
+	return &GetApi5GatewayBadRequest{}
+}
+
+/*
+GetApi5GatewayBadRequest describes a response with status code 400, with default header values.
+
+Bad Request
+*/
+type GetApi5GatewayBadRequest struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this get api5 gateway bad request response has a 2xx status code
+func (o *GetApi5GatewayBadRequest) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this get api5 gateway bad request response has a 3xx status code
+func (o *GetApi5GatewayBadRequest) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this get api5 gateway bad request response has a 4xx status code
+func (o *GetApi5GatewayBadRequest) IsClientError() bool {
+	return true
+}
+
+// IsServerError returns true when this get api5 gateway bad request response has a 5xx status code
+func (o *GetApi5GatewayBadRequest) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this get api5 gateway bad request response a status code equal to that given
+func (o *GetApi5GatewayBadRequest) IsCode(code int) bool {
+	return code == 400
+}
+
+// Code gets the status code for the get api5 gateway bad request response
+func (o *GetApi5GatewayBadRequest) Code() int {
+	return 400
+}
+
+func (o *GetApi5GatewayBadRequest) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[GET /api/5/gateway][%d] getApi5GatewayBadRequest %s", 400, payload)
+}
+
+func (o *GetApi5GatewayBadRequest) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[GET /api/5/gateway][%d] getApi5GatewayBadRequest %s", 400, payload)
+}
+
+func (o *GetApi5GatewayBadRequest) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *GetApi5GatewayBadRequest) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewGetApi5GatewayNotFound creates a GetApi5GatewayNotFound with default headers values
+func NewGetApi5GatewayNotFound() *GetApi5GatewayNotFound {
+	return &GetApi5GatewayNotFound{}
+}
+
+/*
+GetApi5GatewayNotFound describes a response with status code 404, with default header values.
+
+Not Found
+*/
+type GetApi5GatewayNotFound struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this get api5 gateway not found response has a 2xx status code
+func (o *GetApi5GatewayNotFound) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this get api5 gateway not found response has a 3xx status code
+func (o *GetApi5GatewayNotFound) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this get api5 gateway not found response has a 4xx status code
+func (o *GetApi5GatewayNotFound) IsClientError() bool {
+	return true
+}
+
+// IsServerError returns true when this get api5 gateway not found response has a 5xx status code
+func (o *GetApi5GatewayNotFound) IsServerError() bool {
+	return false
+}
+
+// IsCode returns true when this get api5 gateway not found response a status code equal to that given
+func (o *GetApi5GatewayNotFound) IsCode(code int) bool {
+	return code == 404
+}
+
+// Code gets the status code for the get api5 gateway not found response
+func (o *GetApi5GatewayNotFound) Code() int {
+	return 404
+}
+
+func (o *GetApi5GatewayNotFound) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[GET /api/5/gateway][%d] getApi5GatewayNotFound %s", 404, payload)
+}
+
+func (o *GetApi5GatewayNotFound) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[GET /api/5/gateway][%d] getApi5GatewayNotFound %s", 404, payload)
+}
+
+func (o *GetApi5GatewayNotFound) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *GetApi5GatewayNotFound) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
+
+// NewGetApi5GatewayInternalServerError creates a GetApi5GatewayInternalServerError with default headers values
+func NewGetApi5GatewayInternalServerError() *GetApi5GatewayInternalServerError {
+	return &GetApi5GatewayInternalServerError{}
+}
+
+/*
+GetApi5GatewayInternalServerError describes a response with status code 500, with default header values.
+
+Internal Server Error
+*/
+type GetApi5GatewayInternalServerError struct {
+	Payload interface{}
+}
+
+// IsSuccess returns true when this get api5 gateway internal server error response has a 2xx status code
+func (o *GetApi5GatewayInternalServerError) IsSuccess() bool {
+	return false
+}
+
+// IsRedirect returns true when this get api5 gateway internal server error response has a 3xx status code
+func (o *GetApi5GatewayInternalServerError) IsRedirect() bool {
+	return false
+}
+
+// IsClientError returns true when this get api5 gateway internal server error response has a 4xx status code
+func (o *GetApi5GatewayInternalServerError) IsClientError() bool {
+	return false
+}
+
+// IsServerError returns true when this get api5 gateway internal server error response has a 5xx status code
+func (o *GetApi5GatewayInternalServerError) IsServerError() bool {
+	return true
+}
+
+// IsCode returns true when this get api5 gateway internal server error response a status code equal to that given
+func (o *GetApi5GatewayInternalServerError) IsCode(code int) bool {
+	return code == 500
+}
+
+// Code gets the status code for the get api5 gateway internal server error response
+func (o *GetApi5GatewayInternalServerError) Code() int {
+	return 500
+}
+
+func (o *GetApi5GatewayInternalServerError) Error() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[GET /api/5/gateway][%d] getApi5GatewayInternalServerError %s", 500, payload)
+}
+
+func (o *GetApi5GatewayInternalServerError) String() string {
+	payload, _ := json.Marshal(o.Payload)
+	return fmt.Sprintf("[GET /api/5/gateway][%d] getApi5GatewayInternalServerError %s", 500, payload)
+}
+
+func (o *GetApi5GatewayInternalServerError) GetPayload() interface{} {
+	return o.Payload
+}
+
+func (o *GetApi5GatewayInternalServerError) readResponse(response runtime.ClientResponse, consumer runtime.Consumer, formats strfmt.Registry) error {
+
+	// response payload
+	if err := consumer.Consume(response.Body(), &o.Payload); err != nil && err != io.EOF {
+		return err
+	}
+
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateways_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateways_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateways_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateways_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateways_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateways_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateways_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateways_responses.go
index 17a1587ca12f38c92acdfd77d9b67a225cf03710..66076bdbf1e3a6631a672220b7bdbaf460676d15 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_gateways_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_gateways_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // GetAPI5GatewaysReader is a Reader for the GetAPI5Gateways structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_openvpn_cert_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_openvpn_cert_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_openvpn_cert_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_openvpn_cert_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_openvpn_cert_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_openvpn_cert_responses.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_openvpn_cert_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_openvpn_cert_responses.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_openvpn_config_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_openvpn_config_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_openvpn_config_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_openvpn_config_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_openvpn_config_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_openvpn_config_responses.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_openvpn_config_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_openvpn_config_responses.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_service_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_service_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_service_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_service_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_service_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_service_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_service_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_service_responses.go
index 7424a128e9d8d544cabd253e20ab766e8ff293aa..37c1a2cb5aef594e0835ff3326b057f574b03d44 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_5_service_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_5_service_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // GetAPI5ServiceReader is a Reader for the GetAPI5Service structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_autoconf_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_autoconf_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_autoconf_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_autoconf_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_autoconf_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_api_autoconf_responses.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_api_autoconf_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_api_autoconf_responses.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_autoconf_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_autoconf_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_autoconf_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_autoconf_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_autoconf_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_autoconf_responses.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_autoconf_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_autoconf_responses.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_provider_json_parameters.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_provider_json_parameters.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_provider_json_parameters.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_provider_json_parameters.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_provider_json_responses.go b/vendor/0xacab.org/leap/menshen/client/provisioning/get_provider_json_responses.go
similarity index 99%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_provider_json_responses.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/get_provider_json_responses.go
index c2ea0bfa937c4e5c9331b77a72d0af3fb78793f8..c4cd55f3747bcef10a917af82b2a9dd607433bb3 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/get_provider_json_responses.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/get_provider_json_responses.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-openapi/runtime"
 	"github.com/go-openapi/strfmt"
 
-	"0xacab.org/leap/bitmask-core/models"
+	"0xacab.org/leap/menshen/models"
 )
 
 // GetProviderJSONReader is a Reader for the GetProviderJSON structure.
diff --git a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/provisioning_client.go b/vendor/0xacab.org/leap/menshen/client/provisioning/provisioning_client.go
similarity index 89%
rename from vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/provisioning_client.go
rename to vendor/0xacab.org/leap/menshen/client/provisioning/provisioning_client.go
index ca36757736a46b24015463dd3b13369b85a60e37..e425629df3e73ddd0d38de87467e0e579e64b4b7 100644
--- a/vendor/0xacab.org/leap/bitmask-core/pkg/client/provisioning/provisioning_client.go
+++ b/vendor/0xacab.org/leap/menshen/client/provisioning/provisioning_client.go
@@ -80,13 +80,13 @@ func WithAcceptTextPlain(r *runtime.ClientOperation) {
 
 // ClientService is the interface for Client methods
 type ClientService interface {
-	GetAPI5BridgeLocation(params *GetAPI5BridgeLocationParams, opts ...ClientOption) (*GetApi5BridgeLocationOK, error)
+	GetAPI5BridgeLocation(params *GetAPI5BridgeLocationParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*GetApi5BridgeLocationOK, error)
 
-	GetAPI5Bridges(params *GetAPI5BridgesParams, opts ...ClientOption) (*GetApi5BridgesOK, error)
+	GetAPI5Bridges(params *GetAPI5BridgesParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*GetApi5BridgesOK, error)
 
-	GetAPI5GatewayLocation(params *GetAPI5GatewayLocationParams, opts ...ClientOption) (*GetApi5GatewayLocationOK, error)
+	GetAPI5Gateway(params *GetAPI5GatewayParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*GetApi5GatewayOK, error)
 
-	GetAPI5Gateways(params *GetAPI5GatewaysParams, opts ...ClientOption) (*GetApi5GatewaysOK, error)
+	GetAPI5Gateways(params *GetAPI5GatewaysParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*GetApi5GatewaysOK, error)
 
 	GetAPI5OpenvpnCert(params *GetAPI5OpenvpnCertParams, opts ...ClientOption) (*GetApi5OpenvpnCertOK, error)
 
@@ -106,7 +106,7 @@ GetAPI5BridgeLocation gets bridges
 
 fetch bridges by location
 */
-func (a *Client) GetAPI5BridgeLocation(params *GetAPI5BridgeLocationParams, opts ...ClientOption) (*GetApi5BridgeLocationOK, error) {
+func (a *Client) GetAPI5BridgeLocation(params *GetAPI5BridgeLocationParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*GetApi5BridgeLocationOK, error) {
 	// TODO: Validate the params before sending
 	if params == nil {
 		params = NewGetAPI5BridgeLocationParams()
@@ -120,6 +120,7 @@ func (a *Client) GetAPI5BridgeLocation(params *GetAPI5BridgeLocationParams, opts
 		Schemes:            []string{"http"},
 		Params:             params,
 		Reader:             &GetAPI5BridgeLocationReader{formats: a.formats},
+		AuthInfo:           authInfo,
 		Context:            params.Context,
 		Client:             params.HTTPClient,
 	}
@@ -146,7 +147,7 @@ GetAPI5Bridges gets all bridges
 
 Fetch all bridges. This is an optional API endpoint for compatibility with vpnweb, but do not count on all the providers to have it enabled since it makes it easier to enumerate resources. On the other hand, if the service has "open" VPN endpoints, they can enumerate them here freely. Bridges, however, should be more restricted as a general rule.
 */
-func (a *Client) GetAPI5Bridges(params *GetAPI5BridgesParams, opts ...ClientOption) (*GetApi5BridgesOK, error) {
+func (a *Client) GetAPI5Bridges(params *GetAPI5BridgesParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*GetApi5BridgesOK, error) {
 	// TODO: Validate the params before sending
 	if params == nil {
 		params = NewGetAPI5BridgesParams()
@@ -160,6 +161,7 @@ func (a *Client) GetAPI5Bridges(params *GetAPI5BridgesParams, opts ...ClientOpti
 		Schemes:            []string{"http"},
 		Params:             params,
 		Reader:             &GetAPI5BridgesReader{formats: a.formats},
+		AuthInfo:           authInfo,
 		Context:            params.Context,
 		Client:             params.HTTPClient,
 	}
@@ -182,24 +184,25 @@ func (a *Client) GetAPI5Bridges(params *GetAPI5BridgesParams, opts ...ClientOpti
 }
 
 /*
-GetAPI5GatewayLocation gets gateways by location
+GetAPI5Gateway gets gateways by location countrycode or random
 
-fetch random gateways for a given location
+Get Gateways with param countrycode for nearest, or with param location to get a gateway in specific location, or a random one without params
 */
-func (a *Client) GetAPI5GatewayLocation(params *GetAPI5GatewayLocationParams, opts ...ClientOption) (*GetApi5GatewayLocationOK, error) {
+func (a *Client) GetAPI5Gateway(params *GetAPI5GatewayParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*GetApi5GatewayOK, error) {
 	// TODO: Validate the params before sending
 	if params == nil {
-		params = NewGetAPI5GatewayLocationParams()
+		params = NewGetAPI5GatewayParams()
 	}
 	op := &runtime.ClientOperation{
-		ID:                 "GetAPI5GatewayLocation",
+		ID:                 "GetAPI5Gateway",
 		Method:             "GET",
-		PathPattern:        "/api/5/gateway/{location}",
+		PathPattern:        "/api/5/gateway",
 		ProducesMediaTypes: []string{"application/json"},
 		ConsumesMediaTypes: []string{"application/json"},
 		Schemes:            []string{"http"},
 		Params:             params,
-		Reader:             &GetAPI5GatewayLocationReader{formats: a.formats},
+		Reader:             &GetAPI5GatewayReader{formats: a.formats},
+		AuthInfo:           authInfo,
 		Context:            params.Context,
 		Client:             params.HTTPClient,
 	}
@@ -211,13 +214,13 @@ func (a *Client) GetAPI5GatewayLocation(params *GetAPI5GatewayLocationParams, op
 	if err != nil {
 		return nil, err
 	}
-	success, ok := result.(*GetApi5GatewayLocationOK)
+	success, ok := result.(*GetApi5GatewayOK)
 	if ok {
 		return success, nil
 	}
 	// unexpected success response
 	// safeguard: normally, absent a default response, unknown success responses return an error above: so this is a codegen issue
-	msg := fmt.Sprintf("unexpected success response for GetAPI5GatewayLocation: API contract not enforced by server. Client expected to get an error, but got: %T", result)
+	msg := fmt.Sprintf("unexpected success response for GetAPI5Gateway: API contract not enforced by server. Client expected to get an error, but got: %T", result)
 	panic(msg)
 }
 
@@ -226,7 +229,7 @@ GetAPI5Gateways gets all gateways
 
 Fetch all gateways. This is an optional API endpoint for compatibility with vpnweb, but do not count on all the providers to have it enabled since it makes it easier to enumerate resources. On the other hand, if the service has "open" VPN endpoints, they can enumerate them here freely. Bridges, however, should be more restricted as a general rule.
 */
-func (a *Client) GetAPI5Gateways(params *GetAPI5GatewaysParams, opts ...ClientOption) (*GetApi5GatewaysOK, error) {
+func (a *Client) GetAPI5Gateways(params *GetAPI5GatewaysParams, authInfo runtime.ClientAuthInfoWriter, opts ...ClientOption) (*GetApi5GatewaysOK, error) {
 	// TODO: Validate the params before sending
 	if params == nil {
 		params = NewGetAPI5GatewaysParams()
@@ -240,6 +243,7 @@ func (a *Client) GetAPI5Gateways(params *GetAPI5GatewaysParams, opts ...ClientOp
 		Schemes:            []string{"http"},
 		Params:             params,
 		Reader:             &GetAPI5GatewaysReader{formats: a.formats},
+		AuthInfo:           authInfo,
 		Context:            params.Context,
 		Client:             params.HTTPClient,
 	}
diff --git a/vendor/0xacab.org/leap/bitmask-core/models/api_bridge.go b/vendor/0xacab.org/leap/menshen/models/api_bridge.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/models/api_bridge.go
rename to vendor/0xacab.org/leap/menshen/models/api_bridge.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/models/api_e_ip_service.go b/vendor/0xacab.org/leap/menshen/models/api_e_ip_service.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/models/api_e_ip_service.go
rename to vendor/0xacab.org/leap/menshen/models/api_e_ip_service.go
diff --git a/vendor/0xacab.org/leap/bitmask-core/models/api_gateway.go b/vendor/0xacab.org/leap/menshen/models/api_gateway.go
similarity index 100%
rename from vendor/0xacab.org/leap/bitmask-core/models/api_gateway.go
rename to vendor/0xacab.org/leap/menshen/models/api_gateway.go
diff --git a/vendor/0xacab.org/leap/menshen/models/api_gateways.go b/vendor/0xacab.org/leap/menshen/models/api_gateways.go
new file mode 100644
index 0000000000000000000000000000000000000000..950dd73b3ec4ff7736f1f9b85ff58571ed97c5b8
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/models/api_gateways.go
@@ -0,0 +1,53 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package models
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"context"
+
+	"github.com/go-openapi/strfmt"
+	"github.com/go-openapi/swag"
+)
+
+// APIGateways api gateways
+//
+// swagger:model api.Gateways
+type APIGateways struct {
+
+	// all
+	All string `json:"all,omitempty"`
+
+	// location
+	Location string `json:"location,omitempty"`
+}
+
+// Validate validates this api gateways
+func (m *APIGateways) Validate(formats strfmt.Registry) error {
+	return nil
+}
+
+// ContextValidate validates this api gateways based on context it is used
+func (m *APIGateways) ContextValidate(ctx context.Context, formats strfmt.Registry) error {
+	return nil
+}
+
+// MarshalBinary interface implementation
+func (m *APIGateways) MarshalBinary() ([]byte, error) {
+	if m == nil {
+		return nil, nil
+	}
+	return swag.WriteJSON(m)
+}
+
+// UnmarshalBinary interface implementation
+func (m *APIGateways) UnmarshalBinary(b []byte) error {
+	var res APIGateways
+	if err := swag.ReadJSON(b, &res); err != nil {
+		return err
+	}
+	*m = res
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/menshen/models/api_location.go b/vendor/0xacab.org/leap/menshen/models/api_location.go
new file mode 100644
index 0000000000000000000000000000000000000000..75c13f2c056180162dde9a4c9fc253afeba2c6e3
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/models/api_location.go
@@ -0,0 +1,65 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package models
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"context"
+
+	"github.com/go-openapi/strfmt"
+	"github.com/go-openapi/swag"
+)
+
+// APILocation api location
+//
+// swagger:model api.Location
+type APILocation struct {
+
+	// country code
+	CountryCode string `json:"country_code,omitempty"`
+
+	// hemisphere
+	Hemisphere string `json:"hemisphere,omitempty"`
+
+	// lat
+	Lat string `json:"lat,omitempty"`
+
+	// lon
+	Lon string `json:"lon,omitempty"`
+
+	// name
+	Name string `json:"name,omitempty"`
+
+	// timezone
+	Timezone string `json:"timezone,omitempty"`
+}
+
+// Validate validates this api location
+func (m *APILocation) Validate(formats strfmt.Registry) error {
+	return nil
+}
+
+// ContextValidate validates this api location based on context it is used
+func (m *APILocation) ContextValidate(ctx context.Context, formats strfmt.Registry) error {
+	return nil
+}
+
+// MarshalBinary interface implementation
+func (m *APILocation) MarshalBinary() ([]byte, error) {
+	if m == nil {
+		return nil, nil
+	}
+	return swag.WriteJSON(m)
+}
+
+// UnmarshalBinary interface implementation
+func (m *APILocation) UnmarshalBinary(b []byte) error {
+	var res APILocation
+	if err := swag.ReadJSON(b, &res); err != nil {
+		return err
+	}
+	*m = res
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/menshen/models/main_gateways.go b/vendor/0xacab.org/leap/menshen/models/main_gateways.go
new file mode 100644
index 0000000000000000000000000000000000000000..27f62bfc301b183c90cd0456db14b8076bb5891b
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/models/main_gateways.go
@@ -0,0 +1,53 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package models
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"context"
+
+	"github.com/go-openapi/strfmt"
+	"github.com/go-openapi/swag"
+)
+
+// MainGateways main gateways
+//
+// swagger:model main.Gateways
+type MainGateways struct {
+
+	// all
+	All string `json:"all,omitempty"`
+
+	// location
+	Location string `json:"location,omitempty"`
+}
+
+// Validate validates this main gateways
+func (m *MainGateways) Validate(formats strfmt.Registry) error {
+	return nil
+}
+
+// ContextValidate validates this main gateways based on context it is used
+func (m *MainGateways) ContextValidate(ctx context.Context, formats strfmt.Registry) error {
+	return nil
+}
+
+// MarshalBinary interface implementation
+func (m *MainGateways) MarshalBinary() ([]byte, error) {
+	if m == nil {
+		return nil, nil
+	}
+	return swag.WriteJSON(m)
+}
+
+// UnmarshalBinary interface implementation
+func (m *MainGateways) UnmarshalBinary(b []byte) error {
+	var res MainGateways
+	if err := swag.ReadJSON(b, &res); err != nil {
+		return err
+	}
+	*m = res
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/bitmask-core/models/models_bridge.go b/vendor/0xacab.org/leap/menshen/models/models_bridge.go
similarity index 93%
rename from vendor/0xacab.org/leap/bitmask-core/models/models_bridge.go
rename to vendor/0xacab.org/leap/menshen/models/models_bridge.go
index 486692bb872179b88987714ce6d5d91217584d55..d98817726e58133a4b03f390932931ec86d80167 100644
--- a/vendor/0xacab.org/leap/bitmask-core/models/models_bridge.go
+++ b/vendor/0xacab.org/leap/menshen/models/models_bridge.go
@@ -42,6 +42,9 @@ type ModelsBridge struct {
 	// IPAddr is the IPv4 address
 	IPAddr string `json:"ip_addr,omitempty"`
 
+	// LastSeenMillis is a unix time in milliseconds representing the last time we received a heartbeat update from this bridge
+	LastSeenMillis int64 `json:"lastSeenMillis,omitempty"`
+
 	// Load is the fractional load - but for now menshen agent is not measuring
 	// load in the bridges.
 	Load float64 `json:"load,omitempty"`
diff --git a/vendor/0xacab.org/leap/menshen/models/models_e_ip_service.go b/vendor/0xacab.org/leap/menshen/models/models_e_ip_service.go
new file mode 100644
index 0000000000000000000000000000000000000000..321de3916fffac31f31ec99a46a04b6ea541311e
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/models/models_e_ip_service.go
@@ -0,0 +1,123 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package models
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"context"
+
+	"github.com/go-openapi/errors"
+	"github.com/go-openapi/strfmt"
+	"github.com/go-openapi/swag"
+	"github.com/go-openapi/validate"
+)
+
+// ModelsEIPService models e IP service
+//
+// swagger:model models.EIPService
+type ModelsEIPService struct {
+
+	// auth
+	Auth string `json:"auth,omitempty"`
+
+	// locations
+	Locations map[string]ModelsLocation `json:"locations,omitempty"`
+
+	// openvpn configuration
+	OpenvpnConfiguration map[string]interface{} `json:"openvpn_configuration,omitempty"`
+
+	// serial
+	Serial int64 `json:"serial,omitempty"`
+
+	// version
+	Version int64 `json:"version,omitempty"`
+}
+
+// Validate validates this models e IP service
+func (m *ModelsEIPService) Validate(formats strfmt.Registry) error {
+	var res []error
+
+	if err := m.validateLocations(formats); err != nil {
+		res = append(res, err)
+	}
+
+	if len(res) > 0 {
+		return errors.CompositeValidationError(res...)
+	}
+	return nil
+}
+
+func (m *ModelsEIPService) validateLocations(formats strfmt.Registry) error {
+	if swag.IsZero(m.Locations) { // not required
+		return nil
+	}
+
+	for k := range m.Locations {
+
+		if err := validate.Required("locations"+"."+k, "body", m.Locations[k]); err != nil {
+			return err
+		}
+		if val, ok := m.Locations[k]; ok {
+			if err := val.Validate(formats); err != nil {
+				if ve, ok := err.(*errors.Validation); ok {
+					return ve.ValidateName("locations" + "." + k)
+				} else if ce, ok := err.(*errors.CompositeError); ok {
+					return ce.ValidateName("locations" + "." + k)
+				}
+				return err
+			}
+		}
+
+	}
+
+	return nil
+}
+
+// ContextValidate validate this models e IP service based on the context it is used
+func (m *ModelsEIPService) ContextValidate(ctx context.Context, formats strfmt.Registry) error {
+	var res []error
+
+	if err := m.contextValidateLocations(ctx, formats); err != nil {
+		res = append(res, err)
+	}
+
+	if len(res) > 0 {
+		return errors.CompositeValidationError(res...)
+	}
+	return nil
+}
+
+func (m *ModelsEIPService) contextValidateLocations(ctx context.Context, formats strfmt.Registry) error {
+
+	for k := range m.Locations {
+
+		if val, ok := m.Locations[k]; ok {
+			if err := val.ContextValidate(ctx, formats); err != nil {
+				return err
+			}
+		}
+
+	}
+
+	return nil
+}
+
+// MarshalBinary interface implementation
+func (m *ModelsEIPService) MarshalBinary() ([]byte, error) {
+	if m == nil {
+		return nil, nil
+	}
+	return swag.WriteJSON(m)
+}
+
+// UnmarshalBinary interface implementation
+func (m *ModelsEIPService) UnmarshalBinary(b []byte) error {
+	var res ModelsEIPService
+	if err := swag.ReadJSON(b, &res); err != nil {
+		return err
+	}
+	*m = res
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/bitmask-core/models/models_gateway.go b/vendor/0xacab.org/leap/menshen/models/models_gateway.go
similarity index 94%
rename from vendor/0xacab.org/leap/bitmask-core/models/models_gateway.go
rename to vendor/0xacab.org/leap/menshen/models/models_gateway.go
index b7015c16842715fc246109ab42e27f595a24b59c..9490477cdceb443ffd4f4675f932352ea2806033 100644
--- a/vendor/0xacab.org/leap/bitmask-core/models/models_gateway.go
+++ b/vendor/0xacab.org/leap/menshen/models/models_gateway.go
@@ -42,6 +42,9 @@ type ModelsGateway struct {
 	// IPAddr is the IPv4 address
 	IPAddr string `json:"ip_addr,omitempty"`
 
+	// LastSeenMillis is a unix time in milliseconds representing the last time we received a heartbeat update from this gateway
+	LastSeenMillis int64 `json:"lastSeenMillis,omitempty"`
+
 	// Load is the fractional load received from the menshen agent. For the
 	// time being it is a synthethic metric that takes into account number of clients
 	// and network information for the node.
diff --git a/vendor/0xacab.org/leap/menshen/models/models_location.go b/vendor/0xacab.org/leap/menshen/models/models_location.go
new file mode 100644
index 0000000000000000000000000000000000000000..3016d7a261bf37504f74bc81434a9850ed2a3c73
--- /dev/null
+++ b/vendor/0xacab.org/leap/menshen/models/models_location.go
@@ -0,0 +1,83 @@
+// Code generated by go-swagger; DO NOT EDIT.
+
+package models
+
+// This file was generated by the swagger tool.
+// Editing this file might prove futile when you re-run the swagger generate command
+
+import (
+	"context"
+
+	"github.com/go-openapi/strfmt"
+	"github.com/go-openapi/swag"
+)
+
+// ModelsLocation models location
+//
+// swagger:model models.Location
+type ModelsLocation struct {
+
+	// CountryCode is the two-character country ISO identifier (uppercase).
+	CountryCode string `json:"country_code,omitempty"`
+
+	// DisplayName is the user-facing string for a given location.
+	DisplayName string `json:"display_name,omitempty"`
+
+	// Any location that has at least one bridge configured will set this to true.
+	HasBridges bool `json:"has_bridges,omitempty"`
+
+	// TODO Not used right now, but intended to signal when a location has all of their
+	// nodes overwhelmed.
+	Healthy bool `json:"healthy,omitempty"`
+
+	// Hemisphere is a legacy label for a gateway. The rationale was once
+	// intended to be to allocate gateways for an hemisphere with certain
+	// regional "fairness", even if they're geographically located in a
+	// different region. We might want to set this on the Gateway or Bridge, not in the
+	// Location itself...
+	Hemisphere string `json:"hemisphere,omitempty"`
+
+	// Label is the short representation of a location, used internally.
+	Label string `json:"label,omitempty"`
+
+	// Lat is the latitude for the location.
+	Lat string `json:"lat,omitempty"`
+
+	// Lon is the longitude for the location.
+	Lon string `json:"lon,omitempty"`
+
+	// Region is the continental region this gateway is assigned to. Not used at the moment,
+	// intended to use a label from the 7-continent model.
+	Region string `json:"region,omitempty"`
+
+	// Timezone is the TZ for the location (-1, 0, +1, ...)
+	Timezone string `json:"timezone,omitempty"`
+}
+
+// Validate validates this models location
+func (m *ModelsLocation) Validate(formats strfmt.Registry) error {
+	return nil
+}
+
+// ContextValidate validates this models location based on context it is used
+func (m *ModelsLocation) ContextValidate(ctx context.Context, formats strfmt.Registry) error {
+	return nil
+}
+
+// MarshalBinary interface implementation
+func (m *ModelsLocation) MarshalBinary() ([]byte, error) {
+	if m == nil {
+		return nil, nil
+	}
+	return swag.WriteJSON(m)
+}
+
+// UnmarshalBinary interface implementation
+func (m *ModelsLocation) UnmarshalBinary(b []byte) error {
+	var res ModelsLocation
+	if err := swag.ReadJSON(b, &res); err != nil {
+		return err
+	}
+	*m = res
+	return nil
+}
diff --git a/vendor/0xacab.org/leap/bitmask-core/models/models_provider.go b/vendor/0xacab.org/leap/menshen/models/models_provider.go
similarity index 89%
rename from vendor/0xacab.org/leap/bitmask-core/models/models_provider.go
rename to vendor/0xacab.org/leap/menshen/models/models_provider.go
index 0ddcf2b5334916345db4e6fbee25076627f0d276..d022568be32d5e3efc0d9246e431c659a21719b4 100644
--- a/vendor/0xacab.org/leap/bitmask-core/models/models_provider.go
+++ b/vendor/0xacab.org/leap/menshen/models/models_provider.go
@@ -39,6 +39,10 @@ type ModelsProvider struct {
 	// deprecated: kept for backwards compatibility
 	CaCertURI string `json:"ca_cert_uri,omitempty"`
 
+	// URL of a service that returns a country code for an ip address. If empty,
+	// OONI backend is used
+	CountryCodeLookupURL string `json:"country_code_lookup_url,omitempty"`
+
 	// Default language this provider uses to show infos and provider messages
 	DefaultLanguage string `json:"default_language,omitempty"`
 
@@ -74,6 +78,11 @@ type ModelsProvider struct {
 	// List of services the provider offers, currently only openvpn
 	Services []string `json:"services"`
 
+	// list of STUN servers (format: ip/hostname:port) servers to get current ip address
+	// can consist of self hosted STUN servers, public ones or a combination of both.
+	// GeolocationLookup is only done when the list of STUNServers is not empty
+	StunServers []string `json:"stun_servers"`
+
 	// URL to Terms of Service website
 	TosURL string `json:"tos_url,omitempty"`
 }
diff --git a/vendor/0xacab.org/leap/bitmask-core/models/models_provider_service.go b/vendor/0xacab.org/leap/menshen/models/models_provider_service.go
similarity index 92%
rename from vendor/0xacab.org/leap/bitmask-core/models/models_provider_service.go
rename to vendor/0xacab.org/leap/menshen/models/models_provider_service.go
index be742c3f89610e5ff58b522a3ec26e1e6114a02a..646c56ecb0082da837b06251e32e9891a1aaf813 100644
--- a/vendor/0xacab.org/leap/bitmask-core/models/models_provider_service.go
+++ b/vendor/0xacab.org/leap/menshen/models/models_provider_service.go
@@ -19,11 +19,11 @@ type ModelsProviderService struct {
 
 	// Flag indicating if anonymous usage without registration is allowed
 	// deprecated: kept for backwards compatibility
-	AllowAnonymous bool `json:"allowAnonymous,omitempty"`
+	AllowAnonymous bool `json:"allow_anonymous,omitempty"`
 
 	// Flag indicating if the provider supports user registration
 	// deprecated: kept for backwards compatibility
-	AllowRegistration bool `json:"allowRegistration,omitempty"`
+	AllowRegistration bool `json:"allow_registration,omitempty"`
 }
 
 // Validate validates this models provider service
diff --git a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
index 9ae8206c201658163c4cb18b7bb4a73051a30652..f75162e039c10a29690c25dc895343c7aa1c6a3a 100644
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
@@ -1,722 +1,4517 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2bAVX2_amd64_asm.go -out ../../blake2bAVX2_amd64.s -pkg blake2b. DO NOT EDIT.
 
 //go:build amd64 && gc && !purego
 
 #include "textflag.h"
 
-DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
-DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
-GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
-DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
-DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
-DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
-GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
-
-#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
-#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
-#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
-#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
-#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
-
-#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
-	VPADDQ  m0, Y0, Y0;   \
-	VPADDQ  Y1, Y0, Y0;   \
-	VPXOR   Y0, Y3, Y3;   \
-	VPSHUFD $-79, Y3, Y3; \
-	VPADDQ  Y3, Y2, Y2;   \
-	VPXOR   Y2, Y1, Y1;   \
-	VPSHUFB c40, Y1, Y1;  \
-	VPADDQ  m1, Y0, Y0;   \
-	VPADDQ  Y1, Y0, Y0;   \
-	VPXOR   Y0, Y3, Y3;   \
-	VPSHUFB c48, Y3, Y3;  \
-	VPADDQ  Y3, Y2, Y2;   \
-	VPXOR   Y2, Y1, Y1;   \
-	VPADDQ  Y1, Y1, t;    \
-	VPSRLQ  $63, Y1, Y1;  \
-	VPXOR   t, Y1, Y1;    \
-	VPERMQ_0x39_Y1_Y1;    \
-	VPERMQ_0x4E_Y2_Y2;    \
-	VPERMQ_0x93_Y3_Y3;    \
-	VPADDQ  m2, Y0, Y0;   \
-	VPADDQ  Y1, Y0, Y0;   \
-	VPXOR   Y0, Y3, Y3;   \
-	VPSHUFD $-79, Y3, Y3; \
-	VPADDQ  Y3, Y2, Y2;   \
-	VPXOR   Y2, Y1, Y1;   \
-	VPSHUFB c40, Y1, Y1;  \
-	VPADDQ  m3, Y0, Y0;   \
-	VPADDQ  Y1, Y0, Y0;   \
-	VPXOR   Y0, Y3, Y3;   \
-	VPSHUFB c48, Y3, Y3;  \
-	VPADDQ  Y3, Y2, Y2;   \
-	VPXOR   Y2, Y1, Y1;   \
-	VPADDQ  Y1, Y1, t;    \
-	VPSRLQ  $63, Y1, Y1;  \
-	VPXOR   t, Y1, Y1;    \
-	VPERMQ_0x39_Y3_Y3;    \
-	VPERMQ_0x4E_Y2_Y2;    \
-	VPERMQ_0x93_Y1_Y1
-
-#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
-#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
-#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
-#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
-#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
-
-#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
-#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
-#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
-#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
-#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
-
-#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
-#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
-#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
-#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
-#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
-
-#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
-
-#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
-#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
-
-// load msg: Y12 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
-	VMOVQ_SI_X12(i0*8);           \
-	VMOVQ_SI_X11(i2*8);           \
-	VPINSRQ_1_SI_X12(i1*8);       \
-	VPINSRQ_1_SI_X11(i3*8);       \
-	VINSERTI128 $1, X11, Y12, Y12
-
-// load msg: Y13 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
-	VMOVQ_SI_X13(i0*8);           \
-	VMOVQ_SI_X11(i2*8);           \
-	VPINSRQ_1_SI_X13(i1*8);       \
-	VPINSRQ_1_SI_X11(i3*8);       \
-	VINSERTI128 $1, X11, Y13, Y13
-
-// load msg: Y14 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
-	VMOVQ_SI_X14(i0*8);           \
-	VMOVQ_SI_X11(i2*8);           \
-	VPINSRQ_1_SI_X14(i1*8);       \
-	VPINSRQ_1_SI_X11(i3*8);       \
-	VINSERTI128 $1, X11, Y14, Y14
-
-// load msg: Y15 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
-	VMOVQ_SI_X15(i0*8);           \
-	VMOVQ_SI_X11(i2*8);           \
-	VPINSRQ_1_SI_X15(i1*8);       \
-	VPINSRQ_1_SI_X11(i3*8);       \
-	VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
-	VMOVQ_SI_X12_0;                   \
-	VMOVQ_SI_X11(4*8);                \
-	VPINSRQ_1_SI_X12(2*8);            \
-	VPINSRQ_1_SI_X11(6*8);            \
-	VINSERTI128 $1, X11, Y12, Y12;    \
-	LOAD_MSG_AVX2_Y13(1, 3, 5, 7);    \
-	LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
-	LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
-
-#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
-	LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
-	LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
-	VMOVQ_SI_X11(11*8);              \
-	VPSHUFD     $0x4E, 0*8(SI), X14; \
-	VPINSRQ_1_SI_X11(5*8);           \
-	VINSERTI128 $1, X11, Y14, Y14;   \
-	LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
-
-#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
-	VMOVQ_SI_X11(5*8);              \
-	VMOVDQU     11*8(SI), X12;      \
-	VPINSRQ_1_SI_X11(15*8);         \
-	VINSERTI128 $1, X11, Y12, Y12;  \
-	VMOVQ_SI_X13(8*8);              \
-	VMOVQ_SI_X11(2*8);              \
-	VPINSRQ_1_SI_X13_0;             \
-	VPINSRQ_1_SI_X11(13*8);         \
-	VINSERTI128 $1, X11, Y13, Y13;  \
-	LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
-	LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
-
-#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
-	LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
-	LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
-	LOAD_MSG_AVX2_Y14(2, 5, 4, 15);  \
-	VMOVQ_SI_X15(6*8);               \
-	VMOVQ_SI_X11_0;                  \
-	VPINSRQ_1_SI_X15(10*8);          \
-	VPINSRQ_1_SI_X11(8*8);           \
-	VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
-	LOAD_MSG_AVX2_Y12(9, 5, 2, 10);  \
-	VMOVQ_SI_X13_0;                  \
-	VMOVQ_SI_X11(4*8);               \
-	VPINSRQ_1_SI_X13(7*8);           \
-	VPINSRQ_1_SI_X11(15*8);          \
-	VINSERTI128 $1, X11, Y13, Y13;   \
-	LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
-	LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
-
-#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
-	VMOVQ_SI_X12(2*8);                \
-	VMOVQ_SI_X11_0;                   \
-	VPINSRQ_1_SI_X12(6*8);            \
-	VPINSRQ_1_SI_X11(8*8);            \
-	VINSERTI128 $1, X11, Y12, Y12;    \
-	LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
-	LOAD_MSG_AVX2_Y14(4, 7, 15, 1);   \
-	LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
-
-#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
-	LOAD_MSG_AVX2_Y12(12, 1, 14, 4);  \
-	LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
-	VMOVQ_SI_X14_0;                   \
-	VPSHUFD     $0x4E, 8*8(SI), X11;  \
-	VPINSRQ_1_SI_X14(6*8);            \
-	VINSERTI128 $1, X11, Y14, Y14;    \
-	LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
-
-#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
-	LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
-	LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
-	LOAD_MSG_AVX2_Y14(5, 15, 8, 2);  \
-	VMOVQ_SI_X15_0;                  \
-	VMOVQ_SI_X11(6*8);               \
-	VPINSRQ_1_SI_X15(4*8);           \
-	VPINSRQ_1_SI_X11(10*8);          \
-	VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
-	VMOVQ_SI_X12(6*8);              \
-	VMOVQ_SI_X11(11*8);             \
-	VPINSRQ_1_SI_X12(14*8);         \
-	VPINSRQ_1_SI_X11_0;             \
-	VINSERTI128 $1, X11, Y12, Y12;  \
-	LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
-	VMOVQ_SI_X11(1*8);              \
-	VMOVDQU     12*8(SI), X14;      \
-	VPINSRQ_1_SI_X11(10*8);         \
-	VINSERTI128 $1, X11, Y14, Y14;  \
-	VMOVQ_SI_X15(2*8);              \
-	VMOVDQU     4*8(SI), X11;       \
-	VPINSRQ_1_SI_X15(7*8);          \
-	VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
-	LOAD_MSG_AVX2_Y12(10, 8, 7, 1);  \
-	VMOVQ_SI_X13(2*8);               \
-	VPSHUFD     $0x4E, 5*8(SI), X11; \
-	VPINSRQ_1_SI_X13(4*8);           \
-	VINSERTI128 $1, X11, Y13, Y13;   \
-	LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
-	VMOVQ_SI_X15(11*8);              \
-	VMOVQ_SI_X11(12*8);              \
-	VPINSRQ_1_SI_X15(14*8);          \
-	VPINSRQ_1_SI_X11_0;              \
-	VINSERTI128 $1, X11, Y15, Y15
-
 // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
-	MOVQ h+0(FP), AX
-	MOVQ c+8(FP), BX
-	MOVQ flag+16(FP), CX
-	MOVQ blocks_base+24(FP), SI
-	MOVQ blocks_len+32(FP), DI
-
-	MOVQ SP, DX
-	ADDQ $31, DX
-	ANDQ $~31, DX
-
-	MOVQ CX, 16(DX)
-	XORQ CX, CX
-	MOVQ CX, 24(DX)
-
-	VMOVDQU ·AVX2_c40<>(SB), Y4
-	VMOVDQU ·AVX2_c48<>(SB), Y5
-
-	VMOVDQU 0(AX), Y8
+// Requires: AVX, AVX2
+TEXT ·hashBlocksAVX2(SB), NOSPLIT, $320-48
+	MOVQ    h+0(FP), AX
+	MOVQ    c+8(FP), BX
+	MOVQ    flag+16(FP), CX
+	MOVQ    blocks_base+24(FP), SI
+	MOVQ    blocks_len+32(FP), DI
+	MOVQ    SP, DX
+	ADDQ    $+31, DX
+	ANDQ    $-32, DX
+	MOVQ    CX, 16(DX)
+	XORQ    CX, CX
+	MOVQ    CX, 24(DX)
+	VMOVDQU ·AVX2_c40<>+0(SB), Y4
+	VMOVDQU ·AVX2_c48<>+0(SB), Y5
+	VMOVDQU (AX), Y8
 	VMOVDQU 32(AX), Y9
-	VMOVDQU ·AVX2_iv0<>(SB), Y6
-	VMOVDQU ·AVX2_iv1<>(SB), Y7
-
-	MOVQ 0(BX), R8
-	MOVQ 8(BX), R9
-	MOVQ R9, 8(DX)
+	VMOVDQU ·AVX2_iv0<>+0(SB), Y6
+	VMOVDQU ·AVX2_iv1<>+0(SB), Y7
+	MOVQ    (BX), R8
+	MOVQ    8(BX), R9
+	MOVQ    R9, 8(DX)
 
 loop:
-	ADDQ $128, R8
-	MOVQ R8, 0(DX)
-	CMPQ R8, $128
+	ADDQ $0x80, R8
+	MOVQ R8, (DX)
+	CMPQ R8, $0x80
 	JGE  noinc
 	INCQ R9
 	MOVQ R9, 8(DX)
 
 noinc:
-	VMOVDQA Y8, Y0
-	VMOVDQA Y9, Y1
-	VMOVDQA Y6, Y2
-	VPXOR   0(DX), Y7, Y3
-
-	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
-	VMOVDQA Y12, 32(DX)
-	VMOVDQA Y13, 64(DX)
-	VMOVDQA Y14, 96(DX)
-	VMOVDQA Y15, 128(DX)
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
-	VMOVDQA Y12, 160(DX)
-	VMOVDQA Y13, 192(DX)
-	VMOVDQA Y14, 224(DX)
-	VMOVDQA Y15, 256(DX)
-
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-
-	ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
-	ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
-
-	VPXOR Y0, Y8, Y8
-	VPXOR Y1, Y9, Y9
-	VPXOR Y2, Y8, Y8
-	VPXOR Y3, Y9, Y9
-
-	LEAQ 128(SI), SI
-	SUBQ $128, DI
-	JNE  loop
-
-	MOVQ R8, 0(BX)
-	MOVQ R9, 8(BX)
-
-	VMOVDQU Y8, 0(AX)
-	VMOVDQU Y9, 32(AX)
+	VMOVDQA     Y8, Y0
+	VMOVDQA     Y9, Y1
+	VMOVDQA     Y6, Y2
+	VPXOR       (DX), Y7, Y3
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x26
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x10
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x30
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x08
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x28
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x18
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x38
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x40
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x60
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x50
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x70
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x48
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x58
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VMOVDQA     Y12, 32(DX)
+	VMOVDQA     Y13, 64(DX)
+	VMOVDQA     Y14, 96(DX)
+	VMOVDQA     Y15, 128(DX)
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x70
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x48
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x20
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x50
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x40
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x30
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x58
+	VPSHUFD     $0x4e, (SI), X14
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x28
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x60
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x38
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x10
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VMOVDQA     Y12, 160(DX)
+	VMOVDQA     Y13, 192(DX)
+	VMOVDQA     Y14, 224(DX)
+	VMOVDQA     Y15, 256(DX)
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x28
+	VMOVDQU     88(SI), X12
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x40
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x10
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x2e
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x50
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x38
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x18
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x48
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x70
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x08
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x30
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x38
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x18
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x58
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x48
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x60
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x08
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x70
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x10
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x28
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x30
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x1e
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x50
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x48
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x10
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x28
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x50
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x2e
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x38
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x70
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x30
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x58
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x08
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x60
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x10
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x1e
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x30
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x60
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x58
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x50
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x20
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x38
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x08
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x68
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x70
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x28
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x48
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x60
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x70
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x08
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x28
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x78
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x50
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x36
+	VPSHUFD     $0x4e, 64(SI), X11
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x30
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x38
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x10
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x18
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x58
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x68
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x60
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x38
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x58
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x08
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x70
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x48
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x28
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x78
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x10
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x3e
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x30
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x20
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x50
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x30
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x58
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x70
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x1e
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x78
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x48
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x08
+	VMOVDQU     96(SI), X14
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x50
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x10
+	VMOVDQU     32(SI), X11
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x38
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x50
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x38
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x40
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x08
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x10
+	VPSHUFD     $0x4e, 40(SI), X11
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x20
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x78
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x48
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x58
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x60
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x70
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x1e
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	VPADDQ      32(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      64(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      96(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      128(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	VPADDQ      160(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      192(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      224(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      256(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	VPXOR       Y0, Y8, Y8
+	VPXOR       Y1, Y9, Y9
+	VPXOR       Y2, Y8, Y8
+	VPXOR       Y3, Y9, Y9
+	LEAQ        128(SI), SI
+	SUBQ        $0x80, DI
+	JNE         loop
+	MOVQ        R8, (BX)
+	MOVQ        R9, 8(BX)
+	VMOVDQU     Y8, (AX)
+	VMOVDQU     Y9, 32(AX)
 	VZEROUPPER
-
 	RET
 
-#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
-#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
-#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
-#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
-#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
-
-#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
-#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
-#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
-#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
-#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
-#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
-#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
-#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
-
-#define SHUFFLE_AVX() \
-	VMOVDQA X6, X13;         \
-	VMOVDQA X2, X14;         \
-	VMOVDQA X4, X6;          \
-	VPUNPCKLQDQ_X13_X13_X15; \
-	VMOVDQA X5, X4;          \
-	VMOVDQA X6, X5;          \
-	VPUNPCKHQDQ_X15_X7_X6;   \
-	VPUNPCKLQDQ_X7_X7_X15;   \
-	VPUNPCKHQDQ_X15_X13_X7;  \
-	VPUNPCKLQDQ_X3_X3_X15;   \
-	VPUNPCKHQDQ_X15_X2_X2;   \
-	VPUNPCKLQDQ_X14_X14_X15; \
-	VPUNPCKHQDQ_X15_X3_X3;   \
-
-#define SHUFFLE_AVX_INV() \
-	VMOVDQA X2, X13;         \
-	VMOVDQA X4, X14;         \
-	VPUNPCKLQDQ_X2_X2_X15;   \
-	VMOVDQA X5, X4;          \
-	VPUNPCKHQDQ_X15_X3_X2;   \
-	VMOVDQA X14, X5;         \
-	VPUNPCKLQDQ_X3_X3_X15;   \
-	VMOVDQA X6, X14;         \
-	VPUNPCKHQDQ_X15_X13_X3;  \
-	VPUNPCKLQDQ_X7_X7_X15;   \
-	VPUNPCKHQDQ_X15_X6_X6;   \
-	VPUNPCKLQDQ_X14_X14_X15; \
-	VPUNPCKHQDQ_X15_X7_X7;   \
-
-#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
-	VPADDQ  m0, v0, v0;   \
-	VPADDQ  v2, v0, v0;   \
-	VPADDQ  m1, v1, v1;   \
-	VPADDQ  v3, v1, v1;   \
-	VPXOR   v0, v6, v6;   \
-	VPXOR   v1, v7, v7;   \
-	VPSHUFD $-79, v6, v6; \
-	VPSHUFD $-79, v7, v7; \
-	VPADDQ  v6, v4, v4;   \
-	VPADDQ  v7, v5, v5;   \
-	VPXOR   v4, v2, v2;   \
-	VPXOR   v5, v3, v3;   \
-	VPSHUFB c40, v2, v2;  \
-	VPSHUFB c40, v3, v3;  \
-	VPADDQ  m2, v0, v0;   \
-	VPADDQ  v2, v0, v0;   \
-	VPADDQ  m3, v1, v1;   \
-	VPADDQ  v3, v1, v1;   \
-	VPXOR   v0, v6, v6;   \
-	VPXOR   v1, v7, v7;   \
-	VPSHUFB c48, v6, v6;  \
-	VPSHUFB c48, v7, v7;  \
-	VPADDQ  v6, v4, v4;   \
-	VPADDQ  v7, v5, v5;   \
-	VPXOR   v4, v2, v2;   \
-	VPXOR   v5, v3, v3;   \
-	VPADDQ  v2, v2, t0;   \
-	VPSRLQ  $63, v2, v2;  \
-	VPXOR   t0, v2, v2;   \
-	VPADDQ  v3, v3, t0;   \
-	VPSRLQ  $63, v3, v3;  \
-	VPXOR   t0, v3, v3
-
-// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
-// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
-#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
-	VMOVQ_SI_X12(i0*8);     \
-	VMOVQ_SI_X13(i2*8);     \
-	VMOVQ_SI_X14(i4*8);     \
-	VMOVQ_SI_X15(i6*8);     \
-	VPINSRQ_1_SI_X12(i1*8); \
-	VPINSRQ_1_SI_X13(i3*8); \
-	VPINSRQ_1_SI_X14(i5*8); \
-	VPINSRQ_1_SI_X15(i7*8)
-
-// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
-#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
-	VMOVQ_SI_X12_0;        \
-	VMOVQ_SI_X13(4*8);     \
-	VMOVQ_SI_X14(1*8);     \
-	VMOVQ_SI_X15(5*8);     \
-	VPINSRQ_1_SI_X12(2*8); \
-	VPINSRQ_1_SI_X13(6*8); \
-	VPINSRQ_1_SI_X14(3*8); \
-	VPINSRQ_1_SI_X15(7*8)
-
-// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
-#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
-	VPSHUFD $0x4E, 0*8(SI), X12; \
-	VMOVQ_SI_X13(11*8);          \
-	VMOVQ_SI_X14(12*8);          \
-	VMOVQ_SI_X15(7*8);           \
-	VPINSRQ_1_SI_X13(5*8);       \
-	VPINSRQ_1_SI_X14(2*8);       \
-	VPINSRQ_1_SI_X15(3*8)
-
-// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
-#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
-	VMOVDQU 11*8(SI), X12;  \
-	VMOVQ_SI_X13(5*8);      \
-	VMOVQ_SI_X14(8*8);      \
-	VMOVQ_SI_X15(2*8);      \
-	VPINSRQ_1_SI_X13(15*8); \
-	VPINSRQ_1_SI_X14_0;     \
-	VPINSRQ_1_SI_X15(13*8)
-
-// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
-#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
-	VMOVQ_SI_X12(2*8);      \
-	VMOVQ_SI_X13(4*8);      \
-	VMOVQ_SI_X14(6*8);      \
-	VMOVQ_SI_X15_0;         \
-	VPINSRQ_1_SI_X12(5*8);  \
-	VPINSRQ_1_SI_X13(15*8); \
-	VPINSRQ_1_SI_X14(10*8); \
-	VPINSRQ_1_SI_X15(8*8)
+DATA ·AVX2_c40<>+0(SB)/8, $0x0201000706050403
+DATA ·AVX2_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+DATA ·AVX2_c40<>+16(SB)/8, $0x0201000706050403
+DATA ·AVX2_c40<>+24(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·AVX2_c40<>(SB), RODATA|NOPTR, $32
 
-// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
-#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
-	VMOVQ_SI_X12(9*8);      \
-	VMOVQ_SI_X13(2*8);      \
-	VMOVQ_SI_X14_0;         \
-	VMOVQ_SI_X15(4*8);      \
-	VPINSRQ_1_SI_X12(5*8);  \
-	VPINSRQ_1_SI_X13(10*8); \
-	VPINSRQ_1_SI_X14(7*8);  \
-	VPINSRQ_1_SI_X15(15*8)
+DATA ·AVX2_c48<>+0(SB)/8, $0x0100070605040302
+DATA ·AVX2_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+DATA ·AVX2_c48<>+16(SB)/8, $0x0100070605040302
+DATA ·AVX2_c48<>+24(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·AVX2_c48<>(SB), RODATA|NOPTR, $32
 
-// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
-#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
-	VMOVQ_SI_X12(2*8);      \
-	VMOVQ_SI_X13_0;         \
-	VMOVQ_SI_X14(12*8);     \
-	VMOVQ_SI_X15(11*8);     \
-	VPINSRQ_1_SI_X12(6*8);  \
-	VPINSRQ_1_SI_X13(8*8);  \
-	VPINSRQ_1_SI_X14(10*8); \
-	VPINSRQ_1_SI_X15(3*8)
+DATA ·AVX2_iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·AVX2_iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+DATA ·AVX2_iv0<>+16(SB)/8, $0x3c6ef372fe94f82b
+DATA ·AVX2_iv0<>+24(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·AVX2_iv0<>(SB), RODATA|NOPTR, $32
 
-// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
-#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
-	MOVQ    0*8(SI), X12;        \
-	VPSHUFD $0x4E, 8*8(SI), X13; \
-	MOVQ    7*8(SI), X14;        \
-	MOVQ    2*8(SI), X15;        \
-	VPINSRQ_1_SI_X12(6*8);       \
-	VPINSRQ_1_SI_X14(3*8);       \
-	VPINSRQ_1_SI_X15(11*8)
-
-// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
-#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
-	MOVQ 6*8(SI), X12;      \
-	MOVQ 11*8(SI), X13;     \
-	MOVQ 15*8(SI), X14;     \
-	MOVQ 3*8(SI), X15;      \
-	VPINSRQ_1_SI_X12(14*8); \
-	VPINSRQ_1_SI_X13_0;     \
-	VPINSRQ_1_SI_X14(9*8);  \
-	VPINSRQ_1_SI_X15(8*8)
-
-// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
-#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
-	MOVQ 5*8(SI), X12;      \
-	MOVQ 8*8(SI), X13;      \
-	MOVQ 0*8(SI), X14;      \
-	MOVQ 6*8(SI), X15;      \
-	VPINSRQ_1_SI_X12(15*8); \
-	VPINSRQ_1_SI_X13(2*8);  \
-	VPINSRQ_1_SI_X14(4*8);  \
-	VPINSRQ_1_SI_X15(10*8)
-
-// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
-#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
-	VMOVDQU 12*8(SI), X12;  \
-	MOVQ    1*8(SI), X13;   \
-	MOVQ    2*8(SI), X14;   \
-	VPINSRQ_1_SI_X13(10*8); \
-	VPINSRQ_1_SI_X14(7*8);  \
-	VMOVDQU 4*8(SI), X15
-
-// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
-#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
-	MOVQ 15*8(SI), X12;     \
-	MOVQ 3*8(SI), X13;      \
-	MOVQ 11*8(SI), X14;     \
-	MOVQ 12*8(SI), X15;     \
-	VPINSRQ_1_SI_X12(9*8);  \
-	VPINSRQ_1_SI_X13(13*8); \
-	VPINSRQ_1_SI_X14(14*8); \
-	VPINSRQ_1_SI_X15_0
+DATA ·AVX2_iv1<>+0(SB)/8, $0x510e527fade682d1
+DATA ·AVX2_iv1<>+8(SB)/8, $0x9b05688c2b3e6c1f
+DATA ·AVX2_iv1<>+16(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·AVX2_iv1<>+24(SB)/8, $0x5be0cd19137e2179
+GLOBL ·AVX2_iv1<>(SB), RODATA|NOPTR, $32
 
 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
-	MOVQ h+0(FP), AX
-	MOVQ c+8(FP), BX
-	MOVQ flag+16(FP), CX
-	MOVQ blocks_base+24(FP), SI
-	MOVQ blocks_len+32(FP), DI
-
-	MOVQ SP, R10
-	ADDQ $15, R10
-	ANDQ $~15, R10
-
-	VMOVDQU ·AVX_c40<>(SB), X0
-	VMOVDQU ·AVX_c48<>(SB), X1
+// Requires: AVX, SSE2
+TEXT ·hashBlocksAVX(SB), NOSPLIT, $288-48
+	MOVQ    h+0(FP), AX
+	MOVQ    c+8(FP), BX
+	MOVQ    flag+16(FP), CX
+	MOVQ    blocks_base+24(FP), SI
+	MOVQ    blocks_len+32(FP), DI
+	MOVQ    SP, R10
+	ADDQ    $0x0f, R10
+	ANDQ    $-16, R10
+	VMOVDQU ·AVX_c40<>+0(SB), X0
+	VMOVDQU ·AVX_c48<>+0(SB), X1
 	VMOVDQA X0, X8
 	VMOVDQA X1, X9
-
-	VMOVDQU ·AVX_iv3<>(SB), X0
-	VMOVDQA X0, 0(R10)
-	XORQ    CX, 0(R10)          // 0(R10) = ·AVX_iv3 ^ (CX || 0)
-
-	VMOVDQU 0(AX), X10
+	VMOVDQU ·AVX_iv3<>+0(SB), X0
+	VMOVDQA X0, (R10)
+	XORQ    CX, (R10)
+	VMOVDQU (AX), X10
 	VMOVDQU 16(AX), X11
 	VMOVDQU 32(AX), X2
 	VMOVDQU 48(AX), X3
-
-	MOVQ 0(BX), R8
-	MOVQ 8(BX), R9
+	MOVQ    (BX), R8
+	MOVQ    8(BX), R9
 
 loop:
-	ADDQ $128, R8
-	CMPQ R8, $128
+	ADDQ $0x80, R8
+	CMPQ R8, $0x80
 	JGE  noinc
 	INCQ R9
 
 noinc:
-	VMOVQ_R8_X15
-	VPINSRQ_1_R9_X15
-
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0xf9
+	BYTE    $0x6e
+	BYTE    $0xf8
+	BYTE    $0xc4
+	BYTE    $0x43
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0xf9
+	BYTE    $0x01
 	VMOVDQA X10, X0
 	VMOVDQA X11, X1
-	VMOVDQU ·AVX_iv0<>(SB), X4
-	VMOVDQU ·AVX_iv1<>(SB), X5
-	VMOVDQU ·AVX_iv2<>(SB), X6
-
+	VMOVDQU ·AVX_iv0<>+0(SB), X4
+	VMOVDQU ·AVX_iv1<>+0(SB), X5
+	VMOVDQU ·AVX_iv2<>+0(SB), X6
 	VPXOR   X15, X6, X6
-	VMOVDQA 0(R10), X7
-
-	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
+	VMOVDQA (R10), X7
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x26
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x20
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x08
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x28
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x10
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x30
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x38
+	BYTE    $0x01
 	VMOVDQA X12, 16(R10)
 	VMOVDQA X13, 32(R10)
 	VMOVDQA X14, 48(R10)
 	VMOVDQA X15, 64(R10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x40
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x48
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x68
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x70
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x58
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x78
+	BYTE    $0x01
 	VMOVDQA X12, 80(R10)
 	VMOVDQA X13, 96(R10)
 	VMOVDQA X14, 112(R10)
 	VMOVDQA X15, 128(R10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x70
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x48
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x50
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x78
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x20
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x68
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x40
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x30
+	BYTE    $0x01
 	VMOVDQA X12, 144(R10)
 	VMOVDQA X13, 160(R10)
 	VMOVDQA X14, 176(R10)
 	VMOVDQA X15, 192(R10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	VPSHUFD $0x4e, (SI), X12
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x58
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x38
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x28
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x10
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x18
+	BYTE    $0x01
 	VMOVDQA X12, 208(R10)
 	VMOVDQA X13, 224(R10)
 	VMOVDQA X14, 240(R10)
 	VMOVDQA X15, 256(R10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
-	SHUFFLE_AVX()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
-	SHUFFLE_AVX()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	VMOVDQU 88(SI), X12
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x28
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x40
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x10
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x78
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x36
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x68
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x50
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x38
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x70
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x08
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x48
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x30
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x20
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x38
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x68
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x48
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x60
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x58
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x08
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x70
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x10
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x20
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x30
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x3e
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x28
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x78
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x40
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x48
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x10
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x36
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x20
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x28
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x38
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x78
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x70
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x30
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x08
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x40
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x58
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x60
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x68
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x10
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x2e
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x58
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x30
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x40
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x18
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x20
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x78
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x68
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x70
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x38
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x08
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x28
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x48
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x70
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x28
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x68
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x08
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x20
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x78
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x50
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	MOVQ    (SI), X12
+	VPSHUFD $0x4e, 64(SI), X13
+	MOVQ    56(SI), X14
+	MOVQ    16(SI), X15
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x30
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x58
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x68
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x58
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x08
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x38
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x70
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x48
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	MOVQ    40(SI), X12
+	MOVQ    64(SI), X13
+	MOVQ    (SI), X14
+	MOVQ    48(SI), X15
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x78
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x10
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x20
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x50
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	MOVQ    48(SI), X12
+	MOVQ    88(SI), X13
+	MOVQ    120(SI), X14
+	MOVQ    24(SI), X15
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x70
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x2e
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x48
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x40
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	VMOVDQU 96(SI), X12
+	MOVQ    8(SI), X13
+	MOVQ    16(SI), X14
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x38
+	BYTE    $0x01
+	VMOVDQU 32(SI), X15
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x50
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x38
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x10
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x30
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x40
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x08
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x20
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x28
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	MOVQ    120(SI), X12
+	MOVQ    24(SI), X13
+	MOVQ    88(SI), X14
+	MOVQ    96(SI), X15
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x48
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x68
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x70
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x3e
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	VPADDQ  16(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  32(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  48(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  64(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	VPADDQ  80(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  96(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  112(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  128(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	VPADDQ  144(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  160(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  176(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  192(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	VPADDQ  208(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  224(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  240(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  256(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
 	VMOVDQU 32(AX), X14
 	VMOVDQU 48(AX), X15
 	VPXOR   X0, X10, X10
@@ -729,16 +4524,36 @@ noinc:
 	VPXOR   X7, X15, X3
 	VMOVDQU X2, 32(AX)
 	VMOVDQU X3, 48(AX)
+	LEAQ    128(SI), SI
+	SUBQ    $0x80, DI
+	JNE     loop
+	VMOVDQU X10, (AX)
+	VMOVDQU X11, 16(AX)
+	MOVQ    R8, (BX)
+	MOVQ    R9, 8(BX)
+	VZEROUPPER
+	RET
 
-	LEAQ 128(SI), SI
-	SUBQ $128, DI
-	JNE  loop
+DATA ·AVX_c40<>+0(SB)/8, $0x0201000706050403
+DATA ·AVX_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·AVX_c40<>(SB), RODATA|NOPTR, $16
 
-	VMOVDQU X10, 0(AX)
-	VMOVDQU X11, 16(AX)
+DATA ·AVX_c48<>+0(SB)/8, $0x0100070605040302
+DATA ·AVX_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·AVX_c48<>(SB), RODATA|NOPTR, $16
 
-	MOVQ R8, 0(BX)
-	MOVQ R9, 8(BX)
-	VZEROUPPER
+DATA ·AVX_iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·AVX_iv3<>+8(SB)/8, $0x5be0cd19137e2179
+GLOBL ·AVX_iv3<>(SB), RODATA|NOPTR, $16
 
-	RET
+DATA ·AVX_iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·AVX_iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+GLOBL ·AVX_iv0<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_iv1<>+0(SB)/8, $0x3c6ef372fe94f82b
+DATA ·AVX_iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·AVX_iv1<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_iv2<>+0(SB)/8, $0x510e527fade682d1
+DATA ·AVX_iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f
+GLOBL ·AVX_iv2<>(SB), RODATA|NOPTR, $16
diff --git a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
index adfac00c15c9c5f89160f1a6526bb63d26a74091..9a0ce2124462f63b5b72532a12a999769ef9f2bd 100644
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
@@ -1,278 +1,1441 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT.
 
 //go:build amd64 && gc && !purego
 
 #include "textflag.h"
 
-DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
-DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
-GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
-
-#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
-	MOVO       v4, t1; \
-	MOVO       v5, v4; \
-	MOVO       t1, v5; \
-	MOVO       v6, t1; \
-	PUNPCKLQDQ v6, t2; \
-	PUNPCKHQDQ v7, v6; \
-	PUNPCKHQDQ t2, v6; \
-	PUNPCKLQDQ v7, t2; \
-	MOVO       t1, v7; \
-	MOVO       v2, t1; \
-	PUNPCKHQDQ t2, v7; \
-	PUNPCKLQDQ v3, t2; \
-	PUNPCKHQDQ t2, v2; \
-	PUNPCKLQDQ t1, t2; \
-	PUNPCKHQDQ t2, v3
-
-#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
-	MOVO       v4, t1; \
-	MOVO       v5, v4; \
-	MOVO       t1, v5; \
-	MOVO       v2, t1; \
-	PUNPCKLQDQ v2, t2; \
-	PUNPCKHQDQ v3, v2; \
-	PUNPCKHQDQ t2, v2; \
-	PUNPCKLQDQ v3, t2; \
-	MOVO       t1, v3; \
-	MOVO       v6, t1; \
-	PUNPCKHQDQ t2, v3; \
-	PUNPCKLQDQ v7, t2; \
-	PUNPCKHQDQ t2, v6; \
-	PUNPCKLQDQ t1, t2; \
-	PUNPCKHQDQ t2, v7
-
-#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
-	PADDQ  m0, v0;        \
-	PADDQ  m1, v1;        \
-	PADDQ  v2, v0;        \
-	PADDQ  v3, v1;        \
-	PXOR   v0, v6;        \
-	PXOR   v1, v7;        \
-	PSHUFD $0xB1, v6, v6; \
-	PSHUFD $0xB1, v7, v7; \
-	PADDQ  v6, v4;        \
-	PADDQ  v7, v5;        \
-	PXOR   v4, v2;        \
-	PXOR   v5, v3;        \
-	PSHUFB c40, v2;       \
-	PSHUFB c40, v3;       \
-	PADDQ  m2, v0;        \
-	PADDQ  m3, v1;        \
-	PADDQ  v2, v0;        \
-	PADDQ  v3, v1;        \
-	PXOR   v0, v6;        \
-	PXOR   v1, v7;        \
-	PSHUFB c48, v6;       \
-	PSHUFB c48, v7;       \
-	PADDQ  v6, v4;        \
-	PADDQ  v7, v5;        \
-	PXOR   v4, v2;        \
-	PXOR   v5, v3;        \
-	MOVOU  v2, t0;        \
-	PADDQ  v2, t0;        \
-	PSRLQ  $63, v2;       \
-	PXOR   t0, v2;        \
-	MOVOU  v3, t0;        \
-	PADDQ  v3, t0;        \
-	PSRLQ  $63, v3;       \
-	PXOR   t0, v3
-
-#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
-	MOVQ   i0*8(src), m0;     \
-	PINSRQ $1, i1*8(src), m0; \
-	MOVQ   i2*8(src), m1;     \
-	PINSRQ $1, i3*8(src), m1; \
-	MOVQ   i4*8(src), m2;     \
-	PINSRQ $1, i5*8(src), m2; \
-	MOVQ   i6*8(src), m3;     \
-	PINSRQ $1, i7*8(src), m3
-
 // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
-	MOVQ h+0(FP), AX
-	MOVQ c+8(FP), BX
-	MOVQ flag+16(FP), CX
-	MOVQ blocks_base+24(FP), SI
-	MOVQ blocks_len+32(FP), DI
-
-	MOVQ SP, R10
-	ADDQ $15, R10
-	ANDQ $~15, R10
-
-	MOVOU ·iv3<>(SB), X0
-	MOVO  X0, 0(R10)
-	XORQ  CX, 0(R10)     // 0(R10) = ·iv3 ^ (CX || 0)
-
-	MOVOU ·c40<>(SB), X13
-	MOVOU ·c48<>(SB), X14
-
-	MOVOU 0(AX), X12
+// Requires: SSE2, SSE4.1, SSSE3
+TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48
+	MOVQ  h+0(FP), AX
+	MOVQ  c+8(FP), BX
+	MOVQ  flag+16(FP), CX
+	MOVQ  blocks_base+24(FP), SI
+	MOVQ  blocks_len+32(FP), DI
+	MOVQ  SP, R10
+	ADDQ  $0x0f, R10
+	ANDQ  $-16, R10
+	MOVOU ·iv3<>+0(SB), X0
+	MOVO  X0, (R10)
+	XORQ  CX, (R10)
+	MOVOU ·c40<>+0(SB), X13
+	MOVOU ·c48<>+0(SB), X14
+	MOVOU (AX), X12
 	MOVOU 16(AX), X15
-
-	MOVQ 0(BX), R8
-	MOVQ 8(BX), R9
+	MOVQ  (BX), R8
+	MOVQ  8(BX), R9
 
 loop:
-	ADDQ $128, R8
-	CMPQ R8, $128
+	ADDQ $0x80, R8
+	CMPQ R8, $0x80
 	JGE  noinc
 	INCQ R9
 
 noinc:
-	MOVQ R8, X8
-	PINSRQ $1, R9, X8
-
-	MOVO X12, X0
-	MOVO X15, X1
-	MOVOU 32(AX), X2
-	MOVOU 48(AX), X3
-	MOVOU ·iv0<>(SB), X4
-	MOVOU ·iv1<>(SB), X5
-	MOVOU ·iv2<>(SB), X6
-
-	PXOR X8, X6
-	MOVO 0(R10), X7
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
-	MOVO X8, 16(R10)
-	MOVO X9, 32(R10)
-	MOVO X10, 48(R10)
-	MOVO X11, 64(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
-	MOVO X8, 80(R10)
-	MOVO X9, 96(R10)
-	MOVO X10, 112(R10)
-	MOVO X11, 128(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
-	MOVO X8, 144(R10)
-	MOVO X9, 160(R10)
-	MOVO X10, 176(R10)
-	MOVO X11, 192(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
-	MOVO X8, 208(R10)
-	MOVO X9, 224(R10)
-	MOVO X10, 240(R10)
-	MOVO X11, 256(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+	MOVQ       R8, X8
+	PINSRQ     $0x01, R9, X8
+	MOVO       X12, X0
+	MOVO       X15, X1
+	MOVOU      32(AX), X2
+	MOVOU      48(AX), X3
+	MOVOU      ·iv0<>+0(SB), X4
+	MOVOU      ·iv1<>+0(SB), X5
+	MOVOU      ·iv2<>+0(SB), X6
+	PXOR       X8, X6
+	MOVO       (R10), X7
+	MOVQ       (SI), X8
+	PINSRQ     $0x01, 16(SI), X8
+	MOVQ       32(SI), X9
+	PINSRQ     $0x01, 48(SI), X9
+	MOVQ       8(SI), X10
+	PINSRQ     $0x01, 24(SI), X10
+	MOVQ       40(SI), X11
+	PINSRQ     $0x01, 56(SI), X11
+	MOVO       X8, 16(R10)
+	MOVO       X9, 32(R10)
+	MOVO       X10, 48(R10)
+	MOVO       X11, 64(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       64(SI), X8
+	PINSRQ     $0x01, 80(SI), X8
+	MOVQ       96(SI), X9
+	PINSRQ     $0x01, 112(SI), X9
+	MOVQ       72(SI), X10
+	PINSRQ     $0x01, 88(SI), X10
+	MOVQ       104(SI), X11
+	PINSRQ     $0x01, 120(SI), X11
+	MOVO       X8, 80(R10)
+	MOVO       X9, 96(R10)
+	MOVO       X10, 112(R10)
+	MOVO       X11, 128(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       112(SI), X8
+	PINSRQ     $0x01, 32(SI), X8
+	MOVQ       72(SI), X9
+	PINSRQ     $0x01, 104(SI), X9
+	MOVQ       80(SI), X10
+	PINSRQ     $0x01, 64(SI), X10
+	MOVQ       120(SI), X11
+	PINSRQ     $0x01, 48(SI), X11
+	MOVO       X8, 144(R10)
+	MOVO       X9, 160(R10)
+	MOVO       X10, 176(R10)
+	MOVO       X11, 192(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       8(SI), X8
+	PINSRQ     $0x01, (SI), X8
+	MOVQ       88(SI), X9
+	PINSRQ     $0x01, 40(SI), X9
+	MOVQ       96(SI), X10
+	PINSRQ     $0x01, 16(SI), X10
+	MOVQ       56(SI), X11
+	PINSRQ     $0x01, 24(SI), X11
+	MOVO       X8, 208(R10)
+	MOVO       X9, 224(R10)
+	MOVO       X10, 240(R10)
+	MOVO       X11, 256(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       88(SI), X8
+	PINSRQ     $0x01, 96(SI), X8
+	MOVQ       40(SI), X9
+	PINSRQ     $0x01, 120(SI), X9
+	MOVQ       64(SI), X10
+	PINSRQ     $0x01, (SI), X10
+	MOVQ       16(SI), X11
+	PINSRQ     $0x01, 104(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       80(SI), X8
+	PINSRQ     $0x01, 24(SI), X8
+	MOVQ       56(SI), X9
+	PINSRQ     $0x01, 72(SI), X9
+	MOVQ       112(SI), X10
+	PINSRQ     $0x01, 48(SI), X10
+	MOVQ       8(SI), X11
+	PINSRQ     $0x01, 32(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       56(SI), X8
+	PINSRQ     $0x01, 24(SI), X8
+	MOVQ       104(SI), X9
+	PINSRQ     $0x01, 88(SI), X9
+	MOVQ       72(SI), X10
+	PINSRQ     $0x01, 8(SI), X10
+	MOVQ       96(SI), X11
+	PINSRQ     $0x01, 112(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       16(SI), X8
+	PINSRQ     $0x01, 40(SI), X8
+	MOVQ       32(SI), X9
+	PINSRQ     $0x01, 120(SI), X9
+	MOVQ       48(SI), X10
+	PINSRQ     $0x01, 80(SI), X10
+	MOVQ       (SI), X11
+	PINSRQ     $0x01, 64(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       72(SI), X8
+	PINSRQ     $0x01, 40(SI), X8
+	MOVQ       16(SI), X9
+	PINSRQ     $0x01, 80(SI), X9
+	MOVQ       (SI), X10
+	PINSRQ     $0x01, 56(SI), X10
+	MOVQ       32(SI), X11
+	PINSRQ     $0x01, 120(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       112(SI), X8
+	PINSRQ     $0x01, 88(SI), X8
+	MOVQ       48(SI), X9
+	PINSRQ     $0x01, 24(SI), X9
+	MOVQ       8(SI), X10
+	PINSRQ     $0x01, 96(SI), X10
+	MOVQ       64(SI), X11
+	PINSRQ     $0x01, 104(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       16(SI), X8
+	PINSRQ     $0x01, 48(SI), X8
+	MOVQ       (SI), X9
+	PINSRQ     $0x01, 64(SI), X9
+	MOVQ       96(SI), X10
+	PINSRQ     $0x01, 80(SI), X10
+	MOVQ       88(SI), X11
+	PINSRQ     $0x01, 24(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       32(SI), X8
+	PINSRQ     $0x01, 56(SI), X8
+	MOVQ       120(SI), X9
+	PINSRQ     $0x01, 8(SI), X9
+	MOVQ       104(SI), X10
+	PINSRQ     $0x01, 40(SI), X10
+	MOVQ       112(SI), X11
+	PINSRQ     $0x01, 72(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       96(SI), X8
+	PINSRQ     $0x01, 8(SI), X8
+	MOVQ       112(SI), X9
+	PINSRQ     $0x01, 32(SI), X9
+	MOVQ       40(SI), X10
+	PINSRQ     $0x01, 120(SI), X10
+	MOVQ       104(SI), X11
+	PINSRQ     $0x01, 80(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       (SI), X8
+	PINSRQ     $0x01, 48(SI), X8
+	MOVQ       72(SI), X9
+	PINSRQ     $0x01, 64(SI), X9
+	MOVQ       56(SI), X10
+	PINSRQ     $0x01, 24(SI), X10
+	MOVQ       16(SI), X11
+	PINSRQ     $0x01, 88(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       104(SI), X8
+	PINSRQ     $0x01, 56(SI), X8
+	MOVQ       96(SI), X9
+	PINSRQ     $0x01, 24(SI), X9
+	MOVQ       88(SI), X10
+	PINSRQ     $0x01, 112(SI), X10
+	MOVQ       8(SI), X11
+	PINSRQ     $0x01, 72(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       40(SI), X8
+	PINSRQ     $0x01, 120(SI), X8
+	MOVQ       64(SI), X9
+	PINSRQ     $0x01, 16(SI), X9
+	MOVQ       (SI), X10
+	PINSRQ     $0x01, 32(SI), X10
+	MOVQ       48(SI), X11
+	PINSRQ     $0x01, 80(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       48(SI), X8
+	PINSRQ     $0x01, 112(SI), X8
+	MOVQ       88(SI), X9
+	PINSRQ     $0x01, (SI), X9
+	MOVQ       120(SI), X10
+	PINSRQ     $0x01, 72(SI), X10
+	MOVQ       24(SI), X11
+	PINSRQ     $0x01, 64(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       96(SI), X8
+	PINSRQ     $0x01, 104(SI), X8
+	MOVQ       8(SI), X9
+	PINSRQ     $0x01, 80(SI), X9
+	MOVQ       16(SI), X10
+	PINSRQ     $0x01, 56(SI), X10
+	MOVQ       32(SI), X11
+	PINSRQ     $0x01, 40(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       80(SI), X8
+	PINSRQ     $0x01, 64(SI), X8
+	MOVQ       56(SI), X9
+	PINSRQ     $0x01, 8(SI), X9
+	MOVQ       16(SI), X10
+	PINSRQ     $0x01, 32(SI), X10
+	MOVQ       48(SI), X11
+	PINSRQ     $0x01, 40(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       120(SI), X8
+	PINSRQ     $0x01, 72(SI), X8
+	MOVQ       24(SI), X9
+	PINSRQ     $0x01, 104(SI), X9
+	MOVQ       88(SI), X10
+	PINSRQ     $0x01, 112(SI), X10
+	MOVQ       96(SI), X11
+	PINSRQ     $0x01, (SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	PADDQ      16(R10), X0
+	PADDQ      32(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      48(R10), X0
+	PADDQ      64(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	PADDQ      80(R10), X0
+	PADDQ      96(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      112(R10), X0
+	PADDQ      128(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	PADDQ      144(R10), X0
+	PADDQ      160(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      176(R10), X0
+	PADDQ      192(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	PADDQ      208(R10), X0
+	PADDQ      224(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      240(R10), X0
+	PADDQ      256(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVOU      32(AX), X10
+	MOVOU      48(AX), X11
+	PXOR       X0, X12
+	PXOR       X1, X15
+	PXOR       X2, X10
+	PXOR       X3, X11
+	PXOR       X4, X12
+	PXOR       X5, X15
+	PXOR       X6, X10
+	PXOR       X7, X11
+	MOVOU      X10, 32(AX)
+	MOVOU      X11, 48(AX)
+	LEAQ       128(SI), SI
+	SUBQ       $0x80, DI
+	JNE        loop
+	MOVOU      X12, (AX)
+	MOVOU      X15, 16(AX)
+	MOVQ       R8, (BX)
+	MOVQ       R9, 8(BX)
+	RET
 
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179
+GLOBL ·iv3<>(SB), RODATA|NOPTR, $16
 
-	MOVOU 32(AX), X10
-	MOVOU 48(AX), X11
-	PXOR  X0, X12
-	PXOR  X1, X15
-	PXOR  X2, X10
-	PXOR  X3, X11
-	PXOR  X4, X12
-	PXOR  X5, X15
-	PXOR  X6, X10
-	PXOR  X7, X11
-	MOVOU X10, 32(AX)
-	MOVOU X11, 48(AX)
+DATA ·c40<>+0(SB)/8, $0x0201000706050403
+DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·c40<>(SB), RODATA|NOPTR, $16
 
-	LEAQ 128(SI), SI
-	SUBQ $128, DI
-	JNE  loop
+DATA ·c48<>+0(SB)/8, $0x0100070605040302
+DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·c48<>(SB), RODATA|NOPTR, $16
 
-	MOVOU X12, 0(AX)
-	MOVOU X15, 16(AX)
+DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+GLOBL ·iv0<>(SB), RODATA|NOPTR, $16
 
-	MOVQ R8, 0(BX)
-	MOVQ R9, 8(BX)
+DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b
+DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·iv1<>(SB), RODATA|NOPTR, $16
 
-	RET
+DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1
+DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f
+GLOBL ·iv2<>(SB), RODATA|NOPTR, $16
diff --git a/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s b/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s
index fe4b818a33ac7946e5053bd392729f908433120a..57d510fc08de53182b0ecb400dc1f2f68e977609 100644
--- a/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s
@@ -1,432 +1,2173 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2s_amd64_asm.go -out ../blake2s_amd64.s -pkg blake2s. DO NOT EDIT.
 
 //go:build amd64 && gc && !purego
 
 #include "textflag.h"
 
-DATA iv0<>+0x00(SB)/4, $0x6a09e667
-DATA iv0<>+0x04(SB)/4, $0xbb67ae85
-DATA iv0<>+0x08(SB)/4, $0x3c6ef372
-DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
-GLOBL iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA iv1<>+0x00(SB)/4, $0x510e527f
-DATA iv1<>+0x04(SB)/4, $0x9b05688c
-DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
-DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
-GLOBL iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-GLOBL rol16<>(SB), (NOPTR+RODATA), $16
-
-DATA rol8<>+0x00(SB)/8, $0x0407060500030201
-DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
-GLOBL rol8<>(SB), (NOPTR+RODATA), $16
-
-DATA counter<>+0x00(SB)/8, $0x40
-DATA counter<>+0x08(SB)/8, $0x0
-GLOBL counter<>(SB), (NOPTR+RODATA), $16
-
-#define ROTL_SSE2(n, t, v) \
-	MOVO  v, t;       \
-	PSLLL $n, t;      \
-	PSRLL $(32-n), v; \
-	PXOR  t, v
-
-#define ROTL_SSSE3(c, v) \
-	PSHUFB c, v
-
-#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
-	PADDL  m0, v0;        \
-	PADDL  v1, v0;        \
-	PXOR   v0, v3;        \
-	ROTL_SSE2(16, t, v3); \
-	PADDL  v3, v2;        \
-	PXOR   v2, v1;        \
-	ROTL_SSE2(20, t, v1); \
-	PADDL  m1, v0;        \
-	PADDL  v1, v0;        \
-	PXOR   v0, v3;        \
-	ROTL_SSE2(24, t, v3); \
-	PADDL  v3, v2;        \
-	PXOR   v2, v1;        \
-	ROTL_SSE2(25, t, v1); \
-	PSHUFL $0x39, v1, v1; \
-	PSHUFL $0x4E, v2, v2; \
-	PSHUFL $0x93, v3, v3; \
-	PADDL  m2, v0;        \
-	PADDL  v1, v0;        \
-	PXOR   v0, v3;        \
-	ROTL_SSE2(16, t, v3); \
-	PADDL  v3, v2;        \
-	PXOR   v2, v1;        \
-	ROTL_SSE2(20, t, v1); \
-	PADDL  m3, v0;        \
-	PADDL  v1, v0;        \
-	PXOR   v0, v3;        \
-	ROTL_SSE2(24, t, v3); \
-	PADDL  v3, v2;        \
-	PXOR   v2, v1;        \
-	ROTL_SSE2(25, t, v1); \
-	PSHUFL $0x39, v3, v3; \
-	PSHUFL $0x4E, v2, v2; \
-	PSHUFL $0x93, v1, v1
-
-#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
-	PADDL  m0, v0;        \
-	PADDL  v1, v0;        \
-	PXOR   v0, v3;        \
-	ROTL_SSSE3(c16, v3);  \
-	PADDL  v3, v2;        \
-	PXOR   v2, v1;        \
-	ROTL_SSE2(20, t, v1); \
-	PADDL  m1, v0;        \
-	PADDL  v1, v0;        \
-	PXOR   v0, v3;        \
-	ROTL_SSSE3(c8, v3);   \
-	PADDL  v3, v2;        \
-	PXOR   v2, v1;        \
-	ROTL_SSE2(25, t, v1); \
-	PSHUFL $0x39, v1, v1; \
-	PSHUFL $0x4E, v2, v2; \
-	PSHUFL $0x93, v3, v3; \
-	PADDL  m2, v0;        \
-	PADDL  v1, v0;        \
-	PXOR   v0, v3;        \
-	ROTL_SSSE3(c16, v3);  \
-	PADDL  v3, v2;        \
-	PXOR   v2, v1;        \
-	ROTL_SSE2(20, t, v1); \
-	PADDL  m3, v0;        \
-	PADDL  v1, v0;        \
-	PXOR   v0, v3;        \
-	ROTL_SSSE3(c8, v3);   \
-	PADDL  v3, v2;        \
-	PXOR   v2, v1;        \
-	ROTL_SSE2(25, t, v1); \
-	PSHUFL $0x39, v3, v3; \
-	PSHUFL $0x4E, v2, v2; \
-	PSHUFL $0x93, v1, v1
-
-
-#define LOAD_MSG_SSE4(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \
-	MOVL   i0*4(src), m0;      \
-	PINSRD $1, i1*4(src), m0;  \
-	PINSRD $2, i2*4(src), m0;  \
-	PINSRD $3, i3*4(src), m0;  \
-	MOVL   i4*4(src), m1;      \
-	PINSRD $1, i5*4(src), m1;  \
-	PINSRD $2, i6*4(src), m1;  \
-	PINSRD $3, i7*4(src), m1;  \
-	MOVL   i8*4(src), m2;      \
-	PINSRD $1, i9*4(src), m2;  \
-	PINSRD $2, i10*4(src), m2; \
-	PINSRD $3, i11*4(src), m2; \
-	MOVL   i12*4(src), m3;     \
-	PINSRD $1, i13*4(src), m3; \
-	PINSRD $2, i14*4(src), m3; \
-	PINSRD $3, i15*4(src), m3
+// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+// Requires: SSE2
+TEXT ·hashBlocksSSE2(SB), $672-48
+	MOVQ  h+0(FP), AX
+	MOVQ  c+8(FP), BX
+	MOVL  flag+16(FP), CX
+	MOVQ  blocks_base+24(FP), SI
+	MOVQ  blocks_len+32(FP), DX
+	MOVQ  SP, BP
+	ADDQ  $0x0f, BP
+	ANDQ  $-16, BP
+	MOVQ  (BX), R9
+	MOVQ  R9, (BP)
+	MOVQ  CX, 8(BP)
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU iv0<>+0(SB), X2
+	MOVOU iv1<>+0(SB), X3
+	MOVOU counter<>+0(SB), X12
+	MOVOU rol16<>+0(SB), X13
+	MOVOU rol8<>+0(SB), X14
+	MOVO  (BP), X15
 
-#define PRECOMPUTE_MSG(dst, off, src, R8, R9, R10, R11, R12, R13, R14, R15) \
-	MOVQ 0*4(src), R8;           \
-	MOVQ 2*4(src), R9;           \
-	MOVQ 4*4(src), R10;          \
-	MOVQ 6*4(src), R11;          \
-	MOVQ 8*4(src), R12;          \
-	MOVQ 10*4(src), R13;         \
-	MOVQ 12*4(src), R14;         \
-	MOVQ 14*4(src), R15;         \
-	                             \
-	MOVL R8, 0*4+off+0(dst);     \
-	MOVL R8, 9*4+off+64(dst);    \
-	MOVL R8, 5*4+off+128(dst);   \
-	MOVL R8, 14*4+off+192(dst);  \
-	MOVL R8, 4*4+off+256(dst);   \
-	MOVL R8, 2*4+off+320(dst);   \
-	MOVL R8, 8*4+off+384(dst);   \
-	MOVL R8, 12*4+off+448(dst);  \
-	MOVL R8, 3*4+off+512(dst);   \
-	MOVL R8, 15*4+off+576(dst);  \
-	SHRQ $32, R8;                \
-	MOVL R8, 4*4+off+0(dst);     \
-	MOVL R8, 8*4+off+64(dst);    \
-	MOVL R8, 14*4+off+128(dst);  \
-	MOVL R8, 5*4+off+192(dst);   \
-	MOVL R8, 12*4+off+256(dst);  \
-	MOVL R8, 11*4+off+320(dst);  \
-	MOVL R8, 1*4+off+384(dst);   \
-	MOVL R8, 6*4+off+448(dst);   \
-	MOVL R8, 10*4+off+512(dst);  \
-	MOVL R8, 3*4+off+576(dst);   \
-	                             \
-	MOVL R9, 1*4+off+0(dst);     \
-	MOVL R9, 13*4+off+64(dst);   \
-	MOVL R9, 6*4+off+128(dst);   \
-	MOVL R9, 8*4+off+192(dst);   \
-	MOVL R9, 2*4+off+256(dst);   \
-	MOVL R9, 0*4+off+320(dst);   \
-	MOVL R9, 14*4+off+384(dst);  \
-	MOVL R9, 11*4+off+448(dst);  \
-	MOVL R9, 12*4+off+512(dst);  \
-	MOVL R9, 4*4+off+576(dst);   \
-	SHRQ $32, R9;                \
-	MOVL R9, 5*4+off+0(dst);     \
-	MOVL R9, 15*4+off+64(dst);   \
-	MOVL R9, 9*4+off+128(dst);   \
-	MOVL R9, 1*4+off+192(dst);   \
-	MOVL R9, 11*4+off+256(dst);  \
-	MOVL R9, 7*4+off+320(dst);   \
-	MOVL R9, 13*4+off+384(dst);  \
-	MOVL R9, 3*4+off+448(dst);   \
-	MOVL R9, 6*4+off+512(dst);   \
-	MOVL R9, 10*4+off+576(dst);  \
-	                             \
-	MOVL R10, 2*4+off+0(dst);    \
-	MOVL R10, 1*4+off+64(dst);   \
-	MOVL R10, 15*4+off+128(dst); \
-	MOVL R10, 10*4+off+192(dst); \
-	MOVL R10, 6*4+off+256(dst);  \
-	MOVL R10, 8*4+off+320(dst);  \
-	MOVL R10, 3*4+off+384(dst);  \
-	MOVL R10, 13*4+off+448(dst); \
-	MOVL R10, 14*4+off+512(dst); \
-	MOVL R10, 5*4+off+576(dst);  \
-	SHRQ $32, R10;               \
-	MOVL R10, 6*4+off+0(dst);    \
-	MOVL R10, 11*4+off+64(dst);  \
-	MOVL R10, 2*4+off+128(dst);  \
-	MOVL R10, 9*4+off+192(dst);  \
-	MOVL R10, 1*4+off+256(dst);  \
-	MOVL R10, 13*4+off+320(dst); \
-	MOVL R10, 4*4+off+384(dst);  \
-	MOVL R10, 8*4+off+448(dst);  \
-	MOVL R10, 15*4+off+512(dst); \
-	MOVL R10, 7*4+off+576(dst);  \
-	                             \
-	MOVL R11, 3*4+off+0(dst);    \
-	MOVL R11, 7*4+off+64(dst);   \
-	MOVL R11, 13*4+off+128(dst); \
-	MOVL R11, 12*4+off+192(dst); \
-	MOVL R11, 10*4+off+256(dst); \
-	MOVL R11, 1*4+off+320(dst);  \
-	MOVL R11, 9*4+off+384(dst);  \
-	MOVL R11, 14*4+off+448(dst); \
-	MOVL R11, 0*4+off+512(dst);  \
-	MOVL R11, 6*4+off+576(dst);  \
-	SHRQ $32, R11;               \
-	MOVL R11, 7*4+off+0(dst);    \
-	MOVL R11, 14*4+off+64(dst);  \
-	MOVL R11, 10*4+off+128(dst); \
-	MOVL R11, 0*4+off+192(dst);  \
-	MOVL R11, 5*4+off+256(dst);  \
-	MOVL R11, 9*4+off+320(dst);  \
-	MOVL R11, 12*4+off+384(dst); \
-	MOVL R11, 1*4+off+448(dst);  \
-	MOVL R11, 13*4+off+512(dst); \
-	MOVL R11, 2*4+off+576(dst);  \
-	                             \
-	MOVL R12, 8*4+off+0(dst);    \
-	MOVL R12, 5*4+off+64(dst);   \
-	MOVL R12, 4*4+off+128(dst);  \
-	MOVL R12, 15*4+off+192(dst); \
-	MOVL R12, 14*4+off+256(dst); \
-	MOVL R12, 3*4+off+320(dst);  \
-	MOVL R12, 11*4+off+384(dst); \
-	MOVL R12, 10*4+off+448(dst); \
-	MOVL R12, 7*4+off+512(dst);  \
-	MOVL R12, 1*4+off+576(dst);  \
-	SHRQ $32, R12;               \
-	MOVL R12, 12*4+off+0(dst);   \
-	MOVL R12, 2*4+off+64(dst);   \
-	MOVL R12, 11*4+off+128(dst); \
-	MOVL R12, 4*4+off+192(dst);  \
-	MOVL R12, 0*4+off+256(dst);  \
-	MOVL R12, 15*4+off+320(dst); \
-	MOVL R12, 10*4+off+384(dst); \
-	MOVL R12, 7*4+off+448(dst);  \
-	MOVL R12, 5*4+off+512(dst);  \
-	MOVL R12, 9*4+off+576(dst);  \
-	                             \
-	MOVL R13, 9*4+off+0(dst);    \
-	MOVL R13, 4*4+off+64(dst);   \
-	MOVL R13, 8*4+off+128(dst);  \
-	MOVL R13, 13*4+off+192(dst); \
-	MOVL R13, 3*4+off+256(dst);  \
-	MOVL R13, 5*4+off+320(dst);  \
-	MOVL R13, 7*4+off+384(dst);  \
-	MOVL R13, 15*4+off+448(dst); \
-	MOVL R13, 11*4+off+512(dst); \
-	MOVL R13, 0*4+off+576(dst);  \
-	SHRQ $32, R13;               \
-	MOVL R13, 13*4+off+0(dst);   \
-	MOVL R13, 10*4+off+64(dst);  \
-	MOVL R13, 0*4+off+128(dst);  \
-	MOVL R13, 3*4+off+192(dst);  \
-	MOVL R13, 9*4+off+256(dst);  \
-	MOVL R13, 6*4+off+320(dst);  \
-	MOVL R13, 15*4+off+384(dst); \
-	MOVL R13, 4*4+off+448(dst);  \
-	MOVL R13, 2*4+off+512(dst);  \
-	MOVL R13, 12*4+off+576(dst); \
-	                             \
-	MOVL R14, 10*4+off+0(dst);   \
-	MOVL R14, 12*4+off+64(dst);  \
-	MOVL R14, 1*4+off+128(dst);  \
-	MOVL R14, 6*4+off+192(dst);  \
-	MOVL R14, 13*4+off+256(dst); \
-	MOVL R14, 4*4+off+320(dst);  \
-	MOVL R14, 0*4+off+384(dst);  \
-	MOVL R14, 2*4+off+448(dst);  \
-	MOVL R14, 8*4+off+512(dst);  \
-	MOVL R14, 14*4+off+576(dst); \
-	SHRQ $32, R14;               \
-	MOVL R14, 14*4+off+0(dst);   \
-	MOVL R14, 3*4+off+64(dst);   \
-	MOVL R14, 7*4+off+128(dst);  \
-	MOVL R14, 2*4+off+192(dst);  \
-	MOVL R14, 15*4+off+256(dst); \
-	MOVL R14, 12*4+off+320(dst); \
-	MOVL R14, 6*4+off+384(dst);  \
-	MOVL R14, 0*4+off+448(dst);  \
-	MOVL R14, 9*4+off+512(dst);  \
-	MOVL R14, 11*4+off+576(dst); \
-	                             \
-	MOVL R15, 11*4+off+0(dst);   \
-	MOVL R15, 0*4+off+64(dst);   \
-	MOVL R15, 12*4+off+128(dst); \
-	MOVL R15, 7*4+off+192(dst);  \
-	MOVL R15, 8*4+off+256(dst);  \
-	MOVL R15, 14*4+off+320(dst); \
-	MOVL R15, 2*4+off+384(dst);  \
-	MOVL R15, 5*4+off+448(dst);  \
-	MOVL R15, 1*4+off+512(dst);  \
-	MOVL R15, 13*4+off+576(dst); \
-	SHRQ $32, R15;               \
-	MOVL R15, 15*4+off+0(dst);   \
-	MOVL R15, 6*4+off+64(dst);   \
-	MOVL R15, 3*4+off+128(dst);  \
-	MOVL R15, 11*4+off+192(dst); \
-	MOVL R15, 7*4+off+256(dst);  \
-	MOVL R15, 10*4+off+320(dst); \
-	MOVL R15, 5*4+off+384(dst);  \
-	MOVL R15, 9*4+off+448(dst);  \
-	MOVL R15, 4*4+off+512(dst);  \
-	MOVL R15, 8*4+off+576(dst)
+loop:
+	MOVO   X0, X4
+	MOVO   X1, X5
+	MOVO   X2, X6
+	MOVO   X3, X7
+	PADDQ  X12, X15
+	PXOR   X15, X7
+	MOVQ   (SI), R8
+	MOVQ   8(SI), R9
+	MOVQ   16(SI), R10
+	MOVQ   24(SI), R11
+	MOVQ   32(SI), R12
+	MOVQ   40(SI), R13
+	MOVQ   48(SI), R14
+	MOVQ   56(SI), R15
+	MOVL   R8, 16(BP)
+	MOVL   R8, 116(BP)
+	MOVL   R8, 164(BP)
+	MOVL   R8, 264(BP)
+	MOVL   R8, 288(BP)
+	MOVL   R8, 344(BP)
+	MOVL   R8, 432(BP)
+	MOVL   R8, 512(BP)
+	MOVL   R8, 540(BP)
+	MOVL   R8, 652(BP)
+	SHRQ   $0x20, R8
+	MOVL   R8, 32(BP)
+	MOVL   R8, 112(BP)
+	MOVL   R8, 200(BP)
+	MOVL   R8, 228(BP)
+	MOVL   R8, 320(BP)
+	MOVL   R8, 380(BP)
+	MOVL   R8, 404(BP)
+	MOVL   R8, 488(BP)
+	MOVL   R8, 568(BP)
+	MOVL   R8, 604(BP)
+	MOVL   R9, 20(BP)
+	MOVL   R9, 132(BP)
+	MOVL   R9, 168(BP)
+	MOVL   R9, 240(BP)
+	MOVL   R9, 280(BP)
+	MOVL   R9, 336(BP)
+	MOVL   R9, 456(BP)
+	MOVL   R9, 508(BP)
+	MOVL   R9, 576(BP)
+	MOVL   R9, 608(BP)
+	SHRQ   $0x20, R9
+	MOVL   R9, 36(BP)
+	MOVL   R9, 140(BP)
+	MOVL   R9, 180(BP)
+	MOVL   R9, 212(BP)
+	MOVL   R9, 316(BP)
+	MOVL   R9, 364(BP)
+	MOVL   R9, 452(BP)
+	MOVL   R9, 476(BP)
+	MOVL   R9, 552(BP)
+	MOVL   R9, 632(BP)
+	MOVL   R10, 24(BP)
+	MOVL   R10, 84(BP)
+	MOVL   R10, 204(BP)
+	MOVL   R10, 248(BP)
+	MOVL   R10, 296(BP)
+	MOVL   R10, 368(BP)
+	MOVL   R10, 412(BP)
+	MOVL   R10, 516(BP)
+	MOVL   R10, 584(BP)
+	MOVL   R10, 612(BP)
+	SHRQ   $0x20, R10
+	MOVL   R10, 40(BP)
+	MOVL   R10, 124(BP)
+	MOVL   R10, 152(BP)
+	MOVL   R10, 244(BP)
+	MOVL   R10, 276(BP)
+	MOVL   R10, 388(BP)
+	MOVL   R10, 416(BP)
+	MOVL   R10, 496(BP)
+	MOVL   R10, 588(BP)
+	MOVL   R10, 620(BP)
+	MOVL   R11, 28(BP)
+	MOVL   R11, 108(BP)
+	MOVL   R11, 196(BP)
+	MOVL   R11, 256(BP)
+	MOVL   R11, 312(BP)
+	MOVL   R11, 340(BP)
+	MOVL   R11, 436(BP)
+	MOVL   R11, 520(BP)
+	MOVL   R11, 528(BP)
+	MOVL   R11, 616(BP)
+	SHRQ   $0x20, R11
+	MOVL   R11, 44(BP)
+	MOVL   R11, 136(BP)
+	MOVL   R11, 184(BP)
+	MOVL   R11, 208(BP)
+	MOVL   R11, 292(BP)
+	MOVL   R11, 372(BP)
+	MOVL   R11, 448(BP)
+	MOVL   R11, 468(BP)
+	MOVL   R11, 580(BP)
+	MOVL   R11, 600(BP)
+	MOVL   R12, 48(BP)
+	MOVL   R12, 100(BP)
+	MOVL   R12, 160(BP)
+	MOVL   R12, 268(BP)
+	MOVL   R12, 328(BP)
+	MOVL   R12, 348(BP)
+	MOVL   R12, 444(BP)
+	MOVL   R12, 504(BP)
+	MOVL   R12, 556(BP)
+	MOVL   R12, 596(BP)
+	SHRQ   $0x20, R12
+	MOVL   R12, 64(BP)
+	MOVL   R12, 88(BP)
+	MOVL   R12, 188(BP)
+	MOVL   R12, 224(BP)
+	MOVL   R12, 272(BP)
+	MOVL   R12, 396(BP)
+	MOVL   R12, 440(BP)
+	MOVL   R12, 492(BP)
+	MOVL   R12, 548(BP)
+	MOVL   R12, 628(BP)
+	MOVL   R13, 52(BP)
+	MOVL   R13, 96(BP)
+	MOVL   R13, 176(BP)
+	MOVL   R13, 260(BP)
+	MOVL   R13, 284(BP)
+	MOVL   R13, 356(BP)
+	MOVL   R13, 428(BP)
+	MOVL   R13, 524(BP)
+	MOVL   R13, 572(BP)
+	MOVL   R13, 592(BP)
+	SHRQ   $0x20, R13
+	MOVL   R13, 68(BP)
+	MOVL   R13, 120(BP)
+	MOVL   R13, 144(BP)
+	MOVL   R13, 220(BP)
+	MOVL   R13, 308(BP)
+	MOVL   R13, 360(BP)
+	MOVL   R13, 460(BP)
+	MOVL   R13, 480(BP)
+	MOVL   R13, 536(BP)
+	MOVL   R13, 640(BP)
+	MOVL   R14, 56(BP)
+	MOVL   R14, 128(BP)
+	MOVL   R14, 148(BP)
+	MOVL   R14, 232(BP)
+	MOVL   R14, 324(BP)
+	MOVL   R14, 352(BP)
+	MOVL   R14, 400(BP)
+	MOVL   R14, 472(BP)
+	MOVL   R14, 560(BP)
+	MOVL   R14, 648(BP)
+	SHRQ   $0x20, R14
+	MOVL   R14, 72(BP)
+	MOVL   R14, 92(BP)
+	MOVL   R14, 172(BP)
+	MOVL   R14, 216(BP)
+	MOVL   R14, 332(BP)
+	MOVL   R14, 384(BP)
+	MOVL   R14, 424(BP)
+	MOVL   R14, 464(BP)
+	MOVL   R14, 564(BP)
+	MOVL   R14, 636(BP)
+	MOVL   R15, 60(BP)
+	MOVL   R15, 80(BP)
+	MOVL   R15, 192(BP)
+	MOVL   R15, 236(BP)
+	MOVL   R15, 304(BP)
+	MOVL   R15, 392(BP)
+	MOVL   R15, 408(BP)
+	MOVL   R15, 484(BP)
+	MOVL   R15, 532(BP)
+	MOVL   R15, 644(BP)
+	SHRQ   $0x20, R15
+	MOVL   R15, 76(BP)
+	MOVL   R15, 104(BP)
+	MOVL   R15, 156(BP)
+	MOVL   R15, 252(BP)
+	MOVL   R15, 300(BP)
+	MOVL   R15, 376(BP)
+	MOVL   R15, 420(BP)
+	MOVL   R15, 500(BP)
+	MOVL   R15, 544(BP)
+	MOVL   R15, 624(BP)
+	PADDL  16(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  32(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  48(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  64(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  80(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  96(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  112(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  128(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  144(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  160(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  176(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  192(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  208(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  224(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  240(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  256(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  272(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  288(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  304(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  320(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  336(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  352(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  368(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  384(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  400(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  416(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  432(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  448(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  464(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  480(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  496(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  512(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  528(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  544(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  560(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  576(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  592(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  608(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  624(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x10, X8
+	PSRLL  $0x10, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  640(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	MOVO   X7, X8
+	PSLLL  $0x18, X8
+	PSRLL  $0x08, X7
+	PXOR   X8, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PXOR   X4, X0
+	PXOR   X5, X1
+	PXOR   X6, X0
+	PXOR   X7, X1
+	LEAQ   64(SI), SI
+	SUBQ   $0x40, DX
+	JNE    loop
+	MOVO   X15, (BP)
+	MOVQ   (BP), R9
+	MOVQ   R9, (BX)
+	MOVOU  X0, (AX)
+	MOVOU  X1, 16(AX)
+	RET
 
-#define BLAKE2s_SSE2() \
-	PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15);               \
-	ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8);                 \
-	ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8);     \
-	ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \
-	ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \
-	ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \
-	ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \
-	ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \
-	ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \
-	ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \
-	ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8)
+DATA iv0<>+0(SB)/4, $0x6a09e667
+DATA iv0<>+4(SB)/4, $0xbb67ae85
+DATA iv0<>+8(SB)/4, $0x3c6ef372
+DATA iv0<>+12(SB)/4, $0xa54ff53a
+GLOBL iv0<>(SB), RODATA|NOPTR, $16
 
-#define BLAKE2s_SSSE3() \
-	PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15);                          \
-	ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14);                 \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14);     \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \
-	ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14)
+DATA iv1<>+0(SB)/4, $0x510e527f
+DATA iv1<>+4(SB)/4, $0x9b05688c
+DATA iv1<>+8(SB)/4, $0x1f83d9ab
+DATA iv1<>+12(SB)/4, $0x5be0cd19
+GLOBL iv1<>(SB), RODATA|NOPTR, $16
 
-#define BLAKE2s_SSE4() \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14);                               \
-	LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \
-	ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14)
+DATA counter<>+0(SB)/8, $0x0000000000000040
+DATA counter<>+8(SB)/8, $0x0000000000000000
+GLOBL counter<>(SB), RODATA|NOPTR, $16
 
-#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \
-	MOVQ  h, AX;                   \
-	MOVQ  c, BX;                   \
-	MOVL  flag, CX;                \
-	MOVQ  blocks_base, SI;         \
-	MOVQ  blocks_len, DX;          \
-	                               \
-	MOVQ  SP, BP;                  \
-	ADDQ  $15, BP;                 \
-	ANDQ  $~15, BP;                \
-	                               \
-	MOVQ  0(BX), R9;               \
-	MOVQ  R9, 0(BP);               \
-	MOVQ  CX, 8(BP);               \
-	                               \
-	MOVOU 0(AX), X0;               \
-	MOVOU 16(AX), X1;              \
-	MOVOU iv0<>(SB), X2;           \
-	MOVOU iv1<>(SB), X3            \
-	                               \
-	MOVOU counter<>(SB), X12;      \
-	MOVOU rol16<>(SB), X13;        \
-	MOVOU rol8<>(SB), X14;         \
-	MOVO  0(BP), X15;              \
-	                               \
-	loop:                          \
-	MOVO  X0, X4;                  \
-	MOVO  X1, X5;                  \
-	MOVO  X2, X6;                  \
-	MOVO  X3, X7;                  \
-	                               \
-	PADDQ X12, X15;                \
-	PXOR  X15, X7;                 \
-	                               \
-	BLAKE2s_FUNC();                \
-	                               \
-	PXOR  X4, X0;                  \
-	PXOR  X5, X1;                  \
-	PXOR  X6, X0;                  \
-	PXOR  X7, X1;                  \
-	                               \
-	LEAQ  64(SI), SI;              \
-	SUBQ  $64, DX;                 \
-	JNE   loop;                    \
-	                               \
-	MOVO  X15, 0(BP);              \
-	MOVQ  0(BP), R9;               \
-	MOVQ  R9, 0(BX);               \
-	                               \
-	MOVOU X0, 0(AX);               \
-	MOVOU X1, 16(AX)
+DATA rol16<>+0(SB)/8, $0x0504070601000302
+DATA rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
+GLOBL rol16<>(SB), RODATA|NOPTR, $16
 
-// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
-TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment
-	HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2)
-	RET
+DATA rol8<>+0(SB)/8, $0x0407060500030201
+DATA rol8<>+8(SB)/8, $0x0c0f0e0d080b0a09
+GLOBL rol8<>(SB), RODATA|NOPTR, $16
 
 // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
-TEXT ·hashBlocksSSSE3(SB), 0, $672-48 // frame = 656 + 16 byte alignment
-	HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3)
+// Requires: SSE2, SSSE3
+TEXT ·hashBlocksSSSE3(SB), $672-48
+	MOVQ  h+0(FP), AX
+	MOVQ  c+8(FP), BX
+	MOVL  flag+16(FP), CX
+	MOVQ  blocks_base+24(FP), SI
+	MOVQ  blocks_len+32(FP), DX
+	MOVQ  SP, BP
+	ADDQ  $0x0f, BP
+	ANDQ  $-16, BP
+	MOVQ  (BX), R9
+	MOVQ  R9, (BP)
+	MOVQ  CX, 8(BP)
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU iv0<>+0(SB), X2
+	MOVOU iv1<>+0(SB), X3
+	MOVOU counter<>+0(SB), X12
+	MOVOU rol16<>+0(SB), X13
+	MOVOU rol8<>+0(SB), X14
+	MOVO  (BP), X15
+
+loop:
+	MOVO   X0, X4
+	MOVO   X1, X5
+	MOVO   X2, X6
+	MOVO   X3, X7
+	PADDQ  X12, X15
+	PXOR   X15, X7
+	MOVQ   (SI), R8
+	MOVQ   8(SI), R9
+	MOVQ   16(SI), R10
+	MOVQ   24(SI), R11
+	MOVQ   32(SI), R12
+	MOVQ   40(SI), R13
+	MOVQ   48(SI), R14
+	MOVQ   56(SI), R15
+	MOVL   R8, 16(BP)
+	MOVL   R8, 116(BP)
+	MOVL   R8, 164(BP)
+	MOVL   R8, 264(BP)
+	MOVL   R8, 288(BP)
+	MOVL   R8, 344(BP)
+	MOVL   R8, 432(BP)
+	MOVL   R8, 512(BP)
+	MOVL   R8, 540(BP)
+	MOVL   R8, 652(BP)
+	SHRQ   $0x20, R8
+	MOVL   R8, 32(BP)
+	MOVL   R8, 112(BP)
+	MOVL   R8, 200(BP)
+	MOVL   R8, 228(BP)
+	MOVL   R8, 320(BP)
+	MOVL   R8, 380(BP)
+	MOVL   R8, 404(BP)
+	MOVL   R8, 488(BP)
+	MOVL   R8, 568(BP)
+	MOVL   R8, 604(BP)
+	MOVL   R9, 20(BP)
+	MOVL   R9, 132(BP)
+	MOVL   R9, 168(BP)
+	MOVL   R9, 240(BP)
+	MOVL   R9, 280(BP)
+	MOVL   R9, 336(BP)
+	MOVL   R9, 456(BP)
+	MOVL   R9, 508(BP)
+	MOVL   R9, 576(BP)
+	MOVL   R9, 608(BP)
+	SHRQ   $0x20, R9
+	MOVL   R9, 36(BP)
+	MOVL   R9, 140(BP)
+	MOVL   R9, 180(BP)
+	MOVL   R9, 212(BP)
+	MOVL   R9, 316(BP)
+	MOVL   R9, 364(BP)
+	MOVL   R9, 452(BP)
+	MOVL   R9, 476(BP)
+	MOVL   R9, 552(BP)
+	MOVL   R9, 632(BP)
+	MOVL   R10, 24(BP)
+	MOVL   R10, 84(BP)
+	MOVL   R10, 204(BP)
+	MOVL   R10, 248(BP)
+	MOVL   R10, 296(BP)
+	MOVL   R10, 368(BP)
+	MOVL   R10, 412(BP)
+	MOVL   R10, 516(BP)
+	MOVL   R10, 584(BP)
+	MOVL   R10, 612(BP)
+	SHRQ   $0x20, R10
+	MOVL   R10, 40(BP)
+	MOVL   R10, 124(BP)
+	MOVL   R10, 152(BP)
+	MOVL   R10, 244(BP)
+	MOVL   R10, 276(BP)
+	MOVL   R10, 388(BP)
+	MOVL   R10, 416(BP)
+	MOVL   R10, 496(BP)
+	MOVL   R10, 588(BP)
+	MOVL   R10, 620(BP)
+	MOVL   R11, 28(BP)
+	MOVL   R11, 108(BP)
+	MOVL   R11, 196(BP)
+	MOVL   R11, 256(BP)
+	MOVL   R11, 312(BP)
+	MOVL   R11, 340(BP)
+	MOVL   R11, 436(BP)
+	MOVL   R11, 520(BP)
+	MOVL   R11, 528(BP)
+	MOVL   R11, 616(BP)
+	SHRQ   $0x20, R11
+	MOVL   R11, 44(BP)
+	MOVL   R11, 136(BP)
+	MOVL   R11, 184(BP)
+	MOVL   R11, 208(BP)
+	MOVL   R11, 292(BP)
+	MOVL   R11, 372(BP)
+	MOVL   R11, 448(BP)
+	MOVL   R11, 468(BP)
+	MOVL   R11, 580(BP)
+	MOVL   R11, 600(BP)
+	MOVL   R12, 48(BP)
+	MOVL   R12, 100(BP)
+	MOVL   R12, 160(BP)
+	MOVL   R12, 268(BP)
+	MOVL   R12, 328(BP)
+	MOVL   R12, 348(BP)
+	MOVL   R12, 444(BP)
+	MOVL   R12, 504(BP)
+	MOVL   R12, 556(BP)
+	MOVL   R12, 596(BP)
+	SHRQ   $0x20, R12
+	MOVL   R12, 64(BP)
+	MOVL   R12, 88(BP)
+	MOVL   R12, 188(BP)
+	MOVL   R12, 224(BP)
+	MOVL   R12, 272(BP)
+	MOVL   R12, 396(BP)
+	MOVL   R12, 440(BP)
+	MOVL   R12, 492(BP)
+	MOVL   R12, 548(BP)
+	MOVL   R12, 628(BP)
+	MOVL   R13, 52(BP)
+	MOVL   R13, 96(BP)
+	MOVL   R13, 176(BP)
+	MOVL   R13, 260(BP)
+	MOVL   R13, 284(BP)
+	MOVL   R13, 356(BP)
+	MOVL   R13, 428(BP)
+	MOVL   R13, 524(BP)
+	MOVL   R13, 572(BP)
+	MOVL   R13, 592(BP)
+	SHRQ   $0x20, R13
+	MOVL   R13, 68(BP)
+	MOVL   R13, 120(BP)
+	MOVL   R13, 144(BP)
+	MOVL   R13, 220(BP)
+	MOVL   R13, 308(BP)
+	MOVL   R13, 360(BP)
+	MOVL   R13, 460(BP)
+	MOVL   R13, 480(BP)
+	MOVL   R13, 536(BP)
+	MOVL   R13, 640(BP)
+	MOVL   R14, 56(BP)
+	MOVL   R14, 128(BP)
+	MOVL   R14, 148(BP)
+	MOVL   R14, 232(BP)
+	MOVL   R14, 324(BP)
+	MOVL   R14, 352(BP)
+	MOVL   R14, 400(BP)
+	MOVL   R14, 472(BP)
+	MOVL   R14, 560(BP)
+	MOVL   R14, 648(BP)
+	SHRQ   $0x20, R14
+	MOVL   R14, 72(BP)
+	MOVL   R14, 92(BP)
+	MOVL   R14, 172(BP)
+	MOVL   R14, 216(BP)
+	MOVL   R14, 332(BP)
+	MOVL   R14, 384(BP)
+	MOVL   R14, 424(BP)
+	MOVL   R14, 464(BP)
+	MOVL   R14, 564(BP)
+	MOVL   R14, 636(BP)
+	MOVL   R15, 60(BP)
+	MOVL   R15, 80(BP)
+	MOVL   R15, 192(BP)
+	MOVL   R15, 236(BP)
+	MOVL   R15, 304(BP)
+	MOVL   R15, 392(BP)
+	MOVL   R15, 408(BP)
+	MOVL   R15, 484(BP)
+	MOVL   R15, 532(BP)
+	MOVL   R15, 644(BP)
+	SHRQ   $0x20, R15
+	MOVL   R15, 76(BP)
+	MOVL   R15, 104(BP)
+	MOVL   R15, 156(BP)
+	MOVL   R15, 252(BP)
+	MOVL   R15, 300(BP)
+	MOVL   R15, 376(BP)
+	MOVL   R15, 420(BP)
+	MOVL   R15, 500(BP)
+	MOVL   R15, 544(BP)
+	MOVL   R15, 624(BP)
+	PADDL  16(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  32(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  48(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  64(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  80(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  96(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  112(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  128(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  144(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  160(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  176(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  192(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  208(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  224(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  240(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  256(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  272(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  288(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  304(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  320(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  336(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  352(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  368(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  384(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  400(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  416(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  432(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  448(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  464(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  480(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  496(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  512(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  528(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  544(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  560(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  576(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PADDL  592(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  608(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  624(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  640(BP), X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PXOR   X4, X0
+	PXOR   X5, X1
+	PXOR   X6, X0
+	PXOR   X7, X1
+	LEAQ   64(SI), SI
+	SUBQ   $0x40, DX
+	JNE    loop
+	MOVO   X15, (BP)
+	MOVQ   (BP), R9
+	MOVQ   R9, (BX)
+	MOVOU  X0, (AX)
+	MOVOU  X1, 16(AX)
 	RET
 
 // func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
-TEXT ·hashBlocksSSE4(SB), 0, $32-48 // frame = 16 + 16 byte alignment
-	HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4)
+// Requires: SSE2, SSE4.1, SSSE3
+TEXT ·hashBlocksSSE4(SB), $32-48
+	MOVQ  h+0(FP), AX
+	MOVQ  c+8(FP), BX
+	MOVL  flag+16(FP), CX
+	MOVQ  blocks_base+24(FP), SI
+	MOVQ  blocks_len+32(FP), DX
+	MOVQ  SP, BP
+	ADDQ  $0x0f, BP
+	ANDQ  $-16, BP
+	MOVQ  (BX), R9
+	MOVQ  R9, (BP)
+	MOVQ  CX, 8(BP)
+	MOVOU (AX), X0
+	MOVOU 16(AX), X1
+	MOVOU iv0<>+0(SB), X2
+	MOVOU iv1<>+0(SB), X3
+	MOVOU counter<>+0(SB), X12
+	MOVOU rol16<>+0(SB), X13
+	MOVOU rol8<>+0(SB), X14
+	MOVO  (BP), X15
+
+loop:
+	MOVO   X0, X4
+	MOVO   X1, X5
+	MOVO   X2, X6
+	MOVO   X3, X7
+	PADDQ  X12, X15
+	PXOR   X15, X7
+	MOVL   (SI), X8
+	PINSRD $0x01, 8(SI), X8
+	PINSRD $0x02, 16(SI), X8
+	PINSRD $0x03, 24(SI), X8
+	MOVL   4(SI), X9
+	PINSRD $0x01, 12(SI), X9
+	PINSRD $0x02, 20(SI), X9
+	PINSRD $0x03, 28(SI), X9
+	MOVL   32(SI), X10
+	PINSRD $0x01, 40(SI), X10
+	PINSRD $0x02, 48(SI), X10
+	PINSRD $0x03, 56(SI), X10
+	MOVL   36(SI), X11
+	PINSRD $0x01, 44(SI), X11
+	PINSRD $0x02, 52(SI), X11
+	PINSRD $0x03, 60(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   56(SI), X8
+	PINSRD $0x01, 16(SI), X8
+	PINSRD $0x02, 36(SI), X8
+	PINSRD $0x03, 52(SI), X8
+	MOVL   40(SI), X9
+	PINSRD $0x01, 32(SI), X9
+	PINSRD $0x02, 60(SI), X9
+	PINSRD $0x03, 24(SI), X9
+	MOVL   4(SI), X10
+	PINSRD $0x01, (SI), X10
+	PINSRD $0x02, 44(SI), X10
+	PINSRD $0x03, 20(SI), X10
+	MOVL   48(SI), X11
+	PINSRD $0x01, 8(SI), X11
+	PINSRD $0x02, 28(SI), X11
+	PINSRD $0x03, 12(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   44(SI), X8
+	PINSRD $0x01, 48(SI), X8
+	PINSRD $0x02, 20(SI), X8
+	PINSRD $0x03, 60(SI), X8
+	MOVL   32(SI), X9
+	PINSRD $0x01, (SI), X9
+	PINSRD $0x02, 8(SI), X9
+	PINSRD $0x03, 52(SI), X9
+	MOVL   40(SI), X10
+	PINSRD $0x01, 12(SI), X10
+	PINSRD $0x02, 28(SI), X10
+	PINSRD $0x03, 36(SI), X10
+	MOVL   56(SI), X11
+	PINSRD $0x01, 24(SI), X11
+	PINSRD $0x02, 4(SI), X11
+	PINSRD $0x03, 16(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   28(SI), X8
+	PINSRD $0x01, 12(SI), X8
+	PINSRD $0x02, 52(SI), X8
+	PINSRD $0x03, 44(SI), X8
+	MOVL   36(SI), X9
+	PINSRD $0x01, 4(SI), X9
+	PINSRD $0x02, 48(SI), X9
+	PINSRD $0x03, 56(SI), X9
+	MOVL   8(SI), X10
+	PINSRD $0x01, 20(SI), X10
+	PINSRD $0x02, 16(SI), X10
+	PINSRD $0x03, 60(SI), X10
+	MOVL   24(SI), X11
+	PINSRD $0x01, 40(SI), X11
+	PINSRD $0x02, (SI), X11
+	PINSRD $0x03, 32(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   36(SI), X8
+	PINSRD $0x01, 20(SI), X8
+	PINSRD $0x02, 8(SI), X8
+	PINSRD $0x03, 40(SI), X8
+	MOVL   (SI), X9
+	PINSRD $0x01, 28(SI), X9
+	PINSRD $0x02, 16(SI), X9
+	PINSRD $0x03, 60(SI), X9
+	MOVL   56(SI), X10
+	PINSRD $0x01, 44(SI), X10
+	PINSRD $0x02, 24(SI), X10
+	PINSRD $0x03, 12(SI), X10
+	MOVL   4(SI), X11
+	PINSRD $0x01, 48(SI), X11
+	PINSRD $0x02, 32(SI), X11
+	PINSRD $0x03, 52(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   8(SI), X8
+	PINSRD $0x01, 24(SI), X8
+	PINSRD $0x02, (SI), X8
+	PINSRD $0x03, 32(SI), X8
+	MOVL   48(SI), X9
+	PINSRD $0x01, 40(SI), X9
+	PINSRD $0x02, 44(SI), X9
+	PINSRD $0x03, 12(SI), X9
+	MOVL   16(SI), X10
+	PINSRD $0x01, 28(SI), X10
+	PINSRD $0x02, 60(SI), X10
+	PINSRD $0x03, 4(SI), X10
+	MOVL   52(SI), X11
+	PINSRD $0x01, 20(SI), X11
+	PINSRD $0x02, 56(SI), X11
+	PINSRD $0x03, 36(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   48(SI), X8
+	PINSRD $0x01, 4(SI), X8
+	PINSRD $0x02, 56(SI), X8
+	PINSRD $0x03, 16(SI), X8
+	MOVL   20(SI), X9
+	PINSRD $0x01, 60(SI), X9
+	PINSRD $0x02, 52(SI), X9
+	PINSRD $0x03, 40(SI), X9
+	MOVL   (SI), X10
+	PINSRD $0x01, 24(SI), X10
+	PINSRD $0x02, 36(SI), X10
+	PINSRD $0x03, 32(SI), X10
+	MOVL   28(SI), X11
+	PINSRD $0x01, 12(SI), X11
+	PINSRD $0x02, 8(SI), X11
+	PINSRD $0x03, 44(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   52(SI), X8
+	PINSRD $0x01, 28(SI), X8
+	PINSRD $0x02, 48(SI), X8
+	PINSRD $0x03, 12(SI), X8
+	MOVL   44(SI), X9
+	PINSRD $0x01, 56(SI), X9
+	PINSRD $0x02, 4(SI), X9
+	PINSRD $0x03, 36(SI), X9
+	MOVL   20(SI), X10
+	PINSRD $0x01, 60(SI), X10
+	PINSRD $0x02, 32(SI), X10
+	PINSRD $0x03, 8(SI), X10
+	MOVL   (SI), X11
+	PINSRD $0x01, 16(SI), X11
+	PINSRD $0x02, 24(SI), X11
+	PINSRD $0x03, 40(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   24(SI), X8
+	PINSRD $0x01, 56(SI), X8
+	PINSRD $0x02, 44(SI), X8
+	PINSRD $0x03, (SI), X8
+	MOVL   60(SI), X9
+	PINSRD $0x01, 36(SI), X9
+	PINSRD $0x02, 12(SI), X9
+	PINSRD $0x03, 32(SI), X9
+	MOVL   48(SI), X10
+	PINSRD $0x01, 52(SI), X10
+	PINSRD $0x02, 4(SI), X10
+	PINSRD $0x03, 40(SI), X10
+	MOVL   8(SI), X11
+	PINSRD $0x01, 28(SI), X11
+	PINSRD $0x02, 16(SI), X11
+	PINSRD $0x03, 20(SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	MOVL   40(SI), X8
+	PINSRD $0x01, 32(SI), X8
+	PINSRD $0x02, 28(SI), X8
+	PINSRD $0x03, 4(SI), X8
+	MOVL   8(SI), X9
+	PINSRD $0x01, 16(SI), X9
+	PINSRD $0x02, 24(SI), X9
+	PINSRD $0x03, 20(SI), X9
+	MOVL   60(SI), X10
+	PINSRD $0x01, 36(SI), X10
+	PINSRD $0x02, 12(SI), X10
+	PINSRD $0x03, 52(SI), X10
+	MOVL   44(SI), X11
+	PINSRD $0x01, 56(SI), X11
+	PINSRD $0x02, 48(SI), X11
+	PINSRD $0x03, (SI), X11
+	PADDL  X8, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X9, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X7, X7
+	PADDL  X10, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X13, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x14, X8
+	PSRLL  $0x0c, X5
+	PXOR   X8, X5
+	PADDL  X11, X4
+	PADDL  X5, X4
+	PXOR   X4, X7
+	PSHUFB X14, X7
+	PADDL  X7, X6
+	PXOR   X6, X5
+	MOVO   X5, X8
+	PSLLL  $0x19, X8
+	PSRLL  $0x07, X5
+	PXOR   X8, X5
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x4e, X6, X6
+	PSHUFL $0x93, X5, X5
+	PXOR   X4, X0
+	PXOR   X5, X1
+	PXOR   X6, X0
+	PXOR   X7, X1
+	LEAQ   64(SI), SI
+	SUBQ   $0x40, DX
+	JNE    loop
+	MOVO   X15, (BP)
+	MOVQ   (BP), R9
+	MOVQ   R9, (BX)
+	MOVOU  X0, (AX)
+	MOVOU  X1, 16(AX)
 	RET
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
index db42e6676ab35b735626c5bee7ce4630d450975d..c709b728477d10fc8738d6e254584e766c86b0a8 100644
--- a/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego
+//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
 
 package chacha20
 
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go
similarity index 89%
rename from vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
rename to vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go
index 3a4287f9900e40622ca39fc2e3580fb9190b07fc..bd183d9ba1243e4d0245a1b3b586fe16ebe8c2f7 100644
--- a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
 
 package chacha20
 
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s
similarity index 76%
rename from vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
rename to vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s
index c672ccf6986b08dee9c60f4405d6d66cafbc3e98..a660b4112fafd9830ec74abad6f1f57aaa990fdb 100644
--- a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s
@@ -19,7 +19,7 @@
 // The differences in this and the original implementation are
 // due to the calling conventions and initialization of constants.
 
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
 
 #include "textflag.h"
 
@@ -36,32 +36,68 @@
 // for VPERMXOR
 #define MASK  R18
 
-DATA consts<>+0x00(SB)/8, $0x3320646e61707865
-DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
-DATA consts<>+0x10(SB)/8, $0x0000000000000001
-DATA consts<>+0x18(SB)/8, $0x0000000000000000
-DATA consts<>+0x20(SB)/8, $0x0000000000000004
-DATA consts<>+0x28(SB)/8, $0x0000000000000000
-DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
-DATA consts<>+0x38(SB)/8, $0x0203000106070405
-DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
-DATA consts<>+0x48(SB)/8, $0x0102030005060704
-DATA consts<>+0x50(SB)/8, $0x6170786561707865
-DATA consts<>+0x58(SB)/8, $0x6170786561707865
-DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
-DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
-DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
-DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
-DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
-DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
-DATA consts<>+0x90(SB)/8, $0x0000000100000000
-DATA consts<>+0x98(SB)/8, $0x0000000300000002
-DATA consts<>+0xa0(SB)/8, $0x5566774411223300
-DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
-DATA consts<>+0xb0(SB)/8, $0x6677445522330011
-DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
+DATA consts<>+0x00(SB)/4, $0x61707865
+DATA consts<>+0x04(SB)/4, $0x3320646e
+DATA consts<>+0x08(SB)/4, $0x79622d32
+DATA consts<>+0x0c(SB)/4, $0x6b206574
+DATA consts<>+0x10(SB)/4, $0x00000001
+DATA consts<>+0x14(SB)/4, $0x00000000
+DATA consts<>+0x18(SB)/4, $0x00000000
+DATA consts<>+0x1c(SB)/4, $0x00000000
+DATA consts<>+0x20(SB)/4, $0x00000004
+DATA consts<>+0x24(SB)/4, $0x00000000
+DATA consts<>+0x28(SB)/4, $0x00000000
+DATA consts<>+0x2c(SB)/4, $0x00000000
+DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
+DATA consts<>+0x34(SB)/4, $0x0a0b0809
+DATA consts<>+0x38(SB)/4, $0x06070405
+DATA consts<>+0x3c(SB)/4, $0x02030001
+DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
+DATA consts<>+0x44(SB)/4, $0x090a0b08
+DATA consts<>+0x48(SB)/4, $0x05060704
+DATA consts<>+0x4c(SB)/4, $0x01020300
+DATA consts<>+0x50(SB)/4, $0x61707865
+DATA consts<>+0x54(SB)/4, $0x61707865
+DATA consts<>+0x58(SB)/4, $0x61707865
+DATA consts<>+0x5c(SB)/4, $0x61707865
+DATA consts<>+0x60(SB)/4, $0x3320646e
+DATA consts<>+0x64(SB)/4, $0x3320646e
+DATA consts<>+0x68(SB)/4, $0x3320646e
+DATA consts<>+0x6c(SB)/4, $0x3320646e
+DATA consts<>+0x70(SB)/4, $0x79622d32
+DATA consts<>+0x74(SB)/4, $0x79622d32
+DATA consts<>+0x78(SB)/4, $0x79622d32
+DATA consts<>+0x7c(SB)/4, $0x79622d32
+DATA consts<>+0x80(SB)/4, $0x6b206574
+DATA consts<>+0x84(SB)/4, $0x6b206574
+DATA consts<>+0x88(SB)/4, $0x6b206574
+DATA consts<>+0x8c(SB)/4, $0x6b206574
+DATA consts<>+0x90(SB)/4, $0x00000000
+DATA consts<>+0x94(SB)/4, $0x00000001
+DATA consts<>+0x98(SB)/4, $0x00000002
+DATA consts<>+0x9c(SB)/4, $0x00000003
+DATA consts<>+0xa0(SB)/4, $0x11223300
+DATA consts<>+0xa4(SB)/4, $0x55667744
+DATA consts<>+0xa8(SB)/4, $0x99aabb88
+DATA consts<>+0xac(SB)/4, $0xddeeffcc
+DATA consts<>+0xb0(SB)/4, $0x22330011
+DATA consts<>+0xb4(SB)/4, $0x66774455
+DATA consts<>+0xb8(SB)/4, $0xaabb8899
+DATA consts<>+0xbc(SB)/4, $0xeeffccdd
 GLOBL consts<>(SB), RODATA, $0xc0
 
+#ifdef GOARCH_ppc64
+#define BE_XXBRW_INIT() \
+		LVSL (R0)(R0), V24 \
+		VSPLTISB $3, V25   \
+		VXOR V24, V25, V24 \
+
+#define BE_XXBRW(vr) VPERM vr, vr, V24, vr
+#else
+#define BE_XXBRW_INIT()
+#define BE_XXBRW(vr)
+#endif
+
 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
 	MOVD out+0(FP), OUT
@@ -94,6 +130,8 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
 	// Clear V27
 	VXOR V27, V27, V27
 
+	BE_XXBRW_INIT()
+
 	// V28
 	LXVW4X (CONSTBASE)(R11), VS60
 
@@ -299,6 +337,11 @@ loop_vsx:
 	VADDUWM V8, V18, V8
 	VADDUWM V12, V19, V12
 
+	BE_XXBRW(V0)
+	BE_XXBRW(V4)
+	BE_XXBRW(V8)
+	BE_XXBRW(V12)
+
 	CMPU LEN, $64
 	BLT tail_vsx
 
@@ -327,6 +370,11 @@ loop_vsx:
 	VADDUWM V9, V18, V8
 	VADDUWM V13, V19, V12
 
+	BE_XXBRW(V0)
+	BE_XXBRW(V4)
+	BE_XXBRW(V8)
+	BE_XXBRW(V12)
+
 	CMPU  LEN, $64
 	BLT   tail_vsx
 
@@ -334,8 +382,8 @@ loop_vsx:
 	LXVW4X (INP)(R8), VS60
 	LXVW4X (INP)(R9), VS61
 	LXVW4X (INP)(R10), VS62
-	VXOR   V27, V0, V27
 
+	VXOR V27, V0, V27
 	VXOR V28, V4, V28
 	VXOR V29, V8, V29
 	VXOR V30, V12, V30
@@ -354,6 +402,11 @@ loop_vsx:
 	VADDUWM V10, V18, V8
 	VADDUWM V14, V19, V12
 
+	BE_XXBRW(V0)
+	BE_XXBRW(V4)
+	BE_XXBRW(V8)
+	BE_XXBRW(V12)
+
 	CMPU LEN, $64
 	BLT  tail_vsx
 
@@ -381,6 +434,11 @@ loop_vsx:
 	VADDUWM V11, V18, V8
 	VADDUWM V15, V19, V12
 
+	BE_XXBRW(V0)
+	BE_XXBRW(V4)
+	BE_XXBRW(V8)
+	BE_XXBRW(V12)
+
 	CMPU  LEN, $64
 	BLT   tail_vsx
 
@@ -408,9 +466,9 @@ loop_vsx:
 
 done_vsx:
 	// Increment counter by number of 64 byte blocks
-	MOVD (CNT), R14
+	MOVWZ (CNT), R14
 	ADD  BLOCKS, R14
-	MOVD R14, (CNT)
+	MOVWZ R14, (CNT)
 	RET
 
 tail_vsx:
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
index 731d2ac6dbc11d4a92295f39d9adc963364a368b..fd5ee845f9faacf742693334acd48ea1c9f45c9a 100644
--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
@@ -1,2715 +1,9762 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
+// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
 
 //go:build gc && !purego
 
 #include "textflag.h"
-// General register allocation
-#define oup DI
-#define inp SI
-#define inl BX
-#define adp CX // free to reuse, after we hash the additional data
-#define keyp R8 // free to reuse, when we copy the key to stack
-#define itr2 R9 // general iterator
-#define itr1 CX // general iterator
-#define acc0 R10
-#define acc1 R11
-#define acc2 R12
-#define t0 R13
-#define t1 R14
-#define t2 R15
-#define t3 R8
-// Register and stack allocation for the SSE code
-#define rStore (0*16)(BP)
-#define sStore (1*16)(BP)
-#define state1Store (2*16)(BP)
-#define state2Store (3*16)(BP)
-#define tmpStore (4*16)(BP)
-#define ctr0Store (5*16)(BP)
-#define ctr1Store (6*16)(BP)
-#define ctr2Store (7*16)(BP)
-#define ctr3Store (8*16)(BP)
-#define A0 X0
-#define A1 X1
-#define A2 X2
-#define B0 X3
-#define B1 X4
-#define B2 X5
-#define C0 X6
-#define C1 X7
-#define C2 X8
-#define D0 X9
-#define D1 X10
-#define D2 X11
-#define T0 X12
-#define T1 X13
-#define T2 X14
-#define T3 X15
-#define A3 T0
-#define B3 T1
-#define C3 T2
-#define D3 T3
-// Register and stack allocation for the AVX2 code
-#define rsStoreAVX2 (0*32)(BP)
-#define state1StoreAVX2 (1*32)(BP)
-#define state2StoreAVX2 (2*32)(BP)
-#define ctr0StoreAVX2 (3*32)(BP)
-#define ctr1StoreAVX2 (4*32)(BP)
-#define ctr2StoreAVX2 (5*32)(BP)
-#define ctr3StoreAVX2 (6*32)(BP)
-#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
-#define AA0 Y0
-#define AA1 Y5
-#define AA2 Y6
-#define AA3 Y7
-#define BB0 Y14
-#define BB1 Y9
-#define BB2 Y10
-#define BB3 Y11
-#define CC0 Y12
-#define CC1 Y13
-#define CC2 Y8
-#define CC3 Y15
-#define DD0 Y4
-#define DD1 Y1
-#define DD2 Y2
-#define DD3 Y3
-#define TT0 DD3
-#define TT1 AA3
-#define TT2 BB3
-#define TT3 CC3
-// ChaCha20 constants
-DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
-DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
-// <<< 16 with PSHUFB
-DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
-// <<< 8 with PSHUFB
-DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
-DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
-
-DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
-DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
-
-DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
-DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
-// Poly1305 key clamp
-DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
-
-DATA ·sseIncMask<>+0x00(SB)/8, $0x1
-DATA ·sseIncMask<>+0x08(SB)/8, $0x0
-// To load/store the last < 16 bytes in a buffer
-DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
-DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
-
-GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
-GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
-GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
-// No PALIGNR in Go ASM yet (but VPALIGNR is present).
-#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
-#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
-#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
-#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
-#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
-#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
-#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
-#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
-#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
-#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
-#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
-#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
-#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
-#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
-#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
-#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
-#define shiftC0Right shiftC0Left
-#define shiftC1Right shiftC1Left
-#define shiftC2Right shiftC2Left
-#define shiftC3Right shiftC3Left
-#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
-#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
-#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
-#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
-
-// Some macros
-
-// ROL rotates the uint32s in register R left by N bits, using temporary T.
-#define ROL(N, R, T) \
-	MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
-
-// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
-#else
-#define ROL16(R, T) ROL(16, R, T)
-#endif
-
-// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
-#else
-#define ROL8(R, T) ROL(8, R, T)
-#endif
-
-#define chachaQR(A, B, C, D, T) \
-	PADDD B, A; PXOR A, D; ROL16(D, T) \
-	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
-	PADDD B, A; PXOR A, D; ROL8(D, T) \
-	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
-
-#define chachaQR_AVX2(A, B, C, D, T) \
-	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
-	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
-	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
-	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
-
-#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
-#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
-#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
-#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
-
-#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
-#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
-
-#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
-#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
-// ----------------------------------------------------------------------------
+
+// func polyHashADInternal<>()
 TEXT polyHashADInternal<>(SB), NOSPLIT, $0
-	// adp points to beginning of additional data
-	// itr2 holds ad length
-	XORQ acc0, acc0
-	XORQ acc1, acc1
-	XORQ acc2, acc2
-	CMPQ itr2, $13
-	JNE  hashADLoop
-
-openFastTLSAD:
-	// Special treatment for the TLS case of 13 bytes
-	MOVQ (adp), acc0
-	MOVQ 5(adp), acc1
-	SHRQ $24, acc1
-	MOVQ $1, acc2
-	polyMul
+	// Hack: Must declare #define macros inside of a function due to Avo constraints
+	// ROL rotates the uint32s in register R left by N bits, using temporary T.
+	#define ROL(N, R, T) \
+		MOVO R, T; \
+		PSLLL $(N), T; \
+		PSRLL $(32-(N)), R; \
+		PXOR T, R
+
+	// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
+	#ifdef GOAMD64_v2
+		#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
+	#else
+		#define ROL8(R, T) ROL(8, R, T)
+	#endif
+
+	// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
+	#ifdef GOAMD64_v2
+		#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
+	#else
+		#define ROL16(R, T) ROL(16, R, T)
+	#endif
+	XORQ  R10, R10
+	XORQ  R11, R11
+	XORQ  R12, R12
+	CMPQ  R9, $0x0d
+	JNE   hashADLoop
+	MOVQ  (CX), R10
+	MOVQ  5(CX), R11
+	SHRQ  $0x18, R11
+	MOVQ  $0x00000001, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 	RET
 
 hashADLoop:
 	// Hash in 16 byte chunks
-	CMPQ itr2, $16
-	JB   hashADTail
-	polyAdd(0(adp))
-	LEAQ (1*16)(adp), adp
-	SUBQ $16, itr2
-	polyMul
-	JMP  hashADLoop
+	CMPQ  R9, $0x10
+	JB    hashADTail
+	ADDQ  (CX), R10
+	ADCQ  8(CX), R11
+	ADCQ  $0x01, R12
+	LEAQ  16(CX), CX
+	SUBQ  $0x10, R9
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	JMP   hashADLoop
 
 hashADTail:
-	CMPQ itr2, $0
+	CMPQ R9, $0x00
 	JE   hashADDone
 
 	// Hash last < 16 byte tail
-	XORQ t0, t0
-	XORQ t1, t1
-	XORQ t2, t2
-	ADDQ itr2, adp
+	XORQ R13, R13
+	XORQ R14, R14
+	XORQ R15, R15
+	ADDQ R9, CX
 
 hashADTailLoop:
-	SHLQ $8, t0, t1
-	SHLQ $8, t0
-	MOVB -1(adp), t2
-	XORQ t2, t0
-	DECQ adp
-	DECQ itr2
-	JNE  hashADTailLoop
-
-hashADTailFinish:
-	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
-
-	// Finished AD
+	SHLQ  $0x08, R13, R14
+	SHLQ  $0x08, R13
+	MOVB  -1(CX), R15
+	XORQ  R15, R13
+	DECQ  CX
+	DECQ  R9
+	JNE   hashADTailLoop
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+
 hashADDone:
 	RET
 
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
-TEXT ·chacha20Poly1305Open(SB), 0, $288-97
+// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Open(SB), $288-97
 	// For aligned stack access
 	MOVQ SP, BP
-	ADDQ $32, BP
+	ADDQ $0x20, BP
 	ANDQ $-32, BP
-	MOVQ dst+0(FP), oup
-	MOVQ key+24(FP), keyp
-	MOVQ src+48(FP), inp
-	MOVQ src_len+56(FP), inl
-	MOVQ ad+72(FP), adp
+	MOVQ dst_base+0(FP), DI
+	MOVQ key_base+24(FP), R8
+	MOVQ src_base+48(FP), SI
+	MOVQ src_len+56(FP), BX
+	MOVQ ad_base+72(FP), CX
 
 	// Check for AVX2 support
-	CMPB ·useAVX2(SB), $1
+	CMPB ·useAVX2+0(SB), $0x01
 	JE   chacha20Poly1305Open_AVX2
 
 	// Special optimization, for very short buffers
-	CMPQ inl, $128
-	JBE  openSSE128 // About 16% faster
+	CMPQ BX, $0x80
+	JBE  openSSE128
 
 	// For long buffers, prepare the poly key first
-	MOVOU ·chacha20Constants<>(SB), A0
-	MOVOU (1*16)(keyp), B0
-	MOVOU (2*16)(keyp), C0
-	MOVOU (3*16)(keyp), D0
-	MOVO  D0, T1
+	MOVOU ·chacha20Constants<>+0(SB), X0
+	MOVOU 16(R8), X3
+	MOVOU 32(R8), X6
+	MOVOU 48(R8), X9
+	MOVO  X9, X13
 
 	// Store state on stack for future use
-	MOVO B0, state1Store
-	MOVO C0, state2Store
-	MOVO D0, ctr3Store
-	MOVQ $10, itr2
+	MOVO X3, 32(BP)
+	MOVO X6, 48(BP)
+	MOVO X9, 128(BP)
+	MOVQ $0x0000000a, R9
 
 openSSEPreparePolyKey:
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	DECQ          itr2
-	JNE           openSSEPreparePolyKey
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	DECQ  R9
+	JNE   openSSEPreparePolyKey
 
 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL 32(BP), X3
 
 	// Clamp and store the key
-	PAND ·polyClampMask<>(SB), A0
-	MOVO A0, rStore; MOVO B0, sStore
+	PAND ·polyClampMask<>+0(SB), X0
+	MOVO X0, (BP)
+	MOVO X3, 16(BP)
 
 	// Hash AAD
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
 
 openSSEMainLoop:
-	CMPQ inl, $256
+	CMPQ BX, $0x00000100
 	JB   openSSEMainLoopDone
 
 	// Load state, increment counter blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X2, X12
+	MOVO  X5, X13
+	MOVO  X8, X14
+	MOVO  X11, X15
+	PADDL ·sseIncMask<>+0(SB), X15
 
 	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+	MOVO X9, 80(BP)
+	MOVO X10, 96(BP)
+	MOVO X11, 112(BP)
+	MOVO X15, 128(BP)
 
-	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
-	MOVQ $4, itr1
-	MOVQ inp, itr2
+	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
+	// 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+	MOVQ $0x00000004, CX
+	MOVQ SI, R9
 
 openSSEInternalLoop:
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyAdd(0(itr2))
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	LEAQ          (2*8)(itr2), itr2
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	polyMulStage3
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr1
-	JGE           openSSEInternalLoop
-
-	polyAdd(0(itr2))
-	polyMul
-	LEAQ (2*8)(itr2), itr2
-
-	CMPQ itr1, $-6
-	JG   openSSEInternalLoop
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x0c
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	LEAQ  16(R9), R9
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x04
+	DECQ  CX
+	JGE   openSSEInternalLoop
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
+	CMPQ  CX, $-6
+	JG    openSSEInternalLoop
 
 	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+	PADDD ·chacha20Constants<>+0(SB), X0
+	PADDD ·chacha20Constants<>+0(SB), X1
+	PADDD ·chacha20Constants<>+0(SB), X2
+	PADDD ·chacha20Constants<>+0(SB), X12
+	PADDD 32(BP), X3
+	PADDD 32(BP), X4
+	PADDD 32(BP), X5
+	PADDD 32(BP), X13
+	PADDD 48(BP), X6
+	PADDD 48(BP), X7
+	PADDD 48(BP), X8
+	PADDD 48(BP), X14
+	PADDD 80(BP), X9
+	PADDD 96(BP), X10
+	PADDD 112(BP), X11
+	PADDD 128(BP), X15
 
 	// Load - xor - store
-	MOVO  D3, tmpStore
-	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
-	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
-	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
-	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
-	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
-	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
-	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
-	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
-	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
-	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
-	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
-	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
-	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
-	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
-	LEAQ  256(inp), inp
-	LEAQ  256(oup), oup
-	SUBQ  $256, inl
+	MOVO  X15, 64(BP)
+	MOVOU (SI), X15
+	PXOR  X15, X0
+	MOVOU X0, (DI)
+	MOVOU 16(SI), X15
+	PXOR  X15, X3
+	MOVOU X3, 16(DI)
+	MOVOU 32(SI), X15
+	PXOR  X15, X6
+	MOVOU X6, 32(DI)
+	MOVOU 48(SI), X15
+	PXOR  X15, X9
+	MOVOU X9, 48(DI)
+	MOVOU 64(SI), X9
+	PXOR  X9, X1
+	MOVOU X1, 64(DI)
+	MOVOU 80(SI), X9
+	PXOR  X9, X4
+	MOVOU X4, 80(DI)
+	MOVOU 96(SI), X9
+	PXOR  X9, X7
+	MOVOU X7, 96(DI)
+	MOVOU 112(SI), X9
+	PXOR  X9, X10
+	MOVOU X10, 112(DI)
+	MOVOU 128(SI), X9
+	PXOR  X9, X2
+	MOVOU X2, 128(DI)
+	MOVOU 144(SI), X9
+	PXOR  X9, X5
+	MOVOU X5, 144(DI)
+	MOVOU 160(SI), X9
+	PXOR  X9, X8
+	MOVOU X8, 160(DI)
+	MOVOU 176(SI), X9
+	PXOR  X9, X11
+	MOVOU X11, 176(DI)
+	MOVOU 192(SI), X9
+	PXOR  X9, X12
+	MOVOU X12, 192(DI)
+	MOVOU 208(SI), X9
+	PXOR  X9, X13
+	MOVOU X13, 208(DI)
+	MOVOU 224(SI), X9
+	PXOR  X9, X14
+	MOVOU X14, 224(DI)
+	MOVOU 240(SI), X9
+	PXOR  64(BP), X9
+	MOVOU X9, 240(DI)
+	LEAQ  256(SI), SI
+	LEAQ  256(DI), DI
+	SUBQ  $0x00000100, BX
 	JMP   openSSEMainLoop
 
 openSSEMainLoopDone:
 	// Handle the various tail sizes efficiently
-	TESTQ inl, inl
+	TESTQ BX, BX
 	JE    openSSEFinalize
-	CMPQ  inl, $64
+	CMPQ  BX, $0x40
 	JBE   openSSETail64
-	CMPQ  inl, $128
+	CMPQ  BX, $0x80
 	JBE   openSSETail128
-	CMPQ  inl, $192
+	CMPQ  BX, $0xc0
 	JBE   openSSETail192
 	JMP   openSSETail256
 
 openSSEFinalize:
 	// Hash in the PT, AAD lengths
-	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
-	polyMul
+	ADDQ  ad_len+80(FP), R10
+	ADCQ  src_len+56(FP), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Final reduce
-	MOVQ    acc0, t0
-	MOVQ    acc1, t1
-	MOVQ    acc2, t2
-	SUBQ    $-5, acc0
-	SBBQ    $-1, acc1
-	SBBQ    $3, acc2
-	CMOVQCS t0, acc0
-	CMOVQCS t1, acc1
-	CMOVQCS t2, acc2
+	MOVQ    R10, R13
+	MOVQ    R11, R14
+	MOVQ    R12, R15
+	SUBQ    $-5, R10
+	SBBQ    $-1, R11
+	SBBQ    $0x03, R12
+	CMOVQCS R13, R10
+	CMOVQCS R14, R11
+	CMOVQCS R15, R12
 
 	// Add in the "s" part of the key
-	ADDQ 0+sStore, acc0
-	ADCQ 8+sStore, acc1
+	ADDQ 16(BP), R10
+	ADCQ 24(BP), R11
 
 	// Finally, constant time compare to the tag at the end of the message
 	XORQ    AX, AX
-	MOVQ    $1, DX
-	XORQ    (0*8)(inp), acc0
-	XORQ    (1*8)(inp), acc1
-	ORQ     acc1, acc0
+	MOVQ    $0x00000001, DX
+	XORQ    (SI), R10
+	XORQ    8(SI), R11
+	ORQ     R11, R10
 	CMOVQEQ DX, AX
 
 	// Return true iff tags are equal
 	MOVB AX, ret+96(FP)
 	RET
 
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 129 bytes
 openSSE128:
-	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
-	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
-	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
-	MOVQ  $10, itr2
+	MOVOU ·chacha20Constants<>+0(SB), X0
+	MOVOU 16(R8), X3
+	MOVOU 32(R8), X6
+	MOVOU 48(R8), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X3, X13
+	MOVO  X6, X14
+	MOVO  X10, X15
+	MOVQ  $0x0000000a, R9
 
 openSSE128InnerCipherLoop:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left;  shiftB1Left; shiftB2Left
-	shiftC0Left;  shiftC1Left; shiftC2Left
-	shiftD0Left;  shiftD1Left; shiftD2Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftB1Right; shiftB2Right
-	shiftC0Right; shiftC1Right; shiftC2Right
-	shiftD0Right; shiftD1Right; shiftD2Right
-	DECQ          itr2
-	JNE           openSSE128InnerCipherLoop
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	DECQ  R9
+	JNE   openSSE128InnerCipherLoop
 
 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
-	PADDL T2, C1; PADDL T2, C2
-	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL ·chacha20Constants<>+0(SB), X2
+	PADDL X13, X3
+	PADDL X13, X4
+	PADDL X13, X5
+	PADDL X14, X7
+	PADDL X14, X8
+	PADDL X15, X10
+	PADDL ·sseIncMask<>+0(SB), X15
+	PADDL X15, X11
 
 	// Clamp and store the key
-	PAND  ·polyClampMask<>(SB), A0
-	MOVOU A0, rStore; MOVOU B0, sStore
+	PAND  ·polyClampMask<>+0(SB), X0
+	MOVOU X0, (BP)
+	MOVOU X3, 16(BP)
 
 	// Hash
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
 
 openSSE128Open:
-	CMPQ inl, $16
+	CMPQ BX, $0x10
 	JB   openSSETail16
-	SUBQ $16, inl
+	SUBQ $0x10, BX
 
 	// Load for hashing
-	polyAdd(0(inp))
+	ADDQ (SI), R10
+	ADCQ 8(SI), R11
+	ADCQ $0x01, R12
 
 	// Load for decryption
-	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
-	LEAQ  (1*16)(inp), inp
-	LEAQ  (1*16)(oup), oup
-	polyMul
+	MOVOU (SI), X12
+	PXOR  X12, X1
+	MOVOU X1, (DI)
+	LEAQ  16(SI), SI
+	LEAQ  16(DI), DI
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Shift the stream "left"
-	MOVO B1, A1
-	MOVO C1, B1
-	MOVO D1, C1
-	MOVO A2, D1
-	MOVO B2, A2
-	MOVO C2, B2
-	MOVO D2, C2
+	MOVO X4, X1
+	MOVO X7, X4
+	MOVO X10, X7
+	MOVO X2, X10
+	MOVO X5, X2
+	MOVO X8, X5
+	MOVO X11, X8
 	JMP  openSSE128Open
 
 openSSETail16:
-	TESTQ inl, inl
+	TESTQ BX, BX
 	JE    openSSEFinalize
 
 	// We can safely load the CT from the end, because it is padded with the MAC
-	MOVQ   inl, itr2
-	SHLQ   $4, itr2
-	LEAQ   ·andMask<>(SB), t0
-	MOVOU  (inp), T0
-	ADDQ   inl, inp
-	PAND   -16(t0)(itr2*1), T0
-	MOVO   T0, 0+tmpStore
-	MOVQ   T0, t0
-	MOVQ   8+tmpStore, t1
-	PXOR   A1, T0
+	MOVQ  BX, R9
+	SHLQ  $0x04, R9
+	LEAQ  ·andMask<>+0(SB), R13
+	MOVOU (SI), X12
+	ADDQ  BX, SI
+	PAND  -16(R13)(R9*1), X12
+	MOVO  X12, 64(BP)
+	MOVQ  X12, R13
+	MOVQ  72(BP), R14
+	PXOR  X1, X12
 
 	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
 openSSETail16Store:
-	MOVQ T0, t3
-	MOVB t3, (oup)
-	PSRLDQ $1, T0
-	INCQ   oup
-	DECQ   inl
+	MOVQ   X12, R8
+	MOVB   R8, (DI)
+	PSRLDQ $0x01, X12
+	INCQ   DI
+	DECQ   BX
 	JNE    openSSETail16Store
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x01, R12
+	MOVQ   (BP), AX
+	MOVQ   AX, R15
+	MULQ   R10
+	MOVQ   AX, R13
+	MOVQ   DX, R14
+	MOVQ   (BP), AX
+	MULQ   R11
+	IMULQ  R12, R15
+	ADDQ   AX, R14
+	ADCQ   DX, R15
+	MOVQ   8(BP), AX
+	MOVQ   AX, R8
+	MULQ   R10
+	ADDQ   AX, R14
+	ADCQ   $0x00, DX
+	MOVQ   DX, R10
+	MOVQ   8(BP), AX
+	MULQ   R11
+	ADDQ   AX, R15
+	ADCQ   $0x00, DX
+	IMULQ  R12, R8
+	ADDQ   R10, R15
+	ADCQ   DX, R8
+	MOVQ   R13, R10
+	MOVQ   R14, R11
+	MOVQ   R15, R12
+	ANDQ   $0x03, R12
+	MOVQ   R15, R13
+	ANDQ   $-4, R13
+	MOVQ   R8, R14
+	SHRQ   $0x02, R8, R15
+	SHRQ   $0x02, R8
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x00, R12
+	ADDQ   R15, R10
+	ADCQ   R8, R11
+	ADCQ   $0x00, R12
 	JMP    openSSEFinalize
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of ciphertext
 openSSETail64:
-	// Need to decrypt up to 64 bytes - prepare single block
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	XORQ itr2, itr2
-	MOVQ inl, itr1
-	CMPQ itr1, $16
-	JB   openSSETail64LoopB
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X9, 80(BP)
+	XORQ  R9, R9
+	MOVQ  BX, CX
+	CMPQ  CX, $0x10
+	JB    openSSETail64LoopB
 
 openSSETail64LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
-	SUBQ $16, itr1
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	SUBQ  $0x10, CX
 
 openSSETail64LoopB:
-	ADDQ          $16, itr2
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-
-	CMPQ itr1, $16
-	JAE  openSSETail64LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSETail64LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
+	ADDQ  $0x10, R9
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	CMPQ  CX, $0x10
+	JAE   openSSETail64LoopA
+	CMPQ  R9, $0xa0
+	JNE   openSSETail64LoopB
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL 32(BP), X3
+	PADDL 48(BP), X6
+	PADDL 80(BP), X9
 
 openSSETail64DecLoop:
-	CMPQ  inl, $16
+	CMPQ  BX, $0x10
 	JB    openSSETail64DecLoopDone
-	SUBQ  $16, inl
-	MOVOU (inp), T0
-	PXOR  T0, A0
-	MOVOU A0, (oup)
-	LEAQ  16(inp), inp
-	LEAQ  16(oup), oup
-	MOVO  B0, A0
-	MOVO  C0, B0
-	MOVO  D0, C0
+	SUBQ  $0x10, BX
+	MOVOU (SI), X12
+	PXOR  X12, X0
+	MOVOU X0, (DI)
+	LEAQ  16(SI), SI
+	LEAQ  16(DI), DI
+	MOVO  X3, X0
+	MOVO  X6, X3
+	MOVO  X9, X6
 	JMP   openSSETail64DecLoop
 
 openSSETail64DecLoopDone:
-	MOVO A0, A1
+	MOVO X0, X1
 	JMP  openSSETail16
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
 openSSETail128:
-	// Need to decrypt up to 128 bytes - prepare two blocks
-	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
-	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
-	XORQ itr2, itr2
-	MOVQ inl, itr1
-	ANDQ $-16, itr1
+	MOVO  ·chacha20Constants<>+0(SB), X1
+	MOVO  32(BP), X4
+	MOVO  48(BP), X7
+	MOVO  128(BP), X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X10, 80(BP)
+	MOVO  X1, X0
+	MOVO  X4, X3
+	MOVO  X7, X6
+	MOVO  X10, X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X9, 96(BP)
+	XORQ  R9, R9
+	MOVQ  BX, CX
+	ANDQ  $-16, CX
 
 openSSETail128LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 openSSETail128LoopB:
-	ADDQ          $16, itr2
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-
-	CMPQ itr2, itr1
-	JB   openSSETail128LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSETail128LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B0; PADDL state1Store, B1
-	PADDL state2Store, C0; PADDL state2Store, C1
-	PADDL ctr1Store, D0; PADDL ctr0Store, D1
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-
-	SUBQ $64, inl
-	LEAQ 64(inp), inp
-	LEAQ 64(oup), oup
-	JMP  openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of ciphertext
+	ADDQ  $0x10, R9
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	CMPQ  R9, CX
+	JB    openSSETail128LoopA
+	CMPQ  R9, $0xa0
+	JNE   openSSETail128LoopB
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL 32(BP), X3
+	PADDL 32(BP), X4
+	PADDL 48(BP), X6
+	PADDL 48(BP), X7
+	PADDL 96(BP), X9
+	PADDL 80(BP), X10
+	MOVOU (SI), X12
+	MOVOU 16(SI), X13
+	MOVOU 32(SI), X14
+	MOVOU 48(SI), X15
+	PXOR  X12, X1
+	PXOR  X13, X4
+	PXOR  X14, X7
+	PXOR  X15, X10
+	MOVOU X1, (DI)
+	MOVOU X4, 16(DI)
+	MOVOU X7, 32(DI)
+	MOVOU X10, 48(DI)
+	SUBQ  $0x40, BX
+	LEAQ  64(SI), SI
+	LEAQ  64(DI), DI
+	JMP   openSSETail64DecLoop
+
 openSSETail192:
-	// Need to decrypt up to 192 bytes - prepare three blocks
-	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
-	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
-	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
-
-	MOVQ    inl, itr1
-	MOVQ    $160, itr2
-	CMPQ    itr1, $160
-	CMOVQGT itr2, itr1
-	ANDQ    $-16, itr1
-	XORQ    itr2, itr2
+	MOVO    ·chacha20Constants<>+0(SB), X2
+	MOVO    32(BP), X5
+	MOVO    48(BP), X8
+	MOVO    128(BP), X11
+	PADDL   ·sseIncMask<>+0(SB), X11
+	MOVO    X11, 80(BP)
+	MOVO    X2, X1
+	MOVO    X5, X4
+	MOVO    X8, X7
+	MOVO    X11, X10
+	PADDL   ·sseIncMask<>+0(SB), X10
+	MOVO    X10, 96(BP)
+	MOVO    X1, X0
+	MOVO    X4, X3
+	MOVO    X7, X6
+	MOVO    X10, X9
+	PADDL   ·sseIncMask<>+0(SB), X9
+	MOVO    X9, 112(BP)
+	MOVQ    BX, CX
+	MOVQ    $0x000000a0, R9
+	CMPQ    CX, $0xa0
+	CMOVQGT R9, CX
+	ANDQ    $-16, CX
+	XORQ    R9, R9
 
 openSSLTail192LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 openSSLTail192LoopB:
-	ADDQ         $16, itr2
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left; shiftC0Left; shiftD0Left
-	shiftB1Left; shiftC1Left; shiftD1Left
-	shiftB2Left; shiftC2Left; shiftD2Left
-
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-	shiftB2Right; shiftC2Right; shiftD2Right
-
-	CMPQ itr2, itr1
-	JB   openSSLTail192LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSLTail192LoopB
-
-	CMPQ inl, $176
-	JB   openSSLTail192Store
-
-	polyAdd(160(inp))
-	polyMul
-
-	CMPQ inl, $192
-	JB   openSSLTail192Store
-
-	polyAdd(176(inp))
-	polyMul
+	ADDQ  $0x10, R9
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	CMPQ  R9, CX
+	JB    openSSLTail192LoopA
+	CMPQ  R9, $0xa0
+	JNE   openSSLTail192LoopB
+	CMPQ  BX, $0xb0
+	JB    openSSLTail192Store
+	ADDQ  160(SI), R10
+	ADCQ  168(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	CMPQ  BX, $0xc0
+	JB    openSSLTail192Store
+	ADDQ  176(SI), R10
+	ADCQ  184(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 openSSLTail192Store:
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
-	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
-	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
-	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
-
-	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
-	SUBQ $128, inl
-	LEAQ 128(inp), inp
-	LEAQ 128(oup), oup
-	JMP  openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL ·chacha20Constants<>+0(SB), X2
+	PADDL 32(BP), X3
+	PADDL 32(BP), X4
+	PADDL 32(BP), X5
+	PADDL 48(BP), X6
+	PADDL 48(BP), X7
+	PADDL 48(BP), X8
+	PADDL 112(BP), X9
+	PADDL 96(BP), X10
+	PADDL 80(BP), X11
+	MOVOU (SI), X12
+	MOVOU 16(SI), X13
+	MOVOU 32(SI), X14
+	MOVOU 48(SI), X15
+	PXOR  X12, X2
+	PXOR  X13, X5
+	PXOR  X14, X8
+	PXOR  X15, X11
+	MOVOU X2, (DI)
+	MOVOU X5, 16(DI)
+	MOVOU X8, 32(DI)
+	MOVOU X11, 48(DI)
+	MOVOU 64(SI), X12
+	MOVOU 80(SI), X13
+	MOVOU 96(SI), X14
+	MOVOU 112(SI), X15
+	PXOR  X12, X1
+	PXOR  X13, X4
+	PXOR  X14, X7
+	PXOR  X15, X10
+	MOVOU X1, 64(DI)
+	MOVOU X4, 80(DI)
+	MOVOU X7, 96(DI)
+	MOVOU X10, 112(DI)
+	SUBQ  $0x80, BX
+	LEAQ  128(SI), SI
+	LEAQ  128(DI), DI
+	JMP   openSSETail64DecLoop
+
 openSSETail256:
-	// Need to decrypt up to 256 bytes - prepare four blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X2, X12
+	MOVO  X5, X13
+	MOVO  X8, X14
+	MOVO  X11, X15
+	PADDL ·sseIncMask<>+0(SB), X15
 
 	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
-	XORQ itr2, itr2
+	MOVO X9, 80(BP)
+	MOVO X10, 96(BP)
+	MOVO X11, 112(BP)
+	MOVO X15, 128(BP)
+	XORQ R9, R9
 
 openSSETail256Loop:
-	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
-	polyAdd(0(inp)(itr2*1))
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulStage3
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	ADDQ          $2*8, itr2
-	CMPQ          itr2, $160
-	JB            openSSETail256Loop
-	MOVQ          inl, itr1
-	ANDQ          $-16, itr1
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x0c
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x04
+	ADDQ  $0x10, R9
+	CMPQ  R9, $0xa0
+	JB    openSSETail256Loop
+	MOVQ  BX, CX
+	ANDQ  $-16, CX
 
 openSSETail256HashLoop:
-	polyAdd(0(inp)(itr2*1))
-	polyMul
-	ADDQ $2*8, itr2
-	CMPQ itr2, itr1
-	JB   openSSETail256HashLoop
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  $0x10, R9
+	CMPQ  R9, CX
+	JB    openSSETail256HashLoop
 
 	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
-	MOVO  D3, tmpStore
+	PADDD ·chacha20Constants<>+0(SB), X0
+	PADDD ·chacha20Constants<>+0(SB), X1
+	PADDD ·chacha20Constants<>+0(SB), X2
+	PADDD ·chacha20Constants<>+0(SB), X12
+	PADDD 32(BP), X3
+	PADDD 32(BP), X4
+	PADDD 32(BP), X5
+	PADDD 32(BP), X13
+	PADDD 48(BP), X6
+	PADDD 48(BP), X7
+	PADDD 48(BP), X8
+	PADDD 48(BP), X14
+	PADDD 80(BP), X9
+	PADDD 96(BP), X10
+	PADDD 112(BP), X11
+	PADDD 128(BP), X15
+	MOVO  X15, 64(BP)
 
 	// Load - xor - store
-	MOVOU (0*16)(inp), D3; PXOR D3, A0
-	MOVOU (1*16)(inp), D3; PXOR D3, B0
-	MOVOU (2*16)(inp), D3; PXOR D3, C0
-	MOVOU (3*16)(inp), D3; PXOR D3, D0
-	MOVOU A0, (0*16)(oup)
-	MOVOU B0, (1*16)(oup)
-	MOVOU C0, (2*16)(oup)
-	MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
-	LEAQ  192(inp), inp
-	LEAQ  192(oup), oup
-	SUBQ  $192, inl
-	MOVO  A3, A0
-	MOVO  B3, B0
-	MOVO  C3, C0
-	MOVO  tmpStore, D0
-
-	JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
+	MOVOU (SI), X15
+	PXOR  X15, X0
+	MOVOU 16(SI), X15
+	PXOR  X15, X3
+	MOVOU 32(SI), X15
+	PXOR  X15, X6
+	MOVOU 48(SI), X15
+	PXOR  X15, X9
+	MOVOU X0, (DI)
+	MOVOU X3, 16(DI)
+	MOVOU X6, 32(DI)
+	MOVOU X9, 48(DI)
+	MOVOU 64(SI), X0
+	MOVOU 80(SI), X3
+	MOVOU 96(SI), X6
+	MOVOU 112(SI), X9
+	PXOR  X0, X1
+	PXOR  X3, X4
+	PXOR  X6, X7
+	PXOR  X9, X10
+	MOVOU X1, 64(DI)
+	MOVOU X4, 80(DI)
+	MOVOU X7, 96(DI)
+	MOVOU X10, 112(DI)
+	MOVOU 128(SI), X0
+	MOVOU 144(SI), X3
+	MOVOU 160(SI), X6
+	MOVOU 176(SI), X9
+	PXOR  X0, X2
+	PXOR  X3, X5
+	PXOR  X6, X8
+	PXOR  X9, X11
+	MOVOU X2, 128(DI)
+	MOVOU X5, 144(DI)
+	MOVOU X8, 160(DI)
+	MOVOU X11, 176(DI)
+	LEAQ  192(SI), SI
+	LEAQ  192(DI), DI
+	SUBQ  $0xc0, BX
+	MOVO  X12, X0
+	MOVO  X13, X3
+	MOVO  X14, X6
+	MOVO  64(BP), X9
+	JMP   openSSETail64DecLoop
+
 chacha20Poly1305Open_AVX2:
 	VZEROUPPER
-	VMOVDQU ·chacha20Constants<>(SB), AA0
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
-	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
-	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	BYTE    $0xc4
+	BYTE    $0x42
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x70
+	BYTE    $0x10
+	BYTE    $0xc4
+	BYTE    $0x42
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x60
+	BYTE    $0x20
+	BYTE    $0xc4
+	BYTE    $0xc2
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x60
+	BYTE    $0x30
+	VPADDD  ·avx2InitMask<>+0(SB), Y4, Y4
 
 	// Special optimization, for very short buffers
-	CMPQ inl, $192
+	CMPQ BX, $0xc0
 	JBE  openAVX2192
-	CMPQ inl, $320
+	CMPQ BX, $0x00000140
 	JBE  openAVX2320
 
 	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
-	VMOVDQA BB0, state1StoreAVX2
-	VMOVDQA CC0, state2StoreAVX2
-	VMOVDQA DD0, ctr3StoreAVX2
-	MOVQ    $10, itr2
+	VMOVDQA Y14, 32(BP)
+	VMOVDQA Y12, 64(BP)
+	VMOVDQA Y4, 192(BP)
+	MOVQ    $0x0000000a, R9
 
 openAVX2PreparePolyKey:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	DECQ     itr2
-	JNE      openAVX2PreparePolyKey
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA0
-	VPADDD state1StoreAVX2, BB0, BB0
-	VPADDD state2StoreAVX2, CC0, CC0
-	VPADDD ctr3StoreAVX2, DD0, DD0
-
-	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	DECQ       R9
+	JNE        openAVX2PreparePolyKey
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     192(BP), Y4, Y4
+	VPERM2I128 $0x02, Y0, Y14, Y3
 
 	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
+	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA Y3, (BP)
 
 	// Stream for the first 64 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
 
 	// Hash AD + first 64 bytes
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
+	XORQ CX, CX
 
 openAVX2InitialHash64:
-	polyAdd(0(inp)(itr1*1))
-	polyMulAVX2
-	ADDQ $16, itr1
-	CMPQ itr1, $64
-	JNE  openAVX2InitialHash64
+	ADDQ  (SI)(CX*1), R10
+	ADCQ  8(SI)(CX*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  $0x10, CX
+	CMPQ  CX, $0x40
+	JNE   openAVX2InitialHash64
 
 	// Decrypt the first 64 bytes
-	VPXOR   (0*32)(inp), AA0, AA0
-	VPXOR   (1*32)(inp), BB0, BB0
-	VMOVDQU AA0, (0*32)(oup)
-	VMOVDQU BB0, (1*32)(oup)
-	LEAQ    (2*32)(inp), inp
-	LEAQ    (2*32)(oup), oup
-	SUBQ    $64, inl
+	VPXOR   (SI), Y0, Y0
+	VPXOR   32(SI), Y14, Y14
+	VMOVDQU Y0, (DI)
+	VMOVDQU Y14, 32(DI)
+	LEAQ    64(SI), SI
+	LEAQ    64(DI), DI
+	SUBQ    $0x40, BX
 
 openAVX2MainLoop:
-	CMPQ inl, $512
+	CMPQ BX, $0x00000200
 	JB   openAVX2MainLoopDone
 
 	// Load state, increment counter blocks, store the incremented counters
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	XORQ    itr1, itr1
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
+	XORQ    CX, CX
 
 openAVX2InternalLoop:
-	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
-	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
-	polyAdd(0*8(inp)(itr1*1))
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage1_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulStage2_AVX2
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyMulStage3_AVX2
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	polyAdd(2*8(inp)(itr1*1))
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage1_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage2_AVX2
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage3_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulReduceStage
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(4*8(inp)(itr1*1))
-	LEAQ     (6*8)(itr1), itr1
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage1_AVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	polyMulStage2_AVX2
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage3_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	CMPQ     itr1, $480
+	ADDQ     (SI)(CX*1), R10
+	ADCQ     8(SI)(CX*1), R11
+	ADCQ     $0x01, R12
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	ADDQ     16(SI)(CX*1), R10
+	ADCQ     24(SI)(CX*1), R11
+	ADCQ     $0x01, R12
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x04, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPALIGNR $0x0c, Y3, Y3, Y3
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	ADDQ     32(SI)(CX*1), R10
+	ADCQ     40(SI)(CX*1), R11
+	ADCQ     $0x01, R12
+	LEAQ     48(CX), CX
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x0c, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	VPALIGNR $0x04, Y3, Y3, Y3
+	CMPQ     CX, $0x000001e0
 	JNE      openAVX2InternalLoop
-
-	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA CC3, tmpStoreAVX2
+	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD   32(BP), Y14, Y14
+	VPADDD   32(BP), Y9, Y9
+	VPADDD   32(BP), Y10, Y10
+	VPADDD   32(BP), Y11, Y11
+	VPADDD   64(BP), Y12, Y12
+	VPADDD   64(BP), Y13, Y13
+	VPADDD   64(BP), Y8, Y8
+	VPADDD   64(BP), Y15, Y15
+	VPADDD   96(BP), Y4, Y4
+	VPADDD   128(BP), Y1, Y1
+	VPADDD   160(BP), Y2, Y2
+	VPADDD   192(BP), Y3, Y3
+	VMOVDQA  Y15, 224(BP)
 
 	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
-	polyAdd(480(inp))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+	ADDQ       480(SI), R10
+	ADCQ       488(SI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPERM2I128 $0x02, Y0, Y14, Y15
+	VPERM2I128 $0x13, Y0, Y14, Y14
+	VPERM2I128 $0x02, Y12, Y4, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y12
+	VPXOR      (SI), Y15, Y15
+	VPXOR      32(SI), Y0, Y0
+	VPXOR      64(SI), Y14, Y14
+	VPXOR      96(SI), Y12, Y12
+	VMOVDQU    Y15, (DI)
+	VMOVDQU    Y0, 32(DI)
+	VMOVDQU    Y14, 64(DI)
+	VMOVDQU    Y12, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      128(SI), Y0, Y0
+	VPXOR      160(SI), Y14, Y14
+	VPXOR      192(SI), Y12, Y12
+	VPXOR      224(SI), Y4, Y4
+	VMOVDQU    Y0, 128(DI)
+	VMOVDQU    Y14, 160(DI)
+	VMOVDQU    Y12, 192(DI)
+	VMOVDQU    Y4, 224(DI)
 
 	// and here
-	polyAdd(496(inp))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
-	LEAQ       (32*16)(inp), inp
-	LEAQ       (32*16)(oup), oup
-	SUBQ       $(32*16), inl
+	ADDQ       496(SI), R10
+	ADCQ       504(SI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      256(SI), Y0, Y0
+	VPXOR      288(SI), Y14, Y14
+	VPXOR      320(SI), Y12, Y12
+	VPXOR      352(SI), Y4, Y4
+	VMOVDQU    Y0, 256(DI)
+	VMOVDQU    Y14, 288(DI)
+	VMOVDQU    Y12, 320(DI)
+	VMOVDQU    Y4, 352(DI)
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, 224(BP), Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, 224(BP), Y3, Y4
+	VPXOR      384(SI), Y0, Y0
+	VPXOR      416(SI), Y14, Y14
+	VPXOR      448(SI), Y12, Y12
+	VPXOR      480(SI), Y4, Y4
+	VMOVDQU    Y0, 384(DI)
+	VMOVDQU    Y14, 416(DI)
+	VMOVDQU    Y12, 448(DI)
+	VMOVDQU    Y4, 480(DI)
+	LEAQ       512(SI), SI
+	LEAQ       512(DI), DI
+	SUBQ       $0x00000200, BX
 	JMP        openAVX2MainLoop
 
 openAVX2MainLoopDone:
 	// Handle the various tail sizes efficiently
-	TESTQ inl, inl
+	TESTQ BX, BX
 	JE    openSSEFinalize
-	CMPQ  inl, $128
+	CMPQ  BX, $0x80
 	JBE   openAVX2Tail128
-	CMPQ  inl, $256
+	CMPQ  BX, $0x00000100
 	JBE   openAVX2Tail256
-	CMPQ  inl, $384
+	CMPQ  BX, $0x00000180
 	JBE   openAVX2Tail384
 	JMP   openAVX2Tail512
 
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
 openAVX2192:
-	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
-	VMOVDQA AA0, AA1
-	VMOVDQA BB0, BB1
-	VMOVDQA CC0, CC1
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2
-	VMOVDQA BB0, BB2
-	VMOVDQA CC0, CC2
-	VMOVDQA DD0, DD2
-	VMOVDQA DD1, TT3
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y14, Y9
+	VMOVDQA Y12, Y13
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y0, Y6
+	VMOVDQA Y14, Y10
+	VMOVDQA Y12, Y8
+	VMOVDQA Y4, Y2
+	VMOVDQA Y1, Y15
+	MOVQ    $0x0000000a, R9
 
 openAVX2192InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ       itr2
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	DECQ       R9
 	JNE        openAVX2192InnerCipherLoop
-	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
-	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
-	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
-	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPADDD     Y6, Y0, Y0
+	VPADDD     Y6, Y5, Y5
+	VPADDD     Y10, Y14, Y14
+	VPADDD     Y10, Y9, Y9
+	VPADDD     Y8, Y12, Y12
+	VPADDD     Y8, Y13, Y13
+	VPADDD     Y2, Y4, Y4
+	VPADDD     Y15, Y1, Y1
+	VPERM2I128 $0x02, Y0, Y14, Y3
 
 	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
+	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA Y3, (BP)
 
 	// Stream for up to 192 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
+	VPERM2I128 $0x02, Y5, Y9, Y12
+	VPERM2I128 $0x02, Y13, Y1, Y4
+	VPERM2I128 $0x13, Y5, Y9, Y5
+	VPERM2I128 $0x13, Y13, Y1, Y9
 
 openAVX2ShortOpen:
 	// Hash
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
 
 openAVX2ShortOpenLoop:
-	CMPQ inl, $32
+	CMPQ BX, $0x20
 	JB   openAVX2ShortTail32
-	SUBQ $32, inl
+	SUBQ $0x20, BX
 
 	// Load for hashing
-	polyAdd(0*8(inp))
-	polyMulAVX2
-	polyAdd(2*8(inp))
-	polyMulAVX2
+	ADDQ  (SI), R10
+	ADCQ  8(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  16(SI), R10
+	ADCQ  24(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Load for decryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
-	LEAQ    (1*32)(oup), oup
+	VPXOR   (SI), Y0, Y0
+	VMOVDQU Y0, (DI)
+	LEAQ    32(SI), SI
+	LEAQ    32(DI), DI
 
 	// Shift stream left
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
-	VMOVDQA AA1, DD0
-	VMOVDQA BB1, AA1
-	VMOVDQA CC1, BB1
-	VMOVDQA DD1, CC1
-	VMOVDQA AA2, DD1
-	VMOVDQA BB2, AA2
+	VMOVDQA Y14, Y0
+	VMOVDQA Y12, Y14
+	VMOVDQA Y4, Y12
+	VMOVDQA Y5, Y4
+	VMOVDQA Y9, Y5
+	VMOVDQA Y13, Y9
+	VMOVDQA Y1, Y13
+	VMOVDQA Y6, Y1
+	VMOVDQA Y10, Y6
 	JMP     openAVX2ShortOpenLoop
 
 openAVX2ShortTail32:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
+	CMPQ    BX, $0x10
+	VMOVDQA X0, X1
 	JB      openAVX2ShortDone
-
-	SUBQ $16, inl
+	SUBQ    $0x10, BX
 
 	// Load for hashing
-	polyAdd(0*8(inp))
-	polyMulAVX2
+	ADDQ  (SI), R10
+	ADCQ  8(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Load for decryption
-	VPXOR      (inp), A0, T0
-	VMOVDQU    T0, (oup)
-	LEAQ       (1*16)(inp), inp
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
+	VPXOR      (SI), X0, X12
+	VMOVDQU    X12, (DI)
+	LEAQ       16(SI), SI
+	LEAQ       16(DI), DI
+	VPERM2I128 $0x11, Y0, Y0, Y0
+	VMOVDQA    X0, X1
 
 openAVX2ShortDone:
 	VZEROUPPER
 	JMP openSSETail16
 
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
 openAVX2320:
-	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
-	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y14, Y9
+	VMOVDQA Y12, Y13
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y0, Y6
+	VMOVDQA Y14, Y10
+	VMOVDQA Y12, Y8
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y14, Y7
+	VMOVDQA Y12, Y11
+	VMOVDQA Y4, Y15
+	MOVQ    $0x0000000a, R9
 
 openAVX2320InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr2
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	DECQ     R9
 	JNE      openAVX2320InnerCipherLoop
-
-	VMOVDQA ·chacha20Constants<>(SB), TT0
-	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
-	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
-	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
-	VMOVDQA ·avx2IncMask<>(SB), TT0
-	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD2, DD2
+	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
+	VPADDD   Y3, Y0, Y0
+	VPADDD   Y3, Y5, Y5
+	VPADDD   Y3, Y6, Y6
+	VPADDD   Y7, Y14, Y14
+	VPADDD   Y7, Y9, Y9
+	VPADDD   Y7, Y10, Y10
+	VPADDD   Y11, Y12, Y12
+	VPADDD   Y11, Y13, Y13
+	VPADDD   Y11, Y8, Y8
+	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
+	VPADDD   Y15, Y4, Y4
+	VPADDD   Y3, Y15, Y15
+	VPADDD   Y15, Y1, Y1
+	VPADDD   Y3, Y15, Y15
+	VPADDD   Y15, Y2, Y2
 
 	// Clamp and store poly key
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPAND      ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA    TT0, rsStoreAVX2
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA    Y3, (BP)
 
 	// Stream for up to 320 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
-	VPERM2I128 $0x02, AA2, BB2, CC1
-	VPERM2I128 $0x02, CC2, DD2, DD1
-	VPERM2I128 $0x13, AA2, BB2, AA2
-	VPERM2I128 $0x13, CC2, DD2, BB2
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
+	VPERM2I128 $0x02, Y5, Y9, Y12
+	VPERM2I128 $0x02, Y13, Y1, Y4
+	VPERM2I128 $0x13, Y5, Y9, Y5
+	VPERM2I128 $0x13, Y13, Y1, Y9
+	VPERM2I128 $0x02, Y6, Y10, Y13
+	VPERM2I128 $0x02, Y8, Y2, Y1
+	VPERM2I128 $0x13, Y6, Y10, Y6
+	VPERM2I128 $0x13, Y8, Y2, Y10
 	JMP        openAVX2ShortOpen
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
 openAVX2Tail128:
 	// Need to decrypt up to 128 bytes - prepare two blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA1
-	VMOVDQA state1StoreAVX2, BB1
-	VMOVDQA state2StoreAVX2, CC1
-	VMOVDQA ctr3StoreAVX2, DD1
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
-	VMOVDQA DD1, DD0
-
-	XORQ  itr2, itr2
-	MOVQ  inl, itr1
-	ANDQ  $-16, itr1
-	TESTQ itr1, itr1
-	JE    openAVX2Tail128LoopB
+	VMOVDQA ·chacha20Constants<>+0(SB), Y5
+	VMOVDQA 32(BP), Y9
+	VMOVDQA 64(BP), Y13
+	VMOVDQA 192(BP), Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y1
+	VMOVDQA Y1, Y4
+	XORQ    R9, R9
+	MOVQ    BX, CX
+	ANDQ    $-16, CX
+	TESTQ   CX, CX
+	JE      openAVX2Tail128LoopB
 
 openAVX2Tail128LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMulAVX2
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 openAVX2Tail128LoopB:
-	ADDQ     $16, itr2
-	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD1, DD1, DD1
-	CMPQ     itr2, itr1
-	JB       openAVX2Tail128LoopA
-	CMPQ     itr2, $160
-	JNE      openAVX2Tail128LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC1, CC1
-	VPADDD     DD0, DD1, DD1
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+	ADDQ       $0x10, R9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	CMPQ       R9, CX
+	JB         openAVX2Tail128LoopA
+	CMPQ       R9, $0xa0
+	JNE        openAVX2Tail128LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     Y4, Y1, Y1
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
 
 openAVX2TailLoop:
-	CMPQ inl, $32
+	CMPQ BX, $0x20
 	JB   openAVX2Tail
-	SUBQ $32, inl
+	SUBQ $0x20, BX
 
 	// Load for decryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
-	LEAQ    (1*32)(oup), oup
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
+	VPXOR   (SI), Y0, Y0
+	VMOVDQU Y0, (DI)
+	LEAQ    32(SI), SI
+	LEAQ    32(DI), DI
+	VMOVDQA Y14, Y0
+	VMOVDQA Y12, Y14
+	VMOVDQA Y4, Y12
 	JMP     openAVX2TailLoop
 
 openAVX2Tail:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
+	CMPQ    BX, $0x10
+	VMOVDQA X0, X1
 	JB      openAVX2TailDone
-	SUBQ    $16, inl
+	SUBQ    $0x10, BX
 
 	// Load for decryption
-	VPXOR      (inp), A0, T0
-	VMOVDQU    T0, (oup)
-	LEAQ       (1*16)(inp), inp
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
+	VPXOR      (SI), X0, X12
+	VMOVDQU    X12, (DI)
+	LEAQ       16(SI), SI
+	LEAQ       16(DI), DI
+	VPERM2I128 $0x11, Y0, Y0, Y0
+	VMOVDQA    X0, X1
 
 openAVX2TailDone:
 	VZEROUPPER
 	JMP openSSETail16
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
 openAVX2Tail256:
-	// Need to decrypt up to 256 bytes - prepare four blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA DD0, TT1
-	VMOVDQA DD1, TT2
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y4, Y7
+	VMOVDQA Y1, Y11
 
 	// Compute the number of iterations that will hash data
-	MOVQ    inl, tmpStoreAVX2
-	MOVQ    inl, itr1
-	SUBQ    $128, itr1
-	SHRQ    $4, itr1
-	MOVQ    $10, itr2
-	CMPQ    itr1, $10
-	CMOVQGT itr2, itr1
-	MOVQ    inp, inl
-	XORQ    itr2, itr2
+	MOVQ    BX, 224(BP)
+	MOVQ    BX, CX
+	SUBQ    $0x80, CX
+	SHRQ    $0x04, CX
+	MOVQ    $0x0000000a, R9
+	CMPQ    CX, $0x0a
+	CMOVQGT R9, CX
+	MOVQ    SI, BX
+	XORQ    R9, R9
 
 openAVX2Tail256LoopA:
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ 16(inl), inl
+	ADDQ  (BX), R10
+	ADCQ  8(BX), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(BX), BX
 
-	// Perform ChaCha rounds, while hashing the remaining input
 openAVX2Tail256LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	INCQ     itr2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	CMPQ     itr2, itr1
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	INCQ     R9
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	CMPQ     R9, CX
 	JB       openAVX2Tail256LoopA
+	CMPQ     R9, $0x0a
+	JNE      openAVX2Tail256LoopB
+	MOVQ     BX, R9
+	SUBQ     SI, BX
+	MOVQ     BX, CX
+	MOVQ     224(BP), BX
 
-	CMPQ itr2, $10
-	JNE  openAVX2Tail256LoopB
-
-	MOVQ inl, itr2
-	SUBQ inp, inl
-	MOVQ inl, itr1
-	MOVQ tmpStoreAVX2, inl
-
-	// Hash the remainder of data (if any)
 openAVX2Tail256Hash:
-	ADDQ $16, itr1
-	CMPQ itr1, inl
-	JGT  openAVX2Tail256HashEnd
-	polyAdd (0(itr2))
-	polyMulAVX2
-	LEAQ 16(itr2), itr2
-	JMP  openAVX2Tail256Hash
-
-// Store 128 bytes safely, then go to store loop
+	ADDQ  $0x10, CX
+	CMPQ  CX, BX
+	JGT   openAVX2Tail256HashEnd
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
+	JMP   openAVX2Tail256Hash
+
 openAVX2Tail256HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-
-	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
-	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
-	LEAQ    (4*32)(inp), inp
-	LEAQ    (4*32)(oup), oup
-	SUBQ    $4*32, inl
-
-	JMP openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     Y7, Y4, Y4
+	VPADDD     Y11, Y1, Y1
+	VPERM2I128 $0x02, Y0, Y14, Y6
+	VPERM2I128 $0x02, Y12, Y4, Y10
+	VPERM2I128 $0x13, Y0, Y14, Y8
+	VPERM2I128 $0x13, Y12, Y4, Y2
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      (SI), Y6, Y6
+	VPXOR      32(SI), Y10, Y10
+	VPXOR      64(SI), Y8, Y8
+	VPXOR      96(SI), Y2, Y2
+	VMOVDQU    Y6, (DI)
+	VMOVDQU    Y10, 32(DI)
+	VMOVDQU    Y8, 64(DI)
+	VMOVDQU    Y2, 96(DI)
+	LEAQ       128(SI), SI
+	LEAQ       128(DI), DI
+	SUBQ       $0x80, BX
+	JMP        openAVX2TailLoop
+
 openAVX2Tail384:
 	// Need to decrypt up to 384 bytes - prepare six blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA DD0, ctr0StoreAVX2
-	VMOVDQA DD1, ctr1StoreAVX2
-	VMOVDQA DD2, ctr2StoreAVX2
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
 
 	// Compute the number of iterations that will hash two blocks of data
-	MOVQ    inl, tmpStoreAVX2
-	MOVQ    inl, itr1
-	SUBQ    $256, itr1
-	SHRQ    $4, itr1
-	ADDQ    $6, itr1
-	MOVQ    $10, itr2
-	CMPQ    itr1, $10
-	CMOVQGT itr2, itr1
-	MOVQ    inp, inl
-	XORQ    itr2, itr2
-
-	// Perform ChaCha rounds, while hashing the remaining input
+	MOVQ    BX, 224(BP)
+	MOVQ    BX, CX
+	SUBQ    $0x00000100, CX
+	SHRQ    $0x04, CX
+	ADDQ    $0x06, CX
+	MOVQ    $0x0000000a, R9
+	CMPQ    CX, $0x0a
+	CMOVQGT R9, CX
+	MOVQ    SI, BX
+	XORQ    R9, R9
+
 openAVX2Tail384LoopB:
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ 16(inl), inl
+	ADDQ  (BX), R10
+	ADCQ  8(BX), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(BX), BX
 
 openAVX2Tail384LoopA:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ     16(inl), inl
-	INCQ     itr2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-
-	CMPQ itr2, itr1
-	JB   openAVX2Tail384LoopB
-
-	CMPQ itr2, $10
-	JNE  openAVX2Tail384LoopA
-
-	MOVQ inl, itr2
-	SUBQ inp, inl
-	MOVQ inl, itr1
-	MOVQ tmpStoreAVX2, inl
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	ADDQ     (BX), R10
+	ADCQ     8(BX), R11
+	ADCQ     $0x01, R12
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	LEAQ     16(BX), BX
+	INCQ     R9
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	CMPQ     R9, CX
+	JB       openAVX2Tail384LoopB
+	CMPQ     R9, $0x0a
+	JNE      openAVX2Tail384LoopA
+	MOVQ     BX, R9
+	SUBQ     SI, BX
+	MOVQ     BX, CX
+	MOVQ     224(BP), BX
 
 openAVX2Tail384Hash:
-	ADDQ $16, itr1
-	CMPQ itr1, inl
-	JGT  openAVX2Tail384HashEnd
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ 16(itr2), itr2
-	JMP  openAVX2Tail384Hash
-
-// Store 256 bytes safely, then go to store loop
+	ADDQ  $0x10, CX
+	CMPQ  CX, BX
+	JGT   openAVX2Tail384HashEnd
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
+	JMP   openAVX2Tail384Hash
+
 openAVX2Tail384HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
-	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
-	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	LEAQ       (8*32)(inp), inp
-	LEAQ       (8*32)(oup), oup
-	SUBQ       $8*32, inl
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     96(BP), Y4, Y4
+	VPADDD     128(BP), Y1, Y1
+	VPADDD     160(BP), Y2, Y2
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPERM2I128 $0x02, Y12, Y4, Y7
+	VPERM2I128 $0x13, Y0, Y14, Y11
+	VPERM2I128 $0x13, Y12, Y4, Y15
+	VPXOR      (SI), Y3, Y3
+	VPXOR      32(SI), Y7, Y7
+	VPXOR      64(SI), Y11, Y11
+	VPXOR      96(SI), Y15, Y15
+	VMOVDQU    Y3, (DI)
+	VMOVDQU    Y7, 32(DI)
+	VMOVDQU    Y11, 64(DI)
+	VMOVDQU    Y15, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y3
+	VPERM2I128 $0x02, Y13, Y1, Y7
+	VPERM2I128 $0x13, Y5, Y9, Y11
+	VPERM2I128 $0x13, Y13, Y1, Y15
+	VPXOR      128(SI), Y3, Y3
+	VPXOR      160(SI), Y7, Y7
+	VPXOR      192(SI), Y11, Y11
+	VPXOR      224(SI), Y15, Y15
+	VMOVDQU    Y3, 128(DI)
+	VMOVDQU    Y7, 160(DI)
+	VMOVDQU    Y11, 192(DI)
+	VMOVDQU    Y15, 224(DI)
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	LEAQ       256(SI), SI
+	LEAQ       256(DI), DI
+	SUBQ       $0x00000100, BX
 	JMP        openAVX2TailLoop
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
 openAVX2Tail512:
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	XORQ    itr1, itr1
-	MOVQ    inp, itr2
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
+	XORQ    CX, CX
+	MOVQ    SI, R9
 
 openAVX2Tail512LoopB:
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ (2*8)(itr2), itr2
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
 
 openAVX2Tail512LoopA:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyAdd(0*8(itr2))
-	polyMulAVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(2*8(itr2))
-	polyMulAVX2
-	LEAQ     (4*8)(itr2), itr2
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	INCQ     itr1
-	CMPQ     itr1, $4
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	ADDQ     (R9), R10
+	ADCQ     8(R9), R11
+	ADCQ     $0x01, R12
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x04, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPALIGNR $0x0c, Y3, Y3, Y3
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	ADDQ     16(R9), R10
+	ADCQ     24(R9), R11
+	ADCQ     $0x01, R12
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	LEAQ     32(R9), R9
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x0c, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	VPALIGNR $0x04, Y3, Y3, Y3
+	INCQ     CX
+	CMPQ     CX, $0x04
 	JLT      openAVX2Tail512LoopB
-
-	CMPQ itr1, $10
-	JNE  openAVX2Tail512LoopA
-
-	MOVQ inl, itr1
-	SUBQ $384, itr1
-	ANDQ $-16, itr1
+	CMPQ     CX, $0x0a
+	JNE      openAVX2Tail512LoopA
+	MOVQ     BX, CX
+	SUBQ     $0x00000180, CX
+	ANDQ     $-16, CX
 
 openAVX2Tail512HashLoop:
-	TESTQ itr1, itr1
+	TESTQ CX, CX
 	JE    openAVX2Tail512HashEnd
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ  16(itr2), itr2
-	SUBQ  $16, itr1
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
+	SUBQ  $0x10, CX
 	JMP   openAVX2Tail512HashLoop
 
 openAVX2Tail512HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA    CC3, tmpStoreAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
-	LEAQ (12*32)(inp), inp
-	LEAQ (12*32)(oup), oup
-	SUBQ $12*32, inl
-
-	JMP openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Seal(dst, key, src, ad []byte)
-TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
-	// For aligned stack access
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     32(BP), Y11, Y11
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     64(BP), Y15, Y15
+	VPADDD     96(BP), Y4, Y4
+	VPADDD     128(BP), Y1, Y1
+	VPADDD     160(BP), Y2, Y2
+	VPADDD     192(BP), Y3, Y3
+	VMOVDQA    Y15, 224(BP)
+	VPERM2I128 $0x02, Y0, Y14, Y15
+	VPERM2I128 $0x13, Y0, Y14, Y14
+	VPERM2I128 $0x02, Y12, Y4, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y12
+	VPXOR      (SI), Y15, Y15
+	VPXOR      32(SI), Y0, Y0
+	VPXOR      64(SI), Y14, Y14
+	VPXOR      96(SI), Y12, Y12
+	VMOVDQU    Y15, (DI)
+	VMOVDQU    Y0, 32(DI)
+	VMOVDQU    Y14, 64(DI)
+	VMOVDQU    Y12, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      128(SI), Y0, Y0
+	VPXOR      160(SI), Y14, Y14
+	VPXOR      192(SI), Y12, Y12
+	VPXOR      224(SI), Y4, Y4
+	VMOVDQU    Y0, 128(DI)
+	VMOVDQU    Y14, 160(DI)
+	VMOVDQU    Y12, 192(DI)
+	VMOVDQU    Y4, 224(DI)
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      256(SI), Y0, Y0
+	VPXOR      288(SI), Y14, Y14
+	VPXOR      320(SI), Y12, Y12
+	VPXOR      352(SI), Y4, Y4
+	VMOVDQU    Y0, 256(DI)
+	VMOVDQU    Y14, 288(DI)
+	VMOVDQU    Y12, 320(DI)
+	VMOVDQU    Y4, 352(DI)
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, 224(BP), Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, 224(BP), Y3, Y4
+	LEAQ       384(SI), SI
+	LEAQ       384(DI), DI
+	SUBQ       $0x00000180, BX
+	JMP        openAVX2TailLoop
+
+DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
+GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
+
+DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
+DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
+DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
+DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
+GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
+
+DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
+DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
+GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
+
+DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+8(SB)/8, $0x0000000000000000
+DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+24(SB)/8, $0x0000000000000000
+DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+40(SB)/8, $0x0000000000000000
+DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+56(SB)/8, $0x0000000000000000
+DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+72(SB)/8, $0x0000000000000000
+DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+88(SB)/8, $0x0000000000000000
+DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+104(SB)/8, $0x0000000000000000
+DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+120(SB)/8, $0x0000000000000000
+DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
+GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
+
+DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
+DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol16<>+0(SB)/8, $0x0504070601000302
+DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
+DATA ·rol16<>+16(SB)/8, $0x0504070601000302
+DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
+GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol8<>+0(SB)/8, $0x0605040702010003
+DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
+DATA ·rol8<>+16(SB)/8, $0x0605040702010003
+DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
+GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
+
+DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
+
+// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Seal(SB), $288-96
 	MOVQ SP, BP
-	ADDQ $32, BP
+	ADDQ $0x20, BP
 	ANDQ $-32, BP
-	MOVQ dst+0(FP), oup
-	MOVQ key+24(FP), keyp
-	MOVQ src+48(FP), inp
-	MOVQ src_len+56(FP), inl
-	MOVQ ad+72(FP), adp
-
-	CMPB ·useAVX2(SB), $1
+	MOVQ dst_base+0(FP), DI
+	MOVQ key_base+24(FP), R8
+	MOVQ src_base+48(FP), SI
+	MOVQ src_len+56(FP), BX
+	MOVQ ad_base+72(FP), CX
+	CMPB ·useAVX2+0(SB), $0x01
 	JE   chacha20Poly1305Seal_AVX2
 
 	// Special optimization, for very short buffers
-	CMPQ inl, $128
-	JBE  sealSSE128 // About 15% faster
+	CMPQ BX, $0x80
+	JBE  sealSSE128
 
 	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
-	MOVOU ·chacha20Constants<>(SB), A0
-	MOVOU (1*16)(keyp), B0
-	MOVOU (2*16)(keyp), C0
-	MOVOU (3*16)(keyp), D0
+	MOVOU ·chacha20Constants<>+0(SB), X0
+	MOVOU 16(R8), X3
+	MOVOU 32(R8), X6
+	MOVOU 48(R8), X9
 
 	// Store state on stack for future use
-	MOVO B0, state1Store
-	MOVO C0, state2Store
+	MOVO X3, 32(BP)
+	MOVO X6, 48(BP)
 
 	// Load state, increment counter blocks
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X2, X12
+	MOVO  X5, X13
+	MOVO  X8, X14
+	MOVO  X11, X15
+	PADDL ·sseIncMask<>+0(SB), X15
 
 	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
-	MOVQ $10, itr2
+	MOVO X9, 80(BP)
+	MOVO X10, 96(BP)
+	MOVO X11, 112(BP)
+	MOVO X15, 128(BP)
+	MOVQ $0x0000000a, R9
 
 sealSSEIntroLoop:
-	MOVO         C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO         tmpStore, C3
-	MOVO         C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO         tmpStore, C1
-	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
-
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr2
-	JNE           sealSSEIntroLoop
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x0c
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x04
+	DECQ  R9
+	JNE   sealSSEIntroLoop
 
 	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+	PADDD ·chacha20Constants<>+0(SB), X0
+	PADDD ·chacha20Constants<>+0(SB), X1
+	PADDD ·chacha20Constants<>+0(SB), X2
+	PADDD ·chacha20Constants<>+0(SB), X12
+	PADDD 32(BP), X3
+	PADDD 32(BP), X4
+	PADDD 32(BP), X5
+	PADDD 32(BP), X13
+	PADDD 48(BP), X7
+	PADDD 48(BP), X8
+	PADDD 48(BP), X14
+	PADDD 96(BP), X10
+	PADDD 112(BP), X11
+	PADDD 128(BP), X15
 
 	// Clamp and store the key
-	PAND ·polyClampMask<>(SB), A0
-	MOVO A0, rStore
-	MOVO B0, sStore
+	PAND ·polyClampMask<>+0(SB), X0
+	MOVO X0, (BP)
+	MOVO X3, 16(BP)
 
 	// Hash AAD
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
-
-	MOVQ $128, itr1
-	SUBQ $128, inl
-	LEAQ 128(inp), inp
-
-	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
-
-	CMPQ inl, $64
-	JBE  sealSSE128SealHash
-
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
-	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
-
-	ADDQ $64, itr1
-	SUBQ $64, inl
-	LEAQ 64(inp), inp
-
-	MOVQ $2, itr1
-	MOVQ $8, itr2
-
-	CMPQ inl, $64
-	JBE  sealSSETail64
-	CMPQ inl, $128
-	JBE  sealSSETail128
-	CMPQ inl, $192
-	JBE  sealSSETail192
+	MOVQ  ad_len+80(FP), R9
+	CALL  polyHashADInternal<>(SB)
+	MOVOU (SI), X0
+	MOVOU 16(SI), X3
+	MOVOU 32(SI), X6
+	MOVOU 48(SI), X9
+	PXOR  X0, X1
+	PXOR  X3, X4
+	PXOR  X6, X7
+	PXOR  X9, X10
+	MOVOU X1, (DI)
+	MOVOU X4, 16(DI)
+	MOVOU X7, 32(DI)
+	MOVOU X10, 48(DI)
+	MOVOU 64(SI), X0
+	MOVOU 80(SI), X3
+	MOVOU 96(SI), X6
+	MOVOU 112(SI), X9
+	PXOR  X0, X2
+	PXOR  X3, X5
+	PXOR  X6, X8
+	PXOR  X9, X11
+	MOVOU X2, 64(DI)
+	MOVOU X5, 80(DI)
+	MOVOU X8, 96(DI)
+	MOVOU X11, 112(DI)
+	MOVQ  $0x00000080, CX
+	SUBQ  $0x80, BX
+	LEAQ  128(SI), SI
+	MOVO  X12, X1
+	MOVO  X13, X4
+	MOVO  X14, X7
+	MOVO  X15, X10
+	CMPQ  BX, $0x40
+	JBE   sealSSE128SealHash
+	MOVOU (SI), X0
+	MOVOU 16(SI), X3
+	MOVOU 32(SI), X6
+	MOVOU 48(SI), X9
+	PXOR  X0, X12
+	PXOR  X3, X13
+	PXOR  X6, X14
+	PXOR  X9, X15
+	MOVOU X12, 128(DI)
+	MOVOU X13, 144(DI)
+	MOVOU X14, 160(DI)
+	MOVOU X15, 176(DI)
+	ADDQ  $0x40, CX
+	SUBQ  $0x40, BX
+	LEAQ  64(SI), SI
+	MOVQ  $0x00000002, CX
+	MOVQ  $0x00000008, R9
+	CMPQ  BX, $0x40
+	JBE   sealSSETail64
+	CMPQ  BX, $0x80
+	JBE   sealSSETail128
+	CMPQ  BX, $0xc0
+	JBE   sealSSETail192
 
 sealSSEMainLoop:
 	// Load state, increment counter blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X2, X12
+	MOVO  X5, X13
+	MOVO  X8, X14
+	MOVO  X11, X15
+	PADDL ·sseIncMask<>+0(SB), X15
 
 	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+	MOVO X9, 80(BP)
+	MOVO X10, 96(BP)
+	MOVO X11, 112(BP)
+	MOVO X15, 128(BP)
 
 sealSSEInnerLoop:
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyAdd(0(oup))
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	LEAQ          (2*8)(oup), oup
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	polyMulStage3
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr2
-	JGE           sealSSEInnerLoop
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          (2*8)(oup), oup
-	DECQ          itr1
-	JG            sealSSEInnerLoop
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x0c
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	LEAQ  16(DI), DI
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x04
+	DECQ  R9
+	JGE   sealSSEInnerLoop
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
+	DECQ  CX
+	JG    sealSSEInnerLoop
 
 	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
-	MOVO  D3, tmpStore
+	PADDD ·chacha20Constants<>+0(SB), X0
+	PADDD ·chacha20Constants<>+0(SB), X1
+	PADDD ·chacha20Constants<>+0(SB), X2
+	PADDD ·chacha20Constants<>+0(SB), X12
+	PADDD 32(BP), X3
+	PADDD 32(BP), X4
+	PADDD 32(BP), X5
+	PADDD 32(BP), X13
+	PADDD 48(BP), X6
+	PADDD 48(BP), X7
+	PADDD 48(BP), X8
+	PADDD 48(BP), X14
+	PADDD 80(BP), X9
+	PADDD 96(BP), X10
+	PADDD 112(BP), X11
+	PADDD 128(BP), X15
+	MOVO  X15, 64(BP)
 
 	// Load - xor - store
-	MOVOU (0*16)(inp), D3; PXOR D3, A0
-	MOVOU (1*16)(inp), D3; PXOR D3, B0
-	MOVOU (2*16)(inp), D3; PXOR D3, C0
-	MOVOU (3*16)(inp), D3; PXOR D3, D0
-	MOVOU A0, (0*16)(oup)
-	MOVOU B0, (1*16)(oup)
-	MOVOU C0, (2*16)(oup)
-	MOVOU D0, (3*16)(oup)
-	MOVO  tmpStore, D3
-
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
-	ADDQ  $192, inp
-	MOVQ  $192, itr1
-	SUBQ  $192, inl
-	MOVO  A3, A1
-	MOVO  B3, B1
-	MOVO  C3, C1
-	MOVO  D3, D1
-	CMPQ  inl, $64
+	MOVOU (SI), X15
+	PXOR  X15, X0
+	MOVOU 16(SI), X15
+	PXOR  X15, X3
+	MOVOU 32(SI), X15
+	PXOR  X15, X6
+	MOVOU 48(SI), X15
+	PXOR  X15, X9
+	MOVOU X0, (DI)
+	MOVOU X3, 16(DI)
+	MOVOU X6, 32(DI)
+	MOVOU X9, 48(DI)
+	MOVO  64(BP), X15
+	MOVOU 64(SI), X0
+	MOVOU 80(SI), X3
+	MOVOU 96(SI), X6
+	MOVOU 112(SI), X9
+	PXOR  X0, X1
+	PXOR  X3, X4
+	PXOR  X6, X7
+	PXOR  X9, X10
+	MOVOU X1, 64(DI)
+	MOVOU X4, 80(DI)
+	MOVOU X7, 96(DI)
+	MOVOU X10, 112(DI)
+	MOVOU 128(SI), X0
+	MOVOU 144(SI), X3
+	MOVOU 160(SI), X6
+	MOVOU 176(SI), X9
+	PXOR  X0, X2
+	PXOR  X3, X5
+	PXOR  X6, X8
+	PXOR  X9, X11
+	MOVOU X2, 128(DI)
+	MOVOU X5, 144(DI)
+	MOVOU X8, 160(DI)
+	MOVOU X11, 176(DI)
+	ADDQ  $0xc0, SI
+	MOVQ  $0x000000c0, CX
+	SUBQ  $0xc0, BX
+	MOVO  X12, X1
+	MOVO  X13, X4
+	MOVO  X14, X7
+	MOVO  X15, X10
+	CMPQ  BX, $0x40
 	JBE   sealSSE128SealHash
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
-	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
-	LEAQ  64(inp), inp
-	SUBQ  $64, inl
-	MOVQ  $6, itr1
-	MOVQ  $4, itr2
-	CMPQ  inl, $192
+	MOVOU (SI), X0
+	MOVOU 16(SI), X3
+	MOVOU 32(SI), X6
+	MOVOU 48(SI), X9
+	PXOR  X0, X12
+	PXOR  X3, X13
+	PXOR  X6, X14
+	PXOR  X9, X15
+	MOVOU X12, 192(DI)
+	MOVOU X13, 208(DI)
+	MOVOU X14, 224(DI)
+	MOVOU X15, 240(DI)
+	LEAQ  64(SI), SI
+	SUBQ  $0x40, BX
+	MOVQ  $0x00000006, CX
+	MOVQ  $0x00000004, R9
+	CMPQ  BX, $0xc0
 	JG    sealSSEMainLoop
-
-	MOVQ  inl, itr1
-	TESTQ inl, inl
+	MOVQ  BX, CX
+	TESTQ BX, BX
 	JE    sealSSE128SealHash
-	MOVQ  $6, itr1
-	CMPQ  inl, $64
+	MOVQ  $0x00000006, CX
+	CMPQ  BX, $0x40
 	JBE   sealSSETail64
-	CMPQ  inl, $128
+	CMPQ  BX, $0x80
 	JBE   sealSSETail128
 	JMP   sealSSETail192
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of plaintext
 sealSSETail64:
-	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
-	MOVO  ·chacha20Constants<>(SB), A1
-	MOVO  state1Store, B1
-	MOVO  state2Store, C1
-	MOVO  ctr3Store, D1
-	PADDL ·sseIncMask<>(SB), D1
-	MOVO  D1, ctr0Store
+	MOVO  ·chacha20Constants<>+0(SB), X1
+	MOVO  32(BP), X4
+	MOVO  48(BP), X7
+	MOVO  128(BP), X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X10, 80(BP)
 
 sealSSETail64LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealSSETail64LoopB:
-	chachaQR(A1, B1, C1, D1, T1)
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	chachaQR(A1, B1, C1, D1, T1)
-	shiftB1Right; shiftC1Right; shiftD1Right
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          16(oup), oup
-
-	DECQ itr1
-	JG   sealSSETail64LoopA
-
-	DECQ  itr2
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X13)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X13
+	PSLLL $0x0c, X13
+	PSRLL $0x14, X4
+	PXOR  X13, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X13)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X13
+	PSLLL $0x07, X13
+	PSRLL $0x19, X4
+	PXOR  X13, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X13)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X13
+	PSLLL $0x0c, X13
+	PSRLL $0x14, X4
+	PXOR  X13, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X13)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X13
+	PSLLL $0x07, X13
+	PSRLL $0x19, X4
+	PXOR  X13, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
+	DECQ  CX
+	JG    sealSSETail64LoopA
+	DECQ  R9
 	JGE   sealSSETail64LoopB
-	PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B1
-	PADDL state2Store, C1
-	PADDL ctr0Store, D1
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL 32(BP), X4
+	PADDL 48(BP), X7
+	PADDL 80(BP), X10
+	JMP   sealSSE128Seal
 
-	JMP sealSSE128Seal
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of plaintext
 sealSSETail128:
-	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X9, 80(BP)
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X10, 96(BP)
 
 sealSSETail128LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealSSETail128LoopB:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          16(oup), oup
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-
-	DECQ itr1
-	JG   sealSSETail128LoopA
-
-	DECQ itr2
-	JGE  sealSSETail128LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B0; PADDL state1Store, B1
-	PADDL state2Store, C0; PADDL state2Store, C1
-	PADDL ctr0Store, D0; PADDL ctr1Store, D1
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
-	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-
-	MOVQ $64, itr1
-	LEAQ 64(inp), inp
-	SUBQ $64, inl
-
-	JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of plaintext
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	DECQ  CX
+	JG    sealSSETail128LoopA
+	DECQ  R9
+	JGE   sealSSETail128LoopB
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL 32(BP), X3
+	PADDL 32(BP), X4
+	PADDL 48(BP), X6
+	PADDL 48(BP), X7
+	PADDL 80(BP), X9
+	PADDL 96(BP), X10
+	MOVOU (SI), X12
+	MOVOU 16(SI), X13
+	MOVOU 32(SI), X14
+	MOVOU 48(SI), X15
+	PXOR  X12, X0
+	PXOR  X13, X3
+	PXOR  X14, X6
+	PXOR  X15, X9
+	MOVOU X0, (DI)
+	MOVOU X3, 16(DI)
+	MOVOU X6, 32(DI)
+	MOVOU X9, 48(DI)
+	MOVQ  $0x00000040, CX
+	LEAQ  64(SI), SI
+	SUBQ  $0x40, BX
+	JMP   sealSSE128SealHash
+
 sealSSETail192:
-	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X9, 80(BP)
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X10, 96(BP)
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X11, 112(BP)
 
 sealSSETail192LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealSSETail192LoopB:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left; shiftC0Left; shiftD0Left
-	shiftB1Left; shiftC1Left; shiftD1Left
-	shiftB2Left; shiftC2Left; shiftD2Left
-
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-	shiftB2Right; shiftC2Right; shiftD2Right
-
-	DECQ itr1
-	JG   sealSSETail192LoopA
-
-	DECQ itr2
-	JGE  sealSSETail192LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
-	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
-	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
-	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
-	MOVO A2, A1
-	MOVO B2, B1
-	MOVO C2, C1
-	MOVO D2, D1
-	MOVQ $128, itr1
-	LEAQ 128(inp), inp
-	SUBQ $128, inl
-
-	JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special seal optimization for buffers smaller than 129 bytes
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	DECQ  CX
+	JG    sealSSETail192LoopA
+	DECQ  R9
+	JGE   sealSSETail192LoopB
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL ·chacha20Constants<>+0(SB), X2
+	PADDL 32(BP), X3
+	PADDL 32(BP), X4
+	PADDL 32(BP), X5
+	PADDL 48(BP), X6
+	PADDL 48(BP), X7
+	PADDL 48(BP), X8
+	PADDL 80(BP), X9
+	PADDL 96(BP), X10
+	PADDL 112(BP), X11
+	MOVOU (SI), X12
+	MOVOU 16(SI), X13
+	MOVOU 32(SI), X14
+	MOVOU 48(SI), X15
+	PXOR  X12, X0
+	PXOR  X13, X3
+	PXOR  X14, X6
+	PXOR  X15, X9
+	MOVOU X0, (DI)
+	MOVOU X3, 16(DI)
+	MOVOU X6, 32(DI)
+	MOVOU X9, 48(DI)
+	MOVOU 64(SI), X12
+	MOVOU 80(SI), X13
+	MOVOU 96(SI), X14
+	MOVOU 112(SI), X15
+	PXOR  X12, X1
+	PXOR  X13, X4
+	PXOR  X14, X7
+	PXOR  X15, X10
+	MOVOU X1, 64(DI)
+	MOVOU X4, 80(DI)
+	MOVOU X7, 96(DI)
+	MOVOU X10, 112(DI)
+	MOVO  X2, X1
+	MOVO  X5, X4
+	MOVO  X8, X7
+	MOVO  X11, X10
+	MOVQ  $0x00000080, CX
+	LEAQ  128(SI), SI
+	SUBQ  $0x80, BX
+	JMP   sealSSE128SealHash
+
 sealSSE128:
-	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
-	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
-	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
-	MOVQ  $10, itr2
+	MOVOU ·chacha20Constants<>+0(SB), X0
+	MOVOU 16(R8), X3
+	MOVOU 32(R8), X6
+	MOVOU 48(R8), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X3, X13
+	MOVO  X6, X14
+	MOVO  X10, X15
+	MOVQ  $0x0000000a, R9
 
 sealSSE128InnerCipherLoop:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left;  shiftB1Left; shiftB2Left
-	shiftC0Left;  shiftC1Left; shiftC2Left
-	shiftD0Left;  shiftD1Left; shiftD2Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftB1Right; shiftB2Right
-	shiftC0Right; shiftC1Right; shiftC2Right
-	shiftD0Right; shiftD1Right; shiftD2Right
-	DECQ          itr2
-	JNE           sealSSE128InnerCipherLoop
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	DECQ  R9
+	JNE   sealSSE128InnerCipherLoop
 
 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
-	PADDL T2, C1; PADDL T2, C2
-	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
-	PAND  ·polyClampMask<>(SB), A0
-	MOVOU A0, rStore
-	MOVOU B0, sStore
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL ·chacha20Constants<>+0(SB), X2
+	PADDL X13, X3
+	PADDL X13, X4
+	PADDL X13, X5
+	PADDL X14, X7
+	PADDL X14, X8
+	PADDL X15, X10
+	PADDL ·sseIncMask<>+0(SB), X15
+	PADDL X15, X11
+	PAND  ·polyClampMask<>+0(SB), X0
+	MOVOU X0, (BP)
+	MOVOU X3, 16(BP)
 
 	// Hash
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
+	XORQ CX, CX
 
 sealSSE128SealHash:
-	// itr1 holds the number of bytes encrypted but not yet hashed
-	CMPQ itr1, $16
-	JB   sealSSE128Seal
-	polyAdd(0(oup))
-	polyMul
-
-	SUBQ $16, itr1
-	ADDQ $16, oup
-
-	JMP sealSSE128SealHash
+	CMPQ  CX, $0x10
+	JB    sealSSE128Seal
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	SUBQ  $0x10, CX
+	ADDQ  $0x10, DI
+	JMP   sealSSE128SealHash
 
 sealSSE128Seal:
-	CMPQ inl, $16
+	CMPQ BX, $0x10
 	JB   sealSSETail
-	SUBQ $16, inl
+	SUBQ $0x10, BX
 
 	// Load for decryption
-	MOVOU (inp), T0
-	PXOR  T0, A1
-	MOVOU A1, (oup)
-	LEAQ  (1*16)(inp), inp
-	LEAQ  (1*16)(oup), oup
+	MOVOU (SI), X12
+	PXOR  X12, X1
+	MOVOU X1, (DI)
+	LEAQ  16(SI), SI
+	LEAQ  16(DI), DI
 
 	// Extract for hashing
-	MOVQ   A1, t0
-	PSRLDQ $8, A1
-	MOVQ A1, t1
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
+	MOVQ   X1, R13
+	PSRLDQ $0x08, X1
+	MOVQ   X1, R14
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x01, R12
+	MOVQ   (BP), AX
+	MOVQ   AX, R15
+	MULQ   R10
+	MOVQ   AX, R13
+	MOVQ   DX, R14
+	MOVQ   (BP), AX
+	MULQ   R11
+	IMULQ  R12, R15
+	ADDQ   AX, R14
+	ADCQ   DX, R15
+	MOVQ   8(BP), AX
+	MOVQ   AX, R8
+	MULQ   R10
+	ADDQ   AX, R14
+	ADCQ   $0x00, DX
+	MOVQ   DX, R10
+	MOVQ   8(BP), AX
+	MULQ   R11
+	ADDQ   AX, R15
+	ADCQ   $0x00, DX
+	IMULQ  R12, R8
+	ADDQ   R10, R15
+	ADCQ   DX, R8
+	MOVQ   R13, R10
+	MOVQ   R14, R11
+	MOVQ   R15, R12
+	ANDQ   $0x03, R12
+	MOVQ   R15, R13
+	ANDQ   $-4, R13
+	MOVQ   R8, R14
+	SHRQ   $0x02, R8, R15
+	SHRQ   $0x02, R8
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x00, R12
+	ADDQ   R15, R10
+	ADCQ   R8, R11
+	ADCQ   $0x00, R12
 
 	// Shift the stream "left"
-	MOVO B1, A1
-	MOVO C1, B1
-	MOVO D1, C1
-	MOVO A2, D1
-	MOVO B2, A2
-	MOVO C2, B2
-	MOVO D2, C2
+	MOVO X4, X1
+	MOVO X7, X4
+	MOVO X10, X7
+	MOVO X2, X10
+	MOVO X5, X2
+	MOVO X8, X5
+	MOVO X11, X8
 	JMP  sealSSE128Seal
 
 sealSSETail:
-	TESTQ inl, inl
+	TESTQ BX, BX
 	JE    sealSSEFinalize
 
 	// We can only load the PT one byte at a time to avoid read after end of buffer
-	MOVQ inl, itr2
-	SHLQ $4, itr2
-	LEAQ ·andMask<>(SB), t0
-	MOVQ inl, itr1
-	LEAQ -1(inp)(inl*1), inp
-	XORQ t2, t2
-	XORQ t3, t3
+	MOVQ BX, R9
+	SHLQ $0x04, R9
+	LEAQ ·andMask<>+0(SB), R13
+	MOVQ BX, CX
+	LEAQ -1(SI)(BX*1), SI
+	XORQ R15, R15
+	XORQ R8, R8
 	XORQ AX, AX
 
 sealSSETailLoadLoop:
-	SHLQ $8, t2, t3
-	SHLQ $8, t2
-	MOVB (inp), AX
-	XORQ AX, t2
-	LEAQ   -1(inp), inp
-	DECQ   itr1
+	SHLQ   $0x08, R15, R8
+	SHLQ   $0x08, R15
+	MOVB   (SI), AX
+	XORQ   AX, R15
+	LEAQ   -1(SI), SI
+	DECQ   CX
 	JNE    sealSSETailLoadLoop
-	MOVQ t2, 0+tmpStore
-	MOVQ t3, 8+tmpStore
-	PXOR 0+tmpStore, A1
-	MOVOU  A1, (oup)
-	MOVOU  -16(t0)(itr2*1), T0
-	PAND   T0, A1
-	MOVQ   A1, t0
-	PSRLDQ $8, A1
-	MOVQ   A1, t1
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
-
-	ADDQ inl, oup
+	MOVQ   R15, 64(BP)
+	MOVQ   R8, 72(BP)
+	PXOR   64(BP), X1
+	MOVOU  X1, (DI)
+	MOVOU  -16(R13)(R9*1), X12
+	PAND   X12, X1
+	MOVQ   X1, R13
+	PSRLDQ $0x08, X1
+	MOVQ   X1, R14
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x01, R12
+	MOVQ   (BP), AX
+	MOVQ   AX, R15
+	MULQ   R10
+	MOVQ   AX, R13
+	MOVQ   DX, R14
+	MOVQ   (BP), AX
+	MULQ   R11
+	IMULQ  R12, R15
+	ADDQ   AX, R14
+	ADCQ   DX, R15
+	MOVQ   8(BP), AX
+	MOVQ   AX, R8
+	MULQ   R10
+	ADDQ   AX, R14
+	ADCQ   $0x00, DX
+	MOVQ   DX, R10
+	MOVQ   8(BP), AX
+	MULQ   R11
+	ADDQ   AX, R15
+	ADCQ   $0x00, DX
+	IMULQ  R12, R8
+	ADDQ   R10, R15
+	ADCQ   DX, R8
+	MOVQ   R13, R10
+	MOVQ   R14, R11
+	MOVQ   R15, R12
+	ANDQ   $0x03, R12
+	MOVQ   R15, R13
+	ANDQ   $-4, R13
+	MOVQ   R8, R14
+	SHRQ   $0x02, R8, R15
+	SHRQ   $0x02, R8
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x00, R12
+	ADDQ   R15, R10
+	ADCQ   R8, R11
+	ADCQ   $0x00, R12
+	ADDQ   BX, DI
 
 sealSSEFinalize:
 	// Hash in the buffer lengths
-	ADDQ ad_len+80(FP), acc0
-	ADCQ src_len+56(FP), acc1
-	ADCQ $1, acc2
-	polyMul
+	ADDQ  ad_len+80(FP), R10
+	ADCQ  src_len+56(FP), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Final reduce
-	MOVQ    acc0, t0
-	MOVQ    acc1, t1
-	MOVQ    acc2, t2
-	SUBQ    $-5, acc0
-	SBBQ    $-1, acc1
-	SBBQ    $3, acc2
-	CMOVQCS t0, acc0
-	CMOVQCS t1, acc1
-	CMOVQCS t2, acc2
+	MOVQ    R10, R13
+	MOVQ    R11, R14
+	MOVQ    R12, R15
+	SUBQ    $-5, R10
+	SBBQ    $-1, R11
+	SBBQ    $0x03, R12
+	CMOVQCS R13, R10
+	CMOVQCS R14, R11
+	CMOVQCS R15, R12
 
 	// Add in the "s" part of the key
-	ADDQ 0+sStore, acc0
-	ADCQ 8+sStore, acc1
+	ADDQ 16(BP), R10
+	ADCQ 24(BP), R11
 
 	// Finally store the tag at the end of the message
-	MOVQ acc0, (0*8)(oup)
-	MOVQ acc1, (1*8)(oup)
+	MOVQ R10, (DI)
+	MOVQ R11, 8(DI)
 	RET
 
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
 chacha20Poly1305Seal_AVX2:
 	VZEROUPPER
-	VMOVDQU ·chacha20Constants<>(SB), AA0
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
-	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
-	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	BYTE    $0xc4
+	BYTE    $0x42
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x70
+	BYTE    $0x10
+	BYTE    $0xc4
+	BYTE    $0x42
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x60
+	BYTE    $0x20
+	BYTE    $0xc4
+	BYTE    $0xc2
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x60
+	BYTE    $0x30
+	VPADDD  ·avx2InitMask<>+0(SB), Y4, Y4
 
 	// Special optimizations, for very short buffers
-	CMPQ inl, $192
-	JBE  seal192AVX2 // 33% faster
-	CMPQ inl, $320
-	JBE  seal320AVX2 // 17% faster
+	CMPQ BX, $0x000000c0
+	JBE  seal192AVX2
+	CMPQ BX, $0x00000140
+	JBE  seal320AVX2
 
 	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
-	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
-	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
-	VMOVDQA DD3, ctr3StoreAVX2
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA Y14, 32(BP)
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA Y12, 64(BP)
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y4, 96(BP)
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y1, 128(BP)
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
+	MOVQ    $0x0000000a, R9
 
 sealAVX2IntroLoop:
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
-	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
-	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
-	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
-	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
-	DECQ     itr2
-	JNE      sealAVX2IntroLoop
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-
-	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
-	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
-	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
+	VMOVDQA    Y15, 224(BP)
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y15
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y15
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y15
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y15
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x0c, Y10, Y15
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x07, Y10, Y15
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VMOVDQA    224(BP), Y15
+	VMOVDQA    Y13, 224(BP)
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y15, Y11, Y11
+	VPSLLD     $0x0c, Y11, Y13
+	VPSRLD     $0x14, Y11, Y11
+	VPXOR      Y13, Y11, Y11
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y15, Y11, Y11
+	VPSLLD     $0x07, Y11, Y13
+	VPSRLD     $0x19, Y11, Y11
+	VPXOR      Y13, Y11, Y11
+	VMOVDQA    224(BP), Y13
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPALIGNR   $0x04, Y10, Y10, Y10
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x0c, Y2, Y2, Y2
+	VPALIGNR   $0x04, Y11, Y11, Y11
+	VPALIGNR   $0x08, Y15, Y15, Y15
+	VPALIGNR   $0x0c, Y3, Y3, Y3
+	VMOVDQA    Y15, 224(BP)
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y15
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y15
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y15
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y15
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x0c, Y10, Y15
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x07, Y10, Y15
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VMOVDQA    224(BP), Y15
+	VMOVDQA    Y13, 224(BP)
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y15, Y11, Y11
+	VPSLLD     $0x0c, Y11, Y13
+	VPSRLD     $0x14, Y11, Y11
+	VPXOR      Y13, Y11, Y11
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y15, Y11, Y11
+	VPSLLD     $0x07, Y11, Y13
+	VPSRLD     $0x19, Y11, Y11
+	VPXOR      Y13, Y11, Y11
+	VMOVDQA    224(BP), Y13
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	VPALIGNR   $0x0c, Y10, Y10, Y10
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x04, Y2, Y2, Y2
+	VPALIGNR   $0x0c, Y11, Y11, Y11
+	VPALIGNR   $0x08, Y15, Y15, Y15
+	VPALIGNR   $0x04, Y3, Y3, Y3
+	DECQ       R9
+	JNE        sealAVX2IntroLoop
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     32(BP), Y11, Y11
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     64(BP), Y15, Y15
+	VPADDD     96(BP), Y4, Y4
+	VPADDD     128(BP), Y1, Y1
+	VPADDD     160(BP), Y2, Y2
+	VPADDD     192(BP), Y3, Y3
+	VPERM2I128 $0x13, Y12, Y4, Y12
+	VPERM2I128 $0x02, Y0, Y14, Y4
+	VPERM2I128 $0x13, Y0, Y14, Y0
 
 	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), DD0, DD0
-	VMOVDQA DD0, rsStoreAVX2
+	VPAND   ·polyClampMask<>+0(SB), Y4, Y4
+	VMOVDQA Y4, (BP)
 
 	// Hash AD
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
 
 	// Can store at least 320 bytes
-	VPXOR   (0*32)(inp), AA0, AA0
-	VPXOR   (1*32)(inp), CC0, CC0
-	VMOVDQU AA0, (0*32)(oup)
-	VMOVDQU CC0, (1*32)(oup)
-
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
-
-	MOVQ $320, itr1
-	SUBQ $320, inl
-	LEAQ 320(inp), inp
-
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
-	CMPQ       inl, $128
+	VPXOR      (SI), Y0, Y0
+	VPXOR      32(SI), Y12, Y12
+	VMOVDQU    Y0, (DI)
+	VMOVDQU    Y12, 32(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      64(SI), Y0, Y0
+	VPXOR      96(SI), Y14, Y14
+	VPXOR      128(SI), Y12, Y12
+	VPXOR      160(SI), Y4, Y4
+	VMOVDQU    Y0, 64(DI)
+	VMOVDQU    Y14, 96(DI)
+	VMOVDQU    Y12, 128(DI)
+	VMOVDQU    Y4, 160(DI)
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      192(SI), Y0, Y0
+	VPXOR      224(SI), Y14, Y14
+	VPXOR      256(SI), Y12, Y12
+	VPXOR      288(SI), Y4, Y4
+	VMOVDQU    Y0, 192(DI)
+	VMOVDQU    Y14, 224(DI)
+	VMOVDQU    Y12, 256(DI)
+	VMOVDQU    Y4, 288(DI)
+	MOVQ       $0x00000140, CX
+	SUBQ       $0x00000140, BX
+	LEAQ       320(SI), SI
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, Y15, Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, Y15, Y3, Y4
+	CMPQ       BX, $0x80
 	JBE        sealAVX2SealHash
-
-	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
-	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
-	SUBQ    $128, inl
-	LEAQ    128(inp), inp
-
-	MOVQ $8, itr1
-	MOVQ $2, itr2
-
-	CMPQ inl, $128
-	JBE  sealAVX2Tail128
-	CMPQ inl, $256
-	JBE  sealAVX2Tail256
-	CMPQ inl, $384
-	JBE  sealAVX2Tail384
-	CMPQ inl, $512
-	JBE  sealAVX2Tail512
+	VPXOR      (SI), Y0, Y0
+	VPXOR      32(SI), Y14, Y14
+	VPXOR      64(SI), Y12, Y12
+	VPXOR      96(SI), Y4, Y4
+	VMOVDQU    Y0, 320(DI)
+	VMOVDQU    Y14, 352(DI)
+	VMOVDQU    Y12, 384(DI)
+	VMOVDQU    Y4, 416(DI)
+	SUBQ       $0x80, BX
+	LEAQ       128(SI), SI
+	MOVQ       $0x00000008, CX
+	MOVQ       $0x00000002, R9
+	CMPQ       BX, $0x80
+	JBE        sealAVX2Tail128
+	CMPQ       BX, $0x00000100
+	JBE        sealAVX2Tail256
+	CMPQ       BX, $0x00000180
+	JBE        sealAVX2Tail384
+	CMPQ       BX, $0x00000200
+	JBE        sealAVX2Tail512
 
 	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
-	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
-	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
-	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
-	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-
-	SUBQ $16, oup                  // Adjust the pointer
-	MOVQ $9, itr1
-	JMP  sealAVX2InternalLoopStart
+	VMOVDQA  ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA  Y0, Y5
+	VMOVDQA  Y0, Y6
+	VMOVDQA  Y0, Y7
+	VMOVDQA  32(BP), Y14
+	VMOVDQA  Y14, Y9
+	VMOVDQA  Y14, Y10
+	VMOVDQA  Y14, Y11
+	VMOVDQA  64(BP), Y12
+	VMOVDQA  Y12, Y13
+	VMOVDQA  Y12, Y8
+	VMOVDQA  Y12, Y15
+	VMOVDQA  192(BP), Y4
+	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD   ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD   ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA  Y4, 96(BP)
+	VMOVDQA  Y1, 128(BP)
+	VMOVDQA  Y2, 160(BP)
+	VMOVDQA  Y3, 192(BP)
+	VMOVDQA  Y15, 224(BP)
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VMOVDQA  224(BP), Y15
+	VMOVDQA  Y13, 224(BP)
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y15, Y11, Y11
+	VPSLLD   $0x0c, Y11, Y13
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y13, Y11, Y11
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y15, Y11, Y11
+	VPSLLD   $0x07, Y11, Y13
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y13, Y11, Y11
+	VMOVDQA  224(BP), Y13
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPALIGNR $0x04, Y11, Y11, Y11
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x0c, Y3, Y3, Y3
+	VMOVDQA  Y15, 224(BP)
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VMOVDQA  224(BP), Y15
+	VMOVDQA  Y13, 224(BP)
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y15, Y11, Y11
+	VPSLLD   $0x0c, Y11, Y13
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y13, Y11, Y11
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y15, Y11, Y11
+	VPSLLD   $0x07, Y11, Y13
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y13, Y11, Y11
+	VMOVDQA  224(BP), Y13
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x04, Y2, Y2, Y2
+	VPALIGNR $0x0c, Y11, Y11, Y11
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x04, Y3, Y3, Y3
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	SUBQ     $0x10, DI
+	MOVQ     $0x00000009, CX
+	JMP      sealAVX2InternalLoopStart
 
 sealAVX2MainLoop:
-	// Load state, increment counter blocks, store the incremented counters
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	MOVQ    $10, itr1
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
+	MOVQ    $0x0000000a, CX
 
 sealAVX2InternalLoop:
-	polyAdd(0*8(oup))
-	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage1_AVX2
-	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulStage2_AVX2
-	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyMulStage3_AVX2
-	VMOVDQA CC3, tmpStoreAVX2
-	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA tmpStoreAVX2, CC3
-	polyMulReduceStage
+	ADDQ    (DI), R10
+	ADCQ    8(DI), R11
+	ADCQ    $0x01, R12
+	VPADDD  Y14, Y0, Y0
+	VPADDD  Y9, Y5, Y5
+	VPADDD  Y10, Y6, Y6
+	VPADDD  Y11, Y7, Y7
+	MOVQ    (BP), DX
+	MOVQ    DX, R15
+	MULXQ   R10, R13, R14
+	IMULQ   R12, R15
+	MULXQ   R11, AX, DX
+	ADDQ    AX, R14
+	ADCQ    DX, R15
+	VPXOR   Y0, Y4, Y4
+	VPXOR   Y5, Y1, Y1
+	VPXOR   Y6, Y2, Y2
+	VPXOR   Y7, Y3, Y3
+	VPSHUFB ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB ·rol16<>+0(SB), Y3, Y3
+	MOVQ    8(BP), DX
+	MULXQ   R10, R10, AX
+	ADDQ    R10, R14
+	MULXQ   R11, R11, R8
+	ADCQ    R11, R15
+	ADCQ    $0x00, R8
+	VPADDD  Y4, Y12, Y12
+	VPADDD  Y1, Y13, Y13
+	VPADDD  Y2, Y8, Y8
+	VPADDD  Y3, Y15, Y15
+	VPXOR   Y12, Y14, Y14
+	VPXOR   Y13, Y9, Y9
+	VPXOR   Y8, Y10, Y10
+	VPXOR   Y15, Y11, Y11
+	IMULQ   R12, DX
+	ADDQ    AX, R15
+	ADCQ    DX, R8
+	VMOVDQA Y15, 224(BP)
+	VPSLLD  $0x0c, Y14, Y15
+	VPSRLD  $0x14, Y14, Y14
+	VPXOR   Y15, Y14, Y14
+	VPSLLD  $0x0c, Y9, Y15
+	VPSRLD  $0x14, Y9, Y9
+	VPXOR   Y15, Y9, Y9
+	VPSLLD  $0x0c, Y10, Y15
+	VPSRLD  $0x14, Y10, Y10
+	VPXOR   Y15, Y10, Y10
+	VPSLLD  $0x0c, Y11, Y15
+	VPSRLD  $0x14, Y11, Y11
+	VPXOR   Y15, Y11, Y11
+	VMOVDQA 224(BP), Y15
+	MOVQ    R13, R10
+	MOVQ    R14, R11
+	MOVQ    R15, R12
+	ANDQ    $0x03, R12
+	MOVQ    R15, R13
+	ANDQ    $-4, R13
+	MOVQ    R8, R14
+	SHRQ    $0x02, R8, R15
+	SHRQ    $0x02, R8
+	ADDQ    R13, R10
+	ADCQ    R14, R11
+	ADCQ    $0x00, R12
+	ADDQ    R15, R10
+	ADCQ    R8, R11
+	ADCQ    $0x00, R12
 
 sealAVX2InternalLoopStart:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	polyAdd(2*8(oup))
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage1_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage2_AVX2
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage3_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulReduceStage
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(4*8(oup))
-	LEAQ     (6*8)(oup), oup
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage1_AVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	polyMulStage2_AVX2
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage3_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	DECQ     itr1
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	ADDQ     16(DI), R10
+	ADCQ     24(DI), R11
+	ADCQ     $0x01, R12
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x04, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPALIGNR $0x0c, Y3, Y3, Y3
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	ADDQ     32(DI), R10
+	ADCQ     40(DI), R11
+	ADCQ     $0x01, R12
+	LEAQ     48(DI), DI
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x0c, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	VPALIGNR $0x04, Y3, Y3, Y3
+	DECQ     CX
 	JNE      sealAVX2InternalLoop
-
-	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA CC3, tmpStoreAVX2
+	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD   32(BP), Y14, Y14
+	VPADDD   32(BP), Y9, Y9
+	VPADDD   32(BP), Y10, Y10
+	VPADDD   32(BP), Y11, Y11
+	VPADDD   64(BP), Y12, Y12
+	VPADDD   64(BP), Y13, Y13
+	VPADDD   64(BP), Y8, Y8
+	VPADDD   64(BP), Y15, Y15
+	VPADDD   96(BP), Y4, Y4
+	VPADDD   128(BP), Y1, Y1
+	VPADDD   160(BP), Y2, Y2
+	VPADDD   192(BP), Y3, Y3
+	VMOVDQA  Y15, 224(BP)
 
 	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	LEAQ       (4*8)(oup), oup
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VPERM2I128 $0x02, Y0, Y14, Y15
+	VPERM2I128 $0x13, Y0, Y14, Y14
+	VPERM2I128 $0x02, Y12, Y4, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y12
+	VPXOR      (SI), Y15, Y15
+	VPXOR      32(SI), Y0, Y0
+	VPXOR      64(SI), Y14, Y14
+	VPXOR      96(SI), Y12, Y12
+	VMOVDQU    Y15, (DI)
+	VMOVDQU    Y0, 32(DI)
+	VMOVDQU    Y14, 64(DI)
+	VMOVDQU    Y12, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      128(SI), Y0, Y0
+	VPXOR      160(SI), Y14, Y14
+	VPXOR      192(SI), Y12, Y12
+	VPXOR      224(SI), Y4, Y4
+	VMOVDQU    Y0, 128(DI)
+	VMOVDQU    Y14, 160(DI)
+	VMOVDQU    Y12, 192(DI)
+	VMOVDQU    Y4, 224(DI)
 
 	// and here
-	polyAdd(-2*8(oup))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
-	LEAQ       (32*16)(inp), inp
-	SUBQ       $(32*16), inl
-	CMPQ       inl, $512
+	ADDQ       -16(DI), R10
+	ADCQ       -8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      256(SI), Y0, Y0
+	VPXOR      288(SI), Y14, Y14
+	VPXOR      320(SI), Y12, Y12
+	VPXOR      352(SI), Y4, Y4
+	VMOVDQU    Y0, 256(DI)
+	VMOVDQU    Y14, 288(DI)
+	VMOVDQU    Y12, 320(DI)
+	VMOVDQU    Y4, 352(DI)
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, 224(BP), Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, 224(BP), Y3, Y4
+	VPXOR      384(SI), Y0, Y0
+	VPXOR      416(SI), Y14, Y14
+	VPXOR      448(SI), Y12, Y12
+	VPXOR      480(SI), Y4, Y4
+	VMOVDQU    Y0, 384(DI)
+	VMOVDQU    Y14, 416(DI)
+	VMOVDQU    Y12, 448(DI)
+	VMOVDQU    Y4, 480(DI)
+	LEAQ       512(SI), SI
+	SUBQ       $0x00000200, BX
+	CMPQ       BX, $0x00000200
 	JG         sealAVX2MainLoop
 
 	// Tail can only hash 480 bytes
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ 32(oup), oup
-
-	MOVQ $10, itr1
-	MOVQ $0, itr2
-	CMPQ inl, $128
-	JBE  sealAVX2Tail128
-	CMPQ inl, $256
-	JBE  sealAVX2Tail256
-	CMPQ inl, $384
-	JBE  sealAVX2Tail384
-	JMP  sealAVX2Tail512
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  16(DI), R10
+	ADCQ  24(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  32(DI), DI
+	MOVQ  $0x0000000a, CX
+	MOVQ  $0x00000000, R9
+	CMPQ  BX, $0x80
+	JBE   sealAVX2Tail128
+	CMPQ  BX, $0x00000100
+	JBE   sealAVX2Tail256
+	CMPQ  BX, $0x00000180
+	JBE   sealAVX2Tail384
+	JMP   sealAVX2Tail512
+
 seal192AVX2:
-	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
-	VMOVDQA AA0, AA1
-	VMOVDQA BB0, BB1
-	VMOVDQA CC0, CC1
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2
-	VMOVDQA BB0, BB2
-	VMOVDQA CC0, CC2
-	VMOVDQA DD0, DD2
-	VMOVDQA DD1, TT3
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y14, Y9
+	VMOVDQA Y12, Y13
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y0, Y6
+	VMOVDQA Y14, Y10
+	VMOVDQA Y12, Y8
+	VMOVDQA Y4, Y2
+	VMOVDQA Y1, Y15
+	MOVQ    $0x0000000a, R9
 
 sealAVX2192InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ       itr2
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	DECQ       R9
 	JNE        sealAVX2192InnerCipherLoop
-	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
-	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
-	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
-	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPADDD     Y6, Y0, Y0
+	VPADDD     Y6, Y5, Y5
+	VPADDD     Y10, Y14, Y14
+	VPADDD     Y10, Y9, Y9
+	VPADDD     Y8, Y12, Y12
+	VPADDD     Y8, Y13, Y13
+	VPADDD     Y2, Y4, Y4
+	VPADDD     Y15, Y1, Y1
+	VPERM2I128 $0x02, Y0, Y14, Y3
 
 	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
+	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA Y3, (BP)
 
 	// Stream for up to 192 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
+	VPERM2I128 $0x02, Y5, Y9, Y12
+	VPERM2I128 $0x02, Y13, Y1, Y4
+	VPERM2I128 $0x13, Y5, Y9, Y5
+	VPERM2I128 $0x13, Y13, Y1, Y9
 
 sealAVX2ShortSeal:
 	// Hash aad
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
+	XORQ CX, CX
 
 sealAVX2SealHash:
 	// itr1 holds the number of bytes encrypted but not yet hashed
-	CMPQ itr1, $16
-	JB   sealAVX2ShortSealLoop
-	polyAdd(0(oup))
-	polyMul
-	SUBQ $16, itr1
-	ADDQ $16, oup
-	JMP  sealAVX2SealHash
+	CMPQ  CX, $0x10
+	JB    sealAVX2ShortSealLoop
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	SUBQ  $0x10, CX
+	ADDQ  $0x10, DI
+	JMP   sealAVX2SealHash
 
 sealAVX2ShortSealLoop:
-	CMPQ inl, $32
+	CMPQ BX, $0x20
 	JB   sealAVX2ShortTail32
-	SUBQ $32, inl
+	SUBQ $0x20, BX
 
 	// Load for encryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
+	VPXOR   (SI), Y0, Y0
+	VMOVDQU Y0, (DI)
+	LEAQ    32(SI), SI
 
 	// Now can hash
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ (1*32)(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  16(DI), R10
+	ADCQ  24(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  32(DI), DI
 
 	// Shift stream left
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
-	VMOVDQA AA1, DD0
-	VMOVDQA BB1, AA1
-	VMOVDQA CC1, BB1
-	VMOVDQA DD1, CC1
-	VMOVDQA AA2, DD1
-	VMOVDQA BB2, AA2
+	VMOVDQA Y14, Y0
+	VMOVDQA Y12, Y14
+	VMOVDQA Y4, Y12
+	VMOVDQA Y5, Y4
+	VMOVDQA Y9, Y5
+	VMOVDQA Y13, Y9
+	VMOVDQA Y1, Y13
+	VMOVDQA Y6, Y1
+	VMOVDQA Y10, Y6
 	JMP     sealAVX2ShortSealLoop
 
 sealAVX2ShortTail32:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
+	CMPQ    BX, $0x10
+	VMOVDQA X0, X1
 	JB      sealAVX2ShortDone
-
-	SUBQ $16, inl
+	SUBQ    $0x10, BX
 
 	// Load for encryption
-	VPXOR   (inp), A0, T0
-	VMOVDQU T0, (oup)
-	LEAQ    (1*16)(inp), inp
+	VPXOR   (SI), X0, X12
+	VMOVDQU X12, (DI)
+	LEAQ    16(SI), SI
 
 	// Hash
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       16(DI), DI
+	VPERM2I128 $0x11, Y0, Y0, Y0
+	VMOVDQA    X0, X1
 
 sealAVX2ShortDone:
 	VZEROUPPER
 	JMP sealSSETail
 
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
 seal320AVX2:
-	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
-	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y14, Y9
+	VMOVDQA Y12, Y13
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y0, Y6
+	VMOVDQA Y14, Y10
+	VMOVDQA Y12, Y8
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y14, Y7
+	VMOVDQA Y12, Y11
+	VMOVDQA Y4, Y15
+	MOVQ    $0x0000000a, R9
 
 sealAVX2320InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr2
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	DECQ     R9
 	JNE      sealAVX2320InnerCipherLoop
-
-	VMOVDQA ·chacha20Constants<>(SB), TT0
-	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
-	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
-	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
-	VMOVDQA ·avx2IncMask<>(SB), TT0
-	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD2, DD2
+	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
+	VPADDD   Y3, Y0, Y0
+	VPADDD   Y3, Y5, Y5
+	VPADDD   Y3, Y6, Y6
+	VPADDD   Y7, Y14, Y14
+	VPADDD   Y7, Y9, Y9
+	VPADDD   Y7, Y10, Y10
+	VPADDD   Y11, Y12, Y12
+	VPADDD   Y11, Y13, Y13
+	VPADDD   Y11, Y8, Y8
+	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
+	VPADDD   Y15, Y4, Y4
+	VPADDD   Y3, Y15, Y15
+	VPADDD   Y15, Y1, Y1
+	VPADDD   Y3, Y15, Y15
+	VPADDD   Y15, Y2, Y2
 
 	// Clamp and store poly key
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPAND      ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA    TT0, rsStoreAVX2
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA    Y3, (BP)
 
 	// Stream for up to 320 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
-	VPERM2I128 $0x02, AA2, BB2, CC1
-	VPERM2I128 $0x02, CC2, DD2, DD1
-	VPERM2I128 $0x13, AA2, BB2, AA2
-	VPERM2I128 $0x13, CC2, DD2, BB2
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
+	VPERM2I128 $0x02, Y5, Y9, Y12
+	VPERM2I128 $0x02, Y13, Y1, Y4
+	VPERM2I128 $0x13, Y5, Y9, Y5
+	VPERM2I128 $0x13, Y13, Y1, Y9
+	VPERM2I128 $0x02, Y6, Y10, Y13
+	VPERM2I128 $0x02, Y8, Y2, Y1
+	VPERM2I128 $0x13, Y6, Y10, Y6
+	VPERM2I128 $0x13, Y8, Y2, Y10
 	JMP        sealAVX2ShortSeal
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
 sealAVX2Tail128:
-	// Need to decrypt up to 128 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0
-	VMOVDQA state1StoreAVX2, BB0
-	VMOVDQA state2StoreAVX2, CC0
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VMOVDQA DD0, DD1
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA 32(BP), Y14
+	VMOVDQA 64(BP), Y12
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VMOVDQA Y4, Y1
 
 sealAVX2Tail128LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealAVX2Tail128LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0
-	VPALIGNR $8, CC0, CC0, CC0
-	VPALIGNR $12, DD0, DD0, DD0
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0
-	VPALIGNR $8, CC0, CC0, CC0
-	VPALIGNR $4, DD0, DD0, DD0
-	DECQ     itr1
-	JG       sealAVX2Tail128LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail128LoopB
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA1
-	VPADDD state1StoreAVX2, BB0, BB1
-	VPADDD state2StoreAVX2, CC0, CC1
-	VPADDD DD1, DD0, DD1
-
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	ADDQ       16(DI), R10
+	ADCQ       24(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	DECQ       CX
+	JG         sealAVX2Tail128LoopA
+	DECQ       R9
+	JGE        sealAVX2Tail128LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y5
+	VPADDD     32(BP), Y14, Y9
+	VPADDD     64(BP), Y12, Y13
+	VPADDD     Y1, Y4, Y1
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
 	JMP        sealAVX2ShortSealLoop
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
 sealAVX2Tail256:
-	// Need to decrypt up to 256 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA DD0, TT1
-	VMOVDQA DD1, TT2
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA ·chacha20Constants<>+0(SB), Y5
+	VMOVDQA 32(BP), Y14
+	VMOVDQA 32(BP), Y9
+	VMOVDQA 64(BP), Y12
+	VMOVDQA 64(BP), Y13
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y4, Y7
+	VMOVDQA Y1, Y11
 
 sealAVX2Tail256LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealAVX2Tail256LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ     itr1
-	JG       sealAVX2Tail256LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail256LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPERM2I128 $0x02, CC0, DD0, TT1
-	VPERM2I128 $0x13, AA0, BB0, TT2
-	VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	MOVQ       $128, itr1
-	LEAQ       128(inp), inp
-	SUBQ       $128, inl
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
-
-	JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	ADDQ       16(DI), R10
+	ADCQ       24(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	DECQ       CX
+	JG         sealAVX2Tail256LoopA
+	DECQ       R9
+	JGE        sealAVX2Tail256LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     Y7, Y4, Y4
+	VPADDD     Y11, Y1, Y1
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPERM2I128 $0x02, Y12, Y4, Y7
+	VPERM2I128 $0x13, Y0, Y14, Y11
+	VPERM2I128 $0x13, Y12, Y4, Y15
+	VPXOR      (SI), Y3, Y3
+	VPXOR      32(SI), Y7, Y7
+	VPXOR      64(SI), Y11, Y11
+	VPXOR      96(SI), Y15, Y15
+	VMOVDQU    Y3, (DI)
+	VMOVDQU    Y7, 32(DI)
+	VMOVDQU    Y11, 64(DI)
+	VMOVDQU    Y15, 96(DI)
+	MOVQ       $0x00000080, CX
+	LEAQ       128(SI), SI
+	SUBQ       $0x80, BX
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	JMP        sealAVX2SealHash
+
 sealAVX2Tail384:
-	// Need to decrypt up to 384 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y4, Y7
+	VMOVDQA Y1, Y11
+	VMOVDQA Y2, Y15
 
 sealAVX2Tail384LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealAVX2Tail384LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr1
-	JG       sealAVX2Tail384LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail384LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPERM2I128 $0x02, CC0, DD0, TT1
-	VPERM2I128 $0x13, AA0, BB0, TT2
-	VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, TT0
-	VPERM2I128 $0x02, CC1, DD1, TT1
-	VPERM2I128 $0x13, AA1, BB1, TT2
-	VPERM2I128 $0x13, CC1, DD1, TT3
-	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
-	MOVQ       $256, itr1
-	LEAQ       256(inp), inp
-	SUBQ       $256, inl
-	VPERM2I128 $0x02, AA2, BB2, AA0
-	VPERM2I128 $0x02, CC2, DD2, BB0
-	VPERM2I128 $0x13, AA2, BB2, CC0
-	VPERM2I128 $0x13, CC2, DD2, DD0
-
-	JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x0c, Y10, Y3
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y3, Y10, Y10
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x07, Y10, Y3
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y3, Y10, Y10
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x04, Y10, Y10, Y10
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPALIGNR   $0x0c, Y2, Y2, Y2
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x0c, Y10, Y3
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y3, Y10, Y10
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x07, Y10, Y3
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y3, Y10, Y10
+	ADDQ       16(DI), R10
+	ADCQ       24(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x0c, Y10, Y10, Y10
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	VPALIGNR   $0x04, Y2, Y2, Y2
+	DECQ       CX
+	JG         sealAVX2Tail384LoopA
+	DECQ       R9
+	JGE        sealAVX2Tail384LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     Y7, Y4, Y4
+	VPADDD     Y11, Y1, Y1
+	VPADDD     Y15, Y2, Y2
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPERM2I128 $0x02, Y12, Y4, Y7
+	VPERM2I128 $0x13, Y0, Y14, Y11
+	VPERM2I128 $0x13, Y12, Y4, Y15
+	VPXOR      (SI), Y3, Y3
+	VPXOR      32(SI), Y7, Y7
+	VPXOR      64(SI), Y11, Y11
+	VPXOR      96(SI), Y15, Y15
+	VMOVDQU    Y3, (DI)
+	VMOVDQU    Y7, 32(DI)
+	VMOVDQU    Y11, 64(DI)
+	VMOVDQU    Y15, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y3
+	VPERM2I128 $0x02, Y13, Y1, Y7
+	VPERM2I128 $0x13, Y5, Y9, Y11
+	VPERM2I128 $0x13, Y13, Y1, Y15
+	VPXOR      128(SI), Y3, Y3
+	VPXOR      160(SI), Y7, Y7
+	VPXOR      192(SI), Y11, Y11
+	VPXOR      224(SI), Y15, Y15
+	VMOVDQU    Y3, 128(DI)
+	VMOVDQU    Y7, 160(DI)
+	VMOVDQU    Y11, 192(DI)
+	VMOVDQU    Y15, 224(DI)
+	MOVQ       $0x00000100, CX
+	LEAQ       256(SI), SI
+	SUBQ       $0x00000100, BX
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	JMP        sealAVX2SealHash
+
 sealAVX2Tail512:
-	// Need to decrypt up to 512 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
 
 sealAVX2Tail512LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealAVX2Tail512LoopB:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ     (4*8)(oup), oup
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-
-	DECQ itr1
-	JG   sealAVX2Tail512LoopA
-	DECQ itr2
-	JGE  sealAVX2Tail512LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA    CC3, tmpStoreAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3
-	VPXOR      (0*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (0*32)(oup)
-	VPERM2I128 $0x02, CC0, DD0, CC3
-	VPXOR      (1*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (1*32)(oup)
-	VPERM2I128 $0x13, AA0, BB0, CC3
-	VPXOR      (2*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (2*32)(oup)
-	VPERM2I128 $0x13, CC0, DD0, CC3
-	VPXOR      (3*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (3*32)(oup)
-
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-
-	VPERM2I128 $0x02, AA2, BB2, AA0
-	VPERM2I128 $0x02, CC2, DD2, BB0
-	VPERM2I128 $0x13, AA2, BB2, CC0
-	VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-
-	MOVQ       $384, itr1
-	LEAQ       384(inp), inp
-	SUBQ       $384, inl
-	VPERM2I128 $0x02, AA3, BB3, AA0
-	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
-	VPERM2I128 $0x13, AA3, BB3, CC0
-	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
-	JMP sealAVX2SealHash
+	VPADDD     Y14, Y0, Y0
+	VPADDD     Y9, Y5, Y5
+	VPADDD     Y10, Y6, Y6
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y0, Y4, Y4
+	VPXOR      Y5, Y1, Y1
+	VPXOR      Y6, Y2, Y2
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
+	VPADDD     Y4, Y12, Y12
+	VPADDD     Y1, Y13, Y13
+	VPADDD     Y2, Y8, Y8
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y12, Y14, Y14
+	VPXOR      Y13, Y9, Y9
+	VPXOR      Y8, Y10, Y10
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    Y15, 224(BP)
+	VPSLLD     $0x0c, Y14, Y15
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPSLLD     $0x0c, Y9, Y15
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPSLLD     $0x0c, Y10, Y15
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPSLLD     $0x0c, Y11, Y15
+	VPSRLD     $0x14, Y11, Y11
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    224(BP), Y15
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPADDD     Y14, Y0, Y0
+	VPADDD     Y9, Y5, Y5
+	VPADDD     Y10, Y6, Y6
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y0, Y4, Y4
+	VPXOR      Y5, Y1, Y1
+	VPXOR      Y6, Y2, Y2
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
+	VPADDD     Y4, Y12, Y12
+	VPADDD     Y1, Y13, Y13
+	VPADDD     Y2, Y8, Y8
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y12, Y14, Y14
+	VPXOR      Y13, Y9, Y9
+	VPXOR      Y8, Y10, Y10
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    Y15, 224(BP)
+	VPSLLD     $0x07, Y14, Y15
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPSLLD     $0x07, Y9, Y15
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPSLLD     $0x07, Y10, Y15
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPSLLD     $0x07, Y11, Y15
+	VPSRLD     $0x19, Y11, Y11
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    224(BP), Y15
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x04, Y10, Y10, Y10
+	VPALIGNR   $0x04, Y11, Y11, Y11
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x08, Y15, Y15, Y15
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPALIGNR   $0x0c, Y2, Y2, Y2
+	VPALIGNR   $0x0c, Y3, Y3, Y3
+	VPADDD     Y14, Y0, Y0
+	VPADDD     Y9, Y5, Y5
+	VPADDD     Y10, Y6, Y6
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y0, Y4, Y4
+	VPXOR      Y5, Y1, Y1
+	VPXOR      Y6, Y2, Y2
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
+	VPADDD     Y4, Y12, Y12
+	VPADDD     Y1, Y13, Y13
+	VPADDD     Y2, Y8, Y8
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y12, Y14, Y14
+	VPXOR      Y13, Y9, Y9
+	VPXOR      Y8, Y10, Y10
+	VPXOR      Y15, Y11, Y11
+	ADDQ       16(DI), R10
+	ADCQ       24(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VMOVDQA    Y15, 224(BP)
+	VPSLLD     $0x0c, Y14, Y15
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPSLLD     $0x0c, Y9, Y15
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPSLLD     $0x0c, Y10, Y15
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPSLLD     $0x0c, Y11, Y15
+	VPSRLD     $0x14, Y11, Y11
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    224(BP), Y15
+	VPADDD     Y14, Y0, Y0
+	VPADDD     Y9, Y5, Y5
+	VPADDD     Y10, Y6, Y6
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y0, Y4, Y4
+	VPXOR      Y5, Y1, Y1
+	VPXOR      Y6, Y2, Y2
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
+	VPADDD     Y4, Y12, Y12
+	VPADDD     Y1, Y13, Y13
+	VPADDD     Y2, Y8, Y8
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y12, Y14, Y14
+	VPXOR      Y13, Y9, Y9
+	VPXOR      Y8, Y10, Y10
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    Y15, 224(BP)
+	VPSLLD     $0x07, Y14, Y15
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPSLLD     $0x07, Y9, Y15
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPSLLD     $0x07, Y10, Y15
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPSLLD     $0x07, Y11, Y15
+	VPSRLD     $0x19, Y11, Y11
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    224(BP), Y15
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x0c, Y10, Y10, Y10
+	VPALIGNR   $0x0c, Y11, Y11, Y11
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x08, Y15, Y15, Y15
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	VPALIGNR   $0x04, Y2, Y2, Y2
+	VPALIGNR   $0x04, Y3, Y3, Y3
+	DECQ       CX
+	JG         sealAVX2Tail512LoopA
+	DECQ       R9
+	JGE        sealAVX2Tail512LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     32(BP), Y11, Y11
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     64(BP), Y15, Y15
+	VPADDD     96(BP), Y4, Y4
+	VPADDD     128(BP), Y1, Y1
+	VPADDD     160(BP), Y2, Y2
+	VPADDD     192(BP), Y3, Y3
+	VMOVDQA    Y15, 224(BP)
+	VPERM2I128 $0x02, Y0, Y14, Y15
+	VPXOR      (SI), Y15, Y15
+	VMOVDQU    Y15, (DI)
+	VPERM2I128 $0x02, Y12, Y4, Y15
+	VPXOR      32(SI), Y15, Y15
+	VMOVDQU    Y15, 32(DI)
+	VPERM2I128 $0x13, Y0, Y14, Y15
+	VPXOR      64(SI), Y15, Y15
+	VMOVDQU    Y15, 64(DI)
+	VPERM2I128 $0x13, Y12, Y4, Y15
+	VPXOR      96(SI), Y15, Y15
+	VMOVDQU    Y15, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      128(SI), Y0, Y0
+	VPXOR      160(SI), Y14, Y14
+	VPXOR      192(SI), Y12, Y12
+	VPXOR      224(SI), Y4, Y4
+	VMOVDQU    Y0, 128(DI)
+	VMOVDQU    Y14, 160(DI)
+	VMOVDQU    Y12, 192(DI)
+	VMOVDQU    Y4, 224(DI)
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      256(SI), Y0, Y0
+	VPXOR      288(SI), Y14, Y14
+	VPXOR      320(SI), Y12, Y12
+	VPXOR      352(SI), Y4, Y4
+	VMOVDQU    Y0, 256(DI)
+	VMOVDQU    Y14, 288(DI)
+	VMOVDQU    Y12, 320(DI)
+	VMOVDQU    Y4, 352(DI)
+	MOVQ       $0x00000180, CX
+	LEAQ       384(SI), SI
+	SUBQ       $0x00000180, BX
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, 224(BP), Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, 224(BP), Y3, Y4
+	JMP        sealAVX2SealHash
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
index 333da285b32a34f2c085b394da5ce2fea6d31f7c..bd896bdc76d1caa288cb7896bcbdb0e51436410d 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
+//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
 
 package poly1305
 
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
index e0d3c64756692b68aa1179398fdbc03cde0113a2..133757384b787ff854f2367112ecd1480ac5b1b6 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
@@ -1,108 +1,93 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT.
 
 //go:build gc && !purego
 
-#include "textflag.h"
-
-#define POLY1305_ADD(msg, h0, h1, h2) \
-	ADDQ 0(msg), h0;  \
-	ADCQ 8(msg), h1;  \
-	ADCQ $1, h2;      \
-	LEAQ 16(msg), msg
-
-#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
-	MOVQ  r0, AX;                  \
-	MULQ  h0;                      \
-	MOVQ  AX, t0;                  \
-	MOVQ  DX, t1;                  \
-	MOVQ  r0, AX;                  \
-	MULQ  h1;                      \
-	ADDQ  AX, t1;                  \
-	ADCQ  $0, DX;                  \
-	MOVQ  r0, t2;                  \
-	IMULQ h2, t2;                  \
-	ADDQ  DX, t2;                  \
-	                               \
-	MOVQ  r1, AX;                  \
-	MULQ  h0;                      \
-	ADDQ  AX, t1;                  \
-	ADCQ  $0, DX;                  \
-	MOVQ  DX, h0;                  \
-	MOVQ  r1, t3;                  \
-	IMULQ h2, t3;                  \
-	MOVQ  r1, AX;                  \
-	MULQ  h1;                      \
-	ADDQ  AX, t2;                  \
-	ADCQ  DX, t3;                  \
-	ADDQ  h0, t2;                  \
-	ADCQ  $0, t3;                  \
-	                               \
-	MOVQ  t0, h0;                  \
-	MOVQ  t1, h1;                  \
-	MOVQ  t2, h2;                  \
-	ANDQ  $3, h2;                  \
-	MOVQ  t2, t0;                  \
-	ANDQ  $0xFFFFFFFFFFFFFFFC, t0; \
-	ADDQ  t0, h0;                  \
-	ADCQ  t3, h1;                  \
-	ADCQ  $0, h2;                  \
-	SHRQ  $2, t3, t2;              \
-	SHRQ  $2, t3;                  \
-	ADDQ  t2, h0;                  \
-	ADCQ  t3, h1;                  \
-	ADCQ  $0, h2
-
-// func update(state *[7]uint64, msg []byte)
+// func update(state *macState, msg []byte)
 TEXT ·update(SB), $0-32
 	MOVQ state+0(FP), DI
 	MOVQ msg_base+8(FP), SI
 	MOVQ msg_len+16(FP), R15
-
-	MOVQ 0(DI), R8   // h0
-	MOVQ 8(DI), R9   // h1
-	MOVQ 16(DI), R10 // h2
-	MOVQ 24(DI), R11 // r0
-	MOVQ 32(DI), R12 // r1
-
-	CMPQ R15, $16
+	MOVQ (DI), R8
+	MOVQ 8(DI), R9
+	MOVQ 16(DI), R10
+	MOVQ 24(DI), R11
+	MOVQ 32(DI), R12
+	CMPQ R15, $0x10
 	JB   bytes_between_0_and_15
 
 loop:
-	POLY1305_ADD(SI, R8, R9, R10)
+	ADDQ (SI), R8
+	ADCQ 8(SI), R9
+	ADCQ $0x01, R10
+	LEAQ 16(SI), SI
 
 multiply:
-	POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
-	SUBQ $16, R15
-	CMPQ R15, $16
-	JAE  loop
+	MOVQ  R11, AX
+	MULQ  R8
+	MOVQ  AX, BX
+	MOVQ  DX, CX
+	MOVQ  R11, AX
+	MULQ  R9
+	ADDQ  AX, CX
+	ADCQ  $0x00, DX
+	MOVQ  R11, R13
+	IMULQ R10, R13
+	ADDQ  DX, R13
+	MOVQ  R12, AX
+	MULQ  R8
+	ADDQ  AX, CX
+	ADCQ  $0x00, DX
+	MOVQ  DX, R8
+	MOVQ  R12, R14
+	IMULQ R10, R14
+	MOVQ  R12, AX
+	MULQ  R9
+	ADDQ  AX, R13
+	ADCQ  DX, R14
+	ADDQ  R8, R13
+	ADCQ  $0x00, R14
+	MOVQ  BX, R8
+	MOVQ  CX, R9
+	MOVQ  R13, R10
+	ANDQ  $0x03, R10
+	MOVQ  R13, BX
+	ANDQ  $-4, BX
+	ADDQ  BX, R8
+	ADCQ  R14, R9
+	ADCQ  $0x00, R10
+	SHRQ  $0x02, R14, R13
+	SHRQ  $0x02, R14
+	ADDQ  R13, R8
+	ADCQ  R14, R9
+	ADCQ  $0x00, R10
+	SUBQ  $0x10, R15
+	CMPQ  R15, $0x10
+	JAE   loop
 
 bytes_between_0_and_15:
 	TESTQ R15, R15
 	JZ    done
-	MOVQ  $1, BX
+	MOVQ  $0x00000001, BX
 	XORQ  CX, CX
 	XORQ  R13, R13
 	ADDQ  R15, SI
 
 flush_buffer:
-	SHLQ $8, BX, CX
-	SHLQ $8, BX
+	SHLQ $0x08, BX, CX
+	SHLQ $0x08, BX
 	MOVB -1(SI), R13
 	XORQ R13, BX
 	DECQ SI
 	DECQ R15
 	JNZ  flush_buffer
-
 	ADDQ BX, R8
 	ADCQ CX, R9
-	ADCQ $0, R10
-	MOVQ $16, R15
+	ADCQ $0x00, R10
+	MOVQ $0x00000010, R15
 	JMP  multiply
 
 done:
-	MOVQ R8, 0(DI)
+	MOVQ R8, (DI)
 	MOVQ R9, 8(DI)
 	MOVQ R10, 16(DI)
 	RET
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go
similarity index 95%
rename from vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
rename to vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go
index 4aec4874b507e887af5cd578396aab8cc90e41d5..1a1679aaad9c476c4c2cdbbfd5e08f5656ed0f01 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
 
 package poly1305
 
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s
similarity index 89%
rename from vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
rename to vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s
index b3c1699bff51a27428651bf5885d7cd0b660ed19..6899a1dabc0b3e4de5fd66f63b3b1afd140f97ce 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s
@@ -2,15 +2,25 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
 
 #include "textflag.h"
 
 // This was ported from the amd64 implementation.
 
+#ifdef GOARCH_ppc64le
+#define LE_MOVD MOVD
+#define LE_MOVWZ MOVWZ
+#define LE_MOVHZ MOVHZ
+#else
+#define LE_MOVD MOVDBR
+#define LE_MOVWZ MOVWBR
+#define LE_MOVHZ MOVHBR
+#endif
+
 #define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \
-	MOVD (msg), t0;  \
-	MOVD 8(msg), t1; \
+	LE_MOVD (msg)( R0), t0; \
+	LE_MOVD (msg)(R24), t1; \
 	MOVD $1, t2;     \
 	ADDC t0, h0, h0; \
 	ADDE t1, h1, h1; \
@@ -50,10 +60,6 @@
 	ADDE   t3, h1, h1;  \
 	ADDZE  h2
 
-DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-GLOBL ·poly1305Mask<>(SB), RODATA, $16
-
 // func update(state *[7]uint64, msg []byte)
 TEXT ·update(SB), $0-32
 	MOVD state+0(FP), R3
@@ -66,6 +72,8 @@ TEXT ·update(SB), $0-32
 	MOVD 24(R3), R11 // r0
 	MOVD 32(R3), R12 // r1
 
+	MOVD $8, R24
+
 	CMP R5, $16
 	BLT bytes_between_0_and_15
 
@@ -94,7 +102,7 @@ flush_buffer:
 
 	// Greater than 8 -- load the rightmost remaining bytes in msg
 	// and put into R17 (h1)
-	MOVD (R4)(R21), R17
+	LE_MOVD (R4)(R21), R17
 	MOVD $16, R22
 
 	// Find the offset to those bytes
@@ -118,7 +126,7 @@ just1:
 	BLT less8
 
 	// Exactly 8
-	MOVD (R4), R16
+	LE_MOVD (R4), R16
 
 	CMP R17, $0
 
@@ -133,7 +141,7 @@ less8:
 	MOVD  $0, R22   // shift count
 	CMP   R5, $4
 	BLT   less4
-	MOVWZ (R4), R16
+	LE_MOVWZ (R4), R16
 	ADD   $4, R4
 	ADD   $-4, R5
 	MOVD  $32, R22
@@ -141,7 +149,7 @@ less8:
 less4:
 	CMP   R5, $2
 	BLT   less2
-	MOVHZ (R4), R21
+	LE_MOVHZ (R4), R21
 	SLD   R22, R21, R21
 	OR    R16, R21, R16
 	ADD   $16, R22
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s
index fcce0234b6902cf75941af4767affb7d35ae41eb..3883e0ec22987baed3e0236ae47f080798e23c9a 100644
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s
+++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s
@@ -1,880 +1,880 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run salsa20_amd64_asm.go -out ../salsa20_amd64.s -pkg salsa. DO NOT EDIT.
 
 //go:build amd64 && !purego && gc
 
-// This code was translated into a form compatible with 6a from the public
-// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
+// func salsa2020XORKeyStream(out *byte, in *byte, n uint64, nonce *byte, key *byte)
+// Requires: SSE2
+TEXT ·salsa2020XORKeyStream(SB), $456-40
+	// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
+	MOVQ   out+0(FP), DI
+	MOVQ   in+8(FP), SI
+	MOVQ   n+16(FP), DX
+	MOVQ   nonce+24(FP), CX
+	MOVQ   key+32(FP), R8
+	MOVQ   SP, R12
+	ADDQ   $0x1f, R12
+	ANDQ   $-32, R12
+	MOVQ   DX, R9
+	MOVQ   CX, DX
+	MOVQ   R8, R10
+	CMPQ   R9, $0x00
+	JBE    DONE
+	MOVL   20(R10), CX
+	MOVL   (R10), R8
+	MOVL   (DX), AX
+	MOVL   16(R10), R11
+	MOVL   CX, (R12)
+	MOVL   R8, 4(R12)
+	MOVL   AX, 8(R12)
+	MOVL   R11, 12(R12)
+	MOVL   8(DX), CX
+	MOVL   24(R10), R8
+	MOVL   4(R10), AX
+	MOVL   4(DX), R11
+	MOVL   CX, 16(R12)
+	MOVL   R8, 20(R12)
+	MOVL   AX, 24(R12)
+	MOVL   R11, 28(R12)
+	MOVL   12(DX), CX
+	MOVL   12(R10), DX
+	MOVL   28(R10), R8
+	MOVL   8(R10), AX
+	MOVL   DX, 32(R12)
+	MOVL   CX, 36(R12)
+	MOVL   R8, 40(R12)
+	MOVL   AX, 44(R12)
+	MOVQ   $0x61707865, DX
+	MOVQ   $0x3320646e, CX
+	MOVQ   $0x79622d32, R8
+	MOVQ   $0x6b206574, AX
+	MOVL   DX, 48(R12)
+	MOVL   CX, 52(R12)
+	MOVL   R8, 56(R12)
+	MOVL   AX, 60(R12)
+	CMPQ   R9, $0x00000100
+	JB     BYTESBETWEEN1AND255
+	MOVOA  48(R12), X0
+	PSHUFL $0x55, X0, X1
+	PSHUFL $0xaa, X0, X2
+	PSHUFL $0xff, X0, X3
+	PSHUFL $0x00, X0, X0
+	MOVOA  X1, 64(R12)
+	MOVOA  X2, 80(R12)
+	MOVOA  X3, 96(R12)
+	MOVOA  X0, 112(R12)
+	MOVOA  (R12), X0
+	PSHUFL $0xaa, X0, X1
+	PSHUFL $0xff, X0, X2
+	PSHUFL $0x00, X0, X3
+	PSHUFL $0x55, X0, X0
+	MOVOA  X1, 128(R12)
+	MOVOA  X2, 144(R12)
+	MOVOA  X3, 160(R12)
+	MOVOA  X0, 176(R12)
+	MOVOA  16(R12), X0
+	PSHUFL $0xff, X0, X1
+	PSHUFL $0x55, X0, X2
+	PSHUFL $0xaa, X0, X0
+	MOVOA  X1, 192(R12)
+	MOVOA  X2, 208(R12)
+	MOVOA  X0, 224(R12)
+	MOVOA  32(R12), X0
+	PSHUFL $0x00, X0, X1
+	PSHUFL $0xaa, X0, X2
+	PSHUFL $0xff, X0, X0
+	MOVOA  X1, 240(R12)
+	MOVOA  X2, 256(R12)
+	MOVOA  X0, 272(R12)
 
-// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
-// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
-TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
-	MOVQ out+0(FP),DI
-	MOVQ in+8(FP),SI
-	MOVQ n+16(FP),DX
-	MOVQ nonce+24(FP),CX
-	MOVQ key+32(FP),R8
+BYTESATLEAST256:
+	MOVL  16(R12), DX
+	MOVL  36(R12), CX
+	MOVL  DX, 288(R12)
+	MOVL  CX, 304(R12)
+	SHLQ  $0x20, CX
+	ADDQ  CX, DX
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 292(R12)
+	MOVL  CX, 308(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 296(R12)
+	MOVL  CX, 312(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 300(R12)
+	MOVL  CX, 316(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 16(R12)
+	MOVL  CX, 36(R12)
+	MOVQ  R9, 352(R12)
+	MOVQ  $0x00000014, DX
+	MOVOA 64(R12), X0
+	MOVOA 80(R12), X1
+	MOVOA 96(R12), X2
+	MOVOA 256(R12), X3
+	MOVOA 272(R12), X4
+	MOVOA 128(R12), X5
+	MOVOA 144(R12), X6
+	MOVOA 176(R12), X7
+	MOVOA 192(R12), X8
+	MOVOA 208(R12), X9
+	MOVOA 224(R12), X10
+	MOVOA 304(R12), X11
+	MOVOA 112(R12), X12
+	MOVOA 160(R12), X13
+	MOVOA 240(R12), X14
+	MOVOA 288(R12), X15
 
-	MOVQ SP,R12
-	ADDQ $31, R12
-	ANDQ $~31, R12
+MAINLOOP1:
+	MOVOA  X1, 320(R12)
+	MOVOA  X2, 336(R12)
+	MOVOA  X13, X1
+	PADDL  X12, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X14
+	PSRLL  $0x19, X2
+	PXOR   X2, X14
+	MOVOA  X7, X1
+	PADDL  X0, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X11
+	PSRLL  $0x19, X2
+	PXOR   X2, X11
+	MOVOA  X12, X1
+	PADDL  X14, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X15
+	PSRLL  $0x17, X2
+	PXOR   X2, X15
+	MOVOA  X0, X1
+	PADDL  X11, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X9
+	PSRLL  $0x17, X2
+	PXOR   X2, X9
+	MOVOA  X14, X1
+	PADDL  X15, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X13
+	PSRLL  $0x13, X2
+	PXOR   X2, X13
+	MOVOA  X11, X1
+	PADDL  X9, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X7
+	PSRLL  $0x13, X2
+	PXOR   X2, X7
+	MOVOA  X15, X1
+	PADDL  X13, X1
+	MOVOA  X1, X2
+	PSLLL  $0x12, X1
+	PXOR   X1, X12
+	PSRLL  $0x0e, X2
+	PXOR   X2, X12
+	MOVOA  320(R12), X1
+	MOVOA  X12, 320(R12)
+	MOVOA  X9, X2
+	PADDL  X7, X2
+	MOVOA  X2, X12
+	PSLLL  $0x12, X2
+	PXOR   X2, X0
+	PSRLL  $0x0e, X12
+	PXOR   X12, X0
+	MOVOA  X5, X2
+	PADDL  X1, X2
+	MOVOA  X2, X12
+	PSLLL  $0x07, X2
+	PXOR   X2, X3
+	PSRLL  $0x19, X12
+	PXOR   X12, X3
+	MOVOA  336(R12), X2
+	MOVOA  X0, 336(R12)
+	MOVOA  X6, X0
+	PADDL  X2, X0
+	MOVOA  X0, X12
+	PSLLL  $0x07, X0
+	PXOR   X0, X4
+	PSRLL  $0x19, X12
+	PXOR   X12, X4
+	MOVOA  X1, X0
+	PADDL  X3, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X10
+	PSRLL  $0x17, X12
+	PXOR   X12, X10
+	MOVOA  X2, X0
+	PADDL  X4, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X8
+	PSRLL  $0x17, X12
+	PXOR   X12, X8
+	MOVOA  X3, X0
+	PADDL  X10, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X5
+	PSRLL  $0x13, X12
+	PXOR   X12, X5
+	MOVOA  X4, X0
+	PADDL  X8, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X6
+	PSRLL  $0x13, X12
+	PXOR   X12, X6
+	MOVOA  X10, X0
+	PADDL  X5, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X1
+	PSRLL  $0x0e, X12
+	PXOR   X12, X1
+	MOVOA  320(R12), X0
+	MOVOA  X1, 320(R12)
+	MOVOA  X4, X1
+	PADDL  X0, X1
+	MOVOA  X1, X12
+	PSLLL  $0x07, X1
+	PXOR   X1, X7
+	PSRLL  $0x19, X12
+	PXOR   X12, X7
+	MOVOA  X8, X1
+	PADDL  X6, X1
+	MOVOA  X1, X12
+	PSLLL  $0x12, X1
+	PXOR   X1, X2
+	PSRLL  $0x0e, X12
+	PXOR   X12, X2
+	MOVOA  336(R12), X12
+	MOVOA  X2, 336(R12)
+	MOVOA  X14, X1
+	PADDL  X12, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X5
+	PSRLL  $0x19, X2
+	PXOR   X2, X5
+	MOVOA  X0, X1
+	PADDL  X7, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X10
+	PSRLL  $0x17, X2
+	PXOR   X2, X10
+	MOVOA  X12, X1
+	PADDL  X5, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X8
+	PSRLL  $0x17, X2
+	PXOR   X2, X8
+	MOVOA  X7, X1
+	PADDL  X10, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X4
+	PSRLL  $0x13, X2
+	PXOR   X2, X4
+	MOVOA  X5, X1
+	PADDL  X8, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X14
+	PSRLL  $0x13, X2
+	PXOR   X2, X14
+	MOVOA  X10, X1
+	PADDL  X4, X1
+	MOVOA  X1, X2
+	PSLLL  $0x12, X1
+	PXOR   X1, X0
+	PSRLL  $0x0e, X2
+	PXOR   X2, X0
+	MOVOA  320(R12), X1
+	MOVOA  X0, 320(R12)
+	MOVOA  X8, X0
+	PADDL  X14, X0
+	MOVOA  X0, X2
+	PSLLL  $0x12, X0
+	PXOR   X0, X12
+	PSRLL  $0x0e, X2
+	PXOR   X2, X12
+	MOVOA  X11, X0
+	PADDL  X1, X0
+	MOVOA  X0, X2
+	PSLLL  $0x07, X0
+	PXOR   X0, X6
+	PSRLL  $0x19, X2
+	PXOR   X2, X6
+	MOVOA  336(R12), X2
+	MOVOA  X12, 336(R12)
+	MOVOA  X3, X0
+	PADDL  X2, X0
+	MOVOA  X0, X12
+	PSLLL  $0x07, X0
+	PXOR   X0, X13
+	PSRLL  $0x19, X12
+	PXOR   X12, X13
+	MOVOA  X1, X0
+	PADDL  X6, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X15
+	PSRLL  $0x17, X12
+	PXOR   X12, X15
+	MOVOA  X2, X0
+	PADDL  X13, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X9
+	PSRLL  $0x17, X12
+	PXOR   X12, X9
+	MOVOA  X6, X0
+	PADDL  X15, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X11
+	PSRLL  $0x13, X12
+	PXOR   X12, X11
+	MOVOA  X13, X0
+	PADDL  X9, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X3
+	PSRLL  $0x13, X12
+	PXOR   X12, X3
+	MOVOA  X15, X0
+	PADDL  X11, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X1
+	PSRLL  $0x0e, X12
+	PXOR   X12, X1
+	MOVOA  X9, X0
+	PADDL  X3, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X2
+	PSRLL  $0x0e, X12
+	PXOR   X12, X2
+	MOVOA  320(R12), X12
+	MOVOA  336(R12), X0
+	SUBQ   $0x02, DX
+	JA     MAINLOOP1
+	PADDL  112(R12), X12
+	PADDL  176(R12), X7
+	PADDL  224(R12), X10
+	PADDL  272(R12), X4
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   (SI), DX
+	XORL   4(SI), CX
+	XORL   8(SI), R8
+	XORL   12(SI), R9
+	MOVL   DX, (DI)
+	MOVL   CX, 4(DI)
+	MOVL   R8, 8(DI)
+	MOVL   R9, 12(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   64(SI), DX
+	XORL   68(SI), CX
+	XORL   72(SI), R8
+	XORL   76(SI), R9
+	MOVL   DX, 64(DI)
+	MOVL   CX, 68(DI)
+	MOVL   R8, 72(DI)
+	MOVL   R9, 76(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   128(SI), DX
+	XORL   132(SI), CX
+	XORL   136(SI), R8
+	XORL   140(SI), R9
+	MOVL   DX, 128(DI)
+	MOVL   CX, 132(DI)
+	MOVL   R8, 136(DI)
+	MOVL   R9, 140(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	XORL   192(SI), DX
+	XORL   196(SI), CX
+	XORL   200(SI), R8
+	XORL   204(SI), R9
+	MOVL   DX, 192(DI)
+	MOVL   CX, 196(DI)
+	MOVL   R8, 200(DI)
+	MOVL   R9, 204(DI)
+	PADDL  240(R12), X14
+	PADDL  64(R12), X0
+	PADDL  128(R12), X5
+	PADDL  192(R12), X8
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   16(SI), DX
+	XORL   20(SI), CX
+	XORL   24(SI), R8
+	XORL   28(SI), R9
+	MOVL   DX, 16(DI)
+	MOVL   CX, 20(DI)
+	MOVL   R8, 24(DI)
+	MOVL   R9, 28(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   80(SI), DX
+	XORL   84(SI), CX
+	XORL   88(SI), R8
+	XORL   92(SI), R9
+	MOVL   DX, 80(DI)
+	MOVL   CX, 84(DI)
+	MOVL   R8, 88(DI)
+	MOVL   R9, 92(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   144(SI), DX
+	XORL   148(SI), CX
+	XORL   152(SI), R8
+	XORL   156(SI), R9
+	MOVL   DX, 144(DI)
+	MOVL   CX, 148(DI)
+	MOVL   R8, 152(DI)
+	MOVL   R9, 156(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	XORL   208(SI), DX
+	XORL   212(SI), CX
+	XORL   216(SI), R8
+	XORL   220(SI), R9
+	MOVL   DX, 208(DI)
+	MOVL   CX, 212(DI)
+	MOVL   R8, 216(DI)
+	MOVL   R9, 220(DI)
+	PADDL  288(R12), X15
+	PADDL  304(R12), X11
+	PADDL  80(R12), X1
+	PADDL  144(R12), X6
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   32(SI), DX
+	XORL   36(SI), CX
+	XORL   40(SI), R8
+	XORL   44(SI), R9
+	MOVL   DX, 32(DI)
+	MOVL   CX, 36(DI)
+	MOVL   R8, 40(DI)
+	MOVL   R9, 44(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   96(SI), DX
+	XORL   100(SI), CX
+	XORL   104(SI), R8
+	XORL   108(SI), R9
+	MOVL   DX, 96(DI)
+	MOVL   CX, 100(DI)
+	MOVL   R8, 104(DI)
+	MOVL   R9, 108(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   160(SI), DX
+	XORL   164(SI), CX
+	XORL   168(SI), R8
+	XORL   172(SI), R9
+	MOVL   DX, 160(DI)
+	MOVL   CX, 164(DI)
+	MOVL   R8, 168(DI)
+	MOVL   R9, 172(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	XORL   224(SI), DX
+	XORL   228(SI), CX
+	XORL   232(SI), R8
+	XORL   236(SI), R9
+	MOVL   DX, 224(DI)
+	MOVL   CX, 228(DI)
+	MOVL   R8, 232(DI)
+	MOVL   R9, 236(DI)
+	PADDL  160(R12), X13
+	PADDL  208(R12), X9
+	PADDL  256(R12), X3
+	PADDL  96(R12), X2
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   48(SI), DX
+	XORL   52(SI), CX
+	XORL   56(SI), R8
+	XORL   60(SI), R9
+	MOVL   DX, 48(DI)
+	MOVL   CX, 52(DI)
+	MOVL   R8, 56(DI)
+	MOVL   R9, 60(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   112(SI), DX
+	XORL   116(SI), CX
+	XORL   120(SI), R8
+	XORL   124(SI), R9
+	MOVL   DX, 112(DI)
+	MOVL   CX, 116(DI)
+	MOVL   R8, 120(DI)
+	MOVL   R9, 124(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   176(SI), DX
+	XORL   180(SI), CX
+	XORL   184(SI), R8
+	XORL   188(SI), R9
+	MOVL   DX, 176(DI)
+	MOVL   CX, 180(DI)
+	MOVL   R8, 184(DI)
+	MOVL   R9, 188(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	XORL   240(SI), DX
+	XORL   244(SI), CX
+	XORL   248(SI), R8
+	XORL   252(SI), R9
+	MOVL   DX, 240(DI)
+	MOVL   CX, 244(DI)
+	MOVL   R8, 248(DI)
+	MOVL   R9, 252(DI)
+	MOVQ   352(R12), R9
+	SUBQ   $0x00000100, R9
+	ADDQ   $0x00000100, SI
+	ADDQ   $0x00000100, DI
+	CMPQ   R9, $0x00000100
+	JAE    BYTESATLEAST256
+	CMPQ   R9, $0x00
+	JBE    DONE
 
-	MOVQ DX,R9
-	MOVQ CX,DX
-	MOVQ R8,R10
-	CMPQ R9,$0
-	JBE DONE
-	START:
-	MOVL 20(R10),CX
-	MOVL 0(R10),R8
-	MOVL 0(DX),AX
-	MOVL 16(R10),R11
-	MOVL CX,0(R12)
-	MOVL R8, 4 (R12)
-	MOVL AX, 8 (R12)
-	MOVL R11, 12 (R12)
-	MOVL 8(DX),CX
-	MOVL 24(R10),R8
-	MOVL 4(R10),AX
-	MOVL 4(DX),R11
-	MOVL CX,16(R12)
-	MOVL R8, 20 (R12)
-	MOVL AX, 24 (R12)
-	MOVL R11, 28 (R12)
-	MOVL 12(DX),CX
-	MOVL 12(R10),DX
-	MOVL 28(R10),R8
-	MOVL 8(R10),AX
-	MOVL DX,32(R12)
-	MOVL CX, 36 (R12)
-	MOVL R8, 40 (R12)
-	MOVL AX, 44 (R12)
-	MOVQ $1634760805,DX
-	MOVQ $857760878,CX
-	MOVQ $2036477234,R8
-	MOVQ $1797285236,AX
-	MOVL DX,48(R12)
-	MOVL CX, 52 (R12)
-	MOVL R8, 56 (R12)
-	MOVL AX, 60 (R12)
-	CMPQ R9,$256
-	JB BYTESBETWEEN1AND255
-	MOVOA 48(R12),X0
-	PSHUFL $0X55,X0,X1
-	PSHUFL $0XAA,X0,X2
-	PSHUFL $0XFF,X0,X3
-	PSHUFL $0X00,X0,X0
-	MOVOA X1,64(R12)
-	MOVOA X2,80(R12)
-	MOVOA X3,96(R12)
-	MOVOA X0,112(R12)
-	MOVOA 0(R12),X0
-	PSHUFL $0XAA,X0,X1
-	PSHUFL $0XFF,X0,X2
-	PSHUFL $0X00,X0,X3
-	PSHUFL $0X55,X0,X0
-	MOVOA X1,128(R12)
-	MOVOA X2,144(R12)
-	MOVOA X3,160(R12)
-	MOVOA X0,176(R12)
-	MOVOA 16(R12),X0
-	PSHUFL $0XFF,X0,X1
-	PSHUFL $0X55,X0,X2
-	PSHUFL $0XAA,X0,X0
-	MOVOA X1,192(R12)
-	MOVOA X2,208(R12)
-	MOVOA X0,224(R12)
-	MOVOA 32(R12),X0
-	PSHUFL $0X00,X0,X1
-	PSHUFL $0XAA,X0,X2
-	PSHUFL $0XFF,X0,X0
-	MOVOA X1,240(R12)
-	MOVOA X2,256(R12)
-	MOVOA X0,272(R12)
-	BYTESATLEAST256:
-	MOVL 16(R12),DX
-	MOVL  36 (R12),CX
-	MOVL DX,288(R12)
-	MOVL CX,304(R12)
-	SHLQ $32,CX
-	ADDQ CX,DX
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 292 (R12)
-	MOVL CX, 308 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 296 (R12)
-	MOVL CX, 312 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 300 (R12)
-	MOVL CX, 316 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX,16(R12)
-	MOVL CX, 36 (R12)
-	MOVQ R9,352(R12)
-	MOVQ $20,DX
-	MOVOA 64(R12),X0
-	MOVOA 80(R12),X1
-	MOVOA 96(R12),X2
-	MOVOA 256(R12),X3
-	MOVOA 272(R12),X4
-	MOVOA 128(R12),X5
-	MOVOA 144(R12),X6
-	MOVOA 176(R12),X7
-	MOVOA 192(R12),X8
-	MOVOA 208(R12),X9
-	MOVOA 224(R12),X10
-	MOVOA 304(R12),X11
-	MOVOA 112(R12),X12
-	MOVOA 160(R12),X13
-	MOVOA 240(R12),X14
-	MOVOA 288(R12),X15
-	MAINLOOP1:
-	MOVOA X1,320(R12)
-	MOVOA X2,336(R12)
-	MOVOA X13,X1
-	PADDL X12,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X14
-	PSRLL $25,X2
-	PXOR X2,X14
-	MOVOA X7,X1
-	PADDL X0,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X11
-	PSRLL $25,X2
-	PXOR X2,X11
-	MOVOA X12,X1
-	PADDL X14,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X15
-	PSRLL $23,X2
-	PXOR X2,X15
-	MOVOA X0,X1
-	PADDL X11,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X9
-	PSRLL $23,X2
-	PXOR X2,X9
-	MOVOA X14,X1
-	PADDL X15,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X13
-	PSRLL $19,X2
-	PXOR X2,X13
-	MOVOA X11,X1
-	PADDL X9,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X7
-	PSRLL $19,X2
-	PXOR X2,X7
-	MOVOA X15,X1
-	PADDL X13,X1
-	MOVOA X1,X2
-	PSLLL $18,X1
-	PXOR X1,X12
-	PSRLL $14,X2
-	PXOR X2,X12
-	MOVOA 320(R12),X1
-	MOVOA X12,320(R12)
-	MOVOA X9,X2
-	PADDL X7,X2
-	MOVOA X2,X12
-	PSLLL $18,X2
-	PXOR X2,X0
-	PSRLL $14,X12
-	PXOR X12,X0
-	MOVOA X5,X2
-	PADDL X1,X2
-	MOVOA X2,X12
-	PSLLL $7,X2
-	PXOR X2,X3
-	PSRLL $25,X12
-	PXOR X12,X3
-	MOVOA 336(R12),X2
-	MOVOA X0,336(R12)
-	MOVOA X6,X0
-	PADDL X2,X0
-	MOVOA X0,X12
-	PSLLL $7,X0
-	PXOR X0,X4
-	PSRLL $25,X12
-	PXOR X12,X4
-	MOVOA X1,X0
-	PADDL X3,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X10
-	PSRLL $23,X12
-	PXOR X12,X10
-	MOVOA X2,X0
-	PADDL X4,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X8
-	PSRLL $23,X12
-	PXOR X12,X8
-	MOVOA X3,X0
-	PADDL X10,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X5
-	PSRLL $19,X12
-	PXOR X12,X5
-	MOVOA X4,X0
-	PADDL X8,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X6
-	PSRLL $19,X12
-	PXOR X12,X6
-	MOVOA X10,X0
-	PADDL X5,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X1
-	PSRLL $14,X12
-	PXOR X12,X1
-	MOVOA 320(R12),X0
-	MOVOA X1,320(R12)
-	MOVOA X4,X1
-	PADDL X0,X1
-	MOVOA X1,X12
-	PSLLL $7,X1
-	PXOR X1,X7
-	PSRLL $25,X12
-	PXOR X12,X7
-	MOVOA X8,X1
-	PADDL X6,X1
-	MOVOA X1,X12
-	PSLLL $18,X1
-	PXOR X1,X2
-	PSRLL $14,X12
-	PXOR X12,X2
-	MOVOA 336(R12),X12
-	MOVOA X2,336(R12)
-	MOVOA X14,X1
-	PADDL X12,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X5
-	PSRLL $25,X2
-	PXOR X2,X5
-	MOVOA X0,X1
-	PADDL X7,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X10
-	PSRLL $23,X2
-	PXOR X2,X10
-	MOVOA X12,X1
-	PADDL X5,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X8
-	PSRLL $23,X2
-	PXOR X2,X8
-	MOVOA X7,X1
-	PADDL X10,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X4
-	PSRLL $19,X2
-	PXOR X2,X4
-	MOVOA X5,X1
-	PADDL X8,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X14
-	PSRLL $19,X2
-	PXOR X2,X14
-	MOVOA X10,X1
-	PADDL X4,X1
-	MOVOA X1,X2
-	PSLLL $18,X1
-	PXOR X1,X0
-	PSRLL $14,X2
-	PXOR X2,X0
-	MOVOA 320(R12),X1
-	MOVOA X0,320(R12)
-	MOVOA X8,X0
-	PADDL X14,X0
-	MOVOA X0,X2
-	PSLLL $18,X0
-	PXOR X0,X12
-	PSRLL $14,X2
-	PXOR X2,X12
-	MOVOA X11,X0
-	PADDL X1,X0
-	MOVOA X0,X2
-	PSLLL $7,X0
-	PXOR X0,X6
-	PSRLL $25,X2
-	PXOR X2,X6
-	MOVOA 336(R12),X2
-	MOVOA X12,336(R12)
-	MOVOA X3,X0
-	PADDL X2,X0
-	MOVOA X0,X12
-	PSLLL $7,X0
-	PXOR X0,X13
-	PSRLL $25,X12
-	PXOR X12,X13
-	MOVOA X1,X0
-	PADDL X6,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X15
-	PSRLL $23,X12
-	PXOR X12,X15
-	MOVOA X2,X0
-	PADDL X13,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X9
-	PSRLL $23,X12
-	PXOR X12,X9
-	MOVOA X6,X0
-	PADDL X15,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X11
-	PSRLL $19,X12
-	PXOR X12,X11
-	MOVOA X13,X0
-	PADDL X9,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X3
-	PSRLL $19,X12
-	PXOR X12,X3
-	MOVOA X15,X0
-	PADDL X11,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X1
-	PSRLL $14,X12
-	PXOR X12,X1
-	MOVOA X9,X0
-	PADDL X3,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X2
-	PSRLL $14,X12
-	PXOR X12,X2
-	MOVOA 320(R12),X12
-	MOVOA 336(R12),X0
-	SUBQ $2,DX
-	JA MAINLOOP1
-	PADDL 112(R12),X12
-	PADDL 176(R12),X7
-	PADDL 224(R12),X10
-	PADDL 272(R12),X4
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 0(SI),DX
-	XORL 4(SI),CX
-	XORL 8(SI),R8
-	XORL 12(SI),R9
-	MOVL DX,0(DI)
-	MOVL CX,4(DI)
-	MOVL R8,8(DI)
-	MOVL R9,12(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 64(SI),DX
-	XORL 68(SI),CX
-	XORL 72(SI),R8
-	XORL 76(SI),R9
-	MOVL DX,64(DI)
-	MOVL CX,68(DI)
-	MOVL R8,72(DI)
-	MOVL R9,76(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 128(SI),DX
-	XORL 132(SI),CX
-	XORL 136(SI),R8
-	XORL 140(SI),R9
-	MOVL DX,128(DI)
-	MOVL CX,132(DI)
-	MOVL R8,136(DI)
-	MOVL R9,140(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	XORL 192(SI),DX
-	XORL 196(SI),CX
-	XORL 200(SI),R8
-	XORL 204(SI),R9
-	MOVL DX,192(DI)
-	MOVL CX,196(DI)
-	MOVL R8,200(DI)
-	MOVL R9,204(DI)
-	PADDL 240(R12),X14
-	PADDL 64(R12),X0
-	PADDL 128(R12),X5
-	PADDL 192(R12),X8
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 16(SI),DX
-	XORL 20(SI),CX
-	XORL 24(SI),R8
-	XORL 28(SI),R9
-	MOVL DX,16(DI)
-	MOVL CX,20(DI)
-	MOVL R8,24(DI)
-	MOVL R9,28(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 80(SI),DX
-	XORL 84(SI),CX
-	XORL 88(SI),R8
-	XORL 92(SI),R9
-	MOVL DX,80(DI)
-	MOVL CX,84(DI)
-	MOVL R8,88(DI)
-	MOVL R9,92(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 144(SI),DX
-	XORL 148(SI),CX
-	XORL 152(SI),R8
-	XORL 156(SI),R9
-	MOVL DX,144(DI)
-	MOVL CX,148(DI)
-	MOVL R8,152(DI)
-	MOVL R9,156(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	XORL 208(SI),DX
-	XORL 212(SI),CX
-	XORL 216(SI),R8
-	XORL 220(SI),R9
-	MOVL DX,208(DI)
-	MOVL CX,212(DI)
-	MOVL R8,216(DI)
-	MOVL R9,220(DI)
-	PADDL 288(R12),X15
-	PADDL 304(R12),X11
-	PADDL 80(R12),X1
-	PADDL 144(R12),X6
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 32(SI),DX
-	XORL 36(SI),CX
-	XORL 40(SI),R8
-	XORL 44(SI),R9
-	MOVL DX,32(DI)
-	MOVL CX,36(DI)
-	MOVL R8,40(DI)
-	MOVL R9,44(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 96(SI),DX
-	XORL 100(SI),CX
-	XORL 104(SI),R8
-	XORL 108(SI),R9
-	MOVL DX,96(DI)
-	MOVL CX,100(DI)
-	MOVL R8,104(DI)
-	MOVL R9,108(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 160(SI),DX
-	XORL 164(SI),CX
-	XORL 168(SI),R8
-	XORL 172(SI),R9
-	MOVL DX,160(DI)
-	MOVL CX,164(DI)
-	MOVL R8,168(DI)
-	MOVL R9,172(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	XORL 224(SI),DX
-	XORL 228(SI),CX
-	XORL 232(SI),R8
-	XORL 236(SI),R9
-	MOVL DX,224(DI)
-	MOVL CX,228(DI)
-	MOVL R8,232(DI)
-	MOVL R9,236(DI)
-	PADDL 160(R12),X13
-	PADDL 208(R12),X9
-	PADDL 256(R12),X3
-	PADDL 96(R12),X2
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 48(SI),DX
-	XORL 52(SI),CX
-	XORL 56(SI),R8
-	XORL 60(SI),R9
-	MOVL DX,48(DI)
-	MOVL CX,52(DI)
-	MOVL R8,56(DI)
-	MOVL R9,60(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 112(SI),DX
-	XORL 116(SI),CX
-	XORL 120(SI),R8
-	XORL 124(SI),R9
-	MOVL DX,112(DI)
-	MOVL CX,116(DI)
-	MOVL R8,120(DI)
-	MOVL R9,124(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 176(SI),DX
-	XORL 180(SI),CX
-	XORL 184(SI),R8
-	XORL 188(SI),R9
-	MOVL DX,176(DI)
-	MOVL CX,180(DI)
-	MOVL R8,184(DI)
-	MOVL R9,188(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	XORL 240(SI),DX
-	XORL 244(SI),CX
-	XORL 248(SI),R8
-	XORL 252(SI),R9
-	MOVL DX,240(DI)
-	MOVL CX,244(DI)
-	MOVL R8,248(DI)
-	MOVL R9,252(DI)
-	MOVQ 352(R12),R9
-	SUBQ $256,R9
-	ADDQ $256,SI
-	ADDQ $256,DI
-	CMPQ R9,$256
-	JAE BYTESATLEAST256
-	CMPQ R9,$0
-	JBE DONE
-	BYTESBETWEEN1AND255:
-	CMPQ R9,$64
-	JAE NOCOPY
-	MOVQ DI,DX
-	LEAQ 360(R12),DI
-	MOVQ R9,CX
+BYTESBETWEEN1AND255:
+	CMPQ R9, $0x40
+	JAE  NOCOPY
+	MOVQ DI, DX
+	LEAQ 360(R12), DI
+	MOVQ R9, CX
 	REP; MOVSB
-	LEAQ 360(R12),DI
-	LEAQ 360(R12),SI
-	NOCOPY:
-	MOVQ R9,352(R12)
-	MOVOA 48(R12),X0
-	MOVOA 0(R12),X1
-	MOVOA 16(R12),X2
-	MOVOA 32(R12),X3
-	MOVOA X1,X4
-	MOVQ $20,CX
-	MAINLOOP2:
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X3
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X3,X3
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X1
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X1,X1
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X1
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X1,X1
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X3
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X3,X3
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X3
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X3,X3
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X1
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X1,X1
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X1
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X1,X1
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X3
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X3
-	SUBQ $4,CX
-	PADDL X3,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PXOR X7,X7
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X3,X3
-	PXOR X6,X0
-	JA MAINLOOP2
-	PADDL 48(R12),X0
-	PADDL 0(R12),X1
-	PADDL 16(R12),X2
-	PADDL 32(R12),X3
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 0(SI),CX
-	XORL 48(SI),R8
-	XORL 32(SI),R9
-	XORL 16(SI),AX
-	MOVL CX,0(DI)
-	MOVL R8,48(DI)
-	MOVL R9,32(DI)
-	MOVL AX,16(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 20(SI),CX
-	XORL 4(SI),R8
-	XORL 52(SI),R9
-	XORL 36(SI),AX
-	MOVL CX,20(DI)
-	MOVL R8,4(DI)
-	MOVL R9,52(DI)
-	MOVL AX,36(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 40(SI),CX
-	XORL 24(SI),R8
-	XORL 8(SI),R9
-	XORL 56(SI),AX
-	MOVL CX,40(DI)
-	MOVL R8,24(DI)
-	MOVL R9,8(DI)
-	MOVL AX,56(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	XORL 60(SI),CX
-	XORL 44(SI),R8
-	XORL 28(SI),R9
-	XORL 12(SI),AX
-	MOVL CX,60(DI)
-	MOVL R8,44(DI)
-	MOVL R9,28(DI)
-	MOVL AX,12(DI)
-	MOVQ 352(R12),R9
-	MOVL 16(R12),CX
-	MOVL  36 (R12),R8
-	ADDQ $1,CX
-	SHLQ $32,R8
-	ADDQ R8,CX
-	MOVQ CX,R8
-	SHRQ $32,R8
-	MOVL CX,16(R12)
-	MOVL R8, 36 (R12)
-	CMPQ R9,$64
-	JA BYTESATLEAST65
-	JAE BYTESATLEAST64
-	MOVQ DI,SI
-	MOVQ DX,DI
-	MOVQ R9,CX
+	LEAQ 360(R12), DI
+	LEAQ 360(R12), SI
+
+NOCOPY:
+	MOVQ  R9, 352(R12)
+	MOVOA 48(R12), X0
+	MOVOA (R12), X1
+	MOVOA 16(R12), X2
+	MOVOA 32(R12), X3
+	MOVOA X1, X4
+	MOVQ  $0x00000014, CX
+
+MAINLOOP2:
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X3
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X3, X3
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X1
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X1, X1
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X1
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X1, X1
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X3
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X3, X3
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X3
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X3, X3
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X1
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X1, X1
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X1
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X1, X1
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X3
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X3
+	SUBQ   $0x04, CX
+	PADDL  X3, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PXOR   X7, X7
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X3, X3
+	PXOR   X6, X0
+	JA     MAINLOOP2
+	PADDL  48(R12), X0
+	PADDL  (R12), X1
+	PADDL  16(R12), X2
+	PADDL  32(R12), X3
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   (SI), CX
+	XORL   48(SI), R8
+	XORL   32(SI), R9
+	XORL   16(SI), AX
+	MOVL   CX, (DI)
+	MOVL   R8, 48(DI)
+	MOVL   R9, 32(DI)
+	MOVL   AX, 16(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   20(SI), CX
+	XORL   4(SI), R8
+	XORL   52(SI), R9
+	XORL   36(SI), AX
+	MOVL   CX, 20(DI)
+	MOVL   R8, 4(DI)
+	MOVL   R9, 52(DI)
+	MOVL   AX, 36(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   40(SI), CX
+	XORL   24(SI), R8
+	XORL   8(SI), R9
+	XORL   56(SI), AX
+	MOVL   CX, 40(DI)
+	MOVL   R8, 24(DI)
+	MOVL   R9, 8(DI)
+	MOVL   AX, 56(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	XORL   60(SI), CX
+	XORL   44(SI), R8
+	XORL   28(SI), R9
+	XORL   12(SI), AX
+	MOVL   CX, 60(DI)
+	MOVL   R8, 44(DI)
+	MOVL   R9, 28(DI)
+	MOVL   AX, 12(DI)
+	MOVQ   352(R12), R9
+	MOVL   16(R12), CX
+	MOVL   36(R12), R8
+	ADDQ   $0x01, CX
+	SHLQ   $0x20, R8
+	ADDQ   R8, CX
+	MOVQ   CX, R8
+	SHRQ   $0x20, R8
+	MOVL   CX, 16(R12)
+	MOVL   R8, 36(R12)
+	CMPQ   R9, $0x40
+	JA     BYTESATLEAST65
+	JAE    BYTESATLEAST64
+	MOVQ   DI, SI
+	MOVQ   DX, DI
+	MOVQ   R9, CX
 	REP; MOVSB
-	BYTESATLEAST64:
-	DONE:
+
+BYTESATLEAST64:
+DONE:
 	RET
-	BYTESATLEAST65:
-	SUBQ $64,R9
-	ADDQ $64,DI
-	ADDQ $64,SI
-	JMP BYTESBETWEEN1AND255
+
+BYTESATLEAST65:
+	SUBQ $0x40, R9
+	ADDQ $0x40, DI
+	ADDQ $0x40, SI
+	JMP  BYTESBETWEEN1AND255
diff --git a/vendor/golang.org/x/crypto/sha3/doc.go b/vendor/golang.org/x/crypto/sha3/doc.go
index 7e023090707b8a5f6e7c9c6cf7db398889cf7099..bbf391fe6e5909ea10b516d9821c09811fb9dad9 100644
--- a/vendor/golang.org/x/crypto/sha3/doc.go
+++ b/vendor/golang.org/x/crypto/sha3/doc.go
@@ -5,6 +5,10 @@
 // Package sha3 implements the SHA-3 fixed-output-length hash functions and
 // the SHAKE variable-output-length hash functions defined by FIPS-202.
 //
+// All types in this package also implement [encoding.BinaryMarshaler],
+// [encoding.BinaryAppender] and [encoding.BinaryUnmarshaler] to marshal and
+// unmarshal the internal state of the hash.
+//
 // Both types of hash function use the "sponge" construction and the Keccak
 // permutation. For a detailed specification see http://keccak.noekeon.org/
 //
diff --git a/vendor/golang.org/x/crypto/sha3/hashes.go b/vendor/golang.org/x/crypto/sha3/hashes.go
index c544b29e5f2cce1711b876f80cf61969a4e35df0..31fffbe04408af272ff1c800215e51acead39eaf 100644
--- a/vendor/golang.org/x/crypto/sha3/hashes.go
+++ b/vendor/golang.org/x/crypto/sha3/hashes.go
@@ -48,33 +48,52 @@ func init() {
 	crypto.RegisterHash(crypto.SHA3_512, New512)
 }
 
+const (
+	dsbyteSHA3   = 0b00000110
+	dsbyteKeccak = 0b00000001
+	dsbyteShake  = 0b00011111
+	dsbyteCShake = 0b00000100
+
+	// rateK[c] is the rate in bytes for Keccak[c] where c is the capacity in
+	// bits. Given the sponge size is 1600 bits, the rate is 1600 - c bits.
+	rateK256  = (1600 - 256) / 8
+	rateK448  = (1600 - 448) / 8
+	rateK512  = (1600 - 512) / 8
+	rateK768  = (1600 - 768) / 8
+	rateK1024 = (1600 - 1024) / 8
+)
+
 func new224Generic() *state {
-	return &state{rate: 144, outputLen: 28, dsbyte: 0x06}
+	return &state{rate: rateK448, outputLen: 28, dsbyte: dsbyteSHA3}
 }
 
 func new256Generic() *state {
-	return &state{rate: 136, outputLen: 32, dsbyte: 0x06}
+	return &state{rate: rateK512, outputLen: 32, dsbyte: dsbyteSHA3}
 }
 
 func new384Generic() *state {
-	return &state{rate: 104, outputLen: 48, dsbyte: 0x06}
+	return &state{rate: rateK768, outputLen: 48, dsbyte: dsbyteSHA3}
 }
 
 func new512Generic() *state {
-	return &state{rate: 72, outputLen: 64, dsbyte: 0x06}
+	return &state{rate: rateK1024, outputLen: 64, dsbyte: dsbyteSHA3}
 }
 
 // NewLegacyKeccak256 creates a new Keccak-256 hash.
 //
 // Only use this function if you require compatibility with an existing cryptosystem
 // that uses non-standard padding. All other users should use New256 instead.
-func NewLegacyKeccak256() hash.Hash { return &state{rate: 136, outputLen: 32, dsbyte: 0x01} }
+func NewLegacyKeccak256() hash.Hash {
+	return &state{rate: rateK512, outputLen: 32, dsbyte: dsbyteKeccak}
+}
 
 // NewLegacyKeccak512 creates a new Keccak-512 hash.
 //
 // Only use this function if you require compatibility with an existing cryptosystem
 // that uses non-standard padding. All other users should use New512 instead.
-func NewLegacyKeccak512() hash.Hash { return &state{rate: 72, outputLen: 64, dsbyte: 0x01} }
+func NewLegacyKeccak512() hash.Hash {
+	return &state{rate: rateK1024, outputLen: 64, dsbyte: dsbyteKeccak}
+}
 
 // Sum224 returns the SHA3-224 digest of the data.
 func Sum224(data []byte) (digest [28]byte) {
diff --git a/vendor/golang.org/x/crypto/sha3/sha3.go b/vendor/golang.org/x/crypto/sha3/sha3.go
index afedde5abf1fd4be3225a4512db53406d9914fc6..6658c44479b6da200da4727d0af0fc38dd5664a0 100644
--- a/vendor/golang.org/x/crypto/sha3/sha3.go
+++ b/vendor/golang.org/x/crypto/sha3/sha3.go
@@ -4,6 +4,15 @@
 
 package sha3
 
+import (
+	"crypto/subtle"
+	"encoding/binary"
+	"errors"
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
 // spongeDirection indicates the direction bytes are flowing through the sponge.
 type spongeDirection int
 
@@ -14,16 +23,13 @@ const (
 	spongeSqueezing
 )
 
-const (
-	// maxRate is the maximum size of the internal buffer. SHAKE-256
-	// currently needs the largest buffer.
-	maxRate = 168
-)
-
 type state struct {
-	// Generic sponge components.
-	a    [25]uint64 // main state of the hash
-	rate int        // the number of bytes of state to use
+	a [1600 / 8]byte // main state of the hash
+
+	// a[n:rate] is the buffer. If absorbing, it's the remaining space to XOR
+	// into before running the permutation. If squeezing, it's the remaining
+	// output to produce before running the permutation.
+	n, rate int
 
 	// dsbyte contains the "domain separation" bits and the first bit of
 	// the padding. Sections 6.1 and 6.2 of [1] separate the outputs of the
@@ -39,10 +45,6 @@ type state struct {
 	//      Extendable-Output Functions (May 2014)"
 	dsbyte byte
 
-	i, n    int // storage[i:n] is the buffer, i is only used while squeezing
-	storage [maxRate]byte
-
-	// Specific to SHA-3 and SHAKE.
 	outputLen int             // the default output size in bytes
 	state     spongeDirection // whether the sponge is absorbing or squeezing
 }
@@ -61,7 +63,7 @@ func (d *state) Reset() {
 		d.a[i] = 0
 	}
 	d.state = spongeAbsorbing
-	d.i, d.n = 0, 0
+	d.n = 0
 }
 
 func (d *state) clone() *state {
@@ -69,22 +71,25 @@ func (d *state) clone() *state {
 	return &ret
 }
 
-// permute applies the KeccakF-1600 permutation. It handles
-// any input-output buffering.
+// permute applies the KeccakF-1600 permutation.
 func (d *state) permute() {
-	switch d.state {
-	case spongeAbsorbing:
-		// If we're absorbing, we need to xor the input into the state
-		// before applying the permutation.
-		xorIn(d, d.storage[:d.rate])
-		d.n = 0
-		keccakF1600(&d.a)
-	case spongeSqueezing:
-		// If we're squeezing, we need to apply the permutation before
-		// copying more output.
-		keccakF1600(&d.a)
-		d.i = 0
-		copyOut(d, d.storage[:d.rate])
+	var a *[25]uint64
+	if cpu.IsBigEndian {
+		a = new([25]uint64)
+		for i := range a {
+			a[i] = binary.LittleEndian.Uint64(d.a[i*8:])
+		}
+	} else {
+		a = (*[25]uint64)(unsafe.Pointer(&d.a))
+	}
+
+	keccakF1600(a)
+	d.n = 0
+
+	if cpu.IsBigEndian {
+		for i := range a {
+			binary.LittleEndian.PutUint64(d.a[i*8:], a[i])
+		}
 	}
 }
 
@@ -92,53 +97,36 @@ func (d *state) permute() {
 // the multi-bitrate 10..1 padding rule, and permutes the state.
 func (d *state) padAndPermute() {
 	// Pad with this instance's domain-separator bits. We know that there's
-	// at least one byte of space in d.buf because, if it were full,
+	// at least one byte of space in the sponge because, if it were full,
 	// permute would have been called to empty it. dsbyte also contains the
 	// first one bit for the padding. See the comment in the state struct.
-	d.storage[d.n] = d.dsbyte
-	d.n++
-	for d.n < d.rate {
-		d.storage[d.n] = 0
-		d.n++
-	}
+	d.a[d.n] ^= d.dsbyte
 	// This adds the final one bit for the padding. Because of the way that
 	// bits are numbered from the LSB upwards, the final bit is the MSB of
 	// the last byte.
-	d.storage[d.rate-1] ^= 0x80
+	d.a[d.rate-1] ^= 0x80
 	// Apply the permutation
 	d.permute()
 	d.state = spongeSqueezing
-	d.n = d.rate
-	copyOut(d, d.storage[:d.rate])
 }
 
 // Write absorbs more data into the hash's state. It panics if any
 // output has already been read.
-func (d *state) Write(p []byte) (written int, err error) {
+func (d *state) Write(p []byte) (n int, err error) {
 	if d.state != spongeAbsorbing {
 		panic("sha3: Write after Read")
 	}
-	written = len(p)
+
+	n = len(p)
 
 	for len(p) > 0 {
-		if d.n == 0 && len(p) >= d.rate {
-			// The fast path; absorb a full "rate" bytes of input and apply the permutation.
-			xorIn(d, p[:d.rate])
-			p = p[d.rate:]
-			keccakF1600(&d.a)
-		} else {
-			// The slow path; buffer the input until we can fill the sponge, and then xor it in.
-			todo := d.rate - d.n
-			if todo > len(p) {
-				todo = len(p)
-			}
-			d.n += copy(d.storage[d.n:], p[:todo])
-			p = p[todo:]
-
-			// If the sponge is full, apply the permutation.
-			if d.n == d.rate {
-				d.permute()
-			}
+		x := subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p)
+		d.n += x
+		p = p[x:]
+
+		// If the sponge is full, apply the permutation.
+		if d.n == d.rate {
+			d.permute()
 		}
 	}
 
@@ -156,14 +144,14 @@ func (d *state) Read(out []byte) (n int, err error) {
 
 	// Now, do the squeezing.
 	for len(out) > 0 {
-		n := copy(out, d.storage[d.i:d.n])
-		d.i += n
-		out = out[n:]
-
 		// Apply the permutation if we've squeezed the sponge dry.
-		if d.i == d.rate {
+		if d.n == d.rate {
 			d.permute()
 		}
+
+		x := copy(out, d.a[d.n:d.rate])
+		d.n += x
+		out = out[x:]
 	}
 
 	return
@@ -183,3 +171,74 @@ func (d *state) Sum(in []byte) []byte {
 	dup.Read(hash)
 	return append(in, hash...)
 }
+
+const (
+	magicSHA3   = "sha\x08"
+	magicShake  = "sha\x09"
+	magicCShake = "sha\x0a"
+	magicKeccak = "sha\x0b"
+	// magic || rate || main state || n || sponge direction
+	marshaledSize = len(magicSHA3) + 1 + 200 + 1 + 1
+)
+
+func (d *state) MarshalBinary() ([]byte, error) {
+	return d.AppendBinary(make([]byte, 0, marshaledSize))
+}
+
+func (d *state) AppendBinary(b []byte) ([]byte, error) {
+	switch d.dsbyte {
+	case dsbyteSHA3:
+		b = append(b, magicSHA3...)
+	case dsbyteShake:
+		b = append(b, magicShake...)
+	case dsbyteCShake:
+		b = append(b, magicCShake...)
+	case dsbyteKeccak:
+		b = append(b, magicKeccak...)
+	default:
+		panic("unknown dsbyte")
+	}
+	// rate is at most 168, and n is at most rate.
+	b = append(b, byte(d.rate))
+	b = append(b, d.a[:]...)
+	b = append(b, byte(d.n), byte(d.state))
+	return b, nil
+}
+
+func (d *state) UnmarshalBinary(b []byte) error {
+	if len(b) != marshaledSize {
+		return errors.New("sha3: invalid hash state")
+	}
+
+	magic := string(b[:len(magicSHA3)])
+	b = b[len(magicSHA3):]
+	switch {
+	case magic == magicSHA3 && d.dsbyte == dsbyteSHA3:
+	case magic == magicShake && d.dsbyte == dsbyteShake:
+	case magic == magicCShake && d.dsbyte == dsbyteCShake:
+	case magic == magicKeccak && d.dsbyte == dsbyteKeccak:
+	default:
+		return errors.New("sha3: invalid hash state identifier")
+	}
+
+	rate := int(b[0])
+	b = b[1:]
+	if rate != d.rate {
+		return errors.New("sha3: invalid hash state function")
+	}
+
+	copy(d.a[:], b)
+	b = b[len(d.a):]
+
+	n, state := int(b[0]), spongeDirection(b[1])
+	if n > d.rate {
+		return errors.New("sha3: invalid hash state")
+	}
+	d.n = n
+	if state != spongeAbsorbing && state != spongeSqueezing {
+		return errors.New("sha3: invalid hash state")
+	}
+	d.state = state
+
+	return nil
+}
diff --git a/vendor/golang.org/x/crypto/sha3/shake.go b/vendor/golang.org/x/crypto/sha3/shake.go
index 1ea9275b8b7ac7d165f0c28e10a2ad02fb3fa61c..a6b3a4281f5b57bfac6c882f2d960feff440cf31 100644
--- a/vendor/golang.org/x/crypto/sha3/shake.go
+++ b/vendor/golang.org/x/crypto/sha3/shake.go
@@ -16,9 +16,12 @@ package sha3
 // [2] https://doi.org/10.6028/NIST.SP.800-185
 
 import (
+	"bytes"
 	"encoding/binary"
+	"errors"
 	"hash"
 	"io"
+	"math/bits"
 )
 
 // ShakeHash defines the interface to hash functions that support
@@ -50,44 +53,36 @@ type cshakeState struct {
 	initBlock []byte
 }
 
-// Consts for configuring initial SHA-3 state
-const (
-	dsbyteShake  = 0x1f
-	dsbyteCShake = 0x04
-	rate128      = 168
-	rate256      = 136
-)
+func bytepad(data []byte, rate int) []byte {
+	out := make([]byte, 0, 9+len(data)+rate-1)
+	out = append(out, leftEncode(uint64(rate))...)
+	out = append(out, data...)
+	if padlen := rate - len(out)%rate; padlen < rate {
+		out = append(out, make([]byte, padlen)...)
+	}
+	return out
+}
 
-func bytepad(input []byte, w int) []byte {
-	// leftEncode always returns max 9 bytes
-	buf := make([]byte, 0, 9+len(input)+w)
-	buf = append(buf, leftEncode(uint64(w))...)
-	buf = append(buf, input...)
-	padlen := w - (len(buf) % w)
-	return append(buf, make([]byte, padlen)...)
-}
-
-func leftEncode(value uint64) []byte {
-	var b [9]byte
-	binary.BigEndian.PutUint64(b[1:], value)
-	// Trim all but last leading zero bytes
-	i := byte(1)
-	for i < 8 && b[i] == 0 {
-		i++
+func leftEncode(x uint64) []byte {
+	// Let n be the smallest positive integer for which 2^(8n) > x.
+	n := (bits.Len64(x) + 7) / 8
+	if n == 0 {
+		n = 1
 	}
-	// Prepend number of encoded bytes
-	b[i-1] = 9 - i
-	return b[i-1:]
+	// Return n || x with n as a byte and x an n bytes in big-endian order.
+	b := make([]byte, 9)
+	binary.BigEndian.PutUint64(b[1:], x)
+	b = b[9-n-1:]
+	b[0] = byte(n)
+	return b
 }
 
 func newCShake(N, S []byte, rate, outputLen int, dsbyte byte) ShakeHash {
 	c := cshakeState{state: &state{rate: rate, outputLen: outputLen, dsbyte: dsbyte}}
-
-	// leftEncode returns max 9 bytes
-	c.initBlock = make([]byte, 0, 9*2+len(N)+len(S))
-	c.initBlock = append(c.initBlock, leftEncode(uint64(len(N)*8))...)
+	c.initBlock = make([]byte, 0, 9+len(N)+9+len(S)) // leftEncode returns max 9 bytes
+	c.initBlock = append(c.initBlock, leftEncode(uint64(len(N))*8)...)
 	c.initBlock = append(c.initBlock, N...)
-	c.initBlock = append(c.initBlock, leftEncode(uint64(len(S)*8))...)
+	c.initBlock = append(c.initBlock, leftEncode(uint64(len(S))*8)...)
 	c.initBlock = append(c.initBlock, S...)
 	c.Write(bytepad(c.initBlock, c.rate))
 	return &c
@@ -111,6 +106,30 @@ func (c *state) Clone() ShakeHash {
 	return c.clone()
 }
 
+func (c *cshakeState) MarshalBinary() ([]byte, error) {
+	return c.AppendBinary(make([]byte, 0, marshaledSize+len(c.initBlock)))
+}
+
+func (c *cshakeState) AppendBinary(b []byte) ([]byte, error) {
+	b, err := c.state.AppendBinary(b)
+	if err != nil {
+		return nil, err
+	}
+	b = append(b, c.initBlock...)
+	return b, nil
+}
+
+func (c *cshakeState) UnmarshalBinary(b []byte) error {
+	if len(b) <= marshaledSize {
+		return errors.New("sha3: invalid hash state")
+	}
+	if err := c.state.UnmarshalBinary(b[:marshaledSize]); err != nil {
+		return err
+	}
+	c.initBlock = bytes.Clone(b[marshaledSize:])
+	return nil
+}
+
 // NewShake128 creates a new SHAKE128 variable-output-length ShakeHash.
 // Its generic security strength is 128 bits against all attacks if at
 // least 32 bytes of its output are used.
@@ -126,11 +145,11 @@ func NewShake256() ShakeHash {
 }
 
 func newShake128Generic() *state {
-	return &state{rate: rate128, outputLen: 32, dsbyte: dsbyteShake}
+	return &state{rate: rateK256, outputLen: 32, dsbyte: dsbyteShake}
 }
 
 func newShake256Generic() *state {
-	return &state{rate: rate256, outputLen: 64, dsbyte: dsbyteShake}
+	return &state{rate: rateK512, outputLen: 64, dsbyte: dsbyteShake}
 }
 
 // NewCShake128 creates a new instance of cSHAKE128 variable-output-length ShakeHash,
@@ -143,7 +162,7 @@ func NewCShake128(N, S []byte) ShakeHash {
 	if len(N) == 0 && len(S) == 0 {
 		return NewShake128()
 	}
-	return newCShake(N, S, rate128, 32, dsbyteCShake)
+	return newCShake(N, S, rateK256, 32, dsbyteCShake)
 }
 
 // NewCShake256 creates a new instance of cSHAKE256 variable-output-length ShakeHash,
@@ -156,7 +175,7 @@ func NewCShake256(N, S []byte) ShakeHash {
 	if len(N) == 0 && len(S) == 0 {
 		return NewShake256()
 	}
-	return newCShake(N, S, rate256, 64, dsbyteCShake)
+	return newCShake(N, S, rateK512, 64, dsbyteCShake)
 }
 
 // ShakeSum128 writes an arbitrary-length digest of data into hash.
diff --git a/vendor/golang.org/x/crypto/sha3/xor.go b/vendor/golang.org/x/crypto/sha3/xor.go
deleted file mode 100644
index 6ada5c9574e2afa34e784dade9b3c030c68b3da7..0000000000000000000000000000000000000000
--- a/vendor/golang.org/x/crypto/sha3/xor.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package sha3
-
-import (
-	"crypto/subtle"
-	"encoding/binary"
-	"unsafe"
-
-	"golang.org/x/sys/cpu"
-)
-
-// xorIn xors the bytes in buf into the state.
-func xorIn(d *state, buf []byte) {
-	if cpu.IsBigEndian {
-		for i := 0; len(buf) >= 8; i++ {
-			a := binary.LittleEndian.Uint64(buf)
-			d.a[i] ^= a
-			buf = buf[8:]
-		}
-	} else {
-		ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a))
-		subtle.XORBytes(ab[:], ab[:], buf)
-	}
-}
-
-// copyOut copies uint64s to a byte buffer.
-func copyOut(d *state, b []byte) {
-	if cpu.IsBigEndian {
-		for i := 0; len(b) >= 8; i++ {
-			binary.LittleEndian.PutUint64(b, d.a[i])
-			b = b[8:]
-		}
-	} else {
-		ab := (*[25 * 64 / 8]byte)(unsafe.Pointer(&d.a))
-		copy(b, ab[:])
-	}
-}
diff --git a/vendor/golang.org/x/net/http2/client_conn_pool.go b/vendor/golang.org/x/net/http2/client_conn_pool.go
index 780968d6c19b8442a9f0e8fe3370f82b7f5a5f26..e81b73e6a7a0ffd2ffd91bf275a89931e1e73839 100644
--- a/vendor/golang.org/x/net/http2/client_conn_pool.go
+++ b/vendor/golang.org/x/net/http2/client_conn_pool.go
@@ -8,8 +8,8 @@ package http2
 
 import (
 	"context"
-	"crypto/tls"
 	"errors"
+	"net"
 	"net/http"
 	"sync"
 )
@@ -158,7 +158,7 @@ func (c *dialCall) dial(ctx context.Context, addr string) {
 // This code decides which ones live or die.
 // The return value used is whether c was used.
 // c is never closed.
-func (p *clientConnPool) addConnIfNeeded(key string, t *Transport, c *tls.Conn) (used bool, err error) {
+func (p *clientConnPool) addConnIfNeeded(key string, t *Transport, c net.Conn) (used bool, err error) {
 	p.mu.Lock()
 	for _, cc := range p.conns[key] {
 		if cc.CanTakeNewRequest() {
@@ -194,8 +194,8 @@ type addConnCall struct {
 	err  error
 }
 
-func (c *addConnCall) run(t *Transport, key string, tc *tls.Conn) {
-	cc, err := t.NewClientConn(tc)
+func (c *addConnCall) run(t *Transport, key string, nc net.Conn) {
+	cc, err := t.NewClientConn(nc)
 
 	p := c.p
 	p.mu.Lock()
diff --git a/vendor/golang.org/x/net/http2/config.go b/vendor/golang.org/x/net/http2/config.go
new file mode 100644
index 0000000000000000000000000000000000000000..de58dfb8dc4927e359548cc1d833ceea88189e74
--- /dev/null
+++ b/vendor/golang.org/x/net/http2/config.go
@@ -0,0 +1,122 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package http2
+
+import (
+	"math"
+	"net/http"
+	"time"
+)
+
+// http2Config is a package-internal version of net/http.HTTP2Config.
+//
+// http.HTTP2Config was added in Go 1.24.
+// When running with a version of net/http that includes HTTP2Config,
+// we merge the configuration with the fields in Transport or Server
+// to produce an http2Config.
+//
+// Zero valued fields in http2Config are interpreted as in the
+// net/http.HTTPConfig documentation.
+//
+// Precedence order for reconciling configurations is:
+//
+//   - Use the net/http.{Server,Transport}.HTTP2Config value, when non-zero.
+//   - Otherwise use the http2.{Server.Transport} value.
+//   - If the resulting value is zero or out of range, use a default.
+type http2Config struct {
+	MaxConcurrentStreams         uint32
+	MaxDecoderHeaderTableSize    uint32
+	MaxEncoderHeaderTableSize    uint32
+	MaxReadFrameSize             uint32
+	MaxUploadBufferPerConnection int32
+	MaxUploadBufferPerStream     int32
+	SendPingTimeout              time.Duration
+	PingTimeout                  time.Duration
+	WriteByteTimeout             time.Duration
+	PermitProhibitedCipherSuites bool
+	CountError                   func(errType string)
+}
+
+// configFromServer merges configuration settings from
+// net/http.Server.HTTP2Config and http2.Server.
+func configFromServer(h1 *http.Server, h2 *Server) http2Config {
+	conf := http2Config{
+		MaxConcurrentStreams:         h2.MaxConcurrentStreams,
+		MaxEncoderHeaderTableSize:    h2.MaxEncoderHeaderTableSize,
+		MaxDecoderHeaderTableSize:    h2.MaxDecoderHeaderTableSize,
+		MaxReadFrameSize:             h2.MaxReadFrameSize,
+		MaxUploadBufferPerConnection: h2.MaxUploadBufferPerConnection,
+		MaxUploadBufferPerStream:     h2.MaxUploadBufferPerStream,
+		SendPingTimeout:              h2.ReadIdleTimeout,
+		PingTimeout:                  h2.PingTimeout,
+		WriteByteTimeout:             h2.WriteByteTimeout,
+		PermitProhibitedCipherSuites: h2.PermitProhibitedCipherSuites,
+		CountError:                   h2.CountError,
+	}
+	fillNetHTTPServerConfig(&conf, h1)
+	setConfigDefaults(&conf, true)
+	return conf
+}
+
+// configFromServer merges configuration settings from h2 and h2.t1.HTTP2
+// (the net/http Transport).
+func configFromTransport(h2 *Transport) http2Config {
+	conf := http2Config{
+		MaxEncoderHeaderTableSize: h2.MaxEncoderHeaderTableSize,
+		MaxDecoderHeaderTableSize: h2.MaxDecoderHeaderTableSize,
+		MaxReadFrameSize:          h2.MaxReadFrameSize,
+		SendPingTimeout:           h2.ReadIdleTimeout,
+		PingTimeout:               h2.PingTimeout,
+		WriteByteTimeout:          h2.WriteByteTimeout,
+	}
+
+	// Unlike most config fields, where out-of-range values revert to the default,
+	// Transport.MaxReadFrameSize clips.
+	if conf.MaxReadFrameSize < minMaxFrameSize {
+		conf.MaxReadFrameSize = minMaxFrameSize
+	} else if conf.MaxReadFrameSize > maxFrameSize {
+		conf.MaxReadFrameSize = maxFrameSize
+	}
+
+	if h2.t1 != nil {
+		fillNetHTTPTransportConfig(&conf, h2.t1)
+	}
+	setConfigDefaults(&conf, false)
+	return conf
+}
+
+func setDefault[T ~int | ~int32 | ~uint32 | ~int64](v *T, minval, maxval, defval T) {
+	if *v < minval || *v > maxval {
+		*v = defval
+	}
+}
+
+func setConfigDefaults(conf *http2Config, server bool) {
+	setDefault(&conf.MaxConcurrentStreams, 1, math.MaxUint32, defaultMaxStreams)
+	setDefault(&conf.MaxEncoderHeaderTableSize, 1, math.MaxUint32, initialHeaderTableSize)
+	setDefault(&conf.MaxDecoderHeaderTableSize, 1, math.MaxUint32, initialHeaderTableSize)
+	if server {
+		setDefault(&conf.MaxUploadBufferPerConnection, initialWindowSize, math.MaxInt32, 1<<20)
+	} else {
+		setDefault(&conf.MaxUploadBufferPerConnection, initialWindowSize, math.MaxInt32, transportDefaultConnFlow)
+	}
+	if server {
+		setDefault(&conf.MaxUploadBufferPerStream, 1, math.MaxInt32, 1<<20)
+	} else {
+		setDefault(&conf.MaxUploadBufferPerStream, 1, math.MaxInt32, transportDefaultStreamFlow)
+	}
+	setDefault(&conf.MaxReadFrameSize, minMaxFrameSize, maxFrameSize, defaultMaxReadFrameSize)
+	setDefault(&conf.PingTimeout, 1, math.MaxInt64, 15*time.Second)
+}
+
+// adjustHTTP1MaxHeaderSize converts a limit in bytes on the size of an HTTP/1 header
+// to an HTTP/2 MAX_HEADER_LIST_SIZE value.
+func adjustHTTP1MaxHeaderSize(n int64) int64 {
+	// http2's count is in a slightly different unit and includes 32 bytes per pair.
+	// So, take the net/http.Server value and pad it up a bit, assuming 10 headers.
+	const perFieldOverhead = 32 // per http2 spec
+	const typicalHeaders = 10   // conservative
+	return n + typicalHeaders*perFieldOverhead
+}
diff --git a/vendor/golang.org/x/net/http2/config_go124.go b/vendor/golang.org/x/net/http2/config_go124.go
new file mode 100644
index 0000000000000000000000000000000000000000..e3784123c81a66df451404c0197b584bff87a9c8
--- /dev/null
+++ b/vendor/golang.org/x/net/http2/config_go124.go
@@ -0,0 +1,61 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build go1.24
+
+package http2
+
+import "net/http"
+
+// fillNetHTTPServerConfig sets fields in conf from srv.HTTP2.
+func fillNetHTTPServerConfig(conf *http2Config, srv *http.Server) {
+	fillNetHTTPConfig(conf, srv.HTTP2)
+}
+
+// fillNetHTTPServerConfig sets fields in conf from tr.HTTP2.
+func fillNetHTTPTransportConfig(conf *http2Config, tr *http.Transport) {
+	fillNetHTTPConfig(conf, tr.HTTP2)
+}
+
+func fillNetHTTPConfig(conf *http2Config, h2 *http.HTTP2Config) {
+	if h2 == nil {
+		return
+	}
+	if h2.MaxConcurrentStreams != 0 {
+		conf.MaxConcurrentStreams = uint32(h2.MaxConcurrentStreams)
+	}
+	if h2.MaxEncoderHeaderTableSize != 0 {
+		conf.MaxEncoderHeaderTableSize = uint32(h2.MaxEncoderHeaderTableSize)
+	}
+	if h2.MaxDecoderHeaderTableSize != 0 {
+		conf.MaxDecoderHeaderTableSize = uint32(h2.MaxDecoderHeaderTableSize)
+	}
+	if h2.MaxConcurrentStreams != 0 {
+		conf.MaxConcurrentStreams = uint32(h2.MaxConcurrentStreams)
+	}
+	if h2.MaxReadFrameSize != 0 {
+		conf.MaxReadFrameSize = uint32(h2.MaxReadFrameSize)
+	}
+	if h2.MaxReceiveBufferPerConnection != 0 {
+		conf.MaxUploadBufferPerConnection = int32(h2.MaxReceiveBufferPerConnection)
+	}
+	if h2.MaxReceiveBufferPerStream != 0 {
+		conf.MaxUploadBufferPerStream = int32(h2.MaxReceiveBufferPerStream)
+	}
+	if h2.SendPingTimeout != 0 {
+		conf.SendPingTimeout = h2.SendPingTimeout
+	}
+	if h2.PingTimeout != 0 {
+		conf.PingTimeout = h2.PingTimeout
+	}
+	if h2.WriteByteTimeout != 0 {
+		conf.WriteByteTimeout = h2.WriteByteTimeout
+	}
+	if h2.PermitProhibitedCipherSuites {
+		conf.PermitProhibitedCipherSuites = true
+	}
+	if h2.CountError != nil {
+		conf.CountError = h2.CountError
+	}
+}
diff --git a/vendor/golang.org/x/net/http2/config_pre_go124.go b/vendor/golang.org/x/net/http2/config_pre_go124.go
new file mode 100644
index 0000000000000000000000000000000000000000..060fd6c64c6ca7f786f44cfd11ba601cbda9fc9a
--- /dev/null
+++ b/vendor/golang.org/x/net/http2/config_pre_go124.go
@@ -0,0 +1,16 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !go1.24
+
+package http2
+
+import "net/http"
+
+// Pre-Go 1.24 fallback.
+// The Server.HTTP2 and Transport.HTTP2 config fields were added in Go 1.24.
+
+func fillNetHTTPServerConfig(conf *http2Config, srv *http.Server) {}
+
+func fillNetHTTPTransportConfig(conf *http2Config, tr *http.Transport) {}
diff --git a/vendor/golang.org/x/net/http2/frame.go b/vendor/golang.org/x/net/http2/frame.go
index 105c3b279c0f41ae1a525a84fe011ceac7bc805f..81faec7e75d60f1854c9046cfcfd94cad3895664 100644
--- a/vendor/golang.org/x/net/http2/frame.go
+++ b/vendor/golang.org/x/net/http2/frame.go
@@ -1490,7 +1490,7 @@ func (mh *MetaHeadersFrame) checkPseudos() error {
 	pf := mh.PseudoFields()
 	for i, hf := range pf {
 		switch hf.Name {
-		case ":method", ":path", ":scheme", ":authority":
+		case ":method", ":path", ":scheme", ":authority", ":protocol":
 			isRequest = true
 		case ":status":
 			isResponse = true
@@ -1498,7 +1498,7 @@ func (mh *MetaHeadersFrame) checkPseudos() error {
 			return pseudoHeaderError(hf.Name)
 		}
 		// Check for duplicates.
-		// This would be a bad algorithm, but N is 4.
+		// This would be a bad algorithm, but N is 5.
 		// And this doesn't allocate.
 		for _, hf2 := range pf[:i] {
 			if hf.Name == hf2.Name {
diff --git a/vendor/golang.org/x/net/http2/http2.go b/vendor/golang.org/x/net/http2/http2.go
index 003e649f30c6cc1b39641e963719ea441b11963e..c7601c909ffb95d9a1987422099433b6d8a0bd5b 100644
--- a/vendor/golang.org/x/net/http2/http2.go
+++ b/vendor/golang.org/x/net/http2/http2.go
@@ -19,8 +19,9 @@ import (
 	"bufio"
 	"context"
 	"crypto/tls"
+	"errors"
 	"fmt"
-	"io"
+	"net"
 	"net/http"
 	"os"
 	"sort"
@@ -33,10 +34,11 @@ import (
 )
 
 var (
-	VerboseLogs    bool
-	logFrameWrites bool
-	logFrameReads  bool
-	inTests        bool
+	VerboseLogs                    bool
+	logFrameWrites                 bool
+	logFrameReads                  bool
+	inTests                        bool
+	disableExtendedConnectProtocol bool
 )
 
 func init() {
@@ -49,6 +51,9 @@ func init() {
 		logFrameWrites = true
 		logFrameReads = true
 	}
+	if strings.Contains(e, "http2xconnect=0") {
+		disableExtendedConnectProtocol = true
+	}
 }
 
 const (
@@ -140,6 +145,10 @@ func (s Setting) Valid() error {
 		if s.Val < 16384 || s.Val > 1<<24-1 {
 			return ConnectionError(ErrCodeProtocol)
 		}
+	case SettingEnableConnectProtocol:
+		if s.Val != 1 && s.Val != 0 {
+			return ConnectionError(ErrCodeProtocol)
+		}
 	}
 	return nil
 }
@@ -149,21 +158,23 @@ func (s Setting) Valid() error {
 type SettingID uint16
 
 const (
-	SettingHeaderTableSize      SettingID = 0x1
-	SettingEnablePush           SettingID = 0x2
-	SettingMaxConcurrentStreams SettingID = 0x3
-	SettingInitialWindowSize    SettingID = 0x4
-	SettingMaxFrameSize         SettingID = 0x5
-	SettingMaxHeaderListSize    SettingID = 0x6
+	SettingHeaderTableSize       SettingID = 0x1
+	SettingEnablePush            SettingID = 0x2
+	SettingMaxConcurrentStreams  SettingID = 0x3
+	SettingInitialWindowSize     SettingID = 0x4
+	SettingMaxFrameSize          SettingID = 0x5
+	SettingMaxHeaderListSize     SettingID = 0x6
+	SettingEnableConnectProtocol SettingID = 0x8
 )
 
 var settingName = map[SettingID]string{
-	SettingHeaderTableSize:      "HEADER_TABLE_SIZE",
-	SettingEnablePush:           "ENABLE_PUSH",
-	SettingMaxConcurrentStreams: "MAX_CONCURRENT_STREAMS",
-	SettingInitialWindowSize:    "INITIAL_WINDOW_SIZE",
-	SettingMaxFrameSize:         "MAX_FRAME_SIZE",
-	SettingMaxHeaderListSize:    "MAX_HEADER_LIST_SIZE",
+	SettingHeaderTableSize:       "HEADER_TABLE_SIZE",
+	SettingEnablePush:            "ENABLE_PUSH",
+	SettingMaxConcurrentStreams:  "MAX_CONCURRENT_STREAMS",
+	SettingInitialWindowSize:     "INITIAL_WINDOW_SIZE",
+	SettingMaxFrameSize:          "MAX_FRAME_SIZE",
+	SettingMaxHeaderListSize:     "MAX_HEADER_LIST_SIZE",
+	SettingEnableConnectProtocol: "ENABLE_CONNECT_PROTOCOL",
 }
 
 func (s SettingID) String() string {
@@ -237,13 +248,19 @@ func (cw closeWaiter) Wait() {
 // Its buffered writer is lazily allocated as needed, to minimize
 // idle memory usage with many connections.
 type bufferedWriter struct {
-	_  incomparable
-	w  io.Writer     // immutable
-	bw *bufio.Writer // non-nil when data is buffered
+	_           incomparable
+	group       synctestGroupInterface // immutable
+	conn        net.Conn               // immutable
+	bw          *bufio.Writer          // non-nil when data is buffered
+	byteTimeout time.Duration          // immutable, WriteByteTimeout
 }
 
-func newBufferedWriter(w io.Writer) *bufferedWriter {
-	return &bufferedWriter{w: w}
+func newBufferedWriter(group synctestGroupInterface, conn net.Conn, timeout time.Duration) *bufferedWriter {
+	return &bufferedWriter{
+		group:       group,
+		conn:        conn,
+		byteTimeout: timeout,
+	}
 }
 
 // bufWriterPoolBufferSize is the size of bufio.Writer's
@@ -270,7 +287,7 @@ func (w *bufferedWriter) Available() int {
 func (w *bufferedWriter) Write(p []byte) (n int, err error) {
 	if w.bw == nil {
 		bw := bufWriterPool.Get().(*bufio.Writer)
-		bw.Reset(w.w)
+		bw.Reset((*bufferedWriterTimeoutWriter)(w))
 		w.bw = bw
 	}
 	return w.bw.Write(p)
@@ -288,6 +305,38 @@ func (w *bufferedWriter) Flush() error {
 	return err
 }
 
+type bufferedWriterTimeoutWriter bufferedWriter
+
+func (w *bufferedWriterTimeoutWriter) Write(p []byte) (n int, err error) {
+	return writeWithByteTimeout(w.group, w.conn, w.byteTimeout, p)
+}
+
+// writeWithByteTimeout writes to conn.
+// If more than timeout passes without any bytes being written to the connection,
+// the write fails.
+func writeWithByteTimeout(group synctestGroupInterface, conn net.Conn, timeout time.Duration, p []byte) (n int, err error) {
+	if timeout <= 0 {
+		return conn.Write(p)
+	}
+	for {
+		var now time.Time
+		if group == nil {
+			now = time.Now()
+		} else {
+			now = group.Now()
+		}
+		conn.SetWriteDeadline(now.Add(timeout))
+		nn, err := conn.Write(p[n:])
+		n += nn
+		if n == len(p) || nn == 0 || !errors.Is(err, os.ErrDeadlineExceeded) {
+			// Either we finished the write, made no progress, or hit the deadline.
+			// Whichever it is, we're done now.
+			conn.SetWriteDeadline(time.Time{})
+			return n, err
+		}
+	}
+}
+
 func mustUint31(v int32) uint32 {
 	if v < 0 || v > 2147483647 {
 		panic("out of range")
diff --git a/vendor/golang.org/x/net/http2/server.go b/vendor/golang.org/x/net/http2/server.go
index 6c349f3ec647381bb63890955968351620fd9b12..b55547aec64032318cd21825392250f6b6bb673c 100644
--- a/vendor/golang.org/x/net/http2/server.go
+++ b/vendor/golang.org/x/net/http2/server.go
@@ -29,6 +29,7 @@ import (
 	"bufio"
 	"bytes"
 	"context"
+	"crypto/rand"
 	"crypto/tls"
 	"errors"
 	"fmt"
@@ -52,10 +53,14 @@ import (
 )
 
 const (
-	prefaceTimeout         = 10 * time.Second
-	firstSettingsTimeout   = 2 * time.Second // should be in-flight with preface anyway
-	handlerChunkWriteSize  = 4 << 10
-	defaultMaxStreams      = 250 // TODO: make this 100 as the GFE seems to?
+	prefaceTimeout        = 10 * time.Second
+	firstSettingsTimeout  = 2 * time.Second // should be in-flight with preface anyway
+	handlerChunkWriteSize = 4 << 10
+	defaultMaxStreams     = 250 // TODO: make this 100 as the GFE seems to?
+
+	// maxQueuedControlFrames is the maximum number of control frames like
+	// SETTINGS, PING and RST_STREAM that will be queued for writing before
+	// the connection is closed to prevent memory exhaustion attacks.
 	maxQueuedControlFrames = 10000
 )
 
@@ -127,6 +132,22 @@ type Server struct {
 	// If zero or negative, there is no timeout.
 	IdleTimeout time.Duration
 
+	// ReadIdleTimeout is the timeout after which a health check using a ping
+	// frame will be carried out if no frame is received on the connection.
+	// If zero, no health check is performed.
+	ReadIdleTimeout time.Duration
+
+	// PingTimeout is the timeout after which the connection will be closed
+	// if a response to a ping is not received.
+	// If zero, a default of 15 seconds is used.
+	PingTimeout time.Duration
+
+	// WriteByteTimeout is the timeout after which a connection will be
+	// closed if no data can be written to it. The timeout begins when data is
+	// available to write, and is extended whenever any bytes are written.
+	// If zero or negative, there is no timeout.
+	WriteByteTimeout time.Duration
+
 	// MaxUploadBufferPerConnection is the size of the initial flow
 	// control window for each connections. The HTTP/2 spec does not
 	// allow this to be smaller than 65535 or larger than 2^32-1.
@@ -189,57 +210,6 @@ func (s *Server) afterFunc(d time.Duration, f func()) timer {
 	return timeTimer{time.AfterFunc(d, f)}
 }
 
-func (s *Server) initialConnRecvWindowSize() int32 {
-	if s.MaxUploadBufferPerConnection >= initialWindowSize {
-		return s.MaxUploadBufferPerConnection
-	}
-	return 1 << 20
-}
-
-func (s *Server) initialStreamRecvWindowSize() int32 {
-	if s.MaxUploadBufferPerStream > 0 {
-		return s.MaxUploadBufferPerStream
-	}
-	return 1 << 20
-}
-
-func (s *Server) maxReadFrameSize() uint32 {
-	if v := s.MaxReadFrameSize; v >= minMaxFrameSize && v <= maxFrameSize {
-		return v
-	}
-	return defaultMaxReadFrameSize
-}
-
-func (s *Server) maxConcurrentStreams() uint32 {
-	if v := s.MaxConcurrentStreams; v > 0 {
-		return v
-	}
-	return defaultMaxStreams
-}
-
-func (s *Server) maxDecoderHeaderTableSize() uint32 {
-	if v := s.MaxDecoderHeaderTableSize; v > 0 {
-		return v
-	}
-	return initialHeaderTableSize
-}
-
-func (s *Server) maxEncoderHeaderTableSize() uint32 {
-	if v := s.MaxEncoderHeaderTableSize; v > 0 {
-		return v
-	}
-	return initialHeaderTableSize
-}
-
-// maxQueuedControlFrames is the maximum number of control frames like
-// SETTINGS, PING and RST_STREAM that will be queued for writing before
-// the connection is closed to prevent memory exhaustion attacks.
-func (s *Server) maxQueuedControlFrames() int {
-	// TODO: if anybody asks, add a Server field, and remember to define the
-	// behavior of negative values.
-	return maxQueuedControlFrames
-}
-
 type serverInternalState struct {
 	mu          sync.Mutex
 	activeConns map[*serverConn]struct{}
@@ -336,7 +306,7 @@ func ConfigureServer(s *http.Server, conf *Server) error {
 	if s.TLSNextProto == nil {
 		s.TLSNextProto = map[string]func(*http.Server, *tls.Conn, http.Handler){}
 	}
-	protoHandler := func(hs *http.Server, c *tls.Conn, h http.Handler) {
+	protoHandler := func(hs *http.Server, c net.Conn, h http.Handler, sawClientPreface bool) {
 		if testHookOnConn != nil {
 			testHookOnConn()
 		}
@@ -353,12 +323,31 @@ func ConfigureServer(s *http.Server, conf *Server) error {
 			ctx = bc.BaseContext()
 		}
 		conf.ServeConn(c, &ServeConnOpts{
-			Context:    ctx,
-			Handler:    h,
-			BaseConfig: hs,
+			Context:          ctx,
+			Handler:          h,
+			BaseConfig:       hs,
+			SawClientPreface: sawClientPreface,
 		})
 	}
-	s.TLSNextProto[NextProtoTLS] = protoHandler
+	s.TLSNextProto[NextProtoTLS] = func(hs *http.Server, c *tls.Conn, h http.Handler) {
+		protoHandler(hs, c, h, false)
+	}
+	// The "unencrypted_http2" TLSNextProto key is used to pass off non-TLS HTTP/2 conns.
+	//
+	// A connection passed in this method has already had the HTTP/2 preface read from it.
+	s.TLSNextProto[nextProtoUnencryptedHTTP2] = func(hs *http.Server, c *tls.Conn, h http.Handler) {
+		nc, err := unencryptedNetConnFromTLSConn(c)
+		if err != nil {
+			if lg := hs.ErrorLog; lg != nil {
+				lg.Print(err)
+			} else {
+				log.Print(err)
+			}
+			go c.Close()
+			return
+		}
+		protoHandler(hs, nc, h, true)
+	}
 	return nil
 }
 
@@ -440,13 +429,15 @@ func (s *Server) serveConn(c net.Conn, opts *ServeConnOpts, newf func(*serverCon
 	baseCtx, cancel := serverConnBaseContext(c, opts)
 	defer cancel()
 
+	http1srv := opts.baseConfig()
+	conf := configFromServer(http1srv, s)
 	sc := &serverConn{
 		srv:                         s,
-		hs:                          opts.baseConfig(),
+		hs:                          http1srv,
 		conn:                        c,
 		baseCtx:                     baseCtx,
 		remoteAddrStr:               c.RemoteAddr().String(),
-		bw:                          newBufferedWriter(c),
+		bw:                          newBufferedWriter(s.group, c, conf.WriteByteTimeout),
 		handler:                     opts.handler(),
 		streams:                     make(map[uint32]*stream),
 		readFrameCh:                 make(chan readFrameResult),
@@ -456,9 +447,12 @@ func (s *Server) serveConn(c net.Conn, opts *ServeConnOpts, newf func(*serverCon
 		bodyReadCh:                  make(chan bodyReadMsg),         // buffering doesn't matter either way
 		doneServing:                 make(chan struct{}),
 		clientMaxStreams:            math.MaxUint32, // Section 6.5.2: "Initially, there is no limit to this value"
-		advMaxStreams:               s.maxConcurrentStreams(),
+		advMaxStreams:               conf.MaxConcurrentStreams,
 		initialStreamSendWindowSize: initialWindowSize,
+		initialStreamRecvWindowSize: conf.MaxUploadBufferPerStream,
 		maxFrameSize:                initialMaxFrameSize,
+		pingTimeout:                 conf.PingTimeout,
+		countErrorFunc:              conf.CountError,
 		serveG:                      newGoroutineLock(),
 		pushEnabled:                 true,
 		sawClientPreface:            opts.SawClientPreface,
@@ -491,15 +485,15 @@ func (s *Server) serveConn(c net.Conn, opts *ServeConnOpts, newf func(*serverCon
 	sc.flow.add(initialWindowSize)
 	sc.inflow.init(initialWindowSize)
 	sc.hpackEncoder = hpack.NewEncoder(&sc.headerWriteBuf)
-	sc.hpackEncoder.SetMaxDynamicTableSizeLimit(s.maxEncoderHeaderTableSize())
+	sc.hpackEncoder.SetMaxDynamicTableSizeLimit(conf.MaxEncoderHeaderTableSize)
 
 	fr := NewFramer(sc.bw, c)
-	if s.CountError != nil {
-		fr.countError = s.CountError
+	if conf.CountError != nil {
+		fr.countError = conf.CountError
 	}
-	fr.ReadMetaHeaders = hpack.NewDecoder(s.maxDecoderHeaderTableSize(), nil)
+	fr.ReadMetaHeaders = hpack.NewDecoder(conf.MaxDecoderHeaderTableSize, nil)
 	fr.MaxHeaderListSize = sc.maxHeaderListSize()
-	fr.SetMaxReadFrameSize(s.maxReadFrameSize())
+	fr.SetMaxReadFrameSize(conf.MaxReadFrameSize)
 	sc.framer = fr
 
 	if tc, ok := c.(connectionStater); ok {
@@ -532,7 +526,7 @@ func (s *Server) serveConn(c net.Conn, opts *ServeConnOpts, newf func(*serverCon
 			// So for now, do nothing here again.
 		}
 
-		if !s.PermitProhibitedCipherSuites && isBadCipher(sc.tlsState.CipherSuite) {
+		if !conf.PermitProhibitedCipherSuites && isBadCipher(sc.tlsState.CipherSuite) {
 			// "Endpoints MAY choose to generate a connection error
 			// (Section 5.4.1) of type INADEQUATE_SECURITY if one of
 			// the prohibited cipher suites are negotiated."
@@ -569,7 +563,7 @@ func (s *Server) serveConn(c net.Conn, opts *ServeConnOpts, newf func(*serverCon
 		opts.UpgradeRequest = nil
 	}
 
-	sc.serve()
+	sc.serve(conf)
 }
 
 func serverConnBaseContext(c net.Conn, opts *ServeConnOpts) (ctx context.Context, cancel func()) {
@@ -609,6 +603,7 @@ type serverConn struct {
 	tlsState         *tls.ConnectionState   // shared by all handlers, like net/http
 	remoteAddrStr    string
 	writeSched       WriteScheduler
+	countErrorFunc   func(errType string)
 
 	// Everything following is owned by the serve loop; use serveG.check():
 	serveG                      goroutineLock // used to verify funcs are on serve()
@@ -628,6 +623,7 @@ type serverConn struct {
 	streams                     map[uint32]*stream
 	unstartedHandlers           []unstartedHandler
 	initialStreamSendWindowSize int32
+	initialStreamRecvWindowSize int32
 	maxFrameSize                int32
 	peerMaxHeaderListSize       uint32            // zero means unknown (default)
 	canonHeader                 map[string]string // http2-lower-case -> Go-Canonical-Case
@@ -638,9 +634,14 @@ type serverConn struct {
 	inGoAway                    bool              // we've started to or sent GOAWAY
 	inFrameScheduleLoop         bool              // whether we're in the scheduleFrameWrite loop
 	needToSendGoAway            bool              // we need to schedule a GOAWAY frame write
+	pingSent                    bool
+	sentPingData                [8]byte
 	goAwayCode                  ErrCode
 	shutdownTimer               timer // nil until used
 	idleTimer                   timer // nil if unused
+	readIdleTimeout             time.Duration
+	pingTimeout                 time.Duration
+	readIdleTimer               timer // nil if unused
 
 	// Owned by the writeFrameAsync goroutine:
 	headerWriteBuf bytes.Buffer
@@ -655,11 +656,7 @@ func (sc *serverConn) maxHeaderListSize() uint32 {
 	if n <= 0 {
 		n = http.DefaultMaxHeaderBytes
 	}
-	// http2's count is in a slightly different unit and includes 32 bytes per pair.
-	// So, take the net/http.Server value and pad it up a bit, assuming 10 headers.
-	const perFieldOverhead = 32 // per http2 spec
-	const typicalHeaders = 10   // conservative
-	return uint32(n + typicalHeaders*perFieldOverhead)
+	return uint32(adjustHTTP1MaxHeaderSize(int64(n)))
 }
 
 func (sc *serverConn) curOpenStreams() uint32 {
@@ -923,7 +920,7 @@ func (sc *serverConn) notePanic() {
 	}
 }
 
-func (sc *serverConn) serve() {
+func (sc *serverConn) serve(conf http2Config) {
 	sc.serveG.check()
 	defer sc.notePanic()
 	defer sc.conn.Close()
@@ -935,20 +932,24 @@ func (sc *serverConn) serve() {
 		sc.vlogf("http2: server connection from %v on %p", sc.conn.RemoteAddr(), sc.hs)
 	}
 
+	settings := writeSettings{
+		{SettingMaxFrameSize, conf.MaxReadFrameSize},
+		{SettingMaxConcurrentStreams, sc.advMaxStreams},
+		{SettingMaxHeaderListSize, sc.maxHeaderListSize()},
+		{SettingHeaderTableSize, conf.MaxDecoderHeaderTableSize},
+		{SettingInitialWindowSize, uint32(sc.initialStreamRecvWindowSize)},
+	}
+	if !disableExtendedConnectProtocol {
+		settings = append(settings, Setting{SettingEnableConnectProtocol, 1})
+	}
 	sc.writeFrame(FrameWriteRequest{
-		write: writeSettings{
-			{SettingMaxFrameSize, sc.srv.maxReadFrameSize()},
-			{SettingMaxConcurrentStreams, sc.advMaxStreams},
-			{SettingMaxHeaderListSize, sc.maxHeaderListSize()},
-			{SettingHeaderTableSize, sc.srv.maxDecoderHeaderTableSize()},
-			{SettingInitialWindowSize, uint32(sc.srv.initialStreamRecvWindowSize())},
-		},
+		write: settings,
 	})
 	sc.unackedSettings++
 
 	// Each connection starts with initialWindowSize inflow tokens.
 	// If a higher value is configured, we add more tokens.
-	if diff := sc.srv.initialConnRecvWindowSize() - initialWindowSize; diff > 0 {
+	if diff := conf.MaxUploadBufferPerConnection - initialWindowSize; diff > 0 {
 		sc.sendWindowUpdate(nil, int(diff))
 	}
 
@@ -968,11 +969,18 @@ func (sc *serverConn) serve() {
 		defer sc.idleTimer.Stop()
 	}
 
+	if conf.SendPingTimeout > 0 {
+		sc.readIdleTimeout = conf.SendPingTimeout
+		sc.readIdleTimer = sc.srv.afterFunc(conf.SendPingTimeout, sc.onReadIdleTimer)
+		defer sc.readIdleTimer.Stop()
+	}
+
 	go sc.readFrames() // closed by defer sc.conn.Close above
 
 	settingsTimer := sc.srv.afterFunc(firstSettingsTimeout, sc.onSettingsTimer)
 	defer settingsTimer.Stop()
 
+	lastFrameTime := sc.srv.now()
 	loopNum := 0
 	for {
 		loopNum++
@@ -986,6 +994,7 @@ func (sc *serverConn) serve() {
 		case res := <-sc.wroteFrameCh:
 			sc.wroteFrame(res)
 		case res := <-sc.readFrameCh:
+			lastFrameTime = sc.srv.now()
 			// Process any written frames before reading new frames from the client since a
 			// written frame could have triggered a new stream to be started.
 			if sc.writingFrameAsync {
@@ -1017,6 +1026,8 @@ func (sc *serverConn) serve() {
 				case idleTimerMsg:
 					sc.vlogf("connection is idle")
 					sc.goAway(ErrCodeNo)
+				case readIdleTimerMsg:
+					sc.handlePingTimer(lastFrameTime)
 				case shutdownTimerMsg:
 					sc.vlogf("GOAWAY close timer fired; closing conn from %v", sc.conn.RemoteAddr())
 					return
@@ -1039,7 +1050,7 @@ func (sc *serverConn) serve() {
 		// If the peer is causing us to generate a lot of control frames,
 		// but not reading them from us, assume they are trying to make us
 		// run out of memory.
-		if sc.queuedControlFrames > sc.srv.maxQueuedControlFrames() {
+		if sc.queuedControlFrames > maxQueuedControlFrames {
 			sc.vlogf("http2: too many control frames in send queue, closing connection")
 			return
 		}
@@ -1055,12 +1066,39 @@ func (sc *serverConn) serve() {
 	}
 }
 
+func (sc *serverConn) handlePingTimer(lastFrameReadTime time.Time) {
+	if sc.pingSent {
+		sc.vlogf("timeout waiting for PING response")
+		sc.conn.Close()
+		return
+	}
+
+	pingAt := lastFrameReadTime.Add(sc.readIdleTimeout)
+	now := sc.srv.now()
+	if pingAt.After(now) {
+		// We received frames since arming the ping timer.
+		// Reset it for the next possible timeout.
+		sc.readIdleTimer.Reset(pingAt.Sub(now))
+		return
+	}
+
+	sc.pingSent = true
+	// Ignore crypto/rand.Read errors: It generally can't fail, and worse case if it does
+	// is we send a PING frame containing 0s.
+	_, _ = rand.Read(sc.sentPingData[:])
+	sc.writeFrame(FrameWriteRequest{
+		write: &writePing{data: sc.sentPingData},
+	})
+	sc.readIdleTimer.Reset(sc.pingTimeout)
+}
+
 type serverMessage int
 
 // Message values sent to serveMsgCh.
 var (
 	settingsTimerMsg    = new(serverMessage)
 	idleTimerMsg        = new(serverMessage)
+	readIdleTimerMsg    = new(serverMessage)
 	shutdownTimerMsg    = new(serverMessage)
 	gracefulShutdownMsg = new(serverMessage)
 	handlerDoneMsg      = new(serverMessage)
@@ -1068,6 +1106,7 @@ var (
 
 func (sc *serverConn) onSettingsTimer() { sc.sendServeMsg(settingsTimerMsg) }
 func (sc *serverConn) onIdleTimer()     { sc.sendServeMsg(idleTimerMsg) }
+func (sc *serverConn) onReadIdleTimer() { sc.sendServeMsg(readIdleTimerMsg) }
 func (sc *serverConn) onShutdownTimer() { sc.sendServeMsg(shutdownTimerMsg) }
 
 func (sc *serverConn) sendServeMsg(msg interface{}) {
@@ -1320,6 +1359,10 @@ func (sc *serverConn) wroteFrame(res frameWriteResult) {
 	sc.writingFrame = false
 	sc.writingFrameAsync = false
 
+	if res.err != nil {
+		sc.conn.Close()
+	}
+
 	wr := res.wr
 
 	if writeEndsStream(wr.write) {
@@ -1594,6 +1637,11 @@ func (sc *serverConn) processFrame(f Frame) error {
 func (sc *serverConn) processPing(f *PingFrame) error {
 	sc.serveG.check()
 	if f.IsAck() {
+		if sc.pingSent && sc.sentPingData == f.Data {
+			// This is a response to a PING we sent.
+			sc.pingSent = false
+			sc.readIdleTimer.Reset(sc.readIdleTimeout)
+		}
 		// 6.7 PING: " An endpoint MUST NOT respond to PING frames
 		// containing this flag."
 		return nil
@@ -1757,6 +1805,9 @@ func (sc *serverConn) processSetting(s Setting) error {
 		sc.maxFrameSize = int32(s.Val) // the maximum valid s.Val is < 2^31
 	case SettingMaxHeaderListSize:
 		sc.peerMaxHeaderListSize = s.Val
+	case SettingEnableConnectProtocol:
+		// Receipt of this parameter by a server does not
+		// have any impact
 	default:
 		// Unknown setting: "An endpoint that receives a SETTINGS
 		// frame with any unknown or unsupported identifier MUST
@@ -2160,7 +2211,7 @@ func (sc *serverConn) newStream(id, pusherID uint32, state streamState) *stream
 	st.cw.Init()
 	st.flow.conn = &sc.flow // link to conn-level counter
 	st.flow.add(sc.initialStreamSendWindowSize)
-	st.inflow.init(sc.srv.initialStreamRecvWindowSize())
+	st.inflow.init(sc.initialStreamRecvWindowSize)
 	if sc.hs.WriteTimeout > 0 {
 		st.writeDeadline = sc.srv.afterFunc(sc.hs.WriteTimeout, st.onWriteTimeout)
 	}
@@ -2187,11 +2238,17 @@ func (sc *serverConn) newWriterAndRequest(st *stream, f *MetaHeadersFrame) (*res
 		scheme:    f.PseudoValue("scheme"),
 		authority: f.PseudoValue("authority"),
 		path:      f.PseudoValue("path"),
+		protocol:  f.PseudoValue("protocol"),
+	}
+
+	// extended connect is disabled, so we should not see :protocol
+	if disableExtendedConnectProtocol && rp.protocol != "" {
+		return nil, nil, sc.countError("bad_connect", streamError(f.StreamID, ErrCodeProtocol))
 	}
 
 	isConnect := rp.method == "CONNECT"
 	if isConnect {
-		if rp.path != "" || rp.scheme != "" || rp.authority == "" {
+		if rp.protocol == "" && (rp.path != "" || rp.scheme != "" || rp.authority == "") {
 			return nil, nil, sc.countError("bad_connect", streamError(f.StreamID, ErrCodeProtocol))
 		}
 	} else if rp.method == "" || rp.path == "" || (rp.scheme != "https" && rp.scheme != "http") {
@@ -2215,6 +2272,9 @@ func (sc *serverConn) newWriterAndRequest(st *stream, f *MetaHeadersFrame) (*res
 	if rp.authority == "" {
 		rp.authority = rp.header.Get("Host")
 	}
+	if rp.protocol != "" {
+		rp.header.Set(":protocol", rp.protocol)
+	}
 
 	rw, req, err := sc.newWriterAndRequestNoBody(st, rp)
 	if err != nil {
@@ -2241,6 +2301,7 @@ func (sc *serverConn) newWriterAndRequest(st *stream, f *MetaHeadersFrame) (*res
 type requestParam struct {
 	method                  string
 	scheme, authority, path string
+	protocol                string
 	header                  http.Header
 }
 
@@ -2282,7 +2343,7 @@ func (sc *serverConn) newWriterAndRequestNoBody(st *stream, rp requestParam) (*r
 
 	var url_ *url.URL
 	var requestURI string
-	if rp.method == "CONNECT" {
+	if rp.method == "CONNECT" && rp.protocol == "" {
 		url_ = &url.URL{Host: rp.authority}
 		requestURI = rp.authority // mimic HTTP/1 server behavior
 	} else {
@@ -2855,6 +2916,11 @@ func (w *responseWriter) SetWriteDeadline(deadline time.Time) error {
 	return nil
 }
 
+func (w *responseWriter) EnableFullDuplex() error {
+	// We always support full duplex responses, so this is a no-op.
+	return nil
+}
+
 func (w *responseWriter) Flush() {
 	w.FlushError()
 }
@@ -3301,7 +3367,7 @@ func (sc *serverConn) countError(name string, err error) error {
 	if sc == nil || sc.srv == nil {
 		return err
 	}
-	f := sc.srv.CountError
+	f := sc.countErrorFunc
 	if f == nil {
 		return err
 	}
diff --git a/vendor/golang.org/x/net/http2/transport.go b/vendor/golang.org/x/net/http2/transport.go
index 61f511f97aa44034f629eaef554baeaa880d75f5..090d0e1bdb5de6ee67f1a519c2c5dd23f5c21020 100644
--- a/vendor/golang.org/x/net/http2/transport.go
+++ b/vendor/golang.org/x/net/http2/transport.go
@@ -25,7 +25,6 @@ import (
 	"net/http"
 	"net/http/httptrace"
 	"net/textproto"
-	"os"
 	"sort"
 	"strconv"
 	"strings"
@@ -203,6 +202,20 @@ func (t *Transport) markNewGoroutine() {
 	}
 }
 
+func (t *Transport) now() time.Time {
+	if t != nil && t.transportTestHooks != nil {
+		return t.transportTestHooks.group.Now()
+	}
+	return time.Now()
+}
+
+func (t *Transport) timeSince(when time.Time) time.Duration {
+	if t != nil && t.transportTestHooks != nil {
+		return t.now().Sub(when)
+	}
+	return time.Since(when)
+}
+
 // newTimer creates a new time.Timer, or a synthetic timer in tests.
 func (t *Transport) newTimer(d time.Duration) timer {
 	if t.transportTestHooks != nil {
@@ -227,40 +240,26 @@ func (t *Transport) contextWithTimeout(ctx context.Context, d time.Duration) (co
 }
 
 func (t *Transport) maxHeaderListSize() uint32 {
-	if t.MaxHeaderListSize == 0 {
+	n := int64(t.MaxHeaderListSize)
+	if t.t1 != nil && t.t1.MaxResponseHeaderBytes != 0 {
+		n = t.t1.MaxResponseHeaderBytes
+		if n > 0 {
+			n = adjustHTTP1MaxHeaderSize(n)
+		}
+	}
+	if n <= 0 {
 		return 10 << 20
 	}
-	if t.MaxHeaderListSize == 0xffffffff {
+	if n >= 0xffffffff {
 		return 0
 	}
-	return t.MaxHeaderListSize
-}
-
-func (t *Transport) maxFrameReadSize() uint32 {
-	if t.MaxReadFrameSize == 0 {
-		return 0 // use the default provided by the peer
-	}
-	if t.MaxReadFrameSize < minMaxFrameSize {
-		return minMaxFrameSize
-	}
-	if t.MaxReadFrameSize > maxFrameSize {
-		return maxFrameSize
-	}
-	return t.MaxReadFrameSize
+	return uint32(n)
 }
 
 func (t *Transport) disableCompression() bool {
 	return t.DisableCompression || (t.t1 != nil && t.t1.DisableCompression)
 }
 
-func (t *Transport) pingTimeout() time.Duration {
-	if t.PingTimeout == 0 {
-		return 15 * time.Second
-	}
-	return t.PingTimeout
-
-}
-
 // ConfigureTransport configures a net/http HTTP/1 Transport to use HTTP/2.
 // It returns an error if t1 has already been HTTP/2-enabled.
 //
@@ -296,8 +295,8 @@ func configureTransports(t1 *http.Transport) (*Transport, error) {
 	if !strSliceContains(t1.TLSClientConfig.NextProtos, "http/1.1") {
 		t1.TLSClientConfig.NextProtos = append(t1.TLSClientConfig.NextProtos, "http/1.1")
 	}
-	upgradeFn := func(authority string, c *tls.Conn) http.RoundTripper {
-		addr := authorityAddr("https", authority)
+	upgradeFn := func(scheme, authority string, c net.Conn) http.RoundTripper {
+		addr := authorityAddr(scheme, authority)
 		if used, err := connPool.addConnIfNeeded(addr, t2, c); err != nil {
 			go c.Close()
 			return erringRoundTripper{err}
@@ -308,18 +307,37 @@ func configureTransports(t1 *http.Transport) (*Transport, error) {
 			// was unknown)
 			go c.Close()
 		}
+		if scheme == "http" {
+			return (*unencryptedTransport)(t2)
+		}
 		return t2
 	}
-	if m := t1.TLSNextProto; len(m) == 0 {
-		t1.TLSNextProto = map[string]func(string, *tls.Conn) http.RoundTripper{
-			"h2": upgradeFn,
+	if t1.TLSNextProto == nil {
+		t1.TLSNextProto = make(map[string]func(string, *tls.Conn) http.RoundTripper)
+	}
+	t1.TLSNextProto[NextProtoTLS] = func(authority string, c *tls.Conn) http.RoundTripper {
+		return upgradeFn("https", authority, c)
+	}
+	// The "unencrypted_http2" TLSNextProto key is used to pass off non-TLS HTTP/2 conns.
+	t1.TLSNextProto[nextProtoUnencryptedHTTP2] = func(authority string, c *tls.Conn) http.RoundTripper {
+		nc, err := unencryptedNetConnFromTLSConn(c)
+		if err != nil {
+			go c.Close()
+			return erringRoundTripper{err}
 		}
-	} else {
-		m["h2"] = upgradeFn
+		return upgradeFn("http", authority, nc)
 	}
 	return t2, nil
 }
 
+// unencryptedTransport is a Transport with a RoundTrip method that
+// always permits http:// URLs.
+type unencryptedTransport Transport
+
+func (t *unencryptedTransport) RoundTrip(req *http.Request) (*http.Response, error) {
+	return (*Transport)(t).RoundTripOpt(req, RoundTripOpt{allowHTTP: true})
+}
+
 func (t *Transport) connPool() ClientConnPool {
 	t.connPoolOnce.Do(t.initConnPool)
 	return t.connPoolOrDef
@@ -339,7 +357,7 @@ type ClientConn struct {
 	t             *Transport
 	tconn         net.Conn             // usually *tls.Conn, except specialized impls
 	tlsState      *tls.ConnectionState // nil only for specialized impls
-	reused        uint32               // whether conn is being reused; atomic
+	atomicReused  uint32               // whether conn is being reused; atomic
 	singleUse     bool                 // whether being used for a single http.Request
 	getConnCalled bool                 // used by clientConnPool
 
@@ -350,31 +368,54 @@ type ClientConn struct {
 	idleTimeout time.Duration // or 0 for never
 	idleTimer   timer
 
-	mu              sync.Mutex // guards following
-	cond            *sync.Cond // hold mu; broadcast on flow/closed changes
-	flow            outflow    // our conn-level flow control quota (cs.outflow is per stream)
-	inflow          inflow     // peer's conn-level flow control
-	doNotReuse      bool       // whether conn is marked to not be reused for any future requests
-	closing         bool
-	closed          bool
-	seenSettings    bool                     // true if we've seen a settings frame, false otherwise
-	wantSettingsAck bool                     // we sent a SETTINGS frame and haven't heard back
-	goAway          *GoAwayFrame             // if non-nil, the GoAwayFrame we received
-	goAwayDebug     string                   // goAway frame's debug data, retained as a string
-	streams         map[uint32]*clientStream // client-initiated
-	streamsReserved int                      // incr by ReserveNewRequest; decr on RoundTrip
-	nextStreamID    uint32
-	pendingRequests int                       // requests blocked and waiting to be sent because len(streams) == maxConcurrentStreams
-	pings           map[[8]byte]chan struct{} // in flight ping data to notification channel
-	br              *bufio.Reader
-	lastActive      time.Time
-	lastIdle        time.Time // time last idle
+	mu               sync.Mutex // guards following
+	cond             *sync.Cond // hold mu; broadcast on flow/closed changes
+	flow             outflow    // our conn-level flow control quota (cs.outflow is per stream)
+	inflow           inflow     // peer's conn-level flow control
+	doNotReuse       bool       // whether conn is marked to not be reused for any future requests
+	closing          bool
+	closed           bool
+	seenSettings     bool                     // true if we've seen a settings frame, false otherwise
+	seenSettingsChan chan struct{}            // closed when seenSettings is true or frame reading fails
+	wantSettingsAck  bool                     // we sent a SETTINGS frame and haven't heard back
+	goAway           *GoAwayFrame             // if non-nil, the GoAwayFrame we received
+	goAwayDebug      string                   // goAway frame's debug data, retained as a string
+	streams          map[uint32]*clientStream // client-initiated
+	streamsReserved  int                      // incr by ReserveNewRequest; decr on RoundTrip
+	nextStreamID     uint32
+	pendingRequests  int                       // requests blocked and waiting to be sent because len(streams) == maxConcurrentStreams
+	pings            map[[8]byte]chan struct{} // in flight ping data to notification channel
+	br               *bufio.Reader
+	lastActive       time.Time
+	lastIdle         time.Time // time last idle
 	// Settings from peer: (also guarded by wmu)
-	maxFrameSize           uint32
-	maxConcurrentStreams   uint32
-	peerMaxHeaderListSize  uint64
-	peerMaxHeaderTableSize uint32
-	initialWindowSize      uint32
+	maxFrameSize                uint32
+	maxConcurrentStreams        uint32
+	peerMaxHeaderListSize       uint64
+	peerMaxHeaderTableSize      uint32
+	initialWindowSize           uint32
+	initialStreamRecvWindowSize int32
+	readIdleTimeout             time.Duration
+	pingTimeout                 time.Duration
+	extendedConnectAllowed      bool
+
+	// rstStreamPingsBlocked works around an unfortunate gRPC behavior.
+	// gRPC strictly limits the number of PING frames that it will receive.
+	// The default is two pings per two hours, but the limit resets every time
+	// the gRPC endpoint sends a HEADERS or DATA frame. See golang/go#70575.
+	//
+	// rstStreamPingsBlocked is set after receiving a response to a PING frame
+	// bundled with an RST_STREAM (see pendingResets below), and cleared after
+	// receiving a HEADERS or DATA frame.
+	rstStreamPingsBlocked bool
+
+	// pendingResets is the number of RST_STREAM frames we have sent to the peer,
+	// without confirming that the peer has received them. When we send a RST_STREAM,
+	// we bundle it with a PING frame, unless a PING is already in flight. We count
+	// the reset stream against the connection's concurrency limit until we get
+	// a PING response. This limits the number of requests we'll try to send to a
+	// completely unresponsive connection.
+	pendingResets int
 
 	// reqHeaderMu is a 1-element semaphore channel controlling access to sending new requests.
 	// Write to reqHeaderMu to lock it, read from it to unlock.
@@ -432,12 +473,12 @@ type clientStream struct {
 	sentHeaders   bool
 
 	// owned by clientConnReadLoop:
-	firstByte    bool  // got the first response byte
-	pastHeaders  bool  // got first MetaHeadersFrame (actual headers)
-	pastTrailers bool  // got optional second MetaHeadersFrame (trailers)
-	num1xx       uint8 // number of 1xx responses seen
-	readClosed   bool  // peer sent an END_STREAM flag
-	readAborted  bool  // read loop reset the stream
+	firstByte       bool  // got the first response byte
+	pastHeaders     bool  // got first MetaHeadersFrame (actual headers)
+	pastTrailers    bool  // got optional second MetaHeadersFrame (trailers)
+	readClosed      bool  // peer sent an END_STREAM flag
+	readAborted     bool  // read loop reset the stream
+	totalHeaderSize int64 // total size of 1xx headers seen
 
 	trailer    http.Header  // accumulated trailers
 	resTrailer *http.Header // client's Response.Trailer
@@ -499,6 +540,7 @@ func (cs *clientStream) closeReqBodyLocked() {
 }
 
 type stickyErrWriter struct {
+	group   synctestGroupInterface
 	conn    net.Conn
 	timeout time.Duration
 	err     *error
@@ -508,22 +550,9 @@ func (sew stickyErrWriter) Write(p []byte) (n int, err error) {
 	if *sew.err != nil {
 		return 0, *sew.err
 	}
-	for {
-		if sew.timeout != 0 {
-			sew.conn.SetWriteDeadline(time.Now().Add(sew.timeout))
-		}
-		nn, err := sew.conn.Write(p[n:])
-		n += nn
-		if n < len(p) && nn > 0 && errors.Is(err, os.ErrDeadlineExceeded) {
-			// Keep extending the deadline so long as we're making progress.
-			continue
-		}
-		if sew.timeout != 0 {
-			sew.conn.SetWriteDeadline(time.Time{})
-		}
-		*sew.err = err
-		return n, err
-	}
+	n, err = writeWithByteTimeout(sew.group, sew.conn, sew.timeout, p)
+	*sew.err = err
+	return n, err
 }
 
 // noCachedConnError is the concrete type of ErrNoCachedConn, which
@@ -554,6 +583,8 @@ type RoundTripOpt struct {
 	// no cached connection is available, RoundTripOpt
 	// will return ErrNoCachedConn.
 	OnlyCachedConn bool
+
+	allowHTTP bool // allow http:// URLs
 }
 
 func (t *Transport) RoundTrip(req *http.Request) (*http.Response, error) {
@@ -586,7 +617,14 @@ func authorityAddr(scheme string, authority string) (addr string) {
 
 // RoundTripOpt is like RoundTrip, but takes options.
 func (t *Transport) RoundTripOpt(req *http.Request, opt RoundTripOpt) (*http.Response, error) {
-	if !(req.URL.Scheme == "https" || (req.URL.Scheme == "http" && t.AllowHTTP)) {
+	switch req.URL.Scheme {
+	case "https":
+		// Always okay.
+	case "http":
+		if !t.AllowHTTP && !opt.allowHTTP {
+			return nil, errors.New("http2: unencrypted HTTP/2 not enabled")
+		}
+	default:
 		return nil, errors.New("http2: unsupported scheme")
 	}
 
@@ -597,7 +635,7 @@ func (t *Transport) RoundTripOpt(req *http.Request, opt RoundTripOpt) (*http.Res
 			t.vlogf("http2: Transport failed to get client conn for %s: %v", addr, err)
 			return nil, err
 		}
-		reused := !atomic.CompareAndSwapUint32(&cc.reused, 0, 1)
+		reused := !atomic.CompareAndSwapUint32(&cc.atomicReused, 0, 1)
 		traceGotConn(req, cc, reused)
 		res, err := cc.RoundTrip(req)
 		if err != nil && retry <= 6 {
@@ -622,6 +660,22 @@ func (t *Transport) RoundTripOpt(req *http.Request, opt RoundTripOpt) (*http.Res
 				}
 			}
 		}
+		if err == errClientConnNotEstablished {
+			// This ClientConn was created recently,
+			// this is the first request to use it,
+			// and the connection is closed and not usable.
+			//
+			// In this state, cc.idleTimer will remove the conn from the pool
+			// when it fires. Stop the timer and remove it here so future requests
+			// won't try to use this connection.
+			//
+			// If the timer has already fired and we're racing it, the redundant
+			// call to MarkDead is harmless.
+			if cc.idleTimer != nil {
+				cc.idleTimer.Stop()
+			}
+			t.connPool().MarkDead(cc)
+		}
 		if err != nil {
 			t.vlogf("RoundTrip failure: %v", err)
 			return nil, err
@@ -640,9 +694,10 @@ func (t *Transport) CloseIdleConnections() {
 }
 
 var (
-	errClientConnClosed    = errors.New("http2: client conn is closed")
-	errClientConnUnusable  = errors.New("http2: client conn not usable")
-	errClientConnGotGoAway = errors.New("http2: Transport received Server's graceful shutdown GOAWAY")
+	errClientConnClosed         = errors.New("http2: client conn is closed")
+	errClientConnUnusable       = errors.New("http2: client conn not usable")
+	errClientConnNotEstablished = errors.New("http2: client conn could not be established")
+	errClientConnGotGoAway      = errors.New("http2: Transport received Server's graceful shutdown GOAWAY")
 )
 
 // shouldRetryRequest is called by RoundTrip when a request fails to get
@@ -758,44 +813,38 @@ func (t *Transport) expectContinueTimeout() time.Duration {
 	return t.t1.ExpectContinueTimeout
 }
 
-func (t *Transport) maxDecoderHeaderTableSize() uint32 {
-	if v := t.MaxDecoderHeaderTableSize; v > 0 {
-		return v
-	}
-	return initialHeaderTableSize
-}
-
-func (t *Transport) maxEncoderHeaderTableSize() uint32 {
-	if v := t.MaxEncoderHeaderTableSize; v > 0 {
-		return v
-	}
-	return initialHeaderTableSize
-}
-
 func (t *Transport) NewClientConn(c net.Conn) (*ClientConn, error) {
 	return t.newClientConn(c, t.disableKeepAlives())
 }
 
 func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, error) {
+	conf := configFromTransport(t)
 	cc := &ClientConn{
-		t:                     t,
-		tconn:                 c,
-		readerDone:            make(chan struct{}),
-		nextStreamID:          1,
-		maxFrameSize:          16 << 10,                    // spec default
-		initialWindowSize:     65535,                       // spec default
-		maxConcurrentStreams:  initialMaxConcurrentStreams, // "infinite", per spec. Use a smaller value until we have received server settings.
-		peerMaxHeaderListSize: 0xffffffffffffffff,          // "infinite", per spec. Use 2^64-1 instead.
-		streams:               make(map[uint32]*clientStream),
-		singleUse:             singleUse,
-		wantSettingsAck:       true,
-		pings:                 make(map[[8]byte]chan struct{}),
-		reqHeaderMu:           make(chan struct{}, 1),
-	}
+		t:                           t,
+		tconn:                       c,
+		readerDone:                  make(chan struct{}),
+		nextStreamID:                1,
+		maxFrameSize:                16 << 10, // spec default
+		initialWindowSize:           65535,    // spec default
+		initialStreamRecvWindowSize: conf.MaxUploadBufferPerStream,
+		maxConcurrentStreams:        initialMaxConcurrentStreams, // "infinite", per spec. Use a smaller value until we have received server settings.
+		peerMaxHeaderListSize:       0xffffffffffffffff,          // "infinite", per spec. Use 2^64-1 instead.
+		streams:                     make(map[uint32]*clientStream),
+		singleUse:                   singleUse,
+		seenSettingsChan:            make(chan struct{}),
+		wantSettingsAck:             true,
+		readIdleTimeout:             conf.SendPingTimeout,
+		pingTimeout:                 conf.PingTimeout,
+		pings:                       make(map[[8]byte]chan struct{}),
+		reqHeaderMu:                 make(chan struct{}, 1),
+		lastActive:                  t.now(),
+	}
+	var group synctestGroupInterface
 	if t.transportTestHooks != nil {
 		t.markNewGoroutine()
 		t.transportTestHooks.newclientconn(cc)
 		c = cc.tconn
+		group = t.group
 	}
 	if VerboseLogs {
 		t.vlogf("http2: Transport creating client conn %p to %v", cc, c.RemoteAddr())
@@ -807,24 +856,23 @@ func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, erro
 	// TODO: adjust this writer size to account for frame size +
 	// MTU + crypto/tls record padding.
 	cc.bw = bufio.NewWriter(stickyErrWriter{
+		group:   group,
 		conn:    c,
-		timeout: t.WriteByteTimeout,
+		timeout: conf.WriteByteTimeout,
 		err:     &cc.werr,
 	})
 	cc.br = bufio.NewReader(c)
 	cc.fr = NewFramer(cc.bw, cc.br)
-	if t.maxFrameReadSize() != 0 {
-		cc.fr.SetMaxReadFrameSize(t.maxFrameReadSize())
-	}
+	cc.fr.SetMaxReadFrameSize(conf.MaxReadFrameSize)
 	if t.CountError != nil {
 		cc.fr.countError = t.CountError
 	}
-	maxHeaderTableSize := t.maxDecoderHeaderTableSize()
+	maxHeaderTableSize := conf.MaxDecoderHeaderTableSize
 	cc.fr.ReadMetaHeaders = hpack.NewDecoder(maxHeaderTableSize, nil)
 	cc.fr.MaxHeaderListSize = t.maxHeaderListSize()
 
 	cc.henc = hpack.NewEncoder(&cc.hbuf)
-	cc.henc.SetMaxDynamicTableSizeLimit(t.maxEncoderHeaderTableSize())
+	cc.henc.SetMaxDynamicTableSizeLimit(conf.MaxEncoderHeaderTableSize)
 	cc.peerMaxHeaderTableSize = initialHeaderTableSize
 
 	if cs, ok := c.(connectionStater); ok {
@@ -834,11 +882,9 @@ func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, erro
 
 	initialSettings := []Setting{
 		{ID: SettingEnablePush, Val: 0},
-		{ID: SettingInitialWindowSize, Val: transportDefaultStreamFlow},
-	}
-	if max := t.maxFrameReadSize(); max != 0 {
-		initialSettings = append(initialSettings, Setting{ID: SettingMaxFrameSize, Val: max})
+		{ID: SettingInitialWindowSize, Val: uint32(cc.initialStreamRecvWindowSize)},
 	}
+	initialSettings = append(initialSettings, Setting{ID: SettingMaxFrameSize, Val: conf.MaxReadFrameSize})
 	if max := t.maxHeaderListSize(); max != 0 {
 		initialSettings = append(initialSettings, Setting{ID: SettingMaxHeaderListSize, Val: max})
 	}
@@ -848,8 +894,8 @@ func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, erro
 
 	cc.bw.Write(clientPreface)
 	cc.fr.WriteSettings(initialSettings...)
-	cc.fr.WriteWindowUpdate(0, transportDefaultConnFlow)
-	cc.inflow.init(transportDefaultConnFlow + initialWindowSize)
+	cc.fr.WriteWindowUpdate(0, uint32(conf.MaxUploadBufferPerConnection))
+	cc.inflow.init(conf.MaxUploadBufferPerConnection + initialWindowSize)
 	cc.bw.Flush()
 	if cc.werr != nil {
 		cc.Close()
@@ -867,7 +913,7 @@ func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, erro
 }
 
 func (cc *ClientConn) healthCheck() {
-	pingTimeout := cc.t.pingTimeout()
+	pingTimeout := cc.pingTimeout
 	// We don't need to periodically ping in the health check, because the readLoop of ClientConn will
 	// trigger the healthCheck again if there is no frame received.
 	ctx, cancel := cc.t.contextWithTimeout(context.Background(), pingTimeout)
@@ -995,7 +1041,7 @@ func (cc *ClientConn) State() ClientConnState {
 	return ClientConnState{
 		Closed:               cc.closed,
 		Closing:              cc.closing || cc.singleUse || cc.doNotReuse || cc.goAway != nil,
-		StreamsActive:        len(cc.streams),
+		StreamsActive:        len(cc.streams) + cc.pendingResets,
 		StreamsReserved:      cc.streamsReserved,
 		StreamsPending:       cc.pendingRequests,
 		LastIdle:             cc.lastIdle,
@@ -1027,16 +1073,38 @@ func (cc *ClientConn) idleStateLocked() (st clientConnIdleState) {
 		// writing it.
 		maxConcurrentOkay = true
 	} else {
-		maxConcurrentOkay = int64(len(cc.streams)+cc.streamsReserved+1) <= int64(cc.maxConcurrentStreams)
+		// We can take a new request if the total of
+		//   - active streams;
+		//   - reservation slots for new streams; and
+		//   - streams for which we have sent a RST_STREAM and a PING,
+		//     but received no subsequent frame
+		// is less than the concurrency limit.
+		maxConcurrentOkay = cc.currentRequestCountLocked() < int(cc.maxConcurrentStreams)
 	}
 
 	st.canTakeNewRequest = cc.goAway == nil && !cc.closed && !cc.closing && maxConcurrentOkay &&
 		!cc.doNotReuse &&
 		int64(cc.nextStreamID)+2*int64(cc.pendingRequests) < math.MaxInt32 &&
 		!cc.tooIdleLocked()
+
+	// If this connection has never been used for a request and is closed,
+	// then let it take a request (which will fail).
+	//
+	// This avoids a situation where an error early in a connection's lifetime
+	// goes unreported.
+	if cc.nextStreamID == 1 && cc.streamsReserved == 0 && cc.closed {
+		st.canTakeNewRequest = true
+	}
+
 	return
 }
 
+// currentRequestCountLocked reports the number of concurrency slots currently in use,
+// including active streams, reserved slots, and reset streams waiting for acknowledgement.
+func (cc *ClientConn) currentRequestCountLocked() int {
+	return len(cc.streams) + cc.streamsReserved + cc.pendingResets
+}
+
 func (cc *ClientConn) canTakeNewRequestLocked() bool {
 	st := cc.idleStateLocked()
 	return st.canTakeNewRequest
@@ -1049,7 +1117,7 @@ func (cc *ClientConn) tooIdleLocked() bool {
 	// times are compared based on their wall time. We don't want
 	// to reuse a connection that's been sitting idle during
 	// VM/laptop suspend if monotonic time was also frozen.
-	return cc.idleTimeout != 0 && !cc.lastIdle.IsZero() && time.Since(cc.lastIdle.Round(0)) > cc.idleTimeout
+	return cc.idleTimeout != 0 && !cc.lastIdle.IsZero() && cc.t.timeSince(cc.lastIdle.Round(0)) > cc.idleTimeout
 }
 
 // onIdleTimeout is called from a time.AfterFunc goroutine. It will
@@ -1411,6 +1479,8 @@ func (cs *clientStream) doRequest(req *http.Request, streamf func(*clientStream)
 	cs.cleanupWriteRequest(err)
 }
 
+var errExtendedConnectNotSupported = errors.New("net/http: extended connect not supported by peer")
+
 // writeRequest sends a request.
 //
 // It returns nil after the request is written, the response read,
@@ -1426,12 +1496,31 @@ func (cs *clientStream) writeRequest(req *http.Request, streamf func(*clientStre
 		return err
 	}
 
+	// wait for setting frames to be received, a server can change this value later,
+	// but we just wait for the first settings frame
+	var isExtendedConnect bool
+	if req.Method == "CONNECT" && req.Header.Get(":protocol") != "" {
+		isExtendedConnect = true
+	}
+
 	// Acquire the new-request lock by writing to reqHeaderMu.
 	// This lock guards the critical section covering allocating a new stream ID
 	// (requires mu) and creating the stream (requires wmu).
 	if cc.reqHeaderMu == nil {
 		panic("RoundTrip on uninitialized ClientConn") // for tests
 	}
+	if isExtendedConnect {
+		select {
+		case <-cs.reqCancel:
+			return errRequestCanceled
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-cc.seenSettingsChan:
+			if !cc.extendedConnectAllowed {
+				return errExtendedConnectNotSupported
+			}
+		}
+	}
 	select {
 	case cc.reqHeaderMu <- struct{}{}:
 	case <-cs.reqCancel:
@@ -1613,6 +1702,7 @@ func (cs *clientStream) cleanupWriteRequest(err error) {
 		cs.reqBodyClosed = make(chan struct{})
 	}
 	bodyClosed := cs.reqBodyClosed
+	closeOnIdle := cc.singleUse || cc.doNotReuse || cc.t.disableKeepAlives() || cc.goAway != nil
 	cc.mu.Unlock()
 	if mustCloseBody {
 		cs.reqBody.Close()
@@ -1637,16 +1727,44 @@ func (cs *clientStream) cleanupWriteRequest(err error) {
 		if cs.sentHeaders {
 			if se, ok := err.(StreamError); ok {
 				if se.Cause != errFromPeer {
-					cc.writeStreamReset(cs.ID, se.Code, err)
+					cc.writeStreamReset(cs.ID, se.Code, false, err)
 				}
 			} else {
-				cc.writeStreamReset(cs.ID, ErrCodeCancel, err)
+				// We're cancelling an in-flight request.
+				//
+				// This could be due to the server becoming unresponsive.
+				// To avoid sending too many requests on a dead connection,
+				// we let the request continue to consume a concurrency slot
+				// until we can confirm the server is still responding.
+				// We do this by sending a PING frame along with the RST_STREAM
+				// (unless a ping is already in flight).
+				//
+				// For simplicity, we don't bother tracking the PING payload:
+				// We reset cc.pendingResets any time we receive a PING ACK.
+				//
+				// We skip this if the conn is going to be closed on idle,
+				// because it's short lived and will probably be closed before
+				// we get the ping response.
+				ping := false
+				if !closeOnIdle {
+					cc.mu.Lock()
+					// rstStreamPingsBlocked works around a gRPC behavior:
+					// see comment on the field for details.
+					if !cc.rstStreamPingsBlocked {
+						if cc.pendingResets == 0 {
+							ping = true
+						}
+						cc.pendingResets++
+					}
+					cc.mu.Unlock()
+				}
+				cc.writeStreamReset(cs.ID, ErrCodeCancel, ping, err)
 			}
 		}
 		cs.bufPipe.CloseWithError(err) // no-op if already closed
 	} else {
 		if cs.sentHeaders && !cs.sentEndStream {
-			cc.writeStreamReset(cs.ID, ErrCodeNo, nil)
+			cc.writeStreamReset(cs.ID, ErrCodeNo, false, nil)
 		}
 		cs.bufPipe.CloseWithError(errRequestCanceled)
 	}
@@ -1668,12 +1786,17 @@ func (cs *clientStream) cleanupWriteRequest(err error) {
 // Must hold cc.mu.
 func (cc *ClientConn) awaitOpenSlotForStreamLocked(cs *clientStream) error {
 	for {
-		cc.lastActive = time.Now()
+		if cc.closed && cc.nextStreamID == 1 && cc.streamsReserved == 0 {
+			// This is the very first request sent to this connection.
+			// Return a fatal error which aborts the retry loop.
+			return errClientConnNotEstablished
+		}
+		cc.lastActive = cc.t.now()
 		if cc.closed || !cc.canTakeNewRequestLocked() {
 			return errClientConnUnusable
 		}
 		cc.lastIdle = time.Time{}
-		if int64(len(cc.streams)) < int64(cc.maxConcurrentStreams) {
+		if cc.currentRequestCountLocked() < int(cc.maxConcurrentStreams) {
 			return nil
 		}
 		cc.pendingRequests++
@@ -1945,7 +2068,7 @@ func (cs *clientStream) awaitFlowControl(maxBytes int) (taken int32, err error)
 
 func validateHeaders(hdrs http.Header) string {
 	for k, vv := range hdrs {
-		if !httpguts.ValidHeaderFieldName(k) {
+		if !httpguts.ValidHeaderFieldName(k) && k != ":protocol" {
 			return fmt.Sprintf("name %q", k)
 		}
 		for _, v := range vv {
@@ -1961,6 +2084,10 @@ func validateHeaders(hdrs http.Header) string {
 
 var errNilRequestURL = errors.New("http2: Request.URI is nil")
 
+func isNormalConnect(req *http.Request) bool {
+	return req.Method == "CONNECT" && req.Header.Get(":protocol") == ""
+}
+
 // requires cc.wmu be held.
 func (cc *ClientConn) encodeHeaders(req *http.Request, addGzipHeader bool, trailers string, contentLength int64) ([]byte, error) {
 	cc.hbuf.Reset()
@@ -1981,7 +2108,7 @@ func (cc *ClientConn) encodeHeaders(req *http.Request, addGzipHeader bool, trail
 	}
 
 	var path string
-	if req.Method != "CONNECT" {
+	if !isNormalConnect(req) {
 		path = req.URL.RequestURI()
 		if !validPseudoPath(path) {
 			orig := path
@@ -2018,7 +2145,7 @@ func (cc *ClientConn) encodeHeaders(req *http.Request, addGzipHeader bool, trail
 			m = http.MethodGet
 		}
 		f(":method", m)
-		if req.Method != "CONNECT" {
+		if !isNormalConnect(req) {
 			f(":path", path)
 			f(":scheme", req.URL.Scheme)
 		}
@@ -2199,7 +2326,7 @@ type resAndError struct {
 func (cc *ClientConn) addStreamLocked(cs *clientStream) {
 	cs.flow.add(int32(cc.initialWindowSize))
 	cs.flow.setConnFlow(&cc.flow)
-	cs.inflow.init(transportDefaultStreamFlow)
+	cs.inflow.init(cc.initialStreamRecvWindowSize)
 	cs.ID = cc.nextStreamID
 	cc.nextStreamID += 2
 	cc.streams[cs.ID] = cs
@@ -2215,10 +2342,10 @@ func (cc *ClientConn) forgetStreamID(id uint32) {
 	if len(cc.streams) != slen-1 {
 		panic("forgetting unknown stream id")
 	}
-	cc.lastActive = time.Now()
+	cc.lastActive = cc.t.now()
 	if len(cc.streams) == 0 && cc.idleTimer != nil {
 		cc.idleTimer.Reset(cc.idleTimeout)
-		cc.lastIdle = time.Now()
+		cc.lastIdle = cc.t.now()
 	}
 	// Wake up writeRequestBody via clientStream.awaitFlowControl and
 	// wake up RoundTrip if there is a pending request.
@@ -2278,7 +2405,6 @@ func isEOFOrNetReadError(err error) bool {
 
 func (rl *clientConnReadLoop) cleanup() {
 	cc := rl.cc
-	cc.t.connPool().MarkDead(cc)
 	defer cc.closeConn()
 	defer close(cc.readerDone)
 
@@ -2302,6 +2428,24 @@ func (rl *clientConnReadLoop) cleanup() {
 	}
 	cc.closed = true
 
+	// If the connection has never been used, and has been open for only a short time,
+	// leave it in the connection pool for a little while.
+	//
+	// This avoids a situation where new connections are constantly created,
+	// added to the pool, fail, and are removed from the pool, without any error
+	// being surfaced to the user.
+	const unusedWaitTime = 5 * time.Second
+	idleTime := cc.t.now().Sub(cc.lastActive)
+	if atomic.LoadUint32(&cc.atomicReused) == 0 && idleTime < unusedWaitTime {
+		cc.idleTimer = cc.t.afterFunc(unusedWaitTime-idleTime, func() {
+			cc.t.connPool().MarkDead(cc)
+		})
+	} else {
+		cc.mu.Unlock() // avoid any deadlocks in MarkDead
+		cc.t.connPool().MarkDead(cc)
+		cc.mu.Lock()
+	}
+
 	for _, cs := range cc.streams {
 		select {
 		case <-cs.peerClosed:
@@ -2345,7 +2489,7 @@ func (cc *ClientConn) countReadFrameError(err error) {
 func (rl *clientConnReadLoop) run() error {
 	cc := rl.cc
 	gotSettings := false
-	readIdleTimeout := cc.t.ReadIdleTimeout
+	readIdleTimeout := cc.readIdleTimeout
 	var t timer
 	if readIdleTimeout != 0 {
 		t = cc.t.afterFunc(readIdleTimeout, cc.healthCheck)
@@ -2359,7 +2503,7 @@ func (rl *clientConnReadLoop) run() error {
 			cc.vlogf("http2: Transport readFrame error on conn %p: (%T) %v", cc, err, err)
 		}
 		if se, ok := err.(StreamError); ok {
-			if cs := rl.streamByID(se.StreamID); cs != nil {
+			if cs := rl.streamByID(se.StreamID, notHeaderOrDataFrame); cs != nil {
 				if se.Cause == nil {
 					se.Cause = cc.fr.errDetail
 				}
@@ -2405,13 +2549,16 @@ func (rl *clientConnReadLoop) run() error {
 			if VerboseLogs {
 				cc.vlogf("http2: Transport conn %p received error from processing frame %v: %v", cc, summarizeFrame(f), err)
 			}
+			if !cc.seenSettings {
+				close(cc.seenSettingsChan)
+			}
 			return err
 		}
 	}
 }
 
 func (rl *clientConnReadLoop) processHeaders(f *MetaHeadersFrame) error {
-	cs := rl.streamByID(f.StreamID)
+	cs := rl.streamByID(f.StreamID, headerOrDataFrame)
 	if cs == nil {
 		// We'd get here if we canceled a request while the
 		// server had its response still in flight. So if this
@@ -2529,15 +2676,34 @@ func (rl *clientConnReadLoop) handleResponse(cs *clientStream, f *MetaHeadersFra
 		if f.StreamEnded() {
 			return nil, errors.New("1xx informational response with END_STREAM flag")
 		}
-		cs.num1xx++
-		const max1xxResponses = 5 // arbitrary bound on number of informational responses, same as net/http
-		if cs.num1xx > max1xxResponses {
-			return nil, errors.New("http2: too many 1xx informational responses")
-		}
 		if fn := cs.get1xxTraceFunc(); fn != nil {
+			// If the 1xx response is being delivered to the user,
+			// then they're responsible for limiting the number
+			// of responses.
 			if err := fn(statusCode, textproto.MIMEHeader(header)); err != nil {
 				return nil, err
 			}
+		} else {
+			// If the user didn't examine the 1xx response, then we
+			// limit the size of all 1xx headers.
+			//
+			// This differs a bit from the HTTP/1 implementation, which
+			// limits the size of all 1xx headers plus the final response.
+			// Use the larger limit of MaxHeaderListSize and
+			// net/http.Transport.MaxResponseHeaderBytes.
+			limit := int64(cs.cc.t.maxHeaderListSize())
+			if t1 := cs.cc.t.t1; t1 != nil && t1.MaxResponseHeaderBytes > limit {
+				limit = t1.MaxResponseHeaderBytes
+			}
+			for _, h := range f.Fields {
+				cs.totalHeaderSize += int64(h.Size())
+			}
+			if cs.totalHeaderSize > limit {
+				if VerboseLogs {
+					log.Printf("http2: 1xx informational responses too large")
+				}
+				return nil, errors.New("header list too large")
+			}
 		}
 		if statusCode == 100 {
 			traceGot100Continue(cs.trace)
@@ -2721,7 +2887,7 @@ func (b transportResponseBody) Close() error {
 
 func (rl *clientConnReadLoop) processData(f *DataFrame) error {
 	cc := rl.cc
-	cs := rl.streamByID(f.StreamID)
+	cs := rl.streamByID(f.StreamID, headerOrDataFrame)
 	data := f.Data()
 	if cs == nil {
 		cc.mu.Lock()
@@ -2856,9 +3022,22 @@ func (rl *clientConnReadLoop) endStreamError(cs *clientStream, err error) {
 	cs.abortStream(err)
 }
 
-func (rl *clientConnReadLoop) streamByID(id uint32) *clientStream {
+// Constants passed to streamByID for documentation purposes.
+const (
+	headerOrDataFrame    = true
+	notHeaderOrDataFrame = false
+)
+
+// streamByID returns the stream with the given id, or nil if no stream has that id.
+// If headerOrData is true, it clears rst.StreamPingsBlocked.
+func (rl *clientConnReadLoop) streamByID(id uint32, headerOrData bool) *clientStream {
 	rl.cc.mu.Lock()
 	defer rl.cc.mu.Unlock()
+	if headerOrData {
+		// Work around an unfortunate gRPC behavior.
+		// See comment on ClientConn.rstStreamPingsBlocked for details.
+		rl.cc.rstStreamPingsBlocked = false
+	}
 	cs := rl.cc.streams[id]
 	if cs != nil && !cs.readAborted {
 		return cs
@@ -2952,6 +3131,21 @@ func (rl *clientConnReadLoop) processSettingsNoWrite(f *SettingsFrame) error {
 		case SettingHeaderTableSize:
 			cc.henc.SetMaxDynamicTableSize(s.Val)
 			cc.peerMaxHeaderTableSize = s.Val
+		case SettingEnableConnectProtocol:
+			if err := s.Valid(); err != nil {
+				return err
+			}
+			// If the peer wants to send us SETTINGS_ENABLE_CONNECT_PROTOCOL,
+			// we require that it do so in the first SETTINGS frame.
+			//
+			// When we attempt to use extended CONNECT, we wait for the first
+			// SETTINGS frame to see if the server supports it. If we let the
+			// server enable the feature with a later SETTINGS frame, then
+			// users will see inconsistent results depending on whether we've
+			// seen that frame or not.
+			if !cc.seenSettings {
+				cc.extendedConnectAllowed = s.Val == 1
+			}
 		default:
 			cc.vlogf("Unhandled Setting: %v", s)
 		}
@@ -2969,6 +3163,7 @@ func (rl *clientConnReadLoop) processSettingsNoWrite(f *SettingsFrame) error {
 			// connection can establish to our default.
 			cc.maxConcurrentStreams = defaultMaxConcurrentStreams
 		}
+		close(cc.seenSettingsChan)
 		cc.seenSettings = true
 	}
 
@@ -2977,7 +3172,7 @@ func (rl *clientConnReadLoop) processSettingsNoWrite(f *SettingsFrame) error {
 
 func (rl *clientConnReadLoop) processWindowUpdate(f *WindowUpdateFrame) error {
 	cc := rl.cc
-	cs := rl.streamByID(f.StreamID)
+	cs := rl.streamByID(f.StreamID, notHeaderOrDataFrame)
 	if f.StreamID != 0 && cs == nil {
 		return nil
 	}
@@ -3006,7 +3201,7 @@ func (rl *clientConnReadLoop) processWindowUpdate(f *WindowUpdateFrame) error {
 }
 
 func (rl *clientConnReadLoop) processResetStream(f *RSTStreamFrame) error {
-	cs := rl.streamByID(f.StreamID)
+	cs := rl.streamByID(f.StreamID, notHeaderOrDataFrame)
 	if cs == nil {
 		// TODO: return error if server tries to RST_STREAM an idle stream
 		return nil
@@ -3081,6 +3276,12 @@ func (rl *clientConnReadLoop) processPing(f *PingFrame) error {
 			close(c)
 			delete(cc.pings, f.Data)
 		}
+		if cc.pendingResets > 0 {
+			// See clientStream.cleanupWriteRequest.
+			cc.pendingResets = 0
+			cc.rstStreamPingsBlocked = true
+			cc.cond.Broadcast()
+		}
 		return nil
 	}
 	cc := rl.cc
@@ -3103,13 +3304,20 @@ func (rl *clientConnReadLoop) processPushPromise(f *PushPromiseFrame) error {
 	return ConnectionError(ErrCodeProtocol)
 }
 
-func (cc *ClientConn) writeStreamReset(streamID uint32, code ErrCode, err error) {
+// writeStreamReset sends a RST_STREAM frame.
+// When ping is true, it also sends a PING frame with a random payload.
+func (cc *ClientConn) writeStreamReset(streamID uint32, code ErrCode, ping bool, err error) {
 	// TODO: map err to more interesting error codes, once the
 	// HTTP community comes up with some. But currently for
 	// RST_STREAM there's no equivalent to GOAWAY frame's debug
 	// data, and the error codes are all pretty vague ("cancel").
 	cc.wmu.Lock()
 	cc.fr.WriteRSTStream(streamID, code)
+	if ping {
+		var payload [8]byte
+		rand.Read(payload[:])
+		cc.fr.WritePing(false, payload)
+	}
 	cc.bw.Flush()
 	cc.wmu.Unlock()
 }
@@ -3263,7 +3471,7 @@ func traceGotConn(req *http.Request, cc *ClientConn, reused bool) {
 	cc.mu.Lock()
 	ci.WasIdle = len(cc.streams) == 0 && reused
 	if ci.WasIdle && !cc.lastActive.IsZero() {
-		ci.IdleTime = time.Since(cc.lastActive)
+		ci.IdleTime = cc.t.timeSince(cc.lastActive)
 	}
 	cc.mu.Unlock()
 
diff --git a/vendor/golang.org/x/net/http2/unencrypted.go b/vendor/golang.org/x/net/http2/unencrypted.go
new file mode 100644
index 0000000000000000000000000000000000000000..b2de2116135c4e4d63323de5f5d24a8b2de84ffa
--- /dev/null
+++ b/vendor/golang.org/x/net/http2/unencrypted.go
@@ -0,0 +1,32 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package http2
+
+import (
+	"crypto/tls"
+	"errors"
+	"net"
+)
+
+const nextProtoUnencryptedHTTP2 = "unencrypted_http2"
+
+// unencryptedNetConnFromTLSConn retrieves a net.Conn wrapped in a *tls.Conn.
+//
+// TLSNextProto functions accept a *tls.Conn.
+//
+// When passing an unencrypted HTTP/2 connection to a TLSNextProto function,
+// we pass a *tls.Conn with an underlying net.Conn containing the unencrypted connection.
+// To be extra careful about mistakes (accidentally dropping TLS encryption in a place
+// where we want it), the tls.Conn contains a net.Conn with an UnencryptedNetConn method
+// that returns the actual connection we want to use.
+func unencryptedNetConnFromTLSConn(tc *tls.Conn) (net.Conn, error) {
+	conner, ok := tc.NetConn().(interface {
+		UnencryptedNetConn() net.Conn
+	})
+	if !ok {
+		return nil, errors.New("http2: TLS conn unexpectedly found in unencrypted handoff")
+	}
+	return conner.UnencryptedNetConn(), nil
+}
diff --git a/vendor/golang.org/x/net/http2/write.go b/vendor/golang.org/x/net/http2/write.go
index 33f61398a1236d7076dccc90342bcedeec1c111c..6ff6bee7e954926be32dd7ea7926a40fe511cf5e 100644
--- a/vendor/golang.org/x/net/http2/write.go
+++ b/vendor/golang.org/x/net/http2/write.go
@@ -131,6 +131,16 @@ func (se StreamError) writeFrame(ctx writeContext) error {
 
 func (se StreamError) staysWithinBuffer(max int) bool { return frameHeaderLen+4 <= max }
 
+type writePing struct {
+	data [8]byte
+}
+
+func (w writePing) writeFrame(ctx writeContext) error {
+	return ctx.Framer().WritePing(false, w.data)
+}
+
+func (w writePing) staysWithinBuffer(max int) bool { return frameHeaderLen+len(w.data) <= max }
+
 type writePingAck struct{ pf *PingFrame }
 
 func (w writePingAck) writeFrame(ctx writeContext) error {
diff --git a/vendor/golang.org/x/net/internal/socket/zsys_openbsd_ppc64.go b/vendor/golang.org/x/net/internal/socket/zsys_openbsd_ppc64.go
index cebde7634f3421336fc4f5d79f42bdf43c0f0bbb..3c9576e2d830f3829ccea0e79b6a7019b4de0867 100644
--- a/vendor/golang.org/x/net/internal/socket/zsys_openbsd_ppc64.go
+++ b/vendor/golang.org/x/net/internal/socket/zsys_openbsd_ppc64.go
@@ -4,27 +4,27 @@
 package socket
 
 type iovec struct {
-	Base	*byte
-	Len	uint64
+	Base *byte
+	Len  uint64
 }
 
 type msghdr struct {
-	Name		*byte
-	Namelen		uint32
-	Iov		*iovec
-	Iovlen		uint32
-	Control		*byte
-	Controllen	uint32
-	Flags		int32
+	Name       *byte
+	Namelen    uint32
+	Iov        *iovec
+	Iovlen     uint32
+	Control    *byte
+	Controllen uint32
+	Flags      int32
 }
 
 type cmsghdr struct {
-	Len	uint32
-	Level	int32
-	Type	int32
+	Len   uint32
+	Level int32
+	Type  int32
 }
 
 const (
-	sizeofIovec	= 0x10
-	sizeofMsghdr	= 0x30
+	sizeofIovec  = 0x10
+	sizeofMsghdr = 0x30
 )
diff --git a/vendor/golang.org/x/net/internal/socket/zsys_openbsd_riscv64.go b/vendor/golang.org/x/net/internal/socket/zsys_openbsd_riscv64.go
index cebde7634f3421336fc4f5d79f42bdf43c0f0bbb..3c9576e2d830f3829ccea0e79b6a7019b4de0867 100644
--- a/vendor/golang.org/x/net/internal/socket/zsys_openbsd_riscv64.go
+++ b/vendor/golang.org/x/net/internal/socket/zsys_openbsd_riscv64.go
@@ -4,27 +4,27 @@
 package socket
 
 type iovec struct {
-	Base	*byte
-	Len	uint64
+	Base *byte
+	Len  uint64
 }
 
 type msghdr struct {
-	Name		*byte
-	Namelen		uint32
-	Iov		*iovec
-	Iovlen		uint32
-	Control		*byte
-	Controllen	uint32
-	Flags		int32
+	Name       *byte
+	Namelen    uint32
+	Iov        *iovec
+	Iovlen     uint32
+	Control    *byte
+	Controllen uint32
+	Flags      int32
 }
 
 type cmsghdr struct {
-	Len	uint32
-	Level	int32
-	Type	int32
+	Len   uint32
+	Level int32
+	Type  int32
 }
 
 const (
-	sizeofIovec	= 0x10
-	sizeofMsghdr	= 0x30
+	sizeofIovec  = 0x10
+	sizeofMsghdr = 0x30
 )
diff --git a/vendor/golang.org/x/sys/cpu/asm_darwin_x86_gc.s b/vendor/golang.org/x/sys/cpu/asm_darwin_x86_gc.s
new file mode 100644
index 0000000000000000000000000000000000000000..ec2acfe540ea7b0e714e137a72f75db190cf8998
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/asm_darwin_x86_gc.s
@@ -0,0 +1,17 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin && amd64 && gc
+
+#include "textflag.h"
+
+TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_sysctl(SB)
+GLOBL	·libc_sysctl_trampoline_addr(SB), RODATA, $8
+DATA	·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
+
+TEXT libc_sysctlbyname_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_sysctlbyname(SB)
+GLOBL	·libc_sysctlbyname_trampoline_addr(SB), RODATA, $8
+DATA	·libc_sysctlbyname_trampoline_addr(SB)/8, $libc_sysctlbyname_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go
index ec07aab05788c8732bee74fc3837f5b188ae4c84..02609d5b21d56a5b82ce169a3bd1f933f9d091f3 100644
--- a/vendor/golang.org/x/sys/cpu/cpu.go
+++ b/vendor/golang.org/x/sys/cpu/cpu.go
@@ -201,6 +201,25 @@ var S390X struct {
 	_         CacheLinePad
 }
 
+// RISCV64 contains the supported CPU features and performance characteristics for riscv64
+// platforms. The booleans in RISCV64, with the exception of HasFastMisaligned, indicate
+// the presence of RISC-V extensions.
+//
+// It is safe to assume that all the RV64G extensions are supported and so they are omitted from
+// this structure. As riscv64 Go programs require at least RV64G, the code that populates
+// this structure cannot run successfully if some of the RV64G extensions are missing.
+// The struct is padded to avoid false sharing.
+var RISCV64 struct {
+	_                 CacheLinePad
+	HasFastMisaligned bool // Fast misaligned accesses
+	HasC              bool // Compressed instruction-set extension
+	HasV              bool // Vector extension compatible with RVV 1.0
+	HasZba            bool // Address generation instructions extension
+	HasZbb            bool // Basic bit-manipulation extension
+	HasZbs            bool // Single-bit instructions extension
+	_                 CacheLinePad
+}
+
 func init() {
 	archInit()
 	initOptions()
diff --git a/vendor/golang.org/x/sys/cpu/cpu_darwin_x86.go b/vendor/golang.org/x/sys/cpu/cpu_darwin_x86.go
new file mode 100644
index 0000000000000000000000000000000000000000..b838cb9e956e61f625d7976f3a123f44072e529b
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/cpu_darwin_x86.go
@@ -0,0 +1,61 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin && amd64 && gc
+
+package cpu
+
+// darwinSupportsAVX512 checks Darwin kernel for AVX512 support via sysctl
+// call (see issue 43089). It also restricts AVX512 support for Darwin to
+// kernel version 21.3.0 (MacOS 12.2.0) or later (see issue 49233).
+//
+// Background:
+// Darwin implements a special mechanism to economize on thread state when
+// AVX512 specific registers are not in use. This scheme minimizes state when
+// preempting threads that haven't yet used any AVX512 instructions, but adds
+// special requirements to check for AVX512 hardware support at runtime (e.g.
+// via sysctl call or commpage inspection). See issue 43089 and link below for
+// full background:
+// https://github.com/apple-oss-distributions/xnu/blob/xnu-11215.1.10/osfmk/i386/fpu.c#L214-L240
+//
+// Additionally, all versions of the Darwin kernel from 19.6.0 through 21.2.0
+// (corresponding to MacOS 10.15.6 - 12.1) have a bug that can cause corruption
+// of the AVX512 mask registers (K0-K7) upon signal return. For this reason
+// AVX512 is considered unsafe to use on Darwin for kernel versions prior to
+// 21.3.0, where a fix has been confirmed. See issue 49233 for full background.
+func darwinSupportsAVX512() bool {
+	return darwinSysctlEnabled([]byte("hw.optional.avx512f\x00")) && darwinKernelVersionCheck(21, 3, 0)
+}
+
+// Ensure Darwin kernel version is at least major.minor.patch, avoiding dependencies
+func darwinKernelVersionCheck(major, minor, patch int) bool {
+	var release [256]byte
+	err := darwinOSRelease(&release)
+	if err != nil {
+		return false
+	}
+
+	var mmp [3]int
+	c := 0
+Loop:
+	for _, b := range release[:] {
+		switch {
+		case b >= '0' && b <= '9':
+			mmp[c] = 10*mmp[c] + int(b-'0')
+		case b == '.':
+			c++
+			if c > 2 {
+				return false
+			}
+		case b == 0:
+			break Loop
+		default:
+			return false
+		}
+	}
+	if c != 2 {
+		return false
+	}
+	return mmp[0] > major || mmp[0] == major && (mmp[1] > minor || mmp[1] == minor && mmp[2] >= patch)
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
index 910728fb163f36695c04e99064c4e5adf09a160c..32a44514e245fe38568a04ce77817b1b8c01aa73 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
@@ -6,10 +6,10 @@
 
 package cpu
 
-// cpuid is implemented in cpu_x86.s for gc compiler
+// cpuid is implemented in cpu_gc_x86.s for gc compiler
 // and in cpu_gccgo.c for gccgo.
 func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
 
-// xgetbv with ecx = 0 is implemented in cpu_x86.s for gc compiler
+// xgetbv with ecx = 0 is implemented in cpu_gc_x86.s for gc compiler
 // and in cpu_gccgo.c for gccgo.
 func xgetbv() (eax, edx uint32)
diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.s b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.s
similarity index 94%
rename from vendor/golang.org/x/sys/cpu/cpu_x86.s
rename to vendor/golang.org/x/sys/cpu/cpu_gc_x86.s
index 7d7ba33efb85559b3dded691773f12342163fdd6..ce208ce6d6a3a64925076de5b9db96a4afedaf0d 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.s
+++ b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.s
@@ -18,7 +18,7 @@ TEXT ·cpuid(SB), NOSPLIT, $0-24
 	RET
 
 // func xgetbv() (eax, edx uint32)
-TEXT ·xgetbv(SB),NOSPLIT,$0-8
+TEXT ·xgetbv(SB), NOSPLIT, $0-8
 	MOVL $0, CX
 	XGETBV
 	MOVL AX, eax+0(FP)
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
index 99c60fe9f9c657795eaaf6cf85863114f58bb191..170d21ddfda4163fb6d9d2094fb62ff9e027fb65 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
@@ -23,9 +23,3 @@ func xgetbv() (eax, edx uint32) {
 	gccgoXgetbv(&a, &d)
 	return a, d
 }
-
-// gccgo doesn't build on Darwin, per:
-// https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/gcc.rb#L76
-func darwinSupportsAVX512() bool {
-	return false
-}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go
index 08f35ea17735fddf8125d17ffeecbcb5dc6debb2..f1caf0f78e24b65db57f1b599d8bead8e1d9d65f 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go
@@ -110,7 +110,6 @@ func doinit() {
 	ARM64.HasASIMDFHM = isSet(hwCap, hwcap_ASIMDFHM)
 	ARM64.HasDIT = isSet(hwCap, hwcap_DIT)
 
-
 	// HWCAP2 feature bits
 	ARM64.HasSVE2 = isSet(hwCap2, hwcap2_SVE2)
 	ARM64.HasI8MM = isSet(hwCap2, hwcap2_I8MM)
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
index cd63e73355734ca8a12f992c5db23c002a0b6166..7d902b6847bbf90526eb980e09039a79cebcceda 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x
+//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x && !riscv64
 
 package cpu
 
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go
new file mode 100644
index 0000000000000000000000000000000000000000..cb4a0c5728073c634d2cf69b3d76286e013a82ae
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go
@@ -0,0 +1,137 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// RISC-V extension discovery code for Linux. The approach here is to first try the riscv_hwprobe
+// syscall falling back to HWCAP to check for the C extension if riscv_hwprobe is not available.
+//
+// A note on detection of the Vector extension using HWCAP.
+//
+// Support for the Vector extension version 1.0 was added to the Linux kernel in release 6.5.
+// Support for the riscv_hwprobe syscall was added in 6.4. It follows that if the riscv_hwprobe
+// syscall is not available then neither is the Vector extension (which needs kernel support).
+// The riscv_hwprobe syscall should then be all we need to detect the Vector extension.
+// However, some RISC-V board manufacturers ship boards with an older kernel on top of which
+// they have back-ported various versions of the Vector extension patches but not the riscv_hwprobe
+// patches. These kernels advertise support for the Vector extension using HWCAP. Falling
+// back to HWCAP to detect the Vector extension, if riscv_hwprobe is not available, or simply not
+// bothering with riscv_hwprobe at all and just using HWCAP may then seem like an attractive option.
+//
+// Unfortunately, simply checking the 'V' bit in AT_HWCAP will not work as this bit is used by
+// RISC-V board and cloud instance providers to mean different things. The Lichee Pi 4A board
+// and the Scaleway RV1 cloud instances use the 'V' bit to advertise their support for the unratified
+// 0.7.1 version of the Vector Specification. The Banana Pi BPI-F3 and the CanMV-K230 board use
+// it to advertise support for 1.0 of the Vector extension. Versions 0.7.1 and 1.0 of the Vector
+// extension are binary incompatible. HWCAP can then not be used in isolation to populate the
+// HasV field as this field indicates that the underlying CPU is compatible with RVV 1.0.
+//
+// There is a way at runtime to distinguish between versions 0.7.1 and 1.0 of the Vector
+// specification by issuing a RVV 1.0 vsetvli instruction and checking the vill bit of the vtype
+// register. This check would allow us to safely detect version 1.0 of the Vector extension
+// with HWCAP, if riscv_hwprobe were not available. However, the check cannot
+// be added until the assembler supports the Vector instructions.
+//
+// Note the riscv_hwprobe syscall does not suffer from these ambiguities by design as all of the
+// extensions it advertises support for are explicitly versioned. It's also worth noting that
+// the riscv_hwprobe syscall is the only way to detect multi-letter RISC-V extensions, e.g., Zba.
+// These cannot be detected using HWCAP and so riscv_hwprobe must be used to detect the majority
+// of RISC-V extensions.
+//
+// Please see https://docs.kernel.org/arch/riscv/hwprobe.html for more information.
+
+// golang.org/x/sys/cpu is not allowed to depend on golang.org/x/sys/unix so we must
+// reproduce the constants, types and functions needed to make the riscv_hwprobe syscall
+// here.
+
+const (
+	// Copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go.
+	riscv_HWPROBE_KEY_IMA_EXT_0   = 0x4
+	riscv_HWPROBE_IMA_C           = 0x2
+	riscv_HWPROBE_IMA_V           = 0x4
+	riscv_HWPROBE_EXT_ZBA         = 0x8
+	riscv_HWPROBE_EXT_ZBB         = 0x10
+	riscv_HWPROBE_EXT_ZBS         = 0x20
+	riscv_HWPROBE_KEY_CPUPERF_0   = 0x5
+	riscv_HWPROBE_MISALIGNED_FAST = 0x3
+	riscv_HWPROBE_MISALIGNED_MASK = 0x7
+)
+
+const (
+	// sys_RISCV_HWPROBE is copied from golang.org/x/sys/unix/zsysnum_linux_riscv64.go.
+	sys_RISCV_HWPROBE = 258
+)
+
+// riscvHWProbePairs is copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go.
+type riscvHWProbePairs struct {
+	key   int64
+	value uint64
+}
+
+const (
+	// CPU features
+	hwcap_RISCV_ISA_C = 1 << ('C' - 'A')
+)
+
+func doinit() {
+	// A slice of key/value pair structures is passed to the RISCVHWProbe syscall. The key
+	// field should be initialised with one of the key constants defined above, e.g.,
+	// RISCV_HWPROBE_KEY_IMA_EXT_0. The syscall will set the value field to the appropriate value.
+	// If the kernel does not recognise a key it will set the key field to -1 and the value field to 0.
+
+	pairs := []riscvHWProbePairs{
+		{riscv_HWPROBE_KEY_IMA_EXT_0, 0},
+		{riscv_HWPROBE_KEY_CPUPERF_0, 0},
+	}
+
+	// This call only indicates that extensions are supported if they are implemented on all cores.
+	if riscvHWProbe(pairs, 0) {
+		if pairs[0].key != -1 {
+			v := uint(pairs[0].value)
+			RISCV64.HasC = isSet(v, riscv_HWPROBE_IMA_C)
+			RISCV64.HasV = isSet(v, riscv_HWPROBE_IMA_V)
+			RISCV64.HasZba = isSet(v, riscv_HWPROBE_EXT_ZBA)
+			RISCV64.HasZbb = isSet(v, riscv_HWPROBE_EXT_ZBB)
+			RISCV64.HasZbs = isSet(v, riscv_HWPROBE_EXT_ZBS)
+		}
+		if pairs[1].key != -1 {
+			v := pairs[1].value & riscv_HWPROBE_MISALIGNED_MASK
+			RISCV64.HasFastMisaligned = v == riscv_HWPROBE_MISALIGNED_FAST
+		}
+	}
+
+	// Let's double check with HWCAP if the C extension does not appear to be supported.
+	// This may happen if we're running on a kernel older than 6.4.
+
+	if !RISCV64.HasC {
+		RISCV64.HasC = isSet(hwCap, hwcap_RISCV_ISA_C)
+	}
+}
+
+func isSet(hwc uint, value uint) bool {
+	return hwc&value != 0
+}
+
+// riscvHWProbe is a simplified version of the generated wrapper function found in
+// golang.org/x/sys/unix/zsyscall_linux_riscv64.go. We simplify it by removing the
+// cpuCount and cpus parameters which we do not need. We always want to pass 0 for
+// these parameters here so the kernel only reports the extensions that are present
+// on all cores.
+func riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool {
+	var _zero uintptr
+	var p0 unsafe.Pointer
+	if len(pairs) > 0 {
+		p0 = unsafe.Pointer(&pairs[0])
+	} else {
+		p0 = unsafe.Pointer(&_zero)
+	}
+
+	_, _, e1 := syscall.Syscall6(sys_RISCV_HWPROBE, uintptr(p0), uintptr(len(pairs)), uintptr(0), uintptr(0), uintptr(flags), 0)
+	return e1 == 0
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_x86.go b/vendor/golang.org/x/sys/cpu/cpu_other_x86.go
new file mode 100644
index 0000000000000000000000000000000000000000..a0fd7e2f75d37fe6f894f799caae51d9b6fe006f
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/cpu_other_x86.go
@@ -0,0 +1,11 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64p32 || (amd64 && (!darwin || !gc))
+
+package cpu
+
+func darwinSupportsAVX512() bool {
+	panic("only implemented for gc && amd64 && darwin")
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
index 7f0c79c004b44a57b821bdf405363148da434314..aca3199c911690b3b2899e2ee373083e2ac66cdd 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
@@ -8,4 +8,13 @@ package cpu
 
 const cacheLineSize = 64
 
-func initOptions() {}
+func initOptions() {
+	options = []option{
+		{Name: "fastmisaligned", Feature: &RISCV64.HasFastMisaligned},
+		{Name: "c", Feature: &RISCV64.HasC},
+		{Name: "v", Feature: &RISCV64.HasV},
+		{Name: "zba", Feature: &RISCV64.HasZba},
+		{Name: "zbb", Feature: &RISCV64.HasZbb},
+		{Name: "zbs", Feature: &RISCV64.HasZbs},
+	}
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.go b/vendor/golang.org/x/sys/cpu/cpu_x86.go
index c29f5e4c5a6e4ef36f32964e99910cb710205f97..600a6807861e7fd225b672283e2bc5c984eb13d6 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_x86.go
@@ -92,10 +92,8 @@ func archInit() {
 		osSupportsAVX = isSet(1, eax) && isSet(2, eax)
 
 		if runtime.GOOS == "darwin" {
-			// Darwin doesn't save/restore AVX-512 mask registers correctly across signal handlers.
-			// Since users can't rely on mask register contents, let's not advertise AVX-512 support.
-			// See issue 49233.
-			osSupportsAVX512 = false
+			// Darwin requires special AVX512 checks, see cpu_darwin_x86.go
+			osSupportsAVX512 = osSupportsAVX && darwinSupportsAVX512()
 		} else {
 			// Check if OPMASK and ZMM registers have OS support.
 			osSupportsAVX512 = osSupportsAVX && isSet(5, eax) && isSet(6, eax) && isSet(7, eax)
diff --git a/vendor/golang.org/x/sys/cpu/syscall_darwin_x86_gc.go b/vendor/golang.org/x/sys/cpu/syscall_darwin_x86_gc.go
new file mode 100644
index 0000000000000000000000000000000000000000..4d0888b0c010f41faaf56692175b7d395bb7494a
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/syscall_darwin_x86_gc.go
@@ -0,0 +1,98 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Minimal copy of x/sys/unix so the cpu package can make a
+// system call on Darwin without depending on x/sys/unix.
+
+//go:build darwin && amd64 && gc
+
+package cpu
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+type _C_int int32
+
+// adapted from unix.Uname() at x/sys/unix/syscall_darwin.go L419
+func darwinOSRelease(release *[256]byte) error {
+	// from x/sys/unix/zerrors_openbsd_amd64.go
+	const (
+		CTL_KERN       = 0x1
+		KERN_OSRELEASE = 0x2
+	)
+
+	mib := []_C_int{CTL_KERN, KERN_OSRELEASE}
+	n := unsafe.Sizeof(*release)
+
+	return sysctl(mib, &release[0], &n, nil, 0)
+}
+
+type Errno = syscall.Errno
+
+var _zero uintptr // Single-word zero for use when we need a valid pointer to 0 bytes.
+
+// from x/sys/unix/zsyscall_darwin_amd64.go L791-807
+func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) error {
+	var _p0 unsafe.Pointer
+	if len(mib) > 0 {
+		_p0 = unsafe.Pointer(&mib[0])
+	} else {
+		_p0 = unsafe.Pointer(&_zero)
+	}
+	if _, _, err := syscall_syscall6(
+		libc_sysctl_trampoline_addr,
+		uintptr(_p0),
+		uintptr(len(mib)),
+		uintptr(unsafe.Pointer(old)),
+		uintptr(unsafe.Pointer(oldlen)),
+		uintptr(unsafe.Pointer(new)),
+		uintptr(newlen),
+	); err != 0 {
+		return err
+	}
+
+	return nil
+}
+
+var libc_sysctl_trampoline_addr uintptr
+
+// adapted from internal/cpu/cpu_arm64_darwin.go
+func darwinSysctlEnabled(name []byte) bool {
+	out := int32(0)
+	nout := unsafe.Sizeof(out)
+	if ret := sysctlbyname(&name[0], (*byte)(unsafe.Pointer(&out)), &nout, nil, 0); ret != nil {
+		return false
+	}
+	return out > 0
+}
+
+//go:cgo_import_dynamic libc_sysctl sysctl "/usr/lib/libSystem.B.dylib"
+
+var libc_sysctlbyname_trampoline_addr uintptr
+
+// adapted from runtime/sys_darwin.go in the pattern of sysctl() above, as defined in x/sys/unix
+func sysctlbyname(name *byte, old *byte, oldlen *uintptr, new *byte, newlen uintptr) error {
+	if _, _, err := syscall_syscall6(
+		libc_sysctlbyname_trampoline_addr,
+		uintptr(unsafe.Pointer(name)),
+		uintptr(unsafe.Pointer(old)),
+		uintptr(unsafe.Pointer(oldlen)),
+		uintptr(unsafe.Pointer(new)),
+		uintptr(newlen),
+		0,
+	); err != 0 {
+		return err
+	}
+
+	return nil
+}
+
+//go:cgo_import_dynamic libc_sysctlbyname sysctlbyname "/usr/lib/libSystem.B.dylib"
+
+// Implemented in the runtime package (runtime/sys_darwin.go)
+func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
+//go:linkname syscall_syscall6 syscall.syscall6
diff --git a/vendor/golang.org/x/sys/unix/README.md b/vendor/golang.org/x/sys/unix/README.md
index 7d3c060e12213c48b017dfd2846fb9b1cf413d31..6e08a76a716e9acd766c9cda4ea56d607b481470 100644
--- a/vendor/golang.org/x/sys/unix/README.md
+++ b/vendor/golang.org/x/sys/unix/README.md
@@ -156,7 +156,7 @@ from the generated architecture-specific files listed below, and merge these
 into a common file for each OS.
 
 The merge is performed in the following steps:
-1. Construct the set of common code that is idential in all architecture-specific files.
+1. Construct the set of common code that is identical in all architecture-specific files.
 2. Write this common code to the merged file.
 3. Remove the common code from all architecture-specific files.
 
diff --git a/vendor/golang.org/x/sys/unix/ioctl_linux.go b/vendor/golang.org/x/sys/unix/ioctl_linux.go
index dbe680eab88a9e281759b0b9db9cf1ff708b31d8..7ca4fa12aa673c6bbdaaac287ea68eb43408ff03 100644
--- a/vendor/golang.org/x/sys/unix/ioctl_linux.go
+++ b/vendor/golang.org/x/sys/unix/ioctl_linux.go
@@ -58,6 +58,102 @@ func IoctlGetEthtoolDrvinfo(fd int, ifname string) (*EthtoolDrvinfo, error) {
 	return &value, err
 }
 
+// IoctlGetEthtoolTsInfo fetches ethtool timestamping and PHC
+// association for the network device specified by ifname.
+func IoctlGetEthtoolTsInfo(fd int, ifname string) (*EthtoolTsInfo, error) {
+	ifr, err := NewIfreq(ifname)
+	if err != nil {
+		return nil, err
+	}
+
+	value := EthtoolTsInfo{Cmd: ETHTOOL_GET_TS_INFO}
+	ifrd := ifr.withData(unsafe.Pointer(&value))
+
+	err = ioctlIfreqData(fd, SIOCETHTOOL, &ifrd)
+	return &value, err
+}
+
+// IoctlGetHwTstamp retrieves the hardware timestamping configuration
+// for the network device specified by ifname.
+func IoctlGetHwTstamp(fd int, ifname string) (*HwTstampConfig, error) {
+	ifr, err := NewIfreq(ifname)
+	if err != nil {
+		return nil, err
+	}
+
+	value := HwTstampConfig{}
+	ifrd := ifr.withData(unsafe.Pointer(&value))
+
+	err = ioctlIfreqData(fd, SIOCGHWTSTAMP, &ifrd)
+	return &value, err
+}
+
+// IoctlSetHwTstamp updates the hardware timestamping configuration for
+// the network device specified by ifname.
+func IoctlSetHwTstamp(fd int, ifname string, cfg *HwTstampConfig) error {
+	ifr, err := NewIfreq(ifname)
+	if err != nil {
+		return err
+	}
+	ifrd := ifr.withData(unsafe.Pointer(cfg))
+	return ioctlIfreqData(fd, SIOCSHWTSTAMP, &ifrd)
+}
+
+// FdToClockID derives the clock ID from the file descriptor number
+// - see clock_gettime(3), FD_TO_CLOCKID macros. The resulting ID is
+// suitable for system calls like ClockGettime.
+func FdToClockID(fd int) int32 { return int32((int(^fd) << 3) | 3) }
+
+// IoctlPtpClockGetcaps returns the description of a given PTP device.
+func IoctlPtpClockGetcaps(fd int) (*PtpClockCaps, error) {
+	var value PtpClockCaps
+	err := ioctlPtr(fd, PTP_CLOCK_GETCAPS2, unsafe.Pointer(&value))
+	return &value, err
+}
+
+// IoctlPtpSysOffsetPrecise returns a description of the clock
+// offset compared to the system clock.
+func IoctlPtpSysOffsetPrecise(fd int) (*PtpSysOffsetPrecise, error) {
+	var value PtpSysOffsetPrecise
+	err := ioctlPtr(fd, PTP_SYS_OFFSET_PRECISE2, unsafe.Pointer(&value))
+	return &value, err
+}
+
+// IoctlPtpSysOffsetExtended returns an extended description of the
+// clock offset compared to the system clock. The samples parameter
+// specifies the desired number of measurements.
+func IoctlPtpSysOffsetExtended(fd int, samples uint) (*PtpSysOffsetExtended, error) {
+	value := PtpSysOffsetExtended{Samples: uint32(samples)}
+	err := ioctlPtr(fd, PTP_SYS_OFFSET_EXTENDED2, unsafe.Pointer(&value))
+	return &value, err
+}
+
+// IoctlPtpPinGetfunc returns the configuration of the specified
+// I/O pin on given PTP device.
+func IoctlPtpPinGetfunc(fd int, index uint) (*PtpPinDesc, error) {
+	value := PtpPinDesc{Index: uint32(index)}
+	err := ioctlPtr(fd, PTP_PIN_GETFUNC2, unsafe.Pointer(&value))
+	return &value, err
+}
+
+// IoctlPtpPinSetfunc updates configuration of the specified PTP
+// I/O pin.
+func IoctlPtpPinSetfunc(fd int, pd *PtpPinDesc) error {
+	return ioctlPtr(fd, PTP_PIN_SETFUNC2, unsafe.Pointer(pd))
+}
+
+// IoctlPtpPeroutRequest configures the periodic output mode of the
+// PTP I/O pins.
+func IoctlPtpPeroutRequest(fd int, r *PtpPeroutRequest) error {
+	return ioctlPtr(fd, PTP_PEROUT_REQUEST2, unsafe.Pointer(r))
+}
+
+// IoctlPtpExttsRequest configures the external timestamping mode
+// of the PTP I/O pins.
+func IoctlPtpExttsRequest(fd int, r *PtpExttsRequest) error {
+	return ioctlPtr(fd, PTP_EXTTS_REQUEST2, unsafe.Pointer(r))
+}
+
 // IoctlGetWatchdogInfo fetches information about a watchdog device from the
 // Linux watchdog API. For more information, see:
 // https://www.kernel.org/doc/html/latest/watchdog/watchdog-api.html.
diff --git a/vendor/golang.org/x/sys/unix/mkerrors.sh b/vendor/golang.org/x/sys/unix/mkerrors.sh
index d07dd09eb503c2d10c6b44da922b0f37566bd8ba..6ab02b6c3122afd5e2ea1a7cf7f31a43342746fb 100644
--- a/vendor/golang.org/x/sys/unix/mkerrors.sh
+++ b/vendor/golang.org/x/sys/unix/mkerrors.sh
@@ -158,6 +158,16 @@ includes_Linux='
 #endif
 #define _GNU_SOURCE
 
+// See the description in unix/linux/types.go
+#if defined(__ARM_EABI__) || \
+	(defined(__mips__) && (_MIPS_SIM == _ABIO32)) || \
+	(defined(__powerpc__) && (!defined(__powerpc64__)))
+# ifdef   _TIME_BITS
+#  undef  _TIME_BITS
+# endif
+# define  _TIME_BITS 32
+#endif
+
 // <sys/ioctl.h> is broken on powerpc64, as it fails to include definitions of
 // these structures. We just include them copied from <bits/termios.h>.
 #if defined(__powerpc__)
@@ -256,6 +266,7 @@ struct ltchars {
 #include <linux/nsfs.h>
 #include <linux/perf_event.h>
 #include <linux/pps.h>
+#include <linux/ptp_clock.h>
 #include <linux/ptrace.h>
 #include <linux/random.h>
 #include <linux/reboot.h>
@@ -527,6 +538,7 @@ ccflags="$@"
 		$2 ~ /^(AF|SOCK|SO|SOL|IPPROTO|IP|IPV6|TCP|MCAST|EVFILT|NOTE|SHUT|PROT|MAP|MREMAP|MFD|T?PACKET|MSG|SCM|MCL|DT|MADV|PR|LOCAL|TCPOPT|UDP)_/ ||
 		$2 ~ /^NFC_(GENL|PROTO|COMM|RF|SE|DIRECTION|LLCP|SOCKPROTO)_/ ||
 		$2 ~ /^NFC_.*_(MAX)?SIZE$/ ||
+		$2 ~ /^PTP_/ ||
 		$2 ~ /^RAW_PAYLOAD_/ ||
 		$2 ~ /^[US]F_/ ||
 		$2 ~ /^TP_STATUS_/ ||
@@ -552,6 +564,7 @@ ccflags="$@"
 		$2 !~ /^RTC_VL_(ACCURACY|BACKUP|DATA)/ &&
 		$2 ~ /^(NETLINK|NLM|NLMSG|NLA|IFA|IFAN|RT|RTC|RTCF|RTN|RTPROT|RTNH|ARPHRD|ETH_P|NETNSA)_/ ||
 		$2 ~ /^SOCK_|SK_DIAG_|SKNLGRP_$/ ||
+		$2 ~ /^(CONNECT|SAE)_/ ||
 		$2 ~ /^FIORDCHK$/ ||
 		$2 ~ /^SIOC/ ||
 		$2 ~ /^TIOC/ ||
@@ -655,7 +668,7 @@ errors=$(
 signals=$(
 	echo '#include <signal.h>' | $CC -x c - -E -dM $ccflags |
 	awk '$1=="#define" && $2 ~ /^SIG[A-Z0-9]+$/ { print $2 }' |
-	grep -v 'SIGSTKSIZE\|SIGSTKSZ\|SIGRT\|SIGMAX64' |
+	grep -E -v '(SIGSTKSIZE|SIGSTKSZ|SIGRT|SIGMAX64)' |
 	sort
 )
 
@@ -665,7 +678,7 @@ echo '#include <errno.h>' | $CC -x c - -E -dM $ccflags |
 	sort >_error.grep
 echo '#include <signal.h>' | $CC -x c - -E -dM $ccflags |
 	awk '$1=="#define" && $2 ~ /^SIG[A-Z0-9]+$/ { print "^\t" $2 "[ \t]*=" }' |
-	grep -v 'SIGSTKSIZE\|SIGSTKSZ\|SIGRT\|SIGMAX64' |
+	grep -E -v '(SIGSTKSIZE|SIGSTKSZ|SIGRT|SIGMAX64)' |
 	sort >_signal.grep
 
 echo '// mkerrors.sh' "$@"
diff --git a/vendor/golang.org/x/sys/unix/syscall_aix.go b/vendor/golang.org/x/sys/unix/syscall_aix.go
index 67ce6cef2d5c4990629b90eb43d888b90612d671..6f15ba1eaff6571878742b4d058e084f30afa6f2 100644
--- a/vendor/golang.org/x/sys/unix/syscall_aix.go
+++ b/vendor/golang.org/x/sys/unix/syscall_aix.go
@@ -360,7 +360,7 @@ func Wait4(pid int, wstatus *WaitStatus, options int, rusage *Rusage) (wpid int,
 	var status _C_int
 	var r Pid_t
 	err = ERESTART
-	// AIX wait4 may return with ERESTART errno, while the processus is still
+	// AIX wait4 may return with ERESTART errno, while the process is still
 	// active.
 	for err == ERESTART {
 		r, err = wait4(Pid_t(pid), &status, options, rusage)
diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go
index 2d15200adb497cc460d26dc8803a21b6d1eef532..099867deedec638201fadb9eb6241893e8761dc3 100644
--- a/vendor/golang.org/x/sys/unix/syscall_darwin.go
+++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go
@@ -566,6 +566,43 @@ func PthreadFchdir(fd int) (err error) {
 	return pthread_fchdir_np(fd)
 }
 
+// Connectx calls connectx(2) to initiate a connection on a socket.
+//
+// srcIf, srcAddr, and dstAddr are filled into a [SaEndpoints] struct and passed as the endpoints argument.
+//
+//   - srcIf is the optional source interface index. 0 means unspecified.
+//   - srcAddr is the optional source address. nil means unspecified.
+//   - dstAddr is the destination address.
+//
+// On success, Connectx returns the number of bytes enqueued for transmission.
+func Connectx(fd int, srcIf uint32, srcAddr, dstAddr Sockaddr, associd SaeAssocID, flags uint32, iov []Iovec, connid *SaeConnID) (n uintptr, err error) {
+	endpoints := SaEndpoints{
+		Srcif: srcIf,
+	}
+
+	if srcAddr != nil {
+		addrp, addrlen, err := srcAddr.sockaddr()
+		if err != nil {
+			return 0, err
+		}
+		endpoints.Srcaddr = (*RawSockaddr)(addrp)
+		endpoints.Srcaddrlen = uint32(addrlen)
+	}
+
+	if dstAddr != nil {
+		addrp, addrlen, err := dstAddr.sockaddr()
+		if err != nil {
+			return 0, err
+		}
+		endpoints.Dstaddr = (*RawSockaddr)(addrp)
+		endpoints.Dstaddrlen = uint32(addrlen)
+	}
+
+	err = connectx(fd, &endpoints, associd, flags, iov, &n, connid)
+	return
+}
+
+//sys	connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error)
 //sys	sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error)
 
 //sys	shmat(id int, addr uintptr, flag int) (ret uintptr, err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd.go b/vendor/golang.org/x/sys/unix/syscall_hurd.go
index ba46651f8e38a4aaddbbfc5c3f1de1d006af109e..a6a2d2fc2b90c5c8f0efdb88bb72bc8997400ad9 100644
--- a/vendor/golang.org/x/sys/unix/syscall_hurd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_hurd.go
@@ -11,6 +11,7 @@ package unix
 int ioctl(int, unsigned long int, uintptr_t);
 */
 import "C"
+import "unsafe"
 
 func ioctl(fd int, req uint, arg uintptr) (err error) {
 	r0, er := C.ioctl(C.int(fd), C.ulong(req), C.uintptr_t(arg))
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux.go b/vendor/golang.org/x/sys/unix/syscall_linux.go
index 3f1d3d4cb2560b171bd7414fb6cbe6abcac17df6..230a94549a7a278527848109714e2e3285888732 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux.go
@@ -1295,6 +1295,48 @@ func GetsockoptTCPInfo(fd, level, opt int) (*TCPInfo, error) {
 	return &value, err
 }
 
+// GetsockoptTCPCCVegasInfo returns algorithm specific congestion control information for a socket using the "vegas"
+// algorithm.
+//
+// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
+//
+//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
+func GetsockoptTCPCCVegasInfo(fd, level, opt int) (*TCPVegasInfo, error) {
+	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
+	vallen := _Socklen(SizeofTCPCCInfo)
+	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
+	out := (*TCPVegasInfo)(unsafe.Pointer(&value[0]))
+	return out, err
+}
+
+// GetsockoptTCPCCDCTCPInfo returns algorithm specific congestion control information for a socket using the "dctp"
+// algorithm.
+//
+// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
+//
+//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
+func GetsockoptTCPCCDCTCPInfo(fd, level, opt int) (*TCPDCTCPInfo, error) {
+	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
+	vallen := _Socklen(SizeofTCPCCInfo)
+	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
+	out := (*TCPDCTCPInfo)(unsafe.Pointer(&value[0]))
+	return out, err
+}
+
+// GetsockoptTCPCCBBRInfo returns algorithm specific congestion control information for a socket using the "bbr"
+// algorithm.
+//
+// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
+//
+//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
+func GetsockoptTCPCCBBRInfo(fd, level, opt int) (*TCPBBRInfo, error) {
+	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
+	vallen := _Socklen(SizeofTCPCCInfo)
+	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
+	out := (*TCPBBRInfo)(unsafe.Pointer(&value[0]))
+	return out, err
+}
+
 // GetsockoptString returns the string value of the socket option opt for the
 // socket associated with fd at the given socket level.
 func GetsockoptString(fd, level, opt int) (string, error) {
@@ -1818,6 +1860,7 @@ func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err e
 //sys	ClockAdjtime(clockid int32, buf *Timex) (state int, err error)
 //sys	ClockGetres(clockid int32, res *Timespec) (err error)
 //sys	ClockGettime(clockid int32, time *Timespec) (err error)
+//sys	ClockSettime(clockid int32, time *Timespec) (err error)
 //sys	ClockNanosleep(clockid int32, flags int, request *Timespec, remain *Timespec) (err error)
 //sys	Close(fd int) (err error)
 //sys	CloseRange(first uint, last uint, flags uint) (err error)
@@ -1959,7 +2002,26 @@ func Getpgrp() (pid int) {
 //sysnb	Getpid() (pid int)
 //sysnb	Getppid() (ppid int)
 //sys	Getpriority(which int, who int) (prio int, err error)
-//sys	Getrandom(buf []byte, flags int) (n int, err error)
+
+func Getrandom(buf []byte, flags int) (n int, err error) {
+	vdsoRet, supported := vgetrandom(buf, uint32(flags))
+	if supported {
+		if vdsoRet < 0 {
+			return 0, errnoErr(syscall.Errno(-vdsoRet))
+		}
+		return vdsoRet, nil
+	}
+	var p *byte
+	if len(buf) > 0 {
+		p = &buf[0]
+	}
+	r, _, e := Syscall(SYS_GETRANDOM, uintptr(unsafe.Pointer(p)), uintptr(len(buf)), uintptr(flags))
+	if e != 0 {
+		return 0, errnoErr(e)
+	}
+	return int(r), nil
+}
+
 //sysnb	Getrusage(who int, rusage *Rusage) (err error)
 //sysnb	Getsid(pid int) (sid int, err error)
 //sysnb	Gettid() (tid int)
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
index cf2ee6c75ef3d3905f8a6575b00500bc246d483d..745e5c7e6c0d5d6cde926549e394f8c541e7371b 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
@@ -182,3 +182,5 @@ func KexecFileLoad(kernelFd int, initrdFd int, cmdline string, flags int) error
 	}
 	return kexecFileLoad(kernelFd, initrdFd, cmdlineLen, cmdline, flags)
 }
+
+const SYS_FSTATAT = SYS_NEWFSTATAT
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
index 3d0e98451f8a78933ecf08740b1f6e744f0d627a..dd2262a40799a2cdeb02e40a4526aa8b3533c258 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
@@ -214,3 +214,5 @@ func KexecFileLoad(kernelFd int, initrdFd int, cmdline string, flags int) error
 	}
 	return kexecFileLoad(kernelFd, initrdFd, cmdlineLen, cmdline, flags)
 }
+
+const SYS_FSTATAT = SYS_NEWFSTATAT
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
index 6f5a288944dfe604dd0f33b5b32e5d91b827c3f1..8cf3670bda630e22ea33151e00c3895c72dda69c 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
@@ -187,3 +187,5 @@ func RISCVHWProbe(pairs []RISCVHWProbePairs, set *CPUSet, flags uint) (err error
 	}
 	return riscvHWProbe(pairs, setSize, set, flags)
 }
+
+const SYS_FSTATAT = SYS_NEWFSTATAT
diff --git a/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go b/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
index 312ae6ac1d21abb1b847d1036e21f20d877876b2..7bf5c04bb0ae0d539b6082d0b5d60f3f4202f74a 100644
--- a/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
@@ -768,6 +768,15 @@ func Munmap(b []byte) (err error) {
 	return mapper.Munmap(b)
 }
 
+func MmapPtr(fd int, offset int64, addr unsafe.Pointer, length uintptr, prot int, flags int) (ret unsafe.Pointer, err error) {
+	xaddr, err := mapper.mmap(uintptr(addr), length, prot, flags, fd, offset)
+	return unsafe.Pointer(xaddr), err
+}
+
+func MunmapPtr(addr unsafe.Pointer, length uintptr) (err error) {
+	return mapper.munmap(uintptr(addr), length)
+}
+
 //sys   Gethostname(buf []byte) (err error) = SYS___GETHOSTNAME_A
 //sysnb	Getgid() (gid int)
 //sysnb	Getpid() (pid int)
@@ -816,10 +825,10 @@ func Lstat(path string, stat *Stat_t) (err error) {
 // for checking symlinks begins with $VERSION/ $SYSNAME/ $SYSSYMR/ $SYSSYMA/
 func isSpecialPath(path []byte) (v bool) {
 	var special = [4][8]byte{
-		[8]byte{'V', 'E', 'R', 'S', 'I', 'O', 'N', '/'},
-		[8]byte{'S', 'Y', 'S', 'N', 'A', 'M', 'E', '/'},
-		[8]byte{'S', 'Y', 'S', 'S', 'Y', 'M', 'R', '/'},
-		[8]byte{'S', 'Y', 'S', 'S', 'Y', 'M', 'A', '/'}}
+		{'V', 'E', 'R', 'S', 'I', 'O', 'N', '/'},
+		{'S', 'Y', 'S', 'N', 'A', 'M', 'E', '/'},
+		{'S', 'Y', 'S', 'S', 'Y', 'M', 'R', '/'},
+		{'S', 'Y', 'S', 'S', 'Y', 'M', 'A', '/'}}
 
 	var i, j int
 	for i = 0; i < len(special); i++ {
@@ -3115,3 +3124,90 @@ func legacy_Mkfifoat(dirfd int, path string, mode uint32) (err error) {
 //sys	Posix_openpt(oflag int) (fd int, err error) = SYS_POSIX_OPENPT
 //sys	Grantpt(fildes int) (rc int, err error) = SYS_GRANTPT
 //sys	Unlockpt(fildes int) (rc int, err error) = SYS_UNLOCKPT
+
+func fcntlAsIs(fd uintptr, cmd int, arg uintptr) (val int, err error) {
+	runtime.EnterSyscall()
+	r0, e2, e1 := CallLeFuncWithErr(GetZosLibVec()+SYS_FCNTL<<4, uintptr(fd), uintptr(cmd), arg)
+	runtime.ExitSyscall()
+	val = int(r0)
+	if int64(r0) == -1 {
+		err = errnoErr2(e1, e2)
+	}
+	return
+}
+
+func Fcntl(fd uintptr, cmd int, op interface{}) (ret int, err error) {
+	switch op.(type) {
+	case *Flock_t:
+		err = FcntlFlock(fd, cmd, op.(*Flock_t))
+		if err != nil {
+			ret = -1
+		}
+		return
+	case int:
+		return FcntlInt(fd, cmd, op.(int))
+	case *F_cnvrt:
+		return fcntlAsIs(fd, cmd, uintptr(unsafe.Pointer(op.(*F_cnvrt))))
+	case unsafe.Pointer:
+		return fcntlAsIs(fd, cmd, uintptr(op.(unsafe.Pointer)))
+	default:
+		return -1, EINVAL
+	}
+	return
+}
+
+func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) {
+	if raceenabled {
+		raceReleaseMerge(unsafe.Pointer(&ioSync))
+	}
+	return sendfile(outfd, infd, offset, count)
+}
+
+func sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) {
+	// TODO: use LE call instead if the call is implemented
+	originalOffset, err := Seek(infd, 0, SEEK_CUR)
+	if err != nil {
+		return -1, err
+	}
+	//start reading data from in_fd
+	if offset != nil {
+		_, err := Seek(infd, *offset, SEEK_SET)
+		if err != nil {
+			return -1, err
+		}
+	}
+
+	buf := make([]byte, count)
+	readBuf := make([]byte, 0)
+	var n int = 0
+	for i := 0; i < count; i += n {
+		n, err := Read(infd, buf)
+		if n == 0 {
+			if err != nil {
+				return -1, err
+			} else { // EOF
+				break
+			}
+		}
+		readBuf = append(readBuf, buf...)
+		buf = buf[0:0]
+	}
+
+	n2, err := Write(outfd, readBuf)
+	if err != nil {
+		return -1, err
+	}
+
+	//When sendfile() returns, this variable will be set to the
+	// offset of the byte following the last byte that was read.
+	if offset != nil {
+		*offset = *offset + int64(n)
+		// If offset is not NULL, then sendfile() does not modify the file
+		// offset of in_fd
+		_, err := Seek(infd, originalOffset, SEEK_SET)
+		if err != nil {
+			return -1, err
+		}
+	}
+	return n2, nil
+}
diff --git a/vendor/golang.org/x/sys/unix/vgetrandom_linux.go b/vendor/golang.org/x/sys/unix/vgetrandom_linux.go
new file mode 100644
index 0000000000000000000000000000000000000000..07ac8e09d1b702b1abd56252d7c6d1135580f559
--- /dev/null
+++ b/vendor/golang.org/x/sys/unix/vgetrandom_linux.go
@@ -0,0 +1,13 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && go1.24
+
+package unix
+
+import _ "unsafe"
+
+//go:linkname vgetrandom runtime.vgetrandom
+//go:noescape
+func vgetrandom(p []byte, flags uint32) (ret int, supported bool)
diff --git a/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go b/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go
new file mode 100644
index 0000000000000000000000000000000000000000..297e97bce92a6ebc7eeebc067685a2587b5f7e63
--- /dev/null
+++ b/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go
@@ -0,0 +1,11 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !linux || !go1.24
+
+package unix
+
+func vgetrandom(p []byte, flags uint32) (ret int, supported bool) {
+	return -1, false
+}
diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
index 4308ac1772b9fa149be0e641158e67cc34ab55e3..d73c4652e6c549ba5214cbc249eaa72274f6ae4a 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
@@ -237,6 +237,9 @@ const (
 	CLOCK_UPTIME_RAW_APPROX                 = 0x9
 	CLONE_NOFOLLOW                          = 0x1
 	CLONE_NOOWNERCOPY                       = 0x2
+	CONNECT_DATA_AUTHENTICATED              = 0x4
+	CONNECT_DATA_IDEMPOTENT                 = 0x2
+	CONNECT_RESUME_ON_READ_WRITE            = 0x1
 	CR0                                     = 0x0
 	CR1                                     = 0x1000
 	CR2                                     = 0x2000
@@ -1265,6 +1268,10 @@ const (
 	RTV_SSTHRESH                            = 0x20
 	RUSAGE_CHILDREN                         = -0x1
 	RUSAGE_SELF                             = 0x0
+	SAE_ASSOCID_ALL                         = 0xffffffff
+	SAE_ASSOCID_ANY                         = 0x0
+	SAE_CONNID_ALL                          = 0xffffffff
+	SAE_CONNID_ANY                          = 0x0
 	SCM_CREDS                               = 0x3
 	SCM_RIGHTS                              = 0x1
 	SCM_TIMESTAMP                           = 0x2
diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
index c8068a7a16900487f318022c268b7e2484770ee2..4a55a400588946f6f5aa4d3c472f146c9fa938a5 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
@@ -237,6 +237,9 @@ const (
 	CLOCK_UPTIME_RAW_APPROX                 = 0x9
 	CLONE_NOFOLLOW                          = 0x1
 	CLONE_NOOWNERCOPY                       = 0x2
+	CONNECT_DATA_AUTHENTICATED              = 0x4
+	CONNECT_DATA_IDEMPOTENT                 = 0x2
+	CONNECT_RESUME_ON_READ_WRITE            = 0x1
 	CR0                                     = 0x0
 	CR1                                     = 0x1000
 	CR2                                     = 0x2000
@@ -1265,6 +1268,10 @@ const (
 	RTV_SSTHRESH                            = 0x20
 	RUSAGE_CHILDREN                         = -0x1
 	RUSAGE_SELF                             = 0x0
+	SAE_ASSOCID_ALL                         = 0xffffffff
+	SAE_ASSOCID_ANY                         = 0x0
+	SAE_CONNID_ALL                          = 0xffffffff
+	SAE_CONNID_ANY                          = 0x0
 	SCM_CREDS                               = 0x3
 	SCM_RIGHTS                              = 0x1
 	SCM_TIMESTAMP                           = 0x2
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux.go b/vendor/golang.org/x/sys/unix/zerrors_linux.go
index 01a70b24638e60cde45588ff39ee6bfd1d915a21..6ebc48b3fecd71f891b0aeff1f2f9c8110b9e88d 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux.go
@@ -321,6 +321,9 @@ const (
 	AUDIT_INTEGRITY_STATUS                      = 0x70a
 	AUDIT_IPC                                   = 0x517
 	AUDIT_IPC_SET_PERM                          = 0x51f
+	AUDIT_IPE_ACCESS                            = 0x58c
+	AUDIT_IPE_CONFIG_CHANGE                     = 0x58d
+	AUDIT_IPE_POLICY_LOAD                       = 0x58e
 	AUDIT_KERNEL                                = 0x7d0
 	AUDIT_KERNEL_OTHER                          = 0x524
 	AUDIT_KERN_MODULE                           = 0x532
@@ -489,12 +492,14 @@ const (
 	BPF_F_ID                                    = 0x20
 	BPF_F_NETFILTER_IP_DEFRAG                   = 0x1
 	BPF_F_QUERY_EFFECTIVE                       = 0x1
+	BPF_F_REDIRECT_FLAGS                        = 0x19
 	BPF_F_REPLACE                               = 0x4
 	BPF_F_SLEEPABLE                             = 0x10
 	BPF_F_STRICT_ALIGNMENT                      = 0x1
 	BPF_F_TEST_REG_INVARIANTS                   = 0x80
 	BPF_F_TEST_RND_HI32                         = 0x4
 	BPF_F_TEST_RUN_ON_CPU                       = 0x1
+	BPF_F_TEST_SKB_CHECKSUM_COMPLETE            = 0x4
 	BPF_F_TEST_STATE_FREQ                       = 0x8
 	BPF_F_TEST_XDP_LIVE_FRAMES                  = 0x2
 	BPF_F_XDP_DEV_BOUND_ONLY                    = 0x40
@@ -1165,6 +1170,7 @@ const (
 	EXTA                                        = 0xe
 	EXTB                                        = 0xf
 	F2FS_SUPER_MAGIC                            = 0xf2f52010
+	FALLOC_FL_ALLOCATE_RANGE                    = 0x0
 	FALLOC_FL_COLLAPSE_RANGE                    = 0x8
 	FALLOC_FL_INSERT_RANGE                      = 0x20
 	FALLOC_FL_KEEP_SIZE                         = 0x1
@@ -1798,6 +1804,8 @@ const (
 	LANDLOCK_ACCESS_NET_BIND_TCP                = 0x1
 	LANDLOCK_ACCESS_NET_CONNECT_TCP             = 0x2
 	LANDLOCK_CREATE_RULESET_VERSION             = 0x1
+	LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET         = 0x1
+	LANDLOCK_SCOPE_SIGNAL                       = 0x2
 	LINUX_REBOOT_CMD_CAD_OFF                    = 0x0
 	LINUX_REBOOT_CMD_CAD_ON                     = 0x89abcdef
 	LINUX_REBOOT_CMD_HALT                       = 0xcdef0123
@@ -1922,6 +1930,8 @@ const (
 	MNT_EXPIRE                                  = 0x4
 	MNT_FORCE                                   = 0x1
 	MNT_ID_REQ_SIZE_VER0                        = 0x18
+	MNT_ID_REQ_SIZE_VER1                        = 0x20
+	MNT_NS_INFO_SIZE_VER0                       = 0x10
 	MODULE_INIT_COMPRESSED_FILE                 = 0x4
 	MODULE_INIT_IGNORE_MODVERSIONS              = 0x1
 	MODULE_INIT_IGNORE_VERMAGIC                 = 0x2
@@ -2187,7 +2197,7 @@ const (
 	NFT_REG_SIZE                                = 0x10
 	NFT_REJECT_ICMPX_MAX                        = 0x3
 	NFT_RT_MAX                                  = 0x4
-	NFT_SECMARK_CTX_MAXLEN                      = 0x100
+	NFT_SECMARK_CTX_MAXLEN                      = 0x1000
 	NFT_SET_MAXNAMELEN                          = 0x100
 	NFT_SOCKET_MAX                              = 0x3
 	NFT_TABLE_F_MASK                            = 0x7
@@ -2356,9 +2366,11 @@ const (
 	PERF_MEM_LVLNUM_IO                          = 0xa
 	PERF_MEM_LVLNUM_L1                          = 0x1
 	PERF_MEM_LVLNUM_L2                          = 0x2
+	PERF_MEM_LVLNUM_L2_MHB                      = 0x5
 	PERF_MEM_LVLNUM_L3                          = 0x3
 	PERF_MEM_LVLNUM_L4                          = 0x4
 	PERF_MEM_LVLNUM_LFB                         = 0xc
+	PERF_MEM_LVLNUM_MSC                         = 0x6
 	PERF_MEM_LVLNUM_NA                          = 0xf
 	PERF_MEM_LVLNUM_PMEM                        = 0xe
 	PERF_MEM_LVLNUM_RAM                         = 0xd
@@ -2431,6 +2443,7 @@ const (
 	PRIO_PGRP                                   = 0x1
 	PRIO_PROCESS                                = 0x0
 	PRIO_USER                                   = 0x2
+	PROCFS_IOCTL_MAGIC                          = 'f'
 	PROC_SUPER_MAGIC                            = 0x9fa0
 	PROT_EXEC                                   = 0x4
 	PROT_GROWSDOWN                              = 0x1000000
@@ -2620,6 +2633,28 @@ const (
 	PR_UNALIGN_NOPRINT                          = 0x1
 	PR_UNALIGN_SIGBUS                           = 0x2
 	PSTOREFS_MAGIC                              = 0x6165676c
+	PTP_CLK_MAGIC                               = '='
+	PTP_ENABLE_FEATURE                          = 0x1
+	PTP_EXTTS_EDGES                             = 0x6
+	PTP_EXTTS_EVENT_VALID                       = 0x1
+	PTP_EXTTS_V1_VALID_FLAGS                    = 0x7
+	PTP_EXTTS_VALID_FLAGS                       = 0x1f
+	PTP_EXT_OFFSET                              = 0x10
+	PTP_FALLING_EDGE                            = 0x4
+	PTP_MAX_SAMPLES                             = 0x19
+	PTP_PEROUT_DUTY_CYCLE                       = 0x2
+	PTP_PEROUT_ONE_SHOT                         = 0x1
+	PTP_PEROUT_PHASE                            = 0x4
+	PTP_PEROUT_V1_VALID_FLAGS                   = 0x0
+	PTP_PEROUT_VALID_FLAGS                      = 0x7
+	PTP_PIN_GETFUNC                             = 0xc0603d06
+	PTP_PIN_GETFUNC2                            = 0xc0603d0f
+	PTP_RISING_EDGE                             = 0x2
+	PTP_STRICT_FLAGS                            = 0x8
+	PTP_SYS_OFFSET_EXTENDED                     = 0xc4c03d09
+	PTP_SYS_OFFSET_EXTENDED2                    = 0xc4c03d12
+	PTP_SYS_OFFSET_PRECISE                      = 0xc0403d08
+	PTP_SYS_OFFSET_PRECISE2                     = 0xc0403d11
 	PTRACE_ATTACH                               = 0x10
 	PTRACE_CONT                                 = 0x7
 	PTRACE_DETACH                               = 0x11
@@ -2933,15 +2968,17 @@ const (
 	RUSAGE_SELF                                 = 0x0
 	RUSAGE_THREAD                               = 0x1
 	RWF_APPEND                                  = 0x10
+	RWF_ATOMIC                                  = 0x40
 	RWF_DSYNC                                   = 0x2
 	RWF_HIPRI                                   = 0x1
 	RWF_NOAPPEND                                = 0x20
 	RWF_NOWAIT                                  = 0x8
-	RWF_SUPPORTED                               = 0x3f
+	RWF_SUPPORTED                               = 0x7f
 	RWF_SYNC                                    = 0x4
 	RWF_WRITE_LIFE_NOT_SET                      = 0x0
 	SCHED_BATCH                                 = 0x3
 	SCHED_DEADLINE                              = 0x6
+	SCHED_EXT                                   = 0x7
 	SCHED_FIFO                                  = 0x1
 	SCHED_FLAG_ALL                              = 0x7f
 	SCHED_FLAG_DL_OVERRUN                       = 0x4
@@ -3210,6 +3247,7 @@ const (
 	STATX_ATTR_MOUNT_ROOT                       = 0x2000
 	STATX_ATTR_NODUMP                           = 0x40
 	STATX_ATTR_VERITY                           = 0x100000
+	STATX_ATTR_WRITE_ATOMIC                     = 0x400000
 	STATX_BASIC_STATS                           = 0x7ff
 	STATX_BLOCKS                                = 0x400
 	STATX_BTIME                                 = 0x800
@@ -3226,6 +3264,7 @@ const (
 	STATX_SUBVOL                                = 0x8000
 	STATX_TYPE                                  = 0x1
 	STATX_UID                                   = 0x8
+	STATX_WRITE_ATOMIC                          = 0x10000
 	STATX__RESERVED                             = 0x80000000
 	SYNC_FILE_RANGE_WAIT_AFTER                  = 0x4
 	SYNC_FILE_RANGE_WAIT_BEFORE                 = 0x1
@@ -3624,6 +3663,7 @@ const (
 	XDP_UMEM_PGOFF_COMPLETION_RING              = 0x180000000
 	XDP_UMEM_PGOFF_FILL_RING                    = 0x100000000
 	XDP_UMEM_REG                                = 0x4
+	XDP_UMEM_TX_METADATA_LEN                    = 0x4
 	XDP_UMEM_TX_SW_CSUM                         = 0x2
 	XDP_UMEM_UNALIGNED_CHUNK_FLAG               = 0x1
 	XDP_USE_NEED_WAKEUP                         = 0x8
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
index 684a5168dac4e3a9cbc4bb6c0bc3dc8259371f06..c0d45e320505ff6424711bbec395223c8d4ec450 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
@@ -109,6 +109,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -153,9 +154,14 @@ const (
 	NFDBITS                          = 0x20
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -232,6 +238,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETFPXREGS                = 0x12
 	PTRACE_GET_THREAD_AREA           = 0x19
@@ -278,6 +298,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -316,6 +338,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
index 61d74b592d68647f5ec4daffe64daca8d8ff0870..c731d24f02529136a59cf292fcfc27673e1c1396 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
@@ -109,6 +109,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -153,9 +154,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -232,6 +238,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_ARCH_PRCTL                = 0x1e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETFPXREGS                = 0x12
@@ -279,6 +299,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -317,6 +339,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
index a28c9e3e893adb63c98bad863878601f8d2ae217..680018a4a7c9f0f67be48f7ebd0a31a7b6438d38 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -150,9 +151,14 @@ const (
 	NFDBITS                          = 0x20
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -229,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_GETCRUNCHREGS             = 0x19
 	PTRACE_GETFDPIC                  = 0x1f
 	PTRACE_GETFDPIC_EXEC             = 0x0
@@ -284,6 +304,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -322,6 +344,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
index ab5d1fe8ead78b400d50ada17cc0a8c203e166fe..a63909f308d6d7d45db1b23ba71826ef51bd4c72 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
@@ -112,6 +112,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -154,9 +155,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -200,6 +206,7 @@ const (
 	PERF_EVENT_IOC_SET_BPF           = 0x40042408
 	PERF_EVENT_IOC_SET_FILTER        = 0x40082406
 	PERF_EVENT_IOC_SET_OUTPUT        = 0x2405
+	POE_MAGIC                        = 0x504f4530
 	PPPIOCATTACH                     = 0x4004743d
 	PPPIOCATTCHAN                    = 0x40047438
 	PPPIOCBRIDGECHAN                 = 0x40047435
@@ -235,6 +242,20 @@ const (
 	PROT_BTI                         = 0x10
 	PROT_MTE                         = 0x20
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_PEEKMTETAGS               = 0x21
 	PTRACE_POKEMTETAGS               = 0x22
 	PTRACE_SYSEMU                    = 0x1f
@@ -275,6 +296,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -313,6 +336,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
index c523090e7c17e0ada43acb248c94934f4f29a150..9b0a2573fe3fb37337d0c4b87cf268bf95deeb87 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
@@ -109,6 +109,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -154,9 +155,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -233,6 +239,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_SYSEMU                    = 0x1f
 	PTRACE_SYSEMU_SINGLESTEP         = 0x20
 	RLIMIT_AS                        = 0x9
@@ -271,6 +291,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -309,6 +331,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
index 01e6ea7804b12a83847f59ac0464a0ecfa2f211a..958e6e0645acddc8287e1bdae4f672a49a3fb06b 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x100
@@ -150,9 +151,14 @@ const (
 	NFDBITS                          = 0x20
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -229,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GET_THREAD_AREA           = 0x19
 	PTRACE_GET_THREAD_AREA_3264      = 0xc4
@@ -277,6 +297,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -315,6 +337,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
index 7aa610b1e717baf92c32d098beeb1df9cd1223fe..50c7f25bd16c6b4bb48ba4f594c2c8064796075a 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x100
@@ -150,9 +151,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -229,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GET_THREAD_AREA           = 0x19
 	PTRACE_GET_THREAD_AREA_3264      = 0xc4
@@ -277,6 +297,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -315,6 +337,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
index 92af771b44a35026dee5c90b4f28aeeacb69762a..ced21d66d955aa1471413648b780ac6a79c29dda 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x100
@@ -150,9 +151,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -229,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GET_THREAD_AREA           = 0x19
 	PTRACE_GET_THREAD_AREA_3264      = 0xc4
@@ -277,6 +297,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -315,6 +337,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
index b27ef5e6f11952ba929e6659970c6e55b9ee7d34..226c0441902358d4fe95110e043721360edb84f8 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x100
@@ -150,9 +151,14 @@ const (
 	NFDBITS                          = 0x20
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -229,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GET_THREAD_AREA           = 0x19
 	PTRACE_GET_THREAD_AREA_3264      = 0xc4
@@ -277,6 +297,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -315,6 +337,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
index 237a2cefb3e5a27bfb7dc72a560c94495270cbaa..3122737cd464f0b9033911d2e928a48994c7b8fd 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x4000
 	ICANON                           = 0x100
 	IEXTEN                           = 0x400
@@ -152,9 +153,14 @@ const (
 	NL3                              = 0x300
 	NLDLY                            = 0x300
 	NOFLSH                           = 0x80000000
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x4
 	ONLCR                            = 0x2
@@ -232,6 +238,20 @@ const (
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PROT_SAO                         = 0x10
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETEVRREGS                = 0x14
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETREGS64                 = 0x16
@@ -332,6 +352,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -370,6 +392,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
index 4a5c555a36e2bf0383f0094e84e7ced989597391..eb5d3467edf0c1a6fcda2fd26ca8685a89207a1f 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x4000
 	ICANON                           = 0x100
 	IEXTEN                           = 0x400
@@ -152,9 +153,14 @@ const (
 	NL3                              = 0x300
 	NLDLY                            = 0x300
 	NOFLSH                           = 0x80000000
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x4
 	ONLCR                            = 0x2
@@ -232,6 +238,20 @@ const (
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PROT_SAO                         = 0x10
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETEVRREGS                = 0x14
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETREGS64                 = 0x16
@@ -336,6 +356,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -374,6 +396,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
index a02fb49a5f8adb3a400876aa44fde5e597205050..e921ebc60b7142bf22eb748914eaf6557ce644b9 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x4000
 	ICANON                           = 0x100
 	IEXTEN                           = 0x400
@@ -152,9 +153,14 @@ const (
 	NL3                              = 0x300
 	NLDLY                            = 0x300
 	NOFLSH                           = 0x80000000
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x4
 	ONLCR                            = 0x2
@@ -232,6 +238,20 @@ const (
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PROT_SAO                         = 0x10
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETEVRREGS                = 0x14
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETREGS64                 = 0x16
@@ -336,6 +356,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -374,6 +396,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
index e26a7c61b2b6fd8edfaf4feb989e013760c7965d..38ba81c55c1fd3f8fee36578e2a5b67e1a5b1eb1 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -150,9 +151,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -229,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_GETFDPIC                  = 0x21
 	PTRACE_GETFDPIC_EXEC             = 0x0
 	PTRACE_GETFDPIC_INTERP           = 0x1
@@ -268,6 +288,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -306,6 +328,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
index c48f7c2103b81350e8af69d8b83cc6fa25d1a7a1..71f0400977b3672a79401900b85701a23d8b0f99 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
@@ -108,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -150,9 +151,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -229,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_DISABLE_TE                = 0x5010
 	PTRACE_ENABLE_TE                 = 0x5009
 	PTRACE_GET_LAST_BREAK            = 0x5006
@@ -340,6 +360,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -378,6 +400,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
index ad4b9aace7bb6f7eb581f41398aa987f9436ef3a..c44a313322c54c61e7b1952290d2cc5f214331c3 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
@@ -112,6 +112,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -155,9 +156,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -234,6 +240,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPAREGS                = 0x14
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETFPREGS64               = 0x19
@@ -331,6 +351,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x58
+	SCM_DEVMEM_LINEAR                = 0x57
 	SCM_TIMESTAMPING                 = 0x23
 	SCM_TIMESTAMPING_OPT_STATS       = 0x38
 	SCM_TIMESTAMPING_PKTINFO         = 0x3c
@@ -417,6 +439,9 @@ const (
 	SO_CNX_ADVICE                    = 0x37
 	SO_COOKIE                        = 0x3b
 	SO_DETACH_REUSEPORT_BPF          = 0x47
+	SO_DEVMEM_DMABUF                 = 0x58
+	SO_DEVMEM_DONTNEED               = 0x59
+	SO_DEVMEM_LINEAR                 = 0x57
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
index da08b2ab3d9390d54c8d8ccdb56eb0b458d7989f..1ec2b1407b1268a1c4d8a5f3bc4c2bdd377f418d 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
@@ -581,6 +581,8 @@ const (
 	AT_EMPTY_PATH                   = 0x1000
 	AT_REMOVEDIR                    = 0x200
 	RENAME_NOREPLACE                = 1 << 0
+	ST_RDONLY                       = 1
+	ST_NOSUID                       = 2
 )
 
 const (
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
index b622533ef2c2b7c62409ff7931076774767f74de..24b346e1a35159624c2f367cc53798dd2e947b57 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
@@ -841,6 +841,26 @@ var libc_pthread_fchdir_np_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) {
+	var _p0 unsafe.Pointer
+	if len(iov) > 0 {
+		_p0 = unsafe.Pointer(&iov[0])
+	} else {
+		_p0 = unsafe.Pointer(&_zero)
+	}
+	_, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_connectx_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) {
 	_, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags))
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
index cfe6646baf230fe4e48c94576b47a755f5e599ce..ebd213100b352196714a7e8dc8af96dd9f0b9997 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
@@ -248,6 +248,11 @@ TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8
 DATA	·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB)
 
+TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_connectx(SB)
+GLOBL	·libc_connectx_trampoline_addr(SB), RODATA, $8
+DATA	·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB)
+
 TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_sendfile(SB)
 GLOBL	·libc_sendfile_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
index 13f624f69f19d446672db282bab5a12c7f94fae0..824b9c2d5e0e011fe18c26975fda89704fc875b7 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
@@ -841,6 +841,26 @@ var libc_pthread_fchdir_np_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) {
+	var _p0 unsafe.Pointer
+	if len(iov) > 0 {
+		_p0 = unsafe.Pointer(&iov[0])
+	} else {
+		_p0 = unsafe.Pointer(&_zero)
+	}
+	_, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_connectx_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) {
 	_, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags))
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
index fe222b75df04f7d52b53c4f3d32f6863170784c0..4f178a229345e3f062c1371b3c36eadbc8e04274 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
@@ -248,6 +248,11 @@ TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8
 DATA	·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB)
 
+TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_connectx(SB)
+GLOBL	·libc_connectx_trampoline_addr(SB), RODATA, $8
+DATA	·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB)
+
 TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_sendfile(SB)
 GLOBL	·libc_sendfile_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux.go b/vendor/golang.org/x/sys/unix/zsyscall_linux.go
index 1bc1a5adb25fde8aae979c74220db4ae26aea523..5cc1e8eb2f35e097af432fd2273a2a73c1d35b36 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux.go
@@ -592,6 +592,16 @@ func ClockGettime(clockid int32, time *Timespec) (err error) {
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func ClockSettime(clockid int32, time *Timespec) (err error) {
+	_, _, e1 := Syscall(SYS_CLOCK_SETTIME, uintptr(clockid), uintptr(unsafe.Pointer(time)), 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func ClockNanosleep(clockid int32, flags int, request *Timespec, remain *Timespec) (err error) {
 	_, _, e1 := Syscall6(SYS_CLOCK_NANOSLEEP, uintptr(clockid), uintptr(flags), uintptr(unsafe.Pointer(request)), uintptr(unsafe.Pointer(remain)), 0, 0)
 	if e1 != 0 {
@@ -971,23 +981,6 @@ func Getpriority(which int, who int) (prio int, err error) {
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
-func Getrandom(buf []byte, flags int) (n int, err error) {
-	var _p0 unsafe.Pointer
-	if len(buf) > 0 {
-		_p0 = unsafe.Pointer(&buf[0])
-	} else {
-		_p0 = unsafe.Pointer(&_zero)
-	}
-	r0, _, e1 := Syscall(SYS_GETRANDOM, uintptr(_p0), uintptr(len(buf)), uintptr(flags))
-	n = int(r0)
-	if e1 != 0 {
-		err = errnoErr(e1)
-	}
-	return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
 func Getrusage(who int, rusage *Rusage) (err error) {
 	_, _, e1 := RawSyscall(SYS_GETRUSAGE, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0)
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
index d3e38f681ab0345eac069c79580f7f9cba3d4b7a..f485dbf4565671fd01bc685d7de256cbbac0dee2 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
@@ -341,6 +341,7 @@ const (
 	SYS_STATX                   = 332
 	SYS_IO_PGETEVENTS           = 333
 	SYS_RSEQ                    = 334
+	SYS_URETPROBE               = 335
 	SYS_PIDFD_SEND_SIGNAL       = 424
 	SYS_IO_URING_SETUP          = 425
 	SYS_IO_URING_ENTER          = 426
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
index 6c778c23278f923fc4424265cacc92280877460f..1893e2fe884044dc9724126bf8565ca9fca04748 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
@@ -85,7 +85,7 @@ const (
 	SYS_SPLICE                  = 76
 	SYS_TEE                     = 77
 	SYS_READLINKAT              = 78
-	SYS_FSTATAT                 = 79
+	SYS_NEWFSTATAT              = 79
 	SYS_FSTAT                   = 80
 	SYS_SYNC                    = 81
 	SYS_FSYNC                   = 82
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
index 37281cf51a80bbcae0342a567bad5da4e089d2d6..16a4017da0ab2fbb8868a34cf7e90df9b052577e 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
@@ -84,6 +84,8 @@ const (
 	SYS_SPLICE                  = 76
 	SYS_TEE                     = 77
 	SYS_READLINKAT              = 78
+	SYS_NEWFSTATAT              = 79
+	SYS_FSTAT                   = 80
 	SYS_SYNC                    = 81
 	SYS_FSYNC                   = 82
 	SYS_FDATASYNC               = 83
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
index 9889f6a5591b62a375a69a718b692bcb4328852b..a5459e766f59dbfbd2751e1ecc5f7f1c4561bfb4 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
@@ -84,7 +84,7 @@ const (
 	SYS_SPLICE                  = 76
 	SYS_TEE                     = 77
 	SYS_READLINKAT              = 78
-	SYS_FSTATAT                 = 79
+	SYS_NEWFSTATAT              = 79
 	SYS_FSTAT                   = 80
 	SYS_SYNC                    = 81
 	SYS_FSYNC                   = 82
diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
index 091d107f3a5c98255250449074b23e36885b256d..17c53bd9b3315aaec65571964e73bb93e6c85175 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
@@ -306,6 +306,19 @@ type XVSockPgen struct {
 
 type _Socklen uint32
 
+type SaeAssocID uint32
+
+type SaeConnID uint32
+
+type SaEndpoints struct {
+	Srcif      uint32
+	Srcaddr    *RawSockaddr
+	Srcaddrlen uint32
+	Dstaddr    *RawSockaddr
+	Dstaddrlen uint32
+	_          [4]byte
+}
+
 type Xucred struct {
 	Version uint32
 	Uid     uint32
@@ -449,11 +462,14 @@ type FdSet struct {
 
 const (
 	SizeofIfMsghdr    = 0x70
+	SizeofIfMsghdr2   = 0xa0
 	SizeofIfData      = 0x60
+	SizeofIfData64    = 0x80
 	SizeofIfaMsghdr   = 0x14
 	SizeofIfmaMsghdr  = 0x10
 	SizeofIfmaMsghdr2 = 0x14
 	SizeofRtMsghdr    = 0x5c
+	SizeofRtMsghdr2   = 0x5c
 	SizeofRtMetrics   = 0x38
 )
 
@@ -467,6 +483,20 @@ type IfMsghdr struct {
 	Data    IfData
 }
 
+type IfMsghdr2 struct {
+	Msglen     uint16
+	Version    uint8
+	Type       uint8
+	Addrs      int32
+	Flags      int32
+	Index      uint16
+	Snd_len    int32
+	Snd_maxlen int32
+	Snd_drops  int32
+	Timer      int32
+	Data       IfData64
+}
+
 type IfData struct {
 	Type       uint8
 	Typelen    uint8
@@ -499,6 +529,34 @@ type IfData struct {
 	Reserved2  uint32
 }
 
+type IfData64 struct {
+	Type       uint8
+	Typelen    uint8
+	Physical   uint8
+	Addrlen    uint8
+	Hdrlen     uint8
+	Recvquota  uint8
+	Xmitquota  uint8
+	Unused1    uint8
+	Mtu        uint32
+	Metric     uint32
+	Baudrate   uint64
+	Ipackets   uint64
+	Ierrors    uint64
+	Opackets   uint64
+	Oerrors    uint64
+	Collisions uint64
+	Ibytes     uint64
+	Obytes     uint64
+	Imcasts    uint64
+	Omcasts    uint64
+	Iqdrops    uint64
+	Noproto    uint64
+	Recvtiming uint32
+	Xmittiming uint32
+	Lastchange Timeval32
+}
+
 type IfaMsghdr struct {
 	Msglen  uint16
 	Version uint8
@@ -544,6 +602,21 @@ type RtMsghdr struct {
 	Rmx     RtMetrics
 }
 
+type RtMsghdr2 struct {
+	Msglen      uint16
+	Version     uint8
+	Type        uint8
+	Index       uint16
+	Flags       int32
+	Addrs       int32
+	Refcnt      int32
+	Parentflags int32
+	Reserved    int32
+	Use         int32
+	Inits       uint32
+	Rmx         RtMetrics
+}
+
 type RtMetrics struct {
 	Locks    uint32
 	Mtu      uint32
diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
index 28ff4ef74d0d3e2769e75b79c0b757fc3837a13e..2392226a743eb3e19c581773191ddbe86eecad53 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
@@ -306,6 +306,19 @@ type XVSockPgen struct {
 
 type _Socklen uint32
 
+type SaeAssocID uint32
+
+type SaeConnID uint32
+
+type SaEndpoints struct {
+	Srcif      uint32
+	Srcaddr    *RawSockaddr
+	Srcaddrlen uint32
+	Dstaddr    *RawSockaddr
+	Dstaddrlen uint32
+	_          [4]byte
+}
+
 type Xucred struct {
 	Version uint32
 	Uid     uint32
@@ -449,11 +462,14 @@ type FdSet struct {
 
 const (
 	SizeofIfMsghdr    = 0x70
+	SizeofIfMsghdr2   = 0xa0
 	SizeofIfData      = 0x60
+	SizeofIfData64    = 0x80
 	SizeofIfaMsghdr   = 0x14
 	SizeofIfmaMsghdr  = 0x10
 	SizeofIfmaMsghdr2 = 0x14
 	SizeofRtMsghdr    = 0x5c
+	SizeofRtMsghdr2   = 0x5c
 	SizeofRtMetrics   = 0x38
 )
 
@@ -467,6 +483,20 @@ type IfMsghdr struct {
 	Data    IfData
 }
 
+type IfMsghdr2 struct {
+	Msglen     uint16
+	Version    uint8
+	Type       uint8
+	Addrs      int32
+	Flags      int32
+	Index      uint16
+	Snd_len    int32
+	Snd_maxlen int32
+	Snd_drops  int32
+	Timer      int32
+	Data       IfData64
+}
+
 type IfData struct {
 	Type       uint8
 	Typelen    uint8
@@ -499,6 +529,34 @@ type IfData struct {
 	Reserved2  uint32
 }
 
+type IfData64 struct {
+	Type       uint8
+	Typelen    uint8
+	Physical   uint8
+	Addrlen    uint8
+	Hdrlen     uint8
+	Recvquota  uint8
+	Xmitquota  uint8
+	Unused1    uint8
+	Mtu        uint32
+	Metric     uint32
+	Baudrate   uint64
+	Ipackets   uint64
+	Ierrors    uint64
+	Opackets   uint64
+	Oerrors    uint64
+	Collisions uint64
+	Ibytes     uint64
+	Obytes     uint64
+	Imcasts    uint64
+	Omcasts    uint64
+	Iqdrops    uint64
+	Noproto    uint64
+	Recvtiming uint32
+	Xmittiming uint32
+	Lastchange Timeval32
+}
+
 type IfaMsghdr struct {
 	Msglen  uint16
 	Version uint8
@@ -544,6 +602,21 @@ type RtMsghdr struct {
 	Rmx     RtMetrics
 }
 
+type RtMsghdr2 struct {
+	Msglen      uint16
+	Version     uint8
+	Type        uint8
+	Index       uint16
+	Flags       int32
+	Addrs       int32
+	Refcnt      int32
+	Parentflags int32
+	Reserved    int32
+	Use         int32
+	Inits       uint32
+	Rmx         RtMetrics
+}
+
 type RtMetrics struct {
 	Locks    uint32
 	Mtu      uint32
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
index 6cbd094a3aa19eaf6dfdbddac0d63276bd7ffa79..51e13eb055fc5b6b11c1c6ec83bce47610faffa3 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
@@ -625,6 +625,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
index 7c03b6ee77fa0839d25c0d67debf11a4555b703c..d002d8ef3cc2d79cef3f0afee8d25c22632d994f 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
@@ -630,6 +630,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
index 422107ee8b1331d85b7ab4b18a1b8846d1dcc793..3f863d898dd8520433d3201e92b4b318e0a0337a 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
@@ -616,6 +616,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
index 505a12acfd9d2951064f4d98572f7f3313109ee9..61c7293106616f197b93c1f86f3fb9d599a25f14 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
@@ -610,6 +610,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
index cc986c7900660055f809096adc8112e208acca37..b5d17414f039f90f084edaf771bfb3b3c96c15b0 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
@@ -612,6 +612,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go
index b102b95a0a19db8ad7f47ef4c4f7a0d9d8a69100..5537148dcbb3dee0a8d5c8c85f5566895decc8db 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go
@@ -87,31 +87,35 @@ type StatxTimestamp struct {
 }
 
 type Statx_t struct {
-	Mask             uint32
-	Blksize          uint32
-	Attributes       uint64
-	Nlink            uint32
-	Uid              uint32
-	Gid              uint32
-	Mode             uint16
-	_                [1]uint16
-	Ino              uint64
-	Size             uint64
-	Blocks           uint64
-	Attributes_mask  uint64
-	Atime            StatxTimestamp
-	Btime            StatxTimestamp
-	Ctime            StatxTimestamp
-	Mtime            StatxTimestamp
-	Rdev_major       uint32
-	Rdev_minor       uint32
-	Dev_major        uint32
-	Dev_minor        uint32
-	Mnt_id           uint64
-	Dio_mem_align    uint32
-	Dio_offset_align uint32
-	Subvol           uint64
-	_                [11]uint64
+	Mask                      uint32
+	Blksize                   uint32
+	Attributes                uint64
+	Nlink                     uint32
+	Uid                       uint32
+	Gid                       uint32
+	Mode                      uint16
+	_                         [1]uint16
+	Ino                       uint64
+	Size                      uint64
+	Blocks                    uint64
+	Attributes_mask           uint64
+	Atime                     StatxTimestamp
+	Btime                     StatxTimestamp
+	Ctime                     StatxTimestamp
+	Mtime                     StatxTimestamp
+	Rdev_major                uint32
+	Rdev_minor                uint32
+	Dev_major                 uint32
+	Dev_minor                 uint32
+	Mnt_id                    uint64
+	Dio_mem_align             uint32
+	Dio_offset_align          uint32
+	Subvol                    uint64
+	Atomic_write_unit_min     uint32
+	Atomic_write_unit_max     uint32
+	Atomic_write_segments_max uint32
+	_                         [1]uint32
+	_                         [9]uint64
 }
 
 type Fsid struct {
@@ -516,6 +520,29 @@ type TCPInfo struct {
 	Total_rto_time       uint32
 }
 
+type TCPVegasInfo struct {
+	Enabled uint32
+	Rttcnt  uint32
+	Rtt     uint32
+	Minrtt  uint32
+}
+
+type TCPDCTCPInfo struct {
+	Enabled  uint16
+	Ce_state uint16
+	Alpha    uint32
+	Ab_ecn   uint32
+	Ab_tot   uint32
+}
+
+type TCPBBRInfo struct {
+	Bw_lo       uint32
+	Bw_hi       uint32
+	Min_rtt     uint32
+	Pacing_gain uint32
+	Cwnd_gain   uint32
+}
+
 type CanFilter struct {
 	Id   uint32
 	Mask uint32
@@ -557,6 +584,7 @@ const (
 	SizeofICMPv6Filter      = 0x20
 	SizeofUcred             = 0xc
 	SizeofTCPInfo           = 0xf8
+	SizeofTCPCCInfo         = 0x14
 	SizeofCanFilter         = 0x8
 	SizeofTCPRepairOpt      = 0x8
 )
@@ -1724,12 +1752,6 @@ const (
 	IFLA_IPVLAN_UNSPEC                         = 0x0
 	IFLA_IPVLAN_MODE                           = 0x1
 	IFLA_IPVLAN_FLAGS                          = 0x2
-	NETKIT_NEXT                                = -0x1
-	NETKIT_PASS                                = 0x0
-	NETKIT_DROP                                = 0x2
-	NETKIT_REDIRECT                            = 0x7
-	NETKIT_L2                                  = 0x0
-	NETKIT_L3                                  = 0x1
 	IFLA_NETKIT_UNSPEC                         = 0x0
 	IFLA_NETKIT_PEER_INFO                      = 0x1
 	IFLA_NETKIT_PRIMARY                        = 0x2
@@ -1768,6 +1790,7 @@ const (
 	IFLA_VXLAN_DF                              = 0x1d
 	IFLA_VXLAN_VNIFILTER                       = 0x1e
 	IFLA_VXLAN_LOCALBYPASS                     = 0x1f
+	IFLA_VXLAN_LABEL_POLICY                    = 0x20
 	IFLA_GENEVE_UNSPEC                         = 0x0
 	IFLA_GENEVE_ID                             = 0x1
 	IFLA_GENEVE_REMOTE                         = 0x2
@@ -1797,6 +1820,8 @@ const (
 	IFLA_GTP_ROLE                              = 0x4
 	IFLA_GTP_CREATE_SOCKETS                    = 0x5
 	IFLA_GTP_RESTART_COUNT                     = 0x6
+	IFLA_GTP_LOCAL                             = 0x7
+	IFLA_GTP_LOCAL6                            = 0x8
 	IFLA_BOND_UNSPEC                           = 0x0
 	IFLA_BOND_MODE                             = 0x1
 	IFLA_BOND_ACTIVE_SLAVE                     = 0x2
@@ -1829,6 +1854,7 @@ const (
 	IFLA_BOND_AD_LACP_ACTIVE                   = 0x1d
 	IFLA_BOND_MISSED_MAX                       = 0x1e
 	IFLA_BOND_NS_IP6_TARGET                    = 0x1f
+	IFLA_BOND_COUPLED_CONTROL                  = 0x20
 	IFLA_BOND_AD_INFO_UNSPEC                   = 0x0
 	IFLA_BOND_AD_INFO_AGGREGATOR               = 0x1
 	IFLA_BOND_AD_INFO_NUM_PORTS                = 0x2
@@ -1897,6 +1923,7 @@ const (
 	IFLA_HSR_SEQ_NR                            = 0x5
 	IFLA_HSR_VERSION                           = 0x6
 	IFLA_HSR_PROTOCOL                          = 0x7
+	IFLA_HSR_INTERLINK                         = 0x8
 	IFLA_STATS_UNSPEC                          = 0x0
 	IFLA_STATS_LINK_64                         = 0x1
 	IFLA_STATS_LINK_XSTATS                     = 0x2
@@ -1949,6 +1976,15 @@ const (
 	IFLA_DSA_MASTER                            = 0x1
 )
 
+const (
+	NETKIT_NEXT     = -0x1
+	NETKIT_PASS     = 0x0
+	NETKIT_DROP     = 0x2
+	NETKIT_REDIRECT = 0x7
+	NETKIT_L2       = 0x0
+	NETKIT_L3       = 0x1
+)
+
 const (
 	NF_INET_PRE_ROUTING  = 0x0
 	NF_INET_LOCAL_IN     = 0x1
@@ -2486,7 +2522,7 @@ type XDPMmapOffsets struct {
 type XDPUmemReg struct {
 	Addr            uint64
 	Len             uint64
-	Chunk_size      uint32
+	Size            uint32
 	Headroom        uint32
 	Flags           uint32
 	Tx_metadata_len uint32
@@ -2558,8 +2594,8 @@ const (
 	SOF_TIMESTAMPING_BIND_PHC     = 0x8000
 	SOF_TIMESTAMPING_OPT_ID_TCP   = 0x10000
 
-	SOF_TIMESTAMPING_LAST = 0x10000
-	SOF_TIMESTAMPING_MASK = 0x1ffff
+	SOF_TIMESTAMPING_LAST = 0x20000
+	SOF_TIMESTAMPING_MASK = 0x3ffff
 
 	SCM_TSTAMP_SND   = 0x0
 	SCM_TSTAMP_SCHED = 0x1
@@ -3505,7 +3541,7 @@ type Nhmsg struct {
 type NexthopGrp struct {
 	Id     uint32
 	Weight uint8
-	Resvd1 uint8
+	High   uint8
 	Resvd2 uint16
 }
 
@@ -3766,7 +3802,7 @@ const (
 	ETHTOOL_MSG_PSE_GET                       = 0x24
 	ETHTOOL_MSG_PSE_SET                       = 0x25
 	ETHTOOL_MSG_RSS_GET                       = 0x26
-	ETHTOOL_MSG_USER_MAX                      = 0x2b
+	ETHTOOL_MSG_USER_MAX                      = 0x2d
 	ETHTOOL_MSG_KERNEL_NONE                   = 0x0
 	ETHTOOL_MSG_STRSET_GET_REPLY              = 0x1
 	ETHTOOL_MSG_LINKINFO_GET_REPLY            = 0x2
@@ -3806,12 +3842,15 @@ const (
 	ETHTOOL_MSG_MODULE_NTF                    = 0x24
 	ETHTOOL_MSG_PSE_GET_REPLY                 = 0x25
 	ETHTOOL_MSG_RSS_GET_REPLY                 = 0x26
-	ETHTOOL_MSG_KERNEL_MAX                    = 0x2b
+	ETHTOOL_MSG_KERNEL_MAX                    = 0x2e
+	ETHTOOL_FLAG_COMPACT_BITSETS              = 0x1
+	ETHTOOL_FLAG_OMIT_REPLY                   = 0x2
+	ETHTOOL_FLAG_STATS                        = 0x4
 	ETHTOOL_A_HEADER_UNSPEC                   = 0x0
 	ETHTOOL_A_HEADER_DEV_INDEX                = 0x1
 	ETHTOOL_A_HEADER_DEV_NAME                 = 0x2
 	ETHTOOL_A_HEADER_FLAGS                    = 0x3
-	ETHTOOL_A_HEADER_MAX                      = 0x3
+	ETHTOOL_A_HEADER_MAX                      = 0x4
 	ETHTOOL_A_BITSET_BIT_UNSPEC               = 0x0
 	ETHTOOL_A_BITSET_BIT_INDEX                = 0x1
 	ETHTOOL_A_BITSET_BIT_NAME                 = 0x2
@@ -3948,7 +3987,7 @@ const (
 	ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL   = 0x17
 	ETHTOOL_A_COALESCE_USE_CQE_MODE_TX        = 0x18
 	ETHTOOL_A_COALESCE_USE_CQE_MODE_RX        = 0x19
-	ETHTOOL_A_COALESCE_MAX                    = 0x1c
+	ETHTOOL_A_COALESCE_MAX                    = 0x1e
 	ETHTOOL_A_PAUSE_UNSPEC                    = 0x0
 	ETHTOOL_A_PAUSE_HEADER                    = 0x1
 	ETHTOOL_A_PAUSE_AUTONEG                   = 0x2
@@ -3992,11 +4031,11 @@ const (
 	ETHTOOL_A_CABLE_RESULT_UNSPEC             = 0x0
 	ETHTOOL_A_CABLE_RESULT_PAIR               = 0x1
 	ETHTOOL_A_CABLE_RESULT_CODE               = 0x2
-	ETHTOOL_A_CABLE_RESULT_MAX                = 0x2
+	ETHTOOL_A_CABLE_RESULT_MAX                = 0x3
 	ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC       = 0x0
 	ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR         = 0x1
 	ETHTOOL_A_CABLE_FAULT_LENGTH_CM           = 0x2
-	ETHTOOL_A_CABLE_FAULT_LENGTH_MAX          = 0x2
+	ETHTOOL_A_CABLE_FAULT_LENGTH_MAX          = 0x3
 	ETHTOOL_A_CABLE_TEST_NTF_STATUS_UNSPEC    = 0x0
 	ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED   = 0x1
 	ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED = 0x2
@@ -4079,6 +4118,107 @@ type EthtoolDrvinfo struct {
 	Regdump_len  uint32
 }
 
+type EthtoolTsInfo struct {
+	Cmd             uint32
+	So_timestamping uint32
+	Phc_index       int32
+	Tx_types        uint32
+	Tx_reserved     [3]uint32
+	Rx_filters      uint32
+	Rx_reserved     [3]uint32
+}
+
+type HwTstampConfig struct {
+	Flags     int32
+	Tx_type   int32
+	Rx_filter int32
+}
+
+const (
+	HWTSTAMP_FILTER_NONE            = 0x0
+	HWTSTAMP_FILTER_ALL             = 0x1
+	HWTSTAMP_FILTER_SOME            = 0x2
+	HWTSTAMP_FILTER_PTP_V1_L4_EVENT = 0x3
+	HWTSTAMP_FILTER_PTP_V2_L4_EVENT = 0x6
+	HWTSTAMP_FILTER_PTP_V2_L2_EVENT = 0x9
+	HWTSTAMP_FILTER_PTP_V2_EVENT    = 0xc
+)
+
+const (
+	HWTSTAMP_TX_OFF          = 0x0
+	HWTSTAMP_TX_ON           = 0x1
+	HWTSTAMP_TX_ONESTEP_SYNC = 0x2
+)
+
+type (
+	PtpClockCaps struct {
+		Max_adj            int32
+		N_alarm            int32
+		N_ext_ts           int32
+		N_per_out          int32
+		Pps                int32
+		N_pins             int32
+		Cross_timestamping int32
+		Adjust_phase       int32
+		Max_phase_adj      int32
+		Rsv                [11]int32
+	}
+	PtpClockTime struct {
+		Sec      int64
+		Nsec     uint32
+		Reserved uint32
+	}
+	PtpExttsEvent struct {
+		T     PtpClockTime
+		Index uint32
+		Flags uint32
+		Rsv   [2]uint32
+	}
+	PtpExttsRequest struct {
+		Index uint32
+		Flags uint32
+		Rsv   [2]uint32
+	}
+	PtpPeroutRequest struct {
+		StartOrPhase PtpClockTime
+		Period       PtpClockTime
+		Index        uint32
+		Flags        uint32
+		On           PtpClockTime
+	}
+	PtpPinDesc struct {
+		Name  [64]byte
+		Index uint32
+		Func  uint32
+		Chan  uint32
+		Rsv   [5]uint32
+	}
+	PtpSysOffset struct {
+		Samples uint32
+		Rsv     [3]uint32
+		Ts      [51]PtpClockTime
+	}
+	PtpSysOffsetExtended struct {
+		Samples uint32
+		Clockid int32
+		Rsv     [2]uint32
+		Ts      [25][3]PtpClockTime
+	}
+	PtpSysOffsetPrecise struct {
+		Device   PtpClockTime
+		Realtime PtpClockTime
+		Monoraw  PtpClockTime
+		Rsv      [4]uint32
+	}
+)
+
+const (
+	PTP_PF_NONE    = 0x0
+	PTP_PF_EXTTS   = 0x1
+	PTP_PF_PEROUT  = 0x2
+	PTP_PF_PHYSYNC = 0x3
+)
+
 type (
 	HIDRawReportDescriptor struct {
 		Size  uint32
@@ -4260,6 +4400,7 @@ const (
 type LandlockRulesetAttr struct {
 	Access_fs  uint64
 	Access_net uint64
+	Scoped     uint64
 }
 
 type LandlockPathBeneathAttr struct {
@@ -4606,7 +4747,7 @@ const (
 	NL80211_ATTR_MAC_HINT                                   = 0xc8
 	NL80211_ATTR_MAC_MASK                                   = 0xd7
 	NL80211_ATTR_MAX_AP_ASSOC_STA                           = 0xca
-	NL80211_ATTR_MAX                                        = 0x14a
+	NL80211_ATTR_MAX                                        = 0x14c
 	NL80211_ATTR_MAX_CRIT_PROT_DURATION                     = 0xb4
 	NL80211_ATTR_MAX_CSA_COUNTERS                           = 0xce
 	NL80211_ATTR_MAX_MATCH_SETS                             = 0x85
@@ -5210,7 +5351,7 @@ const (
 	NL80211_FREQUENCY_ATTR_GO_CONCURRENT                    = 0xf
 	NL80211_FREQUENCY_ATTR_INDOOR_ONLY                      = 0xe
 	NL80211_FREQUENCY_ATTR_IR_CONCURRENT                    = 0xf
-	NL80211_FREQUENCY_ATTR_MAX                              = 0x20
+	NL80211_FREQUENCY_ATTR_MAX                              = 0x21
 	NL80211_FREQUENCY_ATTR_MAX_TX_POWER                     = 0x6
 	NL80211_FREQUENCY_ATTR_NO_10MHZ                         = 0x11
 	NL80211_FREQUENCY_ATTR_NO_160MHZ                        = 0xc
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
index 15adc04142f2b5a8cd2248839c4732b07626f53e..ad05b51a60364994ecb52341160202a800f98ddc 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
@@ -727,6 +727,37 @@ const (
 	RISCV_HWPROBE_EXT_ZBA                = 0x8
 	RISCV_HWPROBE_EXT_ZBB                = 0x10
 	RISCV_HWPROBE_EXT_ZBS                = 0x20
+	RISCV_HWPROBE_EXT_ZICBOZ             = 0x40
+	RISCV_HWPROBE_EXT_ZBC                = 0x80
+	RISCV_HWPROBE_EXT_ZBKB               = 0x100
+	RISCV_HWPROBE_EXT_ZBKC               = 0x200
+	RISCV_HWPROBE_EXT_ZBKX               = 0x400
+	RISCV_HWPROBE_EXT_ZKND               = 0x800
+	RISCV_HWPROBE_EXT_ZKNE               = 0x1000
+	RISCV_HWPROBE_EXT_ZKNH               = 0x2000
+	RISCV_HWPROBE_EXT_ZKSED              = 0x4000
+	RISCV_HWPROBE_EXT_ZKSH               = 0x8000
+	RISCV_HWPROBE_EXT_ZKT                = 0x10000
+	RISCV_HWPROBE_EXT_ZVBB               = 0x20000
+	RISCV_HWPROBE_EXT_ZVBC               = 0x40000
+	RISCV_HWPROBE_EXT_ZVKB               = 0x80000
+	RISCV_HWPROBE_EXT_ZVKG               = 0x100000
+	RISCV_HWPROBE_EXT_ZVKNED             = 0x200000
+	RISCV_HWPROBE_EXT_ZVKNHA             = 0x400000
+	RISCV_HWPROBE_EXT_ZVKNHB             = 0x800000
+	RISCV_HWPROBE_EXT_ZVKSED             = 0x1000000
+	RISCV_HWPROBE_EXT_ZVKSH              = 0x2000000
+	RISCV_HWPROBE_EXT_ZVKT               = 0x4000000
+	RISCV_HWPROBE_EXT_ZFH                = 0x8000000
+	RISCV_HWPROBE_EXT_ZFHMIN             = 0x10000000
+	RISCV_HWPROBE_EXT_ZIHINTNTL          = 0x20000000
+	RISCV_HWPROBE_EXT_ZVFH               = 0x40000000
+	RISCV_HWPROBE_EXT_ZVFHMIN            = 0x80000000
+	RISCV_HWPROBE_EXT_ZFA                = 0x100000000
+	RISCV_HWPROBE_EXT_ZTSO               = 0x200000000
+	RISCV_HWPROBE_EXT_ZACAS              = 0x400000000
+	RISCV_HWPROBE_EXT_ZICOND             = 0x800000000
+	RISCV_HWPROBE_EXT_ZIHINTPAUSE        = 0x1000000000
 	RISCV_HWPROBE_KEY_CPUPERF_0          = 0x5
 	RISCV_HWPROBE_MISALIGNED_UNKNOWN     = 0x0
 	RISCV_HWPROBE_MISALIGNED_EMULATED    = 0x1
@@ -734,4 +765,6 @@ const (
 	RISCV_HWPROBE_MISALIGNED_FAST        = 0x3
 	RISCV_HWPROBE_MISALIGNED_UNSUPPORTED = 0x4
 	RISCV_HWPROBE_MISALIGNED_MASK        = 0x7
+	RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE  = 0x6
+	RISCV_HWPROBE_WHICH_CPUS             = 0x1
 )
diff --git a/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
index d9a13af4684b0c370ba506917d95659e782194dc..2e5d5a44357a2e2ae04c8a406d8b89f0042968a9 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
@@ -377,6 +377,12 @@ type Flock_t struct {
 	Pid    int32
 }
 
+type F_cnvrt struct {
+	Cvtcmd int32
+	Pccsid int16
+	Fccsid int16
+}
+
 type Termios struct {
 	Cflag uint32
 	Iflag uint32
diff --git a/vendor/golang.org/x/sys/windows/dll_windows.go b/vendor/golang.org/x/sys/windows/dll_windows.go
index 115341fba66dab28536543805dbbdfd3a88500a1..4e613cf6335ceaaaf3075e177938ef11688461a4 100644
--- a/vendor/golang.org/x/sys/windows/dll_windows.go
+++ b/vendor/golang.org/x/sys/windows/dll_windows.go
@@ -65,7 +65,7 @@ func LoadDLL(name string) (dll *DLL, err error) {
 	return d, nil
 }
 
-// MustLoadDLL is like LoadDLL but panics if load operation failes.
+// MustLoadDLL is like LoadDLL but panics if load operation fails.
 func MustLoadDLL(name string) *DLL {
 	d, e := LoadDLL(name)
 	if e != nil {
diff --git a/vendor/golang.org/x/sys/windows/svc/mgr/config.go b/vendor/golang.org/x/sys/windows/svc/mgr/config.go
index a6d3e8a88a3f2ac8557d1048ee890f8ae705c580..3c7ba08f58de3a2afd98466b97e3547476e3f57e 100644
--- a/vendor/golang.org/x/sys/windows/svc/mgr/config.go
+++ b/vendor/golang.org/x/sys/windows/svc/mgr/config.go
@@ -63,7 +63,7 @@ func toStringSlice(ps *uint16) []string {
 	return r
 }
 
-// Config retrieves service s configuration paramteres.
+// Config retrieves service s configuration parameters.
 func (s *Service) Config() (Config, error) {
 	var p *windows.QUERY_SERVICE_CONFIG
 	n := uint32(1024)
diff --git a/vendor/golang.org/x/sys/windows/svc/mgr/recovery.go b/vendor/golang.org/x/sys/windows/svc/mgr/recovery.go
index cdf880e13aca71fbae0ec9534e7166257ffac6e3..ef2a6878405c7434735c85675d9df787dea04d5c 100644
--- a/vendor/golang.org/x/sys/windows/svc/mgr/recovery.go
+++ b/vendor/golang.org/x/sys/windows/svc/mgr/recovery.go
@@ -137,7 +137,7 @@ func (s *Service) RecoveryCommand() (string, error) {
 // SetRecoveryActionsOnNonCrashFailures sets the failure actions flag. If the
 // flag is set to false, recovery actions will only be performed if the service
 // terminates without reporting a status of SERVICE_STOPPED. If the flag is set
-// to true, recovery actions are also perfomed if the service stops with a
+// to true, recovery actions are also performed if the service stops with a
 // nonzero exit code.
 func (s *Service) SetRecoveryActionsOnNonCrashFailures(flag bool) error {
 	var setting windows.SERVICE_FAILURE_ACTIONS_FLAG
@@ -151,7 +151,7 @@ func (s *Service) SetRecoveryActionsOnNonCrashFailures(flag bool) error {
 // actions flag. If the flag is set to false, recovery actions will only be
 // performed if the service terminates without reporting a status of
 // SERVICE_STOPPED. If the flag is set to true, recovery actions are also
-// perfomed if the service stops with a nonzero exit code.
+// performed if the service stops with a nonzero exit code.
 func (s *Service) RecoveryActionsOnNonCrashFailures() (bool, error) {
 	b, err := s.queryServiceConfig2(windows.SERVICE_CONFIG_FAILURE_ACTIONS_FLAG)
 	if err != nil {
diff --git a/vendor/golang.org/x/sys/windows/syscall_windows.go b/vendor/golang.org/x/sys/windows/syscall_windows.go
index 1fa34fd17c5cd9ff9fe76b0023f957c642caae32..4a32543868500f5403404d54de8f9f5f827e6e18 100644
--- a/vendor/golang.org/x/sys/windows/syscall_windows.go
+++ b/vendor/golang.org/x/sys/windows/syscall_windows.go
@@ -168,6 +168,8 @@ func NewCallbackCDecl(fn interface{}) uintptr {
 //sys	CreateNamedPipe(name *uint16, flags uint32, pipeMode uint32, maxInstances uint32, outSize uint32, inSize uint32, defaultTimeout uint32, sa *SecurityAttributes) (handle Handle, err error)  [failretval==InvalidHandle] = CreateNamedPipeW
 //sys	ConnectNamedPipe(pipe Handle, overlapped *Overlapped) (err error)
 //sys	DisconnectNamedPipe(pipe Handle) (err error)
+//sys   GetNamedPipeClientProcessId(pipe Handle, clientProcessID *uint32) (err error)
+//sys   GetNamedPipeServerProcessId(pipe Handle, serverProcessID *uint32) (err error)
 //sys	GetNamedPipeInfo(pipe Handle, flags *uint32, outSize *uint32, inSize *uint32, maxInstances *uint32) (err error)
 //sys	GetNamedPipeHandleState(pipe Handle, state *uint32, curInstances *uint32, maxCollectionCount *uint32, collectDataTimeout *uint32, userName *uint16, maxUserNameSize uint32) (err error) = GetNamedPipeHandleStateW
 //sys	SetNamedPipeHandleState(pipe Handle, state *uint32, maxCollectionCount *uint32, collectDataTimeout *uint32) (err error) = SetNamedPipeHandleState
@@ -313,6 +315,10 @@ func NewCallbackCDecl(fn interface{}) uintptr {
 //sys	SetConsoleMode(console Handle, mode uint32) (err error) = kernel32.SetConsoleMode
 //sys	GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) = kernel32.GetConsoleScreenBufferInfo
 //sys	setConsoleCursorPosition(console Handle, position uint32) (err error) = kernel32.SetConsoleCursorPosition
+//sys	GetConsoleCP() (cp uint32, err error) = kernel32.GetConsoleCP
+//sys	GetConsoleOutputCP() (cp uint32, err error) = kernel32.GetConsoleOutputCP
+//sys	SetConsoleCP(cp uint32) (err error) = kernel32.SetConsoleCP
+//sys	SetConsoleOutputCP(cp uint32) (err error) = kernel32.SetConsoleOutputCP
 //sys	WriteConsole(console Handle, buf *uint16, towrite uint32, written *uint32, reserved *byte) (err error) = kernel32.WriteConsoleW
 //sys	ReadConsole(console Handle, buf *uint16, toread uint32, read *uint32, inputControl *byte) (err error) = kernel32.ReadConsoleW
 //sys	resizePseudoConsole(pconsole Handle, size uint32) (hr error) = kernel32.ResizePseudoConsole
@@ -721,20 +727,12 @@ func DurationSinceBoot() time.Duration {
 }
 
 func Ftruncate(fd Handle, length int64) (err error) {
-	curoffset, e := Seek(fd, 0, 1)
-	if e != nil {
-		return e
-	}
-	defer Seek(fd, curoffset, 0)
-	_, e = Seek(fd, length, 0)
-	if e != nil {
-		return e
+	type _FILE_END_OF_FILE_INFO struct {
+		EndOfFile int64
 	}
-	e = SetEndOfFile(fd)
-	if e != nil {
-		return e
-	}
-	return nil
+	var info _FILE_END_OF_FILE_INFO
+	info.EndOfFile = length
+	return SetFileInformationByHandle(fd, FileEndOfFileInfo, (*byte)(unsafe.Pointer(&info)), uint32(unsafe.Sizeof(info)))
 }
 
 func Gettimeofday(tv *Timeval) (err error) {
@@ -890,6 +888,11 @@ const socket_error = uintptr(^uint32(0))
 //sys	GetACP() (acp uint32) = kernel32.GetACP
 //sys	MultiByteToWideChar(codePage uint32, dwFlags uint32, str *byte, nstr int32, wchar *uint16, nwchar int32) (nwrite int32, err error) = kernel32.MultiByteToWideChar
 //sys	getBestInterfaceEx(sockaddr unsafe.Pointer, pdwBestIfIndex *uint32) (errcode error) = iphlpapi.GetBestInterfaceEx
+//sys   GetIfEntry2Ex(level uint32, row *MibIfRow2) (errcode error) = iphlpapi.GetIfEntry2Ex
+//sys   GetUnicastIpAddressEntry(row *MibUnicastIpAddressRow) (errcode error) = iphlpapi.GetUnicastIpAddressEntry
+//sys   NotifyIpInterfaceChange(family uint16, callback uintptr, callerContext unsafe.Pointer, initialNotification bool, notificationHandle *Handle) (errcode error) = iphlpapi.NotifyIpInterfaceChange
+//sys   NotifyUnicastIpAddressChange(family uint16, callback uintptr, callerContext unsafe.Pointer, initialNotification bool, notificationHandle *Handle) (errcode error) = iphlpapi.NotifyUnicastIpAddressChange
+//sys   CancelMibChangeNotify2(notificationHandle Handle) (errcode error) = iphlpapi.CancelMibChangeNotify2
 
 // For testing: clients can set this flag to force
 // creation of IPv6 sockets to return EAFNOSUPPORT.
@@ -1681,13 +1684,16 @@ func (s NTStatus) Error() string {
 // do not use NTUnicodeString, and instead UTF16PtrFromString should be used for
 // the more common *uint16 string type.
 func NewNTUnicodeString(s string) (*NTUnicodeString, error) {
-	var u NTUnicodeString
-	s16, err := UTF16PtrFromString(s)
+	s16, err := UTF16FromString(s)
 	if err != nil {
 		return nil, err
 	}
-	RtlInitUnicodeString(&u, s16)
-	return &u, nil
+	n := uint16(len(s16) * 2)
+	return &NTUnicodeString{
+		Length:        n - 2, // subtract 2 bytes for the NULL terminator
+		MaximumLength: n,
+		Buffer:        &s16[0],
+	}, nil
 }
 
 // Slice returns a uint16 slice that aliases the data in the NTUnicodeString.
diff --git a/vendor/golang.org/x/sys/windows/types_windows.go b/vendor/golang.org/x/sys/windows/types_windows.go
index 4d0c15745f89f22eb4b115b068003ee123d5f19c..9d138de5fed63ccdc4435228ec96a68c0037bd8c 100644
--- a/vendor/golang.org/x/sys/windows/types_windows.go
+++ b/vendor/golang.org/x/sys/windows/types_windows.go
@@ -176,6 +176,7 @@ const (
 	WAIT_FAILED    = 0xFFFFFFFF
 
 	// Access rights for process.
+	PROCESS_ALL_ACCESS                = 0xFFFF
 	PROCESS_CREATE_PROCESS            = 0x0080
 	PROCESS_CREATE_THREAD             = 0x0002
 	PROCESS_DUP_HANDLE                = 0x0040
@@ -1060,6 +1061,7 @@ const (
 	SIO_GET_EXTENSION_FUNCTION_POINTER = IOC_INOUT | IOC_WS2 | 6
 	SIO_KEEPALIVE_VALS                 = IOC_IN | IOC_VENDOR | 4
 	SIO_UDP_CONNRESET                  = IOC_IN | IOC_VENDOR | 12
+	SIO_UDP_NETRESET                   = IOC_IN | IOC_VENDOR | 15
 
 	// cf. http://support.microsoft.com/default.aspx?scid=kb;en-us;257460
 
@@ -2031,6 +2033,50 @@ const (
 	IF_TYPE_IEEE1394           = 144
 )
 
+// Enum NL_PREFIX_ORIGIN for [IpAdapterUnicastAddress], see
+// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_prefix_origin
+const (
+	IpPrefixOriginOther               = 0
+	IpPrefixOriginManual              = 1
+	IpPrefixOriginWellKnown           = 2
+	IpPrefixOriginDhcp                = 3
+	IpPrefixOriginRouterAdvertisement = 4
+	IpPrefixOriginUnchanged           = 1 << 4
+)
+
+// Enum NL_SUFFIX_ORIGIN for [IpAdapterUnicastAddress], see
+// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_suffix_origin
+const (
+	NlsoOther                      = 0
+	NlsoManual                     = 1
+	NlsoWellKnown                  = 2
+	NlsoDhcp                       = 3
+	NlsoLinkLayerAddress           = 4
+	NlsoRandom                     = 5
+	IpSuffixOriginOther            = 0
+	IpSuffixOriginManual           = 1
+	IpSuffixOriginWellKnown        = 2
+	IpSuffixOriginDhcp             = 3
+	IpSuffixOriginLinkLayerAddress = 4
+	IpSuffixOriginRandom           = 5
+	IpSuffixOriginUnchanged        = 1 << 4
+)
+
+// Enum NL_DAD_STATE for [IpAdapterUnicastAddress], see
+// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_dad_state
+const (
+	NldsInvalid          = 0
+	NldsTentative        = 1
+	NldsDuplicate        = 2
+	NldsDeprecated       = 3
+	NldsPreferred        = 4
+	IpDadStateInvalid    = 0
+	IpDadStateTentative  = 1
+	IpDadStateDuplicate  = 2
+	IpDadStateDeprecated = 3
+	IpDadStatePreferred  = 4
+)
+
 type SocketAddress struct {
 	Sockaddr       *syscall.RawSockaddrAny
 	SockaddrLength int32
@@ -2158,6 +2204,132 @@ const (
 	IfOperStatusLowerLayerDown = 7
 )
 
+const (
+	IF_MAX_PHYS_ADDRESS_LENGTH = 32
+	IF_MAX_STRING_SIZE         = 256
+)
+
+// MIB_IF_ENTRY_LEVEL enumeration from netioapi.h or
+// https://learn.microsoft.com/en-us/windows/win32/api/netioapi/nf-netioapi-getifentry2ex.
+const (
+	MibIfEntryNormal                  = 0
+	MibIfEntryNormalWithoutStatistics = 2
+)
+
+// MIB_NOTIFICATION_TYPE enumeration from netioapi.h or
+// https://learn.microsoft.com/en-us/windows/win32/api/netioapi/ne-netioapi-mib_notification_type.
+const (
+	MibParameterNotification = 0
+	MibAddInstance           = 1
+	MibDeleteInstance        = 2
+	MibInitialNotification   = 3
+)
+
+// MibIfRow2 stores information about a particular interface. See
+// https://learn.microsoft.com/en-us/windows/win32/api/netioapi/ns-netioapi-mib_if_row2.
+type MibIfRow2 struct {
+	InterfaceLuid               uint64
+	InterfaceIndex              uint32
+	InterfaceGuid               GUID
+	Alias                       [IF_MAX_STRING_SIZE + 1]uint16
+	Description                 [IF_MAX_STRING_SIZE + 1]uint16
+	PhysicalAddressLength       uint32
+	PhysicalAddress             [IF_MAX_PHYS_ADDRESS_LENGTH]uint8
+	PermanentPhysicalAddress    [IF_MAX_PHYS_ADDRESS_LENGTH]uint8
+	Mtu                         uint32
+	Type                        uint32
+	TunnelType                  uint32
+	MediaType                   uint32
+	PhysicalMediumType          uint32
+	AccessType                  uint32
+	DirectionType               uint32
+	InterfaceAndOperStatusFlags uint8
+	OperStatus                  uint32
+	AdminStatus                 uint32
+	MediaConnectState           uint32
+	NetworkGuid                 GUID
+	ConnectionType              uint32
+	TransmitLinkSpeed           uint64
+	ReceiveLinkSpeed            uint64
+	InOctets                    uint64
+	InUcastPkts                 uint64
+	InNUcastPkts                uint64
+	InDiscards                  uint64
+	InErrors                    uint64
+	InUnknownProtos             uint64
+	InUcastOctets               uint64
+	InMulticastOctets           uint64
+	InBroadcastOctets           uint64
+	OutOctets                   uint64
+	OutUcastPkts                uint64
+	OutNUcastPkts               uint64
+	OutDiscards                 uint64
+	OutErrors                   uint64
+	OutUcastOctets              uint64
+	OutMulticastOctets          uint64
+	OutBroadcastOctets          uint64
+	OutQLen                     uint64
+}
+
+// MIB_UNICASTIPADDRESS_ROW stores information about a unicast IP address. See
+// https://learn.microsoft.com/en-us/windows/win32/api/netioapi/ns-netioapi-mib_unicastipaddress_row.
+type MibUnicastIpAddressRow struct {
+	Address            RawSockaddrInet6 // SOCKADDR_INET union
+	InterfaceLuid      uint64
+	InterfaceIndex     uint32
+	PrefixOrigin       uint32
+	SuffixOrigin       uint32
+	ValidLifetime      uint32
+	PreferredLifetime  uint32
+	OnLinkPrefixLength uint8
+	SkipAsSource       uint8
+	DadState           uint32
+	ScopeId            uint32
+	CreationTimeStamp  Filetime
+}
+
+const ScopeLevelCount = 16
+
+// MIB_IPINTERFACE_ROW stores interface management information for a particular IP address family on a network interface.
+// See https://learn.microsoft.com/en-us/windows/win32/api/netioapi/ns-netioapi-mib_ipinterface_row.
+type MibIpInterfaceRow struct {
+	Family                               uint16
+	InterfaceLuid                        uint64
+	InterfaceIndex                       uint32
+	MaxReassemblySize                    uint32
+	InterfaceIdentifier                  uint64
+	MinRouterAdvertisementInterval       uint32
+	MaxRouterAdvertisementInterval       uint32
+	AdvertisingEnabled                   uint8
+	ForwardingEnabled                    uint8
+	WeakHostSend                         uint8
+	WeakHostReceive                      uint8
+	UseAutomaticMetric                   uint8
+	UseNeighborUnreachabilityDetection   uint8
+	ManagedAddressConfigurationSupported uint8
+	OtherStatefulConfigurationSupported  uint8
+	AdvertiseDefaultRoute                uint8
+	RouterDiscoveryBehavior              uint32
+	DadTransmits                         uint32
+	BaseReachableTime                    uint32
+	RetransmitTime                       uint32
+	PathMtuDiscoveryTimeout              uint32
+	LinkLocalAddressBehavior             uint32
+	LinkLocalAddressTimeout              uint32
+	ZoneIndices                          [ScopeLevelCount]uint32
+	SitePrefixLength                     uint32
+	Metric                               uint32
+	NlMtu                                uint32
+	Connected                            uint8
+	SupportsWakeUpPatterns               uint8
+	SupportsNeighborDiscovery            uint8
+	SupportsRouterDiscovery              uint8
+	ReachableTime                        uint32
+	TransmitOffload                      uint32
+	ReceiveOffload                       uint32
+	DisableDefaultRoutes                 uint8
+}
+
 // Console related constants used for the mode parameter to SetConsoleMode. See
 // https://docs.microsoft.com/en-us/windows/console/setconsolemode for details.
 
diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go
index 9bb979a3e47267f14e699cc5cd5b9d8376b7aaa6..01c0716c2c4e839cc1a0466642204f4e73b0236f 100644
--- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go
+++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go
@@ -181,10 +181,15 @@ var (
 	procDnsRecordListFree                                    = moddnsapi.NewProc("DnsRecordListFree")
 	procDwmGetWindowAttribute                                = moddwmapi.NewProc("DwmGetWindowAttribute")
 	procDwmSetWindowAttribute                                = moddwmapi.NewProc("DwmSetWindowAttribute")
+	procCancelMibChangeNotify2                               = modiphlpapi.NewProc("CancelMibChangeNotify2")
 	procGetAdaptersAddresses                                 = modiphlpapi.NewProc("GetAdaptersAddresses")
 	procGetAdaptersInfo                                      = modiphlpapi.NewProc("GetAdaptersInfo")
 	procGetBestInterfaceEx                                   = modiphlpapi.NewProc("GetBestInterfaceEx")
 	procGetIfEntry                                           = modiphlpapi.NewProc("GetIfEntry")
+	procGetIfEntry2Ex                                        = modiphlpapi.NewProc("GetIfEntry2Ex")
+	procGetUnicastIpAddressEntry                             = modiphlpapi.NewProc("GetUnicastIpAddressEntry")
+	procNotifyIpInterfaceChange                              = modiphlpapi.NewProc("NotifyIpInterfaceChange")
+	procNotifyUnicastIpAddressChange                         = modiphlpapi.NewProc("NotifyUnicastIpAddressChange")
 	procAddDllDirectory                                      = modkernel32.NewProc("AddDllDirectory")
 	procAssignProcessToJobObject                             = modkernel32.NewProc("AssignProcessToJobObject")
 	procCancelIo                                             = modkernel32.NewProc("CancelIo")
@@ -247,7 +252,9 @@ var (
 	procGetCommandLineW                                      = modkernel32.NewProc("GetCommandLineW")
 	procGetComputerNameExW                                   = modkernel32.NewProc("GetComputerNameExW")
 	procGetComputerNameW                                     = modkernel32.NewProc("GetComputerNameW")
+	procGetConsoleCP                                         = modkernel32.NewProc("GetConsoleCP")
 	procGetConsoleMode                                       = modkernel32.NewProc("GetConsoleMode")
+	procGetConsoleOutputCP                                   = modkernel32.NewProc("GetConsoleOutputCP")
 	procGetConsoleScreenBufferInfo                           = modkernel32.NewProc("GetConsoleScreenBufferInfo")
 	procGetCurrentDirectoryW                                 = modkernel32.NewProc("GetCurrentDirectoryW")
 	procGetCurrentProcessId                                  = modkernel32.NewProc("GetCurrentProcessId")
@@ -273,8 +280,10 @@ var (
 	procGetMaximumProcessorCount                             = modkernel32.NewProc("GetMaximumProcessorCount")
 	procGetModuleFileNameW                                   = modkernel32.NewProc("GetModuleFileNameW")
 	procGetModuleHandleExW                                   = modkernel32.NewProc("GetModuleHandleExW")
+	procGetNamedPipeClientProcessId                          = modkernel32.NewProc("GetNamedPipeClientProcessId")
 	procGetNamedPipeHandleStateW                             = modkernel32.NewProc("GetNamedPipeHandleStateW")
 	procGetNamedPipeInfo                                     = modkernel32.NewProc("GetNamedPipeInfo")
+	procGetNamedPipeServerProcessId                          = modkernel32.NewProc("GetNamedPipeServerProcessId")
 	procGetOverlappedResult                                  = modkernel32.NewProc("GetOverlappedResult")
 	procGetPriorityClass                                     = modkernel32.NewProc("GetPriorityClass")
 	procGetProcAddress                                       = modkernel32.NewProc("GetProcAddress")
@@ -347,8 +356,10 @@ var (
 	procSetCommMask                                          = modkernel32.NewProc("SetCommMask")
 	procSetCommState                                         = modkernel32.NewProc("SetCommState")
 	procSetCommTimeouts                                      = modkernel32.NewProc("SetCommTimeouts")
+	procSetConsoleCP                                         = modkernel32.NewProc("SetConsoleCP")
 	procSetConsoleCursorPosition                             = modkernel32.NewProc("SetConsoleCursorPosition")
 	procSetConsoleMode                                       = modkernel32.NewProc("SetConsoleMode")
+	procSetConsoleOutputCP                                   = modkernel32.NewProc("SetConsoleOutputCP")
 	procSetCurrentDirectoryW                                 = modkernel32.NewProc("SetCurrentDirectoryW")
 	procSetDefaultDllDirectories                             = modkernel32.NewProc("SetDefaultDllDirectories")
 	procSetDllDirectoryW                                     = modkernel32.NewProc("SetDllDirectoryW")
@@ -1602,6 +1613,14 @@ func DwmSetWindowAttribute(hwnd HWND, attribute uint32, value unsafe.Pointer, si
 	return
 }
 
+func CancelMibChangeNotify2(notificationHandle Handle) (errcode error) {
+	r0, _, _ := syscall.Syscall(procCancelMibChangeNotify2.Addr(), 1, uintptr(notificationHandle), 0, 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
 func GetAdaptersAddresses(family uint32, flags uint32, reserved uintptr, adapterAddresses *IpAdapterAddresses, sizePointer *uint32) (errcode error) {
 	r0, _, _ := syscall.Syscall6(procGetAdaptersAddresses.Addr(), 5, uintptr(family), uintptr(flags), uintptr(reserved), uintptr(unsafe.Pointer(adapterAddresses)), uintptr(unsafe.Pointer(sizePointer)), 0)
 	if r0 != 0 {
@@ -1634,6 +1653,46 @@ func GetIfEntry(pIfRow *MibIfRow) (errcode error) {
 	return
 }
 
+func GetIfEntry2Ex(level uint32, row *MibIfRow2) (errcode error) {
+	r0, _, _ := syscall.Syscall(procGetIfEntry2Ex.Addr(), 2, uintptr(level), uintptr(unsafe.Pointer(row)), 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
+func GetUnicastIpAddressEntry(row *MibUnicastIpAddressRow) (errcode error) {
+	r0, _, _ := syscall.Syscall(procGetUnicastIpAddressEntry.Addr(), 1, uintptr(unsafe.Pointer(row)), 0, 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
+func NotifyIpInterfaceChange(family uint16, callback uintptr, callerContext unsafe.Pointer, initialNotification bool, notificationHandle *Handle) (errcode error) {
+	var _p0 uint32
+	if initialNotification {
+		_p0 = 1
+	}
+	r0, _, _ := syscall.Syscall6(procNotifyIpInterfaceChange.Addr(), 5, uintptr(family), uintptr(callback), uintptr(callerContext), uintptr(_p0), uintptr(unsafe.Pointer(notificationHandle)), 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
+func NotifyUnicastIpAddressChange(family uint16, callback uintptr, callerContext unsafe.Pointer, initialNotification bool, notificationHandle *Handle) (errcode error) {
+	var _p0 uint32
+	if initialNotification {
+		_p0 = 1
+	}
+	r0, _, _ := syscall.Syscall6(procNotifyUnicastIpAddressChange.Addr(), 5, uintptr(family), uintptr(callback), uintptr(callerContext), uintptr(_p0), uintptr(unsafe.Pointer(notificationHandle)), 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
 func AddDllDirectory(path *uint16) (cookie uintptr, err error) {
 	r0, _, e1 := syscall.Syscall(procAddDllDirectory.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0)
 	cookie = uintptr(r0)
@@ -2162,6 +2221,15 @@ func GetComputerName(buf *uint16, n *uint32) (err error) {
 	return
 }
 
+func GetConsoleCP() (cp uint32, err error) {
+	r0, _, e1 := syscall.Syscall(procGetConsoleCP.Addr(), 0, 0, 0, 0)
+	cp = uint32(r0)
+	if cp == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetConsoleMode(console Handle, mode *uint32) (err error) {
 	r1, _, e1 := syscall.Syscall(procGetConsoleMode.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(mode)), 0)
 	if r1 == 0 {
@@ -2170,6 +2238,15 @@ func GetConsoleMode(console Handle, mode *uint32) (err error) {
 	return
 }
 
+func GetConsoleOutputCP() (cp uint32, err error) {
+	r0, _, e1 := syscall.Syscall(procGetConsoleOutputCP.Addr(), 0, 0, 0, 0)
+	cp = uint32(r0)
+	if cp == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) {
 	r1, _, e1 := syscall.Syscall(procGetConsoleScreenBufferInfo.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(info)), 0)
 	if r1 == 0 {
@@ -2371,6 +2448,14 @@ func GetModuleHandleEx(flags uint32, moduleName *uint16, module *Handle) (err er
 	return
 }
 
+func GetNamedPipeClientProcessId(pipe Handle, clientProcessID *uint32) (err error) {
+	r1, _, e1 := syscall.Syscall(procGetNamedPipeClientProcessId.Addr(), 2, uintptr(pipe), uintptr(unsafe.Pointer(clientProcessID)), 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetNamedPipeHandleState(pipe Handle, state *uint32, curInstances *uint32, maxCollectionCount *uint32, collectDataTimeout *uint32, userName *uint16, maxUserNameSize uint32) (err error) {
 	r1, _, e1 := syscall.Syscall9(procGetNamedPipeHandleStateW.Addr(), 7, uintptr(pipe), uintptr(unsafe.Pointer(state)), uintptr(unsafe.Pointer(curInstances)), uintptr(unsafe.Pointer(maxCollectionCount)), uintptr(unsafe.Pointer(collectDataTimeout)), uintptr(unsafe.Pointer(userName)), uintptr(maxUserNameSize), 0, 0)
 	if r1 == 0 {
@@ -2387,6 +2472,14 @@ func GetNamedPipeInfo(pipe Handle, flags *uint32, outSize *uint32, inSize *uint3
 	return
 }
 
+func GetNamedPipeServerProcessId(pipe Handle, serverProcessID *uint32) (err error) {
+	r1, _, e1 := syscall.Syscall(procGetNamedPipeServerProcessId.Addr(), 2, uintptr(pipe), uintptr(unsafe.Pointer(serverProcessID)), 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetOverlappedResult(handle Handle, overlapped *Overlapped, done *uint32, wait bool) (err error) {
 	var _p0 uint32
 	if wait {
@@ -3038,6 +3131,14 @@ func SetCommTimeouts(handle Handle, timeouts *CommTimeouts) (err error) {
 	return
 }
 
+func SetConsoleCP(cp uint32) (err error) {
+	r1, _, e1 := syscall.Syscall(procSetConsoleCP.Addr(), 1, uintptr(cp), 0, 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func setConsoleCursorPosition(console Handle, position uint32) (err error) {
 	r1, _, e1 := syscall.Syscall(procSetConsoleCursorPosition.Addr(), 2, uintptr(console), uintptr(position), 0)
 	if r1 == 0 {
@@ -3054,6 +3155,14 @@ func SetConsoleMode(console Handle, mode uint32) (err error) {
 	return
 }
 
+func SetConsoleOutputCP(cp uint32) (err error) {
+	r1, _, e1 := syscall.Syscall(procSetConsoleOutputCP.Addr(), 1, uintptr(cp), 0, 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func SetCurrentDirectory(path *uint16) (err error) {
 	r1, _, e1 := syscall.Syscall(procSetCurrentDirectoryW.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0)
 	if r1 == 0 {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 50a2c5525a5aedf5e4888055a532b96a31226fb6..24117848e4fab6f4f4e44d7162d301b1e3d993b1 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -1,12 +1,15 @@
-# 0xacab.org/leap/bitmask-core v0.0.0-20241015185856-0812b9aadf98
+# 0xacab.org/leap/bitmask-core v0.0.0-20241213164419-d3c23078a4e0
 ## explicit; go 1.22.2
-0xacab.org/leap/bitmask-core/models
 0xacab.org/leap/bitmask-core/pkg/bootstrap
-0xacab.org/leap/bitmask-core/pkg/client
-0xacab.org/leap/bitmask-core/pkg/client/provisioning
+0xacab.org/leap/bitmask-core/pkg/bridge
 0xacab.org/leap/bitmask-core/pkg/introducer
-0xacab.org/leap/bitmask-core/pkg/models
 0xacab.org/leap/bitmask-core/pkg/storage
+# 0xacab.org/leap/menshen v0.0.0-20241122184833-0524d7e093d5
+## explicit; go 1.22
+0xacab.org/leap/menshen/client
+0xacab.org/leap/menshen/client/agent
+0xacab.org/leap/menshen/client/provisioning
+0xacab.org/leap/menshen/models
 # 0xacab.org/leap/obfsvpn v1.3.1-0.20241121155258-e6b06efc4456
 ## explicit; go 1.22
 0xacab.org/leap/obfsvpn/client
@@ -468,7 +471,7 @@ go.opentelemetry.io/otel/trace/embedded
 ## explicit; go 1.20
 go.uber.org/mock/mockgen
 go.uber.org/mock/mockgen/model
-# golang.org/x/crypto v0.26.0
+# golang.org/x/crypto v0.30.0
 ## explicit; go 1.20
 golang.org/x/crypto/blake2b
 golang.org/x/crypto/blake2s
@@ -500,7 +503,7 @@ golang.org/x/mod/internal/lazyregexp
 golang.org/x/mod/modfile
 golang.org/x/mod/module
 golang.org/x/mod/semver
-# golang.org/x/net v0.28.0
+# golang.org/x/net v0.32.0
 ## explicit; go 1.18
 golang.org/x/net/bpf
 golang.org/x/net/dns/dnsmessage
@@ -515,10 +518,10 @@ golang.org/x/net/internal/socks
 golang.org/x/net/ipv4
 golang.org/x/net/ipv6
 golang.org/x/net/proxy
-# golang.org/x/sync v0.8.0
+# golang.org/x/sync v0.10.0
 ## explicit; go 1.18
 golang.org/x/sync/errgroup
-# golang.org/x/sys v0.23.0
+# golang.org/x/sys v0.28.0
 ## explicit; go 1.18
 golang.org/x/sys/cpu
 golang.org/x/sys/unix
@@ -528,7 +531,7 @@ golang.org/x/sys/windows/svc
 golang.org/x/sys/windows/svc/debug
 golang.org/x/sys/windows/svc/eventlog
 golang.org/x/sys/windows/svc/mgr
-# golang.org/x/text v0.17.0
+# golang.org/x/text v0.21.0
 ## explicit; go 1.18
 golang.org/x/text/secure/bidirule
 golang.org/x/text/transform