// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
/* Copyright (C) 2024 Nokia
*
* Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
* Author: Olga Albisser <olga@albisser.org>
* Author: Henrik Steen <henrist@henrist.net>
* Author: Olivier Tilmans <olivier.tilmans@nokia.com>
* Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
*
* DualPI Improved with a Square (dualpi2):
* - Supports congestion controls that comply with the Prague requirements
* in RFC9331 (e.g. TCP-Prague)
* - Supports coupled dual-queue with PI2 as defined in RFC9332
* - Supports ECN L4S-identifier (IP.ECN==0b*1)
*
* note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,
* they do not meet the 'Prague L4S Requirements' listed in RFC 9331
* Section 4, so they can only be used with DualPI2 in a datacenter
* context.
*
* References:
* - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332
* - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and
* scalable TCP." in proc. ACM CoNEXT'16, 2016.
*/
#include <linux/errno.h>
#include <linux/hrtimer.h>
#include <linux/if_vlan.h>
#include <linux/kernel.h>
#include <linux/limits.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/gso.h>
#include <net/inet_ecn.h>
#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets
* i.e., twice the maximal snd_cwnd.
* MAX_PROB must be consistent with the RNG in dualpi2_roll().
*/
#define MAX_PROB U32_MAX
/* alpha/beta values exchanged over netlink are in units of 256ns */
#define ALPHA_BETA_SHIFT 8
/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later
* computations. Consequently (see and dualpi2_scale_alpha_beta()), their
* netlink-provided values can use at most 31b, i.e. be at most (2^23)-1
* (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to
* control flows whose maximal RTTs can be in usec up to few secs.
*/
#define ALPHA_BETA_MAX ((1U << 31) - 1)
/* Internal alpha/beta are in units of 64ns.
* This enables to use all alpha/beta values in the allowed range without loss
* of precision due to rounding when scaling them internally, e.g.,
* scale_alpha_beta(1) will not round down to 0.
*/
#define ALPHA_BETA_GRANULARITY 6
#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */
#define MAX_WC 100
struct dualpi2_sched_data {
struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */
struct Qdisc *sch; /* The Classic queue (C-queue) */
/* Registered tc filters */
struct tcf_proto __rcu *tcf_filters;
struct tcf_block *tcf_block;
/* PI2 parameters */
u64 pi2_target; /* Target delay in nanoseconds */
u32 pi2_tupdate; /* Timer frequency in nanoseconds */
u32 pi2_prob; /* Base PI probability */
u32 pi2_alpha; /* Gain factor for the integral rate response */
u32 pi2_beta; /* Gain factor for the proportional response */
struct hrtimer pi2_timer; /* prob update timer */
/* Step AQM (L-queue only) parameters */
u32 step_thresh; /* Step threshold */
bool step_in_packets; /* Step thresh in packets (1) or time (0) */
/* C-queue starvation protection */
s32 c_protection_credit; /* Credit (sign indicates which queue) */
s32 c_protection_init; /* Reset value of the credit */
u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */
u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */
/* General dualQ parameters */
u32 memory_limit; /* Memory limit of both queues */
u8 coupling_factor;/* Coupling factor (k) between both queues */
u8 ecn_mask; /* Mask to match packets into L-queue */
u32 min_qlen_step; /* Minimum queue length to apply step thresh */
bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */
bool drop_overload; /* Drop (1) on overload, or overflow (0) */
bool split_gso; /* Split aggregated skb (1) or leave as is (0) */
/* Statistics */
u64 c_head_ts; /* Enqueue timestamp of the C-queue head */
u64 l_head_ts; /* Enqueue timestamp of the L-queue head */
u64 last_qdelay; /* Q delay val at the last probability update */
u32 packets_in_c; /* Enqueue packet counter of the C-queue */
u32 packets_in_l; /* Enqueue packet counter of the L-queue */
u32 maxq; /* Maximum queue size of the C-queue */
u32 ecn_mark; /* ECN mark pkt counter due to PI probability */
u32 step_marks; /* ECN mark pkt counter due to step AQM */
u32 memory_used; /* Memory used of both queues */
u32 max_memory_used;/* Maximum used memory */
/* Deferred drop statistics */
u32 deferred_drops_cnt; /* Packets dropped */
u32 deferred_drops_len; /* Bytes dropped */
};
struct dualpi2_skb_cb {
u64 ts; /* Timestamp at enqueue */
u8 apply_step:1, /* Can we apply the step threshold */
classified:2, /* Packet classification results */
ect:2; /* Packet ECT codepoint */
};
enum dualpi2_classification_results {
DUALPI2_C_CLASSIC = 0, /* C-queue */
DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */