// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux INET6 implementation
* FIB front-end.
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*/
/* Changes:
*
* YOSHIFUJI Hideaki @USAGI
* reworked default router selection.
* - respect outgoing interface
* - select from (probably) reachable routers (i.e.
* routers in REACHABLE, STALE, DELAY or PROBE states).
* - always select the same router if it is (probably)
* reachable. otherwise, round-robin the list.
* Ville Nuorvala
* Fixed routing subtrees.
*/
#define pr_fmt(fmt) "IPv6: " fmt
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/mroute6.h>
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
#include <net/ip.h>
#include <linux/uaccess.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
static int ip6_rt_type_to_error(u8 fib6_type);
#define CREATE_TRACE_POINTS
#include <trace/events/fib6.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
#undef CREATE_TRACE_POINTS
enum rt6_nud_state {
RT6_NUD_FAIL_HARD = -3,
RT6_NUD_FAIL_PROBE = -2,
RT6_NUD_FAIL_DO_RR = -1,
RT6_NUD_SUCCEED = 1
};
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
static unsigned int ip6_mtu(const struct dst_entry *dst);
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void ip6_dst_destroy(struct dst_entry *);
static void ip6_dst_ifdown(struct dst_entry *,
struct net_device *dev, int how);
static int ip6_dst_gc(struct dst_ops *ops);
static int ip6_pkt_discard(struct sk_buff *skb);
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static int ip6_pkt_prohibit(struct sk_buff *skb);
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void ip6_link_failure(struct sk_buff *skb);
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
int strict);
static size_t rt6_nlmsg_size(struct fib6_info *f6i);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
int iif, int type, u32 portid, u32 seq,
unsigned int flags);
static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
const struct in6_addr *daddr,
const struct in6_addr *saddr);
#ifdef CONFIG_IPV6_ROUTE_INFO
static struct fib6_info *rt6_add_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
const struct in6_addr *gwaddr,
struct net_device *dev,
unsigned int pref);
static struct fib6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
const struct in6_addr *gwaddr,
struct net_device *dev);
#endif
struct uncached_list {
spinlock_t lock;
struct list_head head;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
void rt6_uncached_list_add(struct rt6_info *rt)
{
struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
rt->rt6i_uncached_list = ul;
spin_lock_bh(&ul->lock);
list_add_tail(&rt->rt6i_uncached, &ul->head);
spin_unlock_bh(&ul->lock);
}
void rt6_uncached_list_del(struct rt6_info *rt)
{
if (!list_empty(&rt->rt6i_uncached)) {
struct uncached_list *ul = rt->rt6i_uncached_list;
struct net *net = dev_net(rt->dst.dev);
spin_lock_bh(&ul->lock);
list_del(&rt->rt6i_uncached);
atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
spin_unlock_bh(&ul->lock);
}
}
static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
struct net_device *loopback_dev = net->loopback_dev;
int cpu;
if (dev == loopback_dev)
return;
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
struct rt6_info *rt;
spin_lock_bh(&ul->lock);
list_for_each_entry(rt, &ul->head, rt6i_uncached) {
struct inet6_dev *rt_idev = rt->rt6i_idev;
struct net_device *rt_dev = rt->dst.dev;
if (rt_idev->dev == dev) {
rt->rt6i_idev = in6_dev_get(loopback_dev);
in6_dev_put(rt_idev);
}
if (rt_dev == dev) {
rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(rt_dev);
}
}
spin_unlock_bh(&ul->lock);
}
}
static inline const void *choose_neigh_daddr(const struct in6_addr *p,
struct sk_buff *skb,
const void *daddr)
{
if (!ipv6_addr_any(p))
return (const void *) p;
else if (skb)
return &ipv6_hdr(skb)->daddr;
return daddr;
}
struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
struct net_device *dev,
struct sk_buff *skb,
const void *daddr)
{
struct neighbour *n;
daddr = choose_neigh_daddr(gw, skb, daddr);
n = __ipv6_neigh_lookup(dev, daddr);
if (n)
return n;
n = neigh_create(&nd_tbl, daddr, dev);
return IS_ERR(n) ? NULL : n;
}
static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr)
{
const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
dst->dev, skb, daddr);
}
static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
struct net_device *dev = dst->dev;
struct rt6_info *rt = (struct rt6_info *)dst;
daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
if (!daddr)
return;
if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
return;
if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
return;
__ipv6_confirm_neigh(dev, daddr);
}
static struct dst_ops ip6_dst_ops_template = {
.family = AF_INET6,
.gc = ip6_dst_gc,
.gc_thresh = 1024,
.check = ip6_dst_check,
.default_advmss = ip6_default_advmss,
.mtu = ip6_mtu,
.cow_metrics = dst_cow_metrics_generic,
.destroy = ip6_dst_destroy,
.ifdown = ip6_dst_ifdown,
.negative_advice = ip6_negative_advice,
.link_failure = ip6_link_failure,
.update_pmtu = ip6_rt_update_pmtu,
.redirect = rt6_do_redirect,
.local_out = __ip6_local_out,
.neigh_lookup = ip6_dst_neigh_lookup,
.confirm_neigh = ip6_confirm_neigh,
};
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
{
unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
return mtu ? : dst->dev->mtu;
}
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu)
{
}
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb)
{
}
static struct dst_ops ip6_dst_blackhole_ops = {
.family = AF_INET6,
.destroy = ip6_dst_destroy,
.check = ip6_dst_check,
.mtu = ip6_blackhole_mtu,
.default_advmss = ip6_default_advmss,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
.redirect = ip6_rt_blackhole_redirect,
.cow_metrics = dst_cow_metrics_generic,
.neigh_lookup = ip6_dst_neigh_lookup,
};
static const u32 ip6_template_metrics[RTAX_MAX] = {
[RTAX_HOPLIMIT - 1] = 0,
};
static const struct fib6_info fib6_null_entry_template = {
.fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
.fib6_protocol = RTPROT_KERNEL,
.fib6_metric = ~(u32)0,
.fib6_ref = REFCOUNT_INIT(1),
.fib6_type = RTN_UNREACHABLE,
.fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
};
static const struct rt6_info ip6_null_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -ENETUNREACH,
.input = ip6_pkt_discard,
.output = ip6_pkt_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
};
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
static const struct rt6_info ip6_prohibit_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EACCES,
.input = ip6_pkt_prohibit,
.output = ip6_pkt_prohibit_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
};
static const struct rt6_info ip6_blk_hole_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EINVAL,
.input = dst_discard,
.output = dst_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
};
#endif
static void rt6_info_init(struct rt6_info *rt)
{
struct dst_entry *dst = &rt->dst;
memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
INIT_LIST_HEAD(&rt->rt6i_uncached);
}
/* allocate dst with ip6_dst_ops */
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
1, DST_OBSOLETE_FORCE_CHK, flags);
if (rt) {
rt6_info_init(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
}
return rt;
}
EXPORT_SYMBOL(ip6_dst_alloc);
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
struct fib6_info *from;
struct inet6_dev *idev;
ip_dst_metrics_put(dst);
rt6_uncached_list_del(rt);
idev = rt->rt6i_idev;
if (idev) {
rt->rt6i_idev = NULL;
in6_dev_put(idev);
}
from = xchg((__force struct fib6_info **)&rt->from, NULL);
fib6_info_release(from);
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
int how)
{
struct rt6_info *rt = (struct rt6_info *)dst;
struct inet6_dev *idev = rt->rt6i_idev;
struct net_device *loopback_dev =
dev_net(dev)->loopback_dev;
if (idev && idev->dev != loopback_dev) {
struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
if (loopback_idev) {
rt->rt6i_idev = loopback_idev;
in6_dev_put(idev);
}
}
}
static bool __rt6_check_expired(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_EXPIRES)
return time_after(jiffies, rt->dst.expires);
else
return false;
}
static bool rt6_check_expired(const struct rt6_info *rt)
{
struct fib6_info *from;
from = rcu_dereference(rt->from);
if (rt->rt6i_flags & RTF_EXPIRES) {
if (time_after(jiffies, rt->dst.expires))
return true;
} else if (from) {
return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
fib6_check_expired(from);
}
return false;
}
void fib6_select_path(const struct net *net, struct fib6_result *res,
struct flowi6 *fl6, int oif, bool have_oif_match,
const struct sk_buff *skb, int strict)
{
struct fib6_info *sibling, *next_sibling;
struct fib6_info *match = res->f6i;
if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
goto out;
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
if (!fl6->mp_hash &&
(!match->nh || nexthop_is_multipath(match->nh)))
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
if (unlikely(match->nh)) {
nexthop_path_fib6_result(res, fl6->mp_hash);
return;
}
if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
goto out;
list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
fib6_siblings) {
const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
if (fl6->mp_hash > nh_upper_bound)
continue;
if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
break;
match = sibling;
break;
}
out:
res->f6i = match;
res->nh = match->fib6_nh;
}
/*
* Route lookup. rcu_read_lock() should be held.
*/
static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
const struct in6_addr *saddr, int oif, int flags)
{
const struct net_device *dev;
if (nh->fib_nh_flags & RTNH_F_DEAD)
return false;
dev = nh->fib_nh_dev;
if (oif) {
if (dev->ifindex == oif)
return true;
} else {
if (ipv6_chk_addr(net, saddr, dev,
flags & RT6_LOOKUP_F_IFACE))
return true;
}
return false;
}
struct fib6_nh_dm_arg {
struct net *net;
const struct in6_addr *saddr;
int oif;
int flags;
struct fib6_nh *nh;
};
static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
{
struct fib6_nh_dm_arg *arg = _arg;
arg->nh = nh;
return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
arg->flags);
}
/* returns fib6_nh from nexthop or NULL */
static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
struct fib6_result *res,
const struct in6_addr *saddr,
int oif, int flags)
{
struct fib6_nh_dm_arg arg = {
.net = net,
.saddr = saddr,
.oif = oif,
.flags = flags,
};
if (nexthop_is_blackhole(nh))
return NULL;
if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
return arg.nh;
return NULL;
}
static void rt6_device_match(struct net *net, struct fib6_result *res,
const struct in6_addr *saddr, int oif, int flags)
{
struct fib6_info *f6i = res->f6i;
struct fib6_info *spf6i;
struct fib6_nh *nh;
if (!oif && ipv6_addr_any(saddr)) {
if (unlikely(f6i->nh)) {
nh = nexthop_fib6_nh(f6i->nh);
if (nexthop_is_blackhole(f6i->nh))
goto out_blackhole;
} else {
nh = f6i->fib6_nh;
}
if (!(nh->fib_nh_flags & RTNH_F_DEAD))
goto out;
}
for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
bool matched = false;
if (unlikely(spf6i->nh)) {
nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
oif, flags);
if (nh)
matched = true;
} else {
nh = spf6i->fib6_nh;
if (__rt6_device_match(net, nh, saddr, oif, flags))
matched = true;
}
if (matched) {
res->f6i = spf6i;
goto out;
}
}
if (oif && flags & RT6_LOOKUP_F_IFACE) {
res->f6i = net->ipv6.fib6_null_entry;
nh = res->f6i->fib6_nh;
goto out;
}
if (unlikely(f6i->nh)) {
nh = nexthop_fib6_nh(f6i->nh);
if (nexthop_is_blackhole(f6i->nh))
goto out_blackhole;
} else {
nh = f6i->fib6_nh;
}
if (nh->fib_nh_flags & RTNH_F_DEAD) {
res->f6i = net->ipv6.fib6_null_entry;
nh = res->f6i->fib6_nh;
}
out:
res->nh = nh;
res->fib6_type = res->f6i->fib6_type;
res->fib6_flags = res->f6i->fib6_flags;
return;
out_blackhole:
res->fib6_flags |= RTF_REJECT;
res->fib6_type = RTN_BLACKHOLE;
res->nh = nh;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
struct __rt6_probe_work {
struct work_struct work;
struct in6_addr target;
struct net_device *dev;
};
static void rt6_probe_deferred(struct work_struct *w)
{
struct in6_addr mcaddr;
struct __rt6_probe_work *work =
container_of(w, struct __rt6_probe_work, work);
addrconf_addr_solict_mult(&work->target, &mcaddr);
ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
dev_put(work->dev);
kfree(work);
}
static void rt6_probe(struct fib6_nh *fib6_nh)
{
struct __rt6_probe_work *work = NULL;
const struct in6_addr *nh_gw;
unsigned long last_probe;
struct neighbour *neigh;
struct net_device *dev;
struct inet6_dev *idev;
/*
* Okay, this does not seem to be appropriate
* for now, however, we need to check if it
* is really so; aka Router Reachability Probing.
*
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
if (!fib6_nh->fib_nh_gw_family)
return;
nh_gw = &fib6_nh->fib_nh_gw6;
dev = fib6_nh->fib_nh_dev;
rcu_read_lock_bh();
last_probe = READ_ONCE(fib6_nh->last_probe);
idev = __in6_dev_get(dev);
neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
if (neigh) {
if (neigh->nud_state & NUD_VALID)
goto out;
write_lock(&neigh->lock);
if (!(neigh->nud_state & NUD_VALID) &&
time_after(jiffies,
neigh->updated + idev->cnf.rtr_probe_interval)) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work)
__neigh_set_probe_once(neigh);
}
write_unlock(&neigh->lock);
} else if (time_after(jiffies, last_probe +
idev->cnf.rtr_probe_interval)) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
}
if (!work || cmpxchg(&fib6_nh->last_probe,
last_probe, jiffies) != last_probe) {
kfree(work);
} else {
INIT_WORK(&work->work, rt6_probe_deferred);
work->target = *nh_gw;
dev_hold(dev);
work->dev = dev;
schedule_work(&work->work);
}
out:
rcu_read_unlock_bh();
}
#else
static inline void rt6_probe(struct fib6_nh *fib6_nh)
{
}
#endif
/*
* Default Router Selection (RFC 2461 6.3.6)
*/
static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
{
enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
struct neighbour *neigh;
rcu_read_lock_bh();
neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
&fib6_nh->fib_nh_gw6);
if (neigh) {
read_lock(&neigh->lock);
if (neigh->nud_state & NUD_VALID)
ret = RT6_NUD_SUCCEED;
#ifdef CONFIG_IPV6_ROUTER_PREF
else if (!(neigh->nud_state & NUD_FAILED))
ret = RT6_NUD_SUCCEED;
else
ret = RT6_NUD_FAIL_PROBE;
#endif
read_unlock(&neigh->lock);
} else {
ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
}
rcu_read_unlock_bh();
return ret;
}
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
int strict)
{
int m = 0;
if (!oif || nh->fib_nh_dev->ifindex == oif)
m = 2;
if (!m && (strict & RT6_LOOKUP_F_IFACE))
return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
#endif
if ((strict & RT6_LOOKUP_F_REACHABLE) &&
!(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
int n = rt6_check_neigh(nh);
if (n < 0)
return n;
}
return m;
}
static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
int oif, int strict, int *mpri, bool *do_rr)
{
bool match_do_rr = false;
bool rc = false;
int m;
if (nh->fib_nh_flags & RTNH_F_DEAD)
goto out;
if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
nh->fib_nh_flags & RTNH_F_LINKDOWN &&
!(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
goto out;
m = rt6_score_route(nh, fib6_flags, oif, strict);
if (m == RT6_NUD_FAIL_DO_RR) {
match_do_rr = true;
m = 0; /* lowest valid score */
} else if (m == RT6_NUD_FAIL_HARD) {
goto out;
}
if (strict & RT6_LOOKUP_F_REACHABLE)
rt6_probe(nh);
/* note that m can be RT6_NUD_FAIL_PROBE at this point */
if (m > *mpri) {
*do_rr = match_do_rr;
*mpri = m;
rc = true;
}
out:
return rc;
}
struct fib6_nh_frl_arg {
u32 flags;
int oif;
int strict;
int *mpri;
bool *do_rr;
struct fib6_nh *nh;
};
static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
{
struct fib6_nh_frl_arg *arg = _arg;
arg->nh = nh;
return find_match(nh, arg->flags, arg->oif, arg->strict,
arg->mpri, arg->do_rr);
}
static void __find_rr_leaf(struct fib6_info *f6i_start,
struct fib6_info *nomatch, u32 metric,
struct fib6_result *res, struct fib6_info **cont,
int oif, int strict, bool *do_rr, int *mpri)
{
struct fib6_info *f6i;
for (f6i = f6i_start;
f6i && f6i != nomatch;
f6i = rcu_dereference(f6i->fib6_next)) {
bool matched = false;
struct fib6_nh *nh;
if (cont && f6i->fib6_metric != metric) {
*cont = f6i;
return;
}
if (fib6_check_expired(f6i))
continue;
if (unlikely(f6i->nh)) {
struct fib6_nh_frl_arg arg = {
.flags = f6i->fib6_flags,
.oif = oif,
.strict = strict,
.mpri = mpri,
.do_rr = do_rr
};
if (nexthop_is_blackhole(f6i->nh)) {
res->fib6_flags = RTF_REJECT;
res->fib6_type = RTN_BLACKHOLE;
res->f6i = f6i;
res->nh = nexthop_fib6_nh(f6i->nh);
return;
}
if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
&arg)) {
matched = true;
nh = arg.nh;
}
} else {
nh = f6i->fib6_nh;
if (find_match(nh, f6i->fib6_flags, oif, strict,
mpri, do_rr))
matched = true;
}
if (matched) {
res->f6i = f6i;
res->nh = nh;
res->fib6_flags = f6i->fib6_flags;
res->fib6_type = f6i->fib6_type;
}
}
}
static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
struct fib6_info *rr_head, int oif, int strict,
bool *do_rr, struct fib6_result *res)
{
u32 metric = rr_head->fib6_metric;
struct fib6_info *cont = NULL;
int mpri = -1;
__find_rr_leaf(rr_head, NULL, metric, res, &cont,
oif, strict, do_rr, &mpri);
__find_rr_leaf(leaf, rr_head, metric, res, &cont,
oif, strict, do_rr, &mpri);
if (res->f6i || !cont)
return;
__find_rr_leaf(cont, NULL, metric, res, NULL,
oif, strict, do_rr, &mpri);
}
static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
struct fib6_result *res, int strict)
{
struct fib6_info *leaf = rcu_dereference(fn->leaf);
struct fib6_info *rt0;
bool do_rr = false;
int key_plen;
/* make sure this function or its helpers sets f6i */
res->f6i = NULL;
if (!leaf || leaf == net->ipv6.fib6_null_entry)
goto out;
rt0 = rcu_dereference(fn->rr_ptr);
if (!rt0)
rt0 = leaf;
/* Double check to make sure fn is not an intermediate node
* and fn->leaf does not points to its child's leaf
* (This might happen if all routes under fn are deleted from
* the tree and fib6_repair_tree() is called on the node.)
*/
key_plen = rt0->fib6_dst.plen;
#ifdef CONFIG_IPV6_SUBTREES
if (rt0->fib6_src.plen)
key_plen = rt0->fib6_src.plen;
#endif
if (fn->fn_bit != key_plen)
goto out;
find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
if (do_rr) {
struct fib6_info *next = rcu_dereference(rt0->fib6_next);
/* no entries matched; do round-robin */
if (!next || next->fib6_metric != rt0->fib6_metric)
next = leaf;
if (next != rt0) {
spin_lock_bh(&leaf->fib6_table->tb6_lock);
/* make sure next is not being deleted from the tree */
if (next->fib6_node)
rcu_assign_pointer(fn->rr_ptr, next);
spin_unlock_bh(&leaf->fib6_table->tb6_lock);
}
}
out:
if (!res->f6i) {
res->f6i = net->ipv6.fib6_null_entry;
res->nh = res->f6i->fib6_nh;
res->fib6_flags = res->f6i->fib6_flags;
res->fib6_type = res->f6i->fib6_type;
}
}
static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
{
return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
res->nh->fib_nh_gw_family;
}
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr)
{
struct net *net = dev_net(dev);
struct route_info *rinfo = (struct route_info *) opt;
struct in6_addr prefix_buf, *prefix;
unsigned int pref;
unsigned long lifetime;
struct fib6_info *rt;
if (len < sizeof(struct route_info)) {
return -EINVAL;
}
/* Sanity check for prefix_len and length */
if (rinfo->length > 3) {
return -EINVAL;
} else if (rinfo->prefix_len > 128) {
return -EINVAL;
} else if (rinfo->prefix_len > 64) {
if (rinfo->length < 2) {
return -EINVAL;
}
} else if (rinfo->prefix_len > 0) {
if (rinfo->length < 1) {
return -EINVAL;
}
}
pref = rinfo->route_pref;
if (pref == ICMPV6_ROUTER_PREF_INVALID)
return -EINVAL;
lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
if (rinfo->length == 3)
prefix = (struct in6_addr *)rinfo->prefix;
else {
/* this function is safe */
ipv6_addr_prefix(&prefix_buf,
(struct in6_addr *)rinfo->prefix,
rinfo->prefix_len);
prefix = &prefix_buf;
}
if (rinfo->prefix_len == 0)
rt = rt6_get_dflt_router(net, gwaddr, dev);
else
rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
gwaddr, dev);
if (rt && !lifetime) {
ip6_del_rt(net, rt);
rt = NULL;
}
if (!rt && lifetime)
rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
dev, pref);
else if (rt)
rt->fib6_flags = RTF_ROUTEINFO |
(rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
if (rt) {
if (!addrconf_finite_timeout(lifetime))
fib6_clean_expires(rt);
else
fib6_set_expires(rt, jiffies + HZ * lifetime);
fib6_info_release(rt);
}
return 0;
}
#endif
/*
* Misc support functions
*/
/* called with rcu_lock held */
static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
{
struct net_device *dev = res->nh->fib_nh_dev;
if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
/* for copies of local routes, dst->dev needs to be the
* device if it is a master device, the master device if
* device is enslaved, and the loopback as the default
*/
if (netif_is_l3_slave(dev) &&
!rt6_need_strict(&res->f6i->fib6_dst.addr))
dev = l3mdev_master_dev_rcu(dev);
else if (!netif_is_l3_master(dev))
dev = dev_net(dev)->loopback_dev;
/* last case is netif_is_l3_master(dev) is true in which
* case we want dev returned to be dev
*/
}
return dev;
}
static const int fib6_prop[RTN_MAX + 1] = {
[RTN_UNSPEC] = 0,
[RTN_UNICAST] = 0,
[RTN_LOCAL] = 0,
[RTN_BROADCAST] = 0,
[RTN_ANYCAST] = 0,
[RTN_MULTICAST] = 0,
[RTN_BLACKHOLE] = -EINVAL,
[RTN_UNREACHABLE] = -EHOSTUNREACH,
[RTN_PROHIBIT] = -EACCES,
[RTN_THROW] = -EAGAIN,
[RTN_NAT] = -EINVAL,
[RTN_XRESOLVE] = -EINVAL,
};
static int ip6_rt_type_to_error(u8 fib6_type)
{
return fib6_prop[fib6_type];
}
static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
{
unsigned short flags = 0;
if (rt->dst_nocount)
flags |= DST_NOCOUNT;
if (rt->dst_nopolicy)
flags |= DST_NOPOLICY;
if (rt->dst_host)
flags |= DST_HOST;
return flags;
}
static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
{
rt->dst.error = ip6_rt_type_to_error(fib6_type);
switch (fib6_type) {
case RTN_BLACKHOLE:
rt->dst.output = dst_discard_out;
rt->dst.input = dst_discard;
break;
case RTN_PROHIBIT:
rt->dst.output = ip6_pkt_prohibit_out;
rt->dst.input = ip6_pkt_prohibit;
break;
case RTN_THROW:
case RTN_UNREACHABLE:
default:
rt->dst.output = ip6_pkt_discard_out;
rt->dst.input = ip6_pkt_discard;
break;
}
}
static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
{
struct fib6_info *f6i = res->f6i;
if (res->fib6_flags & RTF_REJECT) {
ip6_rt_init_dst_reject(rt, res->fib6_type);
return;
}
rt->dst.error = 0;
rt->dst.output = ip6_output;
if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
rt->dst.input = ip6_input;
} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
rt->dst.input = ip6_mc_input;
} else {
rt->dst.input = ip6_forward;
}
if (res->nh->fib_nh_lws) {
rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
lwtunnel_set_redirect(&rt->dst);
}
rt->dst.lastuse = jiffies;
}
/* Caller must already hold reference to @from */
static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
{
rt->rt6i_flags &= ~RTF_EXPIRES;
rcu_assign_pointer(rt->from, from);
ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
}
/* Caller must already hold reference to f6i in result */
static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
{
const struct fib6_nh *nh = res->nh;
const struct net_device *dev = nh->fib_nh_dev;
struct fib6_info *f6i = res->f6i;
ip6_rt_init_dst(rt, res);
rt->rt6i_dst = f6i->fib6_dst;
rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
rt->rt6i_flags = res->fib6_flags;
if (nh->fib_nh_gw_family) {
rt->rt6i_gateway = nh->fib_nh_gw6;
rt->rt6i_flags |= RTF_GATEWAY;
}
rt6_set_from(rt, f6i);
#ifdef CONFIG_IPV6_SUBTREES
rt->rt6i_src = f6i->fib6_src;
#endif
}
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
struct in6_addr *saddr)
{
struct fib6_node *pn, *sn;
while (1) {
if (fn->fn_flags & RTN_TL_ROOT)
return NULL;
pn = rcu_dereference(fn->parent);
sn = FIB6_SUBTREE(pn);
if (sn && sn != fn)
fn = fib6_node_lookup(sn, NULL, saddr);
else
fn = pn;
if (fn->fn_flags & RTN_RTINFO)
return fn;
}
}
static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
{
struct rt6_info *rt = *prt;
if (dst_hold_safe(&rt->dst))
return true;
if (net) {
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
} else {
rt = NULL;
}
*prt = rt;
return false;
}
/* called with rcu_lock held */
static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
{
struct net_device *dev = res->nh->fib_nh_dev;
struct fib6_info *f6i = res->f6i;
unsigned short flags;
struct rt6_info *nrt;
if (!fib6_info_hold_safe(f6i))
goto fallback;
flags = fib6_info_dst_flags(f6i);
nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
if (!nrt) {
fib6_info_release(f6i);
goto fallback;
}
ip6_rt_copy_init(nrt, res);
return nrt;
fallback:
nrt = dev_net(dev)->ipv6.ip6_null_entry;
dst_hold(&nrt->dst);
return nrt;
}
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
struct fib6_result res = {};
struct fib6_node *fn;
struct rt6_info *rt;
if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
flags &= ~RT6_LOOKUP_F_IFACE;
rcu_read_lock();
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
res.f6i = rcu_dereference(fn->leaf);
if (!res.f6i)
res.f6i = net->ipv6.fib6_null_entry;
else
rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
flags);
if (res.f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto restart;
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
goto out;
} else if (res.fib6_flags & RTF_REJECT) {
goto do_create;
}
fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
fl6->flowi6_oif != 0, skb, flags);
/* Search through exception table */
rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
if (ip6_hold_safe(net, &rt))
dst_use_noref(&rt->dst, jiffies);
} else {
do_create:
rt = ip6_create_rt_rcu(&res);
}
out:
trace_fib6_table_lookup(net, &res, table, fl6);
rcu_read_unlock();
return rt;
}
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb, int flags)
{
return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
const struct in6_addr *saddr, int oif,
const struct sk_buff *skb, int strict)
{
struct flowi6 fl6 = {
.flowi6_oif = oif,
.daddr = *daddr,
};
struct dst_entry *dst;
int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
if (saddr) {
memcpy(&fl6.saddr, saddr, sizeof(*saddr));
flags |= RT6_LOOKUP_F_HAS_SADDR;
}
dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
if (dst->error == 0)
return (struct rt6_info *) dst;
dst_release(dst);
return NULL;
}
EXPORT_SYMBOL(rt6_lookup);
/* ip6_ins_rt is called with FREE table->tb6_lock.
* It takes new route entry, the addition fails by any reason the
* route is released.
* Caller must hold dst before calling it.
*/
static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
struct netlink_ext_ack *extack)
{
int err;
struct fib6_table *table;
table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
err = fib6_add(&table->tb6_root, rt, info, extack);
spin_unlock_bh(&table->tb6_lock);
return err;
}
int ip6_ins_rt(struct net *net, struct fib6_info *rt)
{
struct nl_info info = { .nl_net = net, };
return __ip6_ins_rt(rt, &info, NULL);
}
static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
struct fib6_info *f6i = res->f6i;
struct net_device *dev;
struct rt6_info *rt;
/*
* Clone the route.
*/
if (!fib6_info_hold_safe(f6i))
return NULL;
dev = ip6_rt_get_dev_rcu(res);
rt = ip6_dst_alloc(dev_net(dev), dev, 0);
if (!rt) {
fib6_info_release(f6i);
return NULL;
}
ip6_rt_copy_init(rt, res);
rt->rt6i_flags |= RTF_CACHE;
rt->dst.flags |= DST_HOST;
rt->rt6i_dst.addr = *daddr;
rt->rt6i_dst.plen = 128;
if (!rt6_is_gw_or_nonexthop(res)) {
if (f6i->fib6_dst.plen != 128 &&
ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
rt->rt6i_src.addr = *saddr;
rt->rt6i_src.plen = 128;
}
#endif
}
return rt;
}
static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
{
struct fib6_info *f6i = res->f6i;
unsigned short flags = fib6_info_dst_flags(f6i);
struct net_device *dev;
struct rt6_info *pcpu_rt;
if (!fib6_info_hold_safe(f6i))
return NULL;
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(res);
pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
rcu_read_unlock();
if (!pcpu_rt) {
fib6_info_release(f6i);
return NULL;
}
ip6_rt_copy_init(pcpu_rt, res);
pcpu_rt->rt6i_flags |= RTF_PCPU;
return pcpu_rt;
}
/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
struct rt6_info *pcpu_rt;
pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
return pcpu_rt;
}
static struct rt6_info *rt6_make_pcpu_route(struct net *net