From: Arjan van de Ven Subject: [PATCH] RDMA/rxe: fix race condition double-release of namespace UDP socket This patch is based on a BUG as reported at https://lore.kernel.org/all/69e87e0e.a00a0220.9259.001c.GAE@google.com/ When an RXE (Soft-RoCE) device is deleted while its network namespace is concurrently tearing down, two independent code paths can both attempt to release the same per-namespace UDP tunnel socket. rxe_ns_exit() runs as part of pernet cleanup and reads the socket pointer under an RCU read lock. rxe_net_del() runs via the RDMA netlink delete path and reads the same pointer through rxe_ns_pernet_sk4/sk6(). Neither path holds any lock that would prevent the other from reading a non-NULL pointer at the same time, so both can proceed to call udp_tunnel_sock_release() on the same socket. The second udp_tunnel_sock_release() -> sock_release() -> iput() call crashes with VFS_BUG_ON_INODE() at fs/inode.c because the sockfs inode is already in state I_FREEING|I_CLEAR (0x300) with i_count == 0 by the time the second release reaches it. The race window is widened by rxe_sock_put() clearing the namespace pointer only after releasing the socket, meaning a concurrent reader can still observe the non-NULL pointer during the release itself. Fix this by adding a mutex to struct rxe_ns_sock that serializes the "claim" operation: atomically read the socket pointer and replace it with NULL under the lock. Only the path that gets a non-NULL pointer back proceeds to release the socket; the other path gets NULL and does nothing. The actual udp_tunnel_sock_release() call runs outside the lock (it may sleep), but the critical read-and-null step is atomic. This guarantees exactly one release per socket regardless of which path wins the race. Link: https://lore.kernel.org/all/69e87e0e.a00a0220.9259.001c.GAE@google.com/ Oops-Analysis: http://oops.fenrus.org/reports/email/69e87e0e.a00a0220.9259.001c.GAE_google.com/report.html Fixes: f1327abd6abe ("RDMA/rxe: Support RDMA link creation and destruction per net namespace") Assisted-by: GitHub Copilot:claude-sonnet-4.6 linux-kernel-oops-x86. Signed-off-by: Arjan van de Ven Cc: linux-rdma@vger.kernel.org Cc: Zhu Yanjun Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: linux-kernel@vger.kernel.org --- drivers/infiniband/sw/rxe/rxe_net.c | 4 +-- drivers/infiniband/sw/rxe/rxe_ns.c | 53 ++++++++++++++++++++++++++++--------- drivers/infiniband/sw/rxe/rxe_ns.h | 7 +++++ 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 50a2cb5405e22..a49aff7f443d1 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -655,11 +655,11 @@ void rxe_net_del(struct ib_device *dev) net = dev_net(ndev); - sk = rxe_ns_pernet_sk4(net); + sk = rxe_ns_pernet_take_sk4(net); if (sk) rxe_sock_put(sk, rxe_ns_pernet_set_sk4, net); - sk = rxe_ns_pernet_sk6(net); + sk = rxe_ns_pernet_take_sk6(net); if (sk) rxe_sock_put(sk, rxe_ns_pernet_set_sk6, net); diff --git a/drivers/infiniband/sw/rxe/rxe_ns.c b/drivers/infiniband/sw/rxe/rxe_ns.c index 8b9d734229b24..68d5266dc1326 100644 --- a/drivers/infiniband/sw/rxe/rxe_ns.c +++ b/drivers/infiniband/sw/rxe/rxe_ns.c @@ -16,6 +16,7 @@ struct rxe_ns_sock { struct sock __rcu *rxe_sk4; struct sock __rcu *rxe_sk6; + struct mutex lock; /* serializes socket claim/release */ }; /* @@ -28,13 +29,33 @@ static unsigned int rxe_pernet_id; */ static int rxe_ns_init(struct net *net) { + struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); + /* defer socket create in the namespace to the first * device create. */ - + mutex_init(&ns_sk->lock); return 0; } +/* + * Atomically claim the socket pointer from the namespace: read it and set it + * to NULL under the lock, then return the pointer (or NULL if already gone). + * The caller is responsible for releasing the socket after the lock is dropped. + */ +static struct sock *rxe_ns_claim_sk(struct rxe_ns_sock *ns_sk, + struct sock __rcu **sk_rcu) +{ + struct sock *sk; + + mutex_lock(&ns_sk->lock); + sk = rcu_dereference_protected(*sk_rcu, lockdep_is_held(&ns_sk->lock)); + if (sk) + rcu_assign_pointer(*sk_rcu, NULL); + mutex_unlock(&ns_sk->lock); + return sk; +} + static void rxe_ns_exit(struct net *net) { /* called when the network namespace is removed @@ -42,22 +63,14 @@ static void rxe_ns_exit(struct net *net) struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); struct sock *sk; - rcu_read_lock(); - sk = rcu_dereference(ns_sk->rxe_sk4); - rcu_read_unlock(); - if (sk) { - rcu_assign_pointer(ns_sk->rxe_sk4, NULL); + sk = rxe_ns_claim_sk(ns_sk, &ns_sk->rxe_sk4); + if (sk) udp_tunnel_sock_release(sk->sk_socket); - } #if IS_ENABLED(CONFIG_IPV6) - rcu_read_lock(); - sk = rcu_dereference(ns_sk->rxe_sk6); - rcu_read_unlock(); - if (sk) { - rcu_assign_pointer(ns_sk->rxe_sk6, NULL); + sk = rxe_ns_claim_sk(ns_sk, &ns_sk->rxe_sk6); + if (sk) udp_tunnel_sock_release(sk->sk_socket); - } #endif } @@ -91,6 +104,13 @@ void rxe_ns_pernet_set_sk4(struct net *net, struct sock *sk) synchronize_rcu(); } +struct sock *rxe_ns_pernet_take_sk4(struct net *net) +{ + struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); + + return rxe_ns_claim_sk(ns_sk, &ns_sk->rxe_sk4); +} + #if IS_ENABLED(CONFIG_IPV6) struct sock *rxe_ns_pernet_sk6(struct net *net) { @@ -111,6 +131,13 @@ void rxe_ns_pernet_set_sk6(struct net *net, struct sock *sk) rcu_assign_pointer(ns_sk->rxe_sk6, sk); synchronize_rcu(); } + +struct sock *rxe_ns_pernet_take_sk6(struct net *net) +{ + struct rxe_ns_sock *ns_sk = net_generic(net, rxe_pernet_id); + + return rxe_ns_claim_sk(ns_sk, &ns_sk->rxe_sk6); +} #endif /* IPV6 */ int rxe_namespace_init(void) diff --git a/drivers/infiniband/sw/rxe/rxe_ns.h b/drivers/infiniband/sw/rxe/rxe_ns.h index 4da2709e6b714..fa5b6bc2dc78e 100644 --- a/drivers/infiniband/sw/rxe/rxe_ns.h +++ b/drivers/infiniband/sw/rxe/rxe_ns.h @@ -5,10 +5,12 @@ struct sock *rxe_ns_pernet_sk4(struct net *net); void rxe_ns_pernet_set_sk4(struct net *net, struct sock *sk); +struct sock *rxe_ns_pernet_take_sk4(struct net *net); #if IS_ENABLED(CONFIG_IPV6) void rxe_ns_pernet_set_sk6(struct net *net, struct sock *sk); struct sock *rxe_ns_pernet_sk6(struct net *net); +struct sock *rxe_ns_pernet_take_sk6(struct net *net); #else /* IPv6 */ static inline struct sock *rxe_ns_pernet_sk6(struct net *net) { @@ -18,6 +20,11 @@ static inline struct sock *rxe_ns_pernet_sk6(struct net *net) static inline void rxe_ns_pernet_set_sk6(struct net *net, struct sock *sk) { } + +static inline struct sock *rxe_ns_pernet_take_sk6(struct net *net) +{ + return NULL; +} #endif /* IPv6 */ int rxe_namespace_init(void);