From: Arjan van de Ven <arjan@linux.intel.com>
Subject: Re: [syzbot] [net?] WARNING: ODEBUG bug in lane_ioctl (3)

This email is created by automation to help kernel developers deal with
a large volume of bug reports by decoding oopses into more actionable
information.


Decoded Backtrace

1. __debug_object_init -- crash site (lib/debugobjects.c:632)

The WARN fires inside debug_print_object (inlined into
__debug_object_init). The object at 0xffff888077a60f78 (a timer_list) is
in the ACTIVE state when debug_object_init is called on it a second time.

  611 static void debug_print_object(struct debug_obj *obj, char *msg)
  612 {
  613     const struct debug_obj_descr *descr = obj->descr;
  614     static int limit;
  622     if (!debug_objects_enabled)
  623         return;
  625     if (limit < 5 && descr != descr_test) {
  626         void *hint = descr->debug_hint ?
  627             descr->debug_hint(obj->object) : NULL;
  628         limit++;
-> 629         WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) "
  630                 "object: %p object type: %s hint: %pS\n",
  631                 msg, obj_states[obj->state], obj->astate,
  632                 obj->object, descr->name, hint);
  633     }
  634     debug_objects_warnings++;
  635 }

  747 static void
  748 __debug_object_init(void *addr, const struct debug_obj_descr *descr,
  749                     int onstack)
  750 {
  751     struct debug_obj *obj, o;
  752     struct debug_bucket *db;
  753     unsigned long flags;
  755     debug_objects_fill_pool();
  757     db = get_bucket((unsigned long) addr);
  759     raw_spin_lock_irqsave(&db->lock, flags);
  761     obj = lookup_object_or_alloc(addr, db, descr, onstack, false);
  762     if (unlikely(!obj)) {
  763         raw_spin_unlock_irqrestore(&db->lock, flags);
  764         debug_objects_oom();
  765         return;
  766     }
  768     switch (obj->state) {
  769     case ODEBUG_STATE_NONE:
  770     case ODEBUG_STATE_INIT:
  771     case ODEBUG_STATE_INACTIVE:
  772         obj->state = ODEBUG_STATE_INIT;
  773         raw_spin_unlock_irqrestore(&db->lock, flags);
  774         return;
  775     default:
  776         break;
  777     }
  779     o = *obj;
  780     raw_spin_unlock_irqrestore(&db->lock, flags);
-> 780     debug_print_object(&o, "init");
  783     if (o.state == ODEBUG_STATE_ACTIVE)
  784         debug_object_fixup(descr->fixup_init, addr, o.state);
  785 }


2. timer_init_key -- kernel/time/timer.c:880

  786 static inline void debug_timer_init(struct timer_list *timer)
  787 {
-> 788     debug_object_init(timer, &timer_debug_descr);
  789 }

  834 static inline void debug_init(struct timer_list *timer)
  835 {
-> 836     debug_timer_init(timer);
  837     trace_timer_init(timer);
  838 }

  876 void timer_init_key(struct timer_list *timer,
  877         void (*func)(struct timer_list *), unsigned int flags,
  878         const char *name, struct lock_class_key *key)
  879 {
-> 880     debug_init(timer);
  881     do_init_timer(timer, func, flags, name, key);
  882 }


3. lane_ioctl / lecd_attach / lec_arp_init (net/atm/lec.c:1037)

  1264 static void lec_arp_init(struct lec_priv *priv)
  1265 {
  1266     unsigned short i;
  1268     for (i = 0; i < LEC_ARP_TABLE_SIZE; i++)
  1269         INIT_HLIST_HEAD(&priv->lec_arp_tables[i]);
  1270     INIT_HLIST_HEAD(&priv->lec_arp_empty_ones);
  1271     INIT_HLIST_HEAD(&priv->lec_no_forward);
  1272     INIT_HLIST_HEAD(&priv->mcast_fwds);
  1273     spin_lock_init(&priv->lec_arp_lock);
->1274     INIT_DELAYED_WORK(&priv->lec_arp_work, lec_arp_check_expire);
  1275     schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL);
  1276 }

  748 static int lecd_attach(struct atm_vcc *vcc, int arg)
  749 {
  750     int i;
  751     struct lec_priv *priv;
  753     lockdep_assert_held(&lec_mutex);
  754     if (arg < 0)
  755         arg = 0;
  756     if (arg >= MAX_LEC_ITF)
  757         return -EINVAL;
  758     i = array_index_nospec(arg, MAX_LEC_ITF);
  759     if (!dev_lec[i]) {
  763         dev_lec[i] = alloc_etherdev(size);
  775         priv = netdev_priv(dev_lec[i]);
  776     } else {
  776         priv = netdev_priv(dev_lec[i]);
  777         if (rcu_access_pointer(priv->lecd))
  778             return -EADDRINUSE;
  779     }
->781     lec_arp_init(priv);   // called unconditionally for both new and
                                // existing priv -- no work cancellation

  1018 static int lane_ioctl(struct socket *sock, unsigned int cmd,
  1019                       unsigned long arg)
  1020 {
  1034     mutex_lock(&lec_mutex);
  1035     switch (cmd) {
  1036     case ATMLEC_CTRL:
->1037         err = lecd_attach(vcc, (int)arg);
  1038         if (err >= 0)
  1039             sock->state = SS_CONNECTED;
  1040         break;
  1049     mutex_unlock(&lec_mutex);
  1050     return err;
  1051 }


4. do_vcc_ioctl (net/atm/ioctl.c:159)

  153     error = -ENOIOCTLCMD;
  155     mutex_lock(&ioctl_mutex);
  156     list_for_each(pos, &ioctl_list) {
  157         struct atm_ioctl *ic = list_entry(pos, struct atm_ioctl, list);
  158         if (try_module_get(ic->owner)) {
->159             error = ic->ioctl(sock, cmd, arg);  // dispatches to lane_ioctl
  160             module_put(ic->owner);
  161             if (error != -ENOIOCTLCMD)
  162                 break;
  163         }
  164     }
  165     mutex_unlock(&ioctl_mutex);


Tentative Analysis

The ODEBUG WARNING fires when INIT_DELAYED_WORK() is called on a
timer_list (lec_priv.lec_arp_work) that is already in the ACTIVE state.

lec_arp_init() always calls INIT_DELAYED_WORK(&priv->lec_arp_work, ...)
followed by schedule_delayed_work(). lecd_attach() calls lec_arp_init()
unconditionally -- both for a brand-new device and for an existing one
in the else-branch. The only guard for the existing-device path is that
priv->lecd is NULL (no daemon currently attached).

The race is opened by lec_atm_close(), the ATM VCC close handler:

    Thread A (lec_atm_close):        Thread B (lecd_attach via lane_ioctl):
      rcu_assign_pointer(lecd, NULL)
      synchronize_rcu()                mutex_lock(&lec_mutex)
      [window open]                    sees priv->lecd == NULL -- passes guard
                                       lec_arp_init(priv)
                                         INIT_DELAYED_WORK on active timer
                                         --> ODEBUG WARN
      lec_arp_destroy(priv)            [too late: work already re-initialized]

lec_atm_close() clears priv->lecd to NULL *before* calling
lec_arp_destroy() (which contains cancel_delayed_work_sync). Because
lec_atm_close() does not hold lec_mutex, Thread B can observe
priv->lecd == NULL while lec_arp_work is still active, pass the guard
in lecd_attach(), and call lec_arp_init() on a live timer.

The lec_mutex protecting dev_lec[] was introduced by commit d13a3824bfd2
("net: atm: add lec_mutex"), which serialised lecd_attach() and friends
but did not update lec_atm_close() to also acquire the mutex. The
unconditional lec_arp_init() call for existing devices predates that
commit and has always been present.


Potential Solution

Add cancel_delayed_work_sync(&priv->lec_arp_work) in the else-branch of
lecd_attach(), immediately before the call to lec_arp_init(). This
ensures any in-flight work is drained before the timer is re-initialized,
regardless of whether lec_atm_close() has already cancelled it.
cancel_delayed_work_sync() is safe to call from a lec_mutex-held context
because lec_arp_check_expire() only acquires priv->lec_arp_lock (a
spinlock) and never tries to take lec_mutex.

    } else {
        priv = netdev_priv(dev_lec[i]);
        if (rcu_access_pointer(priv->lecd))
            return -EADDRINUSE;
+       cancel_delayed_work_sync(&priv->lec_arp_work);
    }
    lec_arp_init(priv);


More information

Oops-Analysis: http://oops.fenrus.org/reports/lkml/69f16c26.170a0220.34e5b8.0013.GAE@google.com/
Assisted-by: GitHub-Copilot linux-kernel-oops.