Subject: Re: [syzbot] [bluetooth?] WARNING in hci_send_cmd (4) This email is created by automation to help kernel developers deal with a large volume of AI generated bug reports by decoding oopses into more actionable information. Decoded Backtrace 1. __queue_work -- crash site (kernel/workqueue.c:2297) 2275 static void __queue_work(int cpu, struct workqueue_struct *wq, 2276 struct work_struct *work) 2277 { 2278 struct pool_workqueue *pwq; 2279 struct worker_pool *last_pool, *pool; 2280 unsigned int work_flags; 2281 unsigned int req_cpu = cpu; 2289 lockdep_assert_irqs_disabled(); 2291 /* 2292 * For a draining wq, only works from the same workqueue are 2293 * allowed. The __WQ_DESTROYING helps to spot the issue that 2294 * queues a new work item to a wq after destroy_workqueue(wq). 2295 */ 2296 if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && -> 2297 WARN_ONCE(!is_chained_work(wq), 2298 "workqueue: cannot queue %ps on wq %s\n", 2299 work->func, wq->name))) { 2300 return; 2301 } 2. queue_work_on (kernel/workqueue.c:2432) 2422 bool queue_work_on(int cpu, struct workqueue_struct *wq, 2423 struct work_struct *work) 2424 { 2425 bool ret = false; 2426 unsigned long irq_flags; 2428 local_irq_save(irq_flags); 2430 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) && 2431 !clear_pending_if_disabled(work)) { -> 2432 __queue_work(cpu, wq, work); 2433 ret = true; 2434 } 2436 local_irq_restore(irq_flags); 2437 return ret; 2438 } 3. hci_send_cmd (net/bluetooth/hci_core.c:3111) 3092 int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, 3093 const void *param) 3094 { 3095 struct sk_buff *skb; 3097 BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); 3099 skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL); 3100 if (!skb) { 3101 bt_dev_err(hdev, "no memory for command"); 3102 return -ENOMEM; 3103 } 3105 /* Stand-alone HCI commands must be flagged as 3106 * single-command requests. 3107 */ 3108 bt_cb(skb)->hci.req_flags |= HCI_REQ_START; 3110 skb_queue_tail(&hdev->cmd_q, skb); -> 3111 queue_work(hdev->workqueue, &hdev->cmd_work); // hdev->workqueue is __WQ_DRAINING 3113 return 0; 3114 } Inlined caller: hci_conn_auth (net/bluetooth/hci_conn.c:2459) 2438 static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, 2439 __u8 auth_type) 2440 { 2454 if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { 2455 struct hci_cp_auth_requested cp; 2456 cp.handle = cpu_to_le16(conn->handle); -> 2459 hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, 2460 sizeof(cp), &cp); 2461 } 2464 return 0; 2465 } 4. hci_conn_security (net/bluetooth/hci_conn.c:2551) 2487 int hci_conn_security(struct hci_conn *conn, __u8 sec_level, 2488 __u8 auth_type, bool initiator) 2489 { 2544 auth: 2548 if (initiator) 2549 set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags); -> 2551 if (!hci_conn_auth(conn, sec_level, auth_type)) 2552 return 0; 2563 } 5. l2cap_info_timeout (net/bluetooth/l2cap_core.c:1685) -- running on the 'events' workqueue, NOT on hdev->workqueue 1675 static void l2cap_info_timeout(struct work_struct *work) 1676 { 1677 struct l2cap_conn *conn = container_of(work, struct l2cap_conn, 1678 info_timer.work); 1680 conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; 1681 conn->info_ident = 0; 1683 1684 mutex_lock(&conn->lock); -> 1685 l2cap_conn_start(conn); 1686 mutex_unlock(&conn->lock); 1687 } Tentative Analysis The WARNING fires in __queue_work() at kernel/workqueue.c:2297 when hci_send_cmd() attempts to queue hci_cmd_work onto hdev->workqueue after that workqueue has entered the draining or destroying state. The call chain is: l2cap_info_timeout (events wq) -> l2cap_conn_start -> hci_conn_security -> hci_conn_auth [inlined] -> hci_send_cmd -> queue_work(hdev->workqueue, &hdev->cmd_work) // <- The race arises in hci_dev_close_sync() (net/bluetooth/hci_sync.c): 1. HCI_UP is cleared with test_and_clear_bit(HCI_UP, ...). 2. drain_workqueue(hdev->workqueue) is called -- sets __WQ_DRAINING. 3. hci_conn_hash_flush() -> l2cap_conn_del() -> disable_delayed_work_sync(&conn->info_timer) -- cancels the timer. The l2cap_info_timeout callback runs on the 'events' workqueue, which is entirely separate from hdev->workqueue. Draining hdev->workqueue has no effect on pending events workqueue items. If the l2cap_info_timeout work item was already queued on the events workqueue before step 3 cancels it, the timeout fires in the window between steps 2 and 3, reaching hci_send_cmd with hdev->workqueue already in the __WQ_DRAINING state. hci_send_cmd() does not check HCI_UP before calling queue_work(), unlike hci_recv_frame() and the hci_dev_ioctl() handler, which both guard with test_bit(HCI_UP, &hdev->flags). Potential Solution Add the same HCI_UP guard to hci_send_cmd() that already exists in hci_recv_frame() and hci_dev_ioctl(): if (!test_bit(HCI_UP, &hdev->flags)) return -ENETDOWN; Place this check at the top of hci_send_cmd(), before the skb allocation. Since HCI_UP is cleared before drain_workqueue() in hci_dev_close_sync(), this guard catches the race without introducing any resource leak, lock imbalance, or side effects on callers. More information Oops-Analysis: http://oops.fenrus.org/reports/lkml/69ed492c.050a0220.e51af.0005.GAE_google.com/ Assisted-by: GitHub Copilot linux-kernel-oops-x86.