source: trunk/src/os2ahci/ctxhook.c

Last change on this file was 211, checked in by David Azarewicz, 2 years ago

Added workaround to help with VirtualBox issues.
Improved diagnostic messages.
Changed how timeouts are reset and how ctx hooks are triggered.
Added quirk for devices with issues executing some standard commands.
Changed to make /N the default.

File size: 19.5 KB
RevLine 
[205]1/**
[63]2 * ctxhook.c - context hooks (kernel thread functions) for os2ahci
3 *
[87]4 * Copyright (c) 2011 thi.guten Software Development
5 * Copyright (c) 2011 Mensys B.V.
[211]6 * Copyright (c) 2013-2023 David Azarewicz <david@88watts.net>
[63]7 *
[87]8 * Authors: Christian Mueller, Markus Thielen
9 *
10 * Parts copied from/inspired by the Linux AHCI driver;
11 * those parts are (c) Linux AHCI/ATA maintainers
12 *
[63]13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 */
27
28#include "os2ahci.h"
29#include "ata.h"
30#include "atapi.h"
31
32/* port restart context hook and input data */
[211]33ULONG RestartCtxHook_h;
[63]34volatile u32 ports_to_restart[MAX_AD];
35
36/* port reset context hook and input data */
[211]37ULONG ResetCtxHook_h;
38ULONG th_watchdog;
[63]39volatile u32 ports_to_reset[MAX_AD];
40IORB_QUEUE abort_queue;
41
42/* trigger engine context hook and input data */
43ULONG engine_ctxhook_h;
44
[211]45#define QUEUEDEPTH 8
46static struct _ctxq_
47{
48 ULONG ulHandle;
49 ULONG ulArg;
50} CtxQueue[QUEUEDEPTH] = {0};
51static ULONG ulCtxStatusFlag = 0;
52
53void SafeArmCtxHook(ULONG ulHandle, ULONG armData)
54{
55 USHORT i;
56
57 i = LockInc(&ulCtxStatusFlag);
58
59 if (i)
60 {
61 i--;
62 if (i < QUEUEDEPTH)
63 {
64 CtxQueue[i].ulHandle = ulHandle;
65 CtxQueue[i].ulArg = armData;
66 }
67 }
68 else
69 {
70 KernArmHook(ulHandle, armData, 0);
71 }
72}
73
74void ClearThreadStatus(ULONG ulHandle)
75{
76 USHORT i;
77
78 i = LockDec(&ulCtxStatusFlag);
79
80 if (i)
81 {
82 i--;
83 if (i < QUEUEDEPTH)
84 {
85 KernArmHook(CtxQueue[i].ulHandle, CtxQueue[i].ulArg, 0);
86 }
87 else
88 {
89 KernArmHook(ulHandle, 0, 0);
90 }
91 }
92}
93
[63]94/******************************************************************************
95 * Port restart context hook. This context hook is executed at task time and
[87]96 * will handle ports which are stopped due to a device error condition.
[63]97 *
98 * The following conditions may exist:
99 *
100 * - Only a single non-NCQ command is executed by the AHCI adapter at any
101 * given time (even if more are outstanding). This is the case for single
102 * devices or port multipliers without FIS-based command switching. Error
[75]103 * recovery is simple because we know which command has failed and that
104 * all other commands have not yet started executing. Thus, we can requeue
105 * all of them, replacing the failing command with a "request sense"
106 * command to get error details.
[63]107 *
108 * - Multiple non-NCQ commands are executed on different devices behind a
109 * port multiplier which supports FIS-based command switching. This is
110 * more difficult to recover from but currently not an issue because we
111 * don't yet support FIS-based command switching (the FIS receive areas
112 * would become too large for the current data model).
113 *
114 * - One or more NCQ commands were active at the time of the error, with or
115 * without FIS-based command switching. We would have to interrogate the
116 * corresponding devices to find out which command has failed but if this
117 * is combined with FIS-based command switching, even the AHCI spec
118 * recommends to reset the port. This leads to a much simpler approach:
119 * requeue all NCQ commands (they are idempotent per definition, otherwise
120 * they couldn't be reordered by the device) with the 'no_ncq' flag set
121 * in the IORB and reset the port. Then those comands will be executed as
[82]122 * regular commands. The error, if it reoccurs, can then be handled by
[63]123 * one of the above cases.
124 *
125 * The upstream code will guarantee that we will never have a mix of NCQ and
126 * non-NCQ commands active at the same time in order to reduce complexity
127 * in the interrupt and error handlers.
128 */
[211]129void _Syscall RestartCtxHook(ULONG parm)
[63]130{
131 IORB_QUEUE done_queue;
132 AD_INFO *ai;
[178]133 IORBH FAR16DATA *vProblemIorb;
134 IORBH FAR16DATA *vIorb;
[184]135 IORBH FAR16DATA *vNext;
[211]136 IORBH *pIorb;
137 ADD_WORKSPACE *aws;
[178]138 u8 *port_mmio;
[184]139 int rearm_ctx_hook;
[63]140 int need_reset;
141 int ccs;
142 int a;
143 int p;
144
[181]145 D32ThunkStackTo32();
146
[191]147 vNext = FAR16NULL;
[184]148 rearm_ctx_hook = 0;
149
[211]150 AhciStats.ulSoftErrorCount++;
151 DPRINTF(0, DBG_PREFIX": BEG\n");
[63]152 memset(&done_queue, 0x00, sizeof(done_queue));
153
154 spin_lock(drv_lock);
155
[211]156 if (th_watchdog != 0)
157 {
158 /* watchdog timer still active -- just reset it */
159 Timer_CancelTimer(th_watchdog);
160 th_watchdog = 0;
161 }
162
[178]163 for (a = 0; a < ad_info_cnt; a++)
164 {
[63]165 ai = ad_infos + a;
166
[178]167 if (ai->busy)
168 {
[63]169 /* this adapter is busy; leave it alone for now */
170 rearm_ctx_hook = 1;
171 continue;
172 }
173
[178]174 for (p = 0; p <= ai->port_max; p++)
175 {
176 if (ports_to_restart[a] & (1UL << p))
177 {
[63]178 ports_to_restart[a] &= ~(1UL << p);
179
180 /* restart this port */
181 port_mmio = port_base(ai, p);
[191]182 vProblemIorb = FAR16NULL;
[63]183 need_reset = 0;
184
[211]185 DPRINTF(DBG_DETAILED, DBG_PREFIX": port=%d TF_DATA=0x%x\n", p, readl(port_mmio + PORT_TFDATA));
[66]186
[63]187 /* get "current command slot"; only valid if there are no NCQ cmds */
[125]188 ccs = (int) ((readl(port_mmio + PORT_CMD) >> 8) & 0x1f);
[211]189 DPRINTF(DBG_DETAILED, DBG_PREFIX": PORT_CMD=0x%x\n", ccs);
[63]190
[191]191 for (vIorb = ai->ports[p].iorb_queue.vRoot; vIorb != FAR16NULL; vIorb = vNext)
[178]192 {
[211]193 pIorb = Far16ToFlat(vIorb);
194 aws = add_workspace(pIorb);
[204]195 vNext = pIorb->f16NxtIORB;
[63]196
[178]197 if (aws->queued_hw)
198 {
[211]199 if (aws->timer != 0)
200 {
201 Timer_CancelTimer(aws->timer);
202 aws->timer = 0;
203 }
204
[178]205 if (ai->ports[p].ncq_cmds & (1UL << aws->cmd_slot))
206 {
[75]207 /* NCQ command; force non-NCQ mode and trigger port reset */
[63]208 ai->ports[p].ncq_cmds &= ~(1UL << aws->cmd_slot);
209 aws->no_ncq = 1;
210 need_reset = 1;
[211]211 DPRINTF(0, DBG_PREFIX": failing IORB: %x NCQ slot=%x\n", vIorb, aws->cmd_slot);
212 #ifdef DEBUG
213 DumpIorb(pIorb, vIorb);
214 #endif
[178]215 }
216 else
217 {
[75]218 /* regular command; clear cmd bit and identify problem IORB */
[63]219 ai->ports[p].reg_cmds &= ~(1UL << aws->cmd_slot);
[178]220 if (aws->cmd_slot == ccs)
221 {
[121]222 /* this is the non-NCQ command that failed */
[211]223 DPRINTF(0, DBG_PREFIX": failing IORB: %x slot=%x\n", vIorb, aws->cmd_slot);
224 #ifdef DEBUG
225 DumpIorb(pIorb, vIorb);
226 #endif
[178]227 vProblemIorb = vIorb;
[75]228 }
[63]229 }
[75]230 /* we can requeue all IORBs unconditionally (see function comment) */
[178]231 if (aws->retries++ < MAX_RETRIES)
232 {
233 iorb_requeue(pIorb);
234 }
235 else
236 {
[121]237 /* retry count exceeded; consider IORB aborted */
[178]238 iorb_seterr(pIorb, IOERR_CMD_ABORTED);
239 iorb_queue_del(&ai->ports[p].iorb_queue, vIorb);
240 iorb_queue_add(&done_queue, vIorb, pIorb);
241 if (vIorb == vProblemIorb)
242 {
[121]243 /* no further analysis -- we're done with this one */
[191]244 vProblemIorb = FAR16NULL;
[121]245 }
246 }
[63]247 }
248 }
249
250 /* sanity check: issued command bitmaps should be 0 now */
[178]251 if (ai->ports[p].ncq_cmds != 0 || ai->ports[p].reg_cmds != 0)
252 {
[211]253 DPRINTF(0, DBG_PREFIX": warning: commands issued not 0 (%08x/%08x); resetting...\n",
[63]254 ai->ports[p].ncq_cmds, ai->ports[p].reg_cmds);
[74]255 need_reset = 1;
[63]256 }
257
[178]258 if (!need_reset)
259 {
260 if ((readl(port_mmio + PORT_TFDATA) & 0x88) != 0)
261 {
[63]262 /* device is not in an idle state */
263 need_reset = 1;
264 }
265 }
266
267 /* restart/reset port */
268 ai->busy = 1;
269 spin_unlock(drv_lock);
[178]270 if (need_reset)
271 {
[63]272 ahci_reset_port(ai, p, 1);
[178]273 }
274 else
275 {
[63]276 ahci_stop_port(ai, p);
277 ahci_start_port(ai, p, 1);
278 }
279 spin_lock(drv_lock);
280 ai->busy = 0;
281
282 /* reset internal port status */
283 ai->ports[p].ncq_cmds = 0;
284 ai->ports[p].reg_cmds = 0;
285 ai->ports[p].cmd_slot = 0;
286
[191]287 if (vProblemIorb != FAR16NULL)
[178]288 {
289 IORBH *pProblemIorb = Far16ToFlat(vProblemIorb);
[63]290 /* get details about the error that caused this IORB to fail */
[178]291 if (need_reset)
292 {
[63]293 /* no way to retrieve error details after a reset */
[178]294 iorb_seterr(pProblemIorb, IOERR_DEVICE_NONSPECIFIC);
295 iorb_queue_del(&ai->ports[p].iorb_queue, vProblemIorb);
296 iorb_queue_add(&done_queue, vProblemIorb, pProblemIorb);
[63]297
[178]298 }
299 else
300 {
[63]301 /* get sense information */
[178]302 ADD_WORKSPACE *aws = add_workspace(pProblemIorb);
303 int d = iorb_unit_device(pProblemIorb);
304 int (*req_sense)(IORBH FAR16DATA *, IORBH *, int) = (ai->ports[p].devs[d].atapi) ?
[75]305 atapi_req_sense : ata_req_sense;
[63]306
307 aws->processing = 1;
308 aws->queued_hw = 1;
309
[178]310 if (req_sense(vProblemIorb, pProblemIorb, 0) == 0)
311 {
[63]312 /* execute request sense on slot #0 before anything else comes along */
[178]313 Timer_StartTimerMS(&aws->timer, 5000, timeout_callback, CastFar16ToULONG(vProblemIorb));
[69]314 aws->cmd_slot = 0;
[63]315 ai->ports[p].reg_cmds = 1;
316 writel(port_mmio + PORT_CMD_ISSUE, 1);
317 readl(port_mmio); /* flush */
318
[178]319 }
320 else
321 {
[63]322 /* IORB is expected to contain the error code; just move to done queue */
[178]323 iorb_queue_del(&ai->ports[p].iorb_queue, vProblemIorb);
324 iorb_queue_add(&done_queue, vProblemIorb, pProblemIorb);
[63]325 }
326 }
327 }
328 }
329 }
330 }
331
332 spin_unlock(drv_lock);
333
[211]334 DPRINTF(0, DBG_PREFIX": Resuming\n");
[63]335 /* call notification routine on all IORBs which have completed */
[191]336 for (vIorb = done_queue.vRoot; vIorb != FAR16NULL; vIorb = vNext)
[178]337 {
[211]338 pIorb = Far16ToFlat(vIorb);
[204]339 vNext = pIorb->f16NxtIORB;
[75]340
341 spin_lock(drv_lock);
[178]342 aws_free(add_workspace(pIorb));
[75]343 spin_unlock(drv_lock);
344
[178]345 iorb_complete(vIorb, pIorb);
[63]346 }
347
348 /* restart engine to resume IORB processing */
349 spin_lock(drv_lock);
350 trigger_engine();
351 spin_unlock(drv_lock);
352
[211]353 DPRINTF(0, DBG_PREFIX": END Rearm=%x\n", rearm_ctx_hook);
[63]354
355 /* Check whether we have to rearm ourselves because some adapters were busy
356 * when we wanted to restart ports on them.
357 */
[178]358 if (rearm_ctx_hook)
359 {
[211]360 /* we cannot rearm ourself because we will execute immediately leaving
361 * no time to process and clear the reason we need to rearm. Therefore
362 * we set the timer again.
363 */
364 Timer_StartTimerMS(&th_watchdog, 250, WatchdogTimer, RestartCtxHook_h);
[63]365 }
[211]366
367 ClearThreadStatus(RestartCtxHook_h);
[181]368 KernThunkStackTo16();
[63]369}
370
371/******************************************************************************
372 * Reset and abort context hook. This function runs at task time and takes
373 * care of port resets and their side effects. Input to this function are:
374 *
375 * ports_to_reset[] - array of port bitmaps, each bit indicating which port
376 * should be reset unconditionally. This is primarily
377 * used by the error interrupt handler.
378 *
379 * abort_queue - queue with IORBs to be arborted (timed-out, ...) If
380 * any of these commands have reached the hardware, the
381 * corresponding port is reset to interrupt command
382 * execution. This is primarily used for timeout
383 * handling and when IORBs are requested to be aborted.
[87]384 *
[63]385 * After resetting the requested ports, all remaining active IORBs on those
386 * ports have to be retried or aborted. Whether a retry is attempted depends
387 * on the kind of IORB -- those which are idempotent are retried, all others
388 * are aborted. This is different from the port restart hook because the
389 * restart hook can assume it is called with the port in error state, thus
390 * the controller will have stopped executing commands. The reset handler can
391 * be called at any time and we can't tell what's going on in the controller.
392 *
393 * The IORBs in the global abort_queue are expected to have their error code
394 * set (aborted, timeout, ...) but must not be marked as 'done'; otherwise,
395 * the upstream code might reuse the IORBs before we're done with them.
396 */
[211]397void _Syscall ResetCtxHook(ULONG ulArg)
[63]398{
399 IORB_QUEUE done_queue;
400 AD_INFO *ai;
[178]401 IORBH FAR16DATA *vIorb;
[184]402 IORBH FAR16DATA *vNext;
[211]403 IORBH *pIorb;
404 ADD_WORKSPACE *aws;
[184]405 int rearm_ctx_hook;
[63]406 int a;
407 int p;
408
[181]409 D32ThunkStackTo32();
410
[191]411 vNext = FAR16NULL;
[184]412 rearm_ctx_hook = 0;
413
[211]414 AhciStats.ulHardErrorCount++;
415 DPRINTF(0, DBG_PREFIX": BEG Arg=%x\n", ulArg);
[63]416 memset(&done_queue, 0x00, sizeof(done_queue));
417
[211]418 if (th_watchdog != 0)
419 {
420 /* watchdog timer still active -- just reset it */
421 Timer_CancelTimer(th_watchdog);
422 th_watchdog = 0;
423 }
424
[63]425 spin_lock(drv_lock);
426
[211]427 if (ulArg)
[178]428 {
[211]429 /* Move the timed-out IORB to the abort queue. Since it's possible that the
430 * IORB has completed after the timeout has expired but before we got to
431 * this line of code, we'll check the return code of iorb_queue_del(): If it
432 * returns an error, the IORB must have completed a few microseconds ago and
433 * there is no timeout.
434 */
435 vIorb = (IORBH FAR16DATA *)CastULONGToFar16(ulArg);
436 pIorb = Far16ToFlat(vIorb);
437 a = iorb_unit_adapter(pIorb);
438 p = iorb_unit_port(pIorb);
439 if (iorb_queue_del(&ad_infos[a].ports[p].iorb_queue, vIorb) == 0)
440 {
441 pIorb = Far16ToFlat(vIorb);
442 iorb_queue_add(&abort_queue, vIorb, pIorb);
443 pIorb->ErrorCode = IOERR_ADAPTER_TIMEOUT;
444 }
[80]445 }
446
[63]447 /* add ports of active IORBs from the abort queue to ports_to_reset[] */
[191]448 for (vIorb = abort_queue.vRoot; vIorb != FAR16NULL; vIorb = vNext)
[178]449 {
[211]450 pIorb = Far16ToFlat(vIorb);
[204]451 vNext = pIorb->f16NxtIORB;
[178]452 a = iorb_unit_adapter(pIorb);
453 p = iorb_unit_port(pIorb);
[63]454 ai = ad_infos + a;
[211]455 aws = add_workspace(pIorb);
[63]456
[178]457 if (ai->busy)
458 {
[63]459 /* this adapter is busy; leave it alone for now */
460 rearm_ctx_hook = 1;
461 continue;
462 }
463
464 /* move IORB to the local 'done' queue */
[178]465 iorb_queue_del(&abort_queue, vIorb);
466 iorb_queue_add(&done_queue, vIorb, pIorb);
[63]467
468 /* reset port if the IORB has already been queued to hardware */
[211]469 if (aws->queued_hw)
[178]470 {
[211]471 if (aws->timer != 0)
472 {
473 Timer_CancelTimer(aws->timer);
474 aws->timer = 0;
475 }
476
[63]477 /* prepare port reset */
478 ports_to_reset[a] |= (1UL << p);
479 }
480 }
481
482 /* reset all ports in 'ports_to_reset[]' */
[178]483 for (a = 0; a < ad_info_cnt; a++)
484 {
[63]485 ai = ad_infos + a;
486
[178]487 if (ai->busy)
488 {
[63]489 /* this adapter is busy; leave it alone for now */
490 rearm_ctx_hook = 1;
491 continue;
492 }
493
[178]494 for (p = 0; p <= ai->port_max; p++)
495 {
496 if (ports_to_reset[a] & (1UL << p))
497 {
[63]498 ports_to_reset[a] &= ~(1UL << p);
[202]499 ai->ports[p].ulResetCount++;
[63]500
501 /* Reset this port. Since this is a rather slow operation, we'll
502 * release the spinlock while doing so. The adapter is marked as
503 * 'busy' to prevent similar routines (e.g. an ahci port scan) from
[87]504 * interfering.
[63]505 */
506 ai->busy = 1;
507 spin_unlock(drv_lock);
508 ahci_reset_port(ai, p, 1);
509 spin_lock(drv_lock);
510 ai->busy = 0;
511
512 /* reset port status */
513 ai->ports[p].ncq_cmds = 0;
514 ai->ports[p].reg_cmds = 0;
515 ai->ports[p].cmd_slot = 0;
516
517 /* retry or abort all remaining active commands on this port */
[191]518 for (vIorb = ai->ports[p].iorb_queue.vRoot; vIorb != FAR16NULL; vIorb = vNext)
[178]519 {
520 IORBH *pIorb = Far16ToFlat(vIorb);
[204]521 vNext = pIorb->f16NxtIORB;
[121]522
[211]523 aws = add_workspace(pIorb);
524
[178]525 if (aws->queued_hw)
526 {
[63]527 /* this IORB had already been queued to HW when we reset the port */
[178]528 if (aws->idempotent && aws->retries++ < MAX_RETRIES)
529 {
[121]530 /* we can retry this IORB */
[178]531 iorb_requeue(pIorb);
532 }
533 else
534 {
[63]535 /* we cannot retry this IORB; consider it aborted */
[178]536 pIorb->ErrorCode = IOERR_CMD_ABORTED;
537 iorb_queue_del(&ai->ports[p].iorb_queue, vIorb);
538 iorb_queue_add(&done_queue, vIorb, pIorb);
[63]539 }
540 }
541 }
542 }
543 }
544 }
545
546 spin_unlock(drv_lock);
547
[211]548 DPRINTF(0, DBG_PREFIX": Resuming\n");
549
[75]550 /* complete all aborted IORBs */
[191]551 for (vIorb = done_queue.vRoot; vIorb != FAR16NULL; vIorb = vNext)
[178]552 {
[211]553 pIorb = Far16ToFlat(vIorb);
[204]554 vNext = pIorb->f16NxtIORB;
[75]555
556 spin_lock(drv_lock);
[178]557 aws_free(add_workspace(pIorb));
[75]558 spin_unlock(drv_lock);
559
[178]560 pIorb->Status |= IORB_ERROR;
561 iorb_complete(vIorb, pIorb);
[63]562 }
563
564 /* restart engine to resume IORB processing */
565 spin_lock(drv_lock);
566 trigger_engine();
567 spin_unlock(drv_lock);
568
[211]569 DPRINTF(0, DBG_PREFIX": END Rearm=%x\n", rearm_ctx_hook);
[63]570
571 /* Check whether we have to rearm ourselves because some adapters were busy
572 * when we wanted to reset ports on them.
573 */
[178]574 if (rearm_ctx_hook)
575 {
[201]576 /* we cannot rearm ourself because we will execute immediately leaving
577 * no time to process and clear the reason we need to rearm. Therefore
578 * we set the timer again.
579 */
[211]580 Timer_StartTimerMS(&th_watchdog, 250, WatchdogTimer, ResetCtxHook_h);
[63]581 }
[181]582
[211]583 ClearThreadStatus(ResetCtxHook_h);
[181]584 KernThunkStackTo16();
[63]585}
586
587/******************************************************************************
588 * IORB Engine context hook. This hook is executed if trigger_engine() came
589 * to the conclusion that some of the IORBs keep bouncing, most likely due to
590 * some condition on the adapter such as being busy. It could also be a very
591 * busy system. Either way, this requires some task-time help.
592 */
[178]593void _Syscall engine_ctxhook(ULONG parm)
[63]594{
595 int iorbs_sent;
596 int i;
597
[181]598 D32ThunkStackTo32();
599
[209]600 DPRINTF(DBG_FUNCBEG, DBG_PREFIX": engine_ctxhook() started\n");
[178]601 if (resume_sleep_flag)
602 {
[161]603 msleep(resume_sleep_flag);
604 resume_sleep_flag = 0;
605 }
[63]606
607 spin_lock(drv_lock);
[178]608 for (i = 0; i < 10; i++)
609 {
610 if ((iorbs_sent = trigger_engine_1()) == 0) break;
[63]611 }
612 spin_unlock(drv_lock);
613
[209]614 DPRINTF(DBG_FUNCEND, DBG_PREFIX": engine_ctxhook() completed\n");
[63]615
[178]616 if (iorbs_sent != 0)
617 {
[63]618 /* need to rearm ourselves for another run */
619 msleep(250);
[178]620 KernArmHook(engine_ctxhook_h, 0, 0);
[63]621 }
[181]622
623 KernThunkStackTo16();
[63]624}
[75]625
Note: See TracBrowser for help on using the repository browser.