很长时间以来,Linux块设备使用了一种称为“蓄流/泄流”(plugging/unplugging)的技术来改进吞吐率。简单而言,这种工作方式类似浴盆排水系统的塞子。当IO被提交时,它被储存在一个队列,稍后的某个时间,我们才允许IO从队列派发出去。之所以这么做是为IO尽可能做合并和排序。
1 static void scsi_request_fn(struct request_queue *q) 2 { 3 struct scsi_device *sdev = q->queuedata; 4 struct Scsi_Host *shost; 5 struct scsi_cmnd *cmd; 6 struct request *req; 7 if(!get_device(&sdev->sdev_gendev)) 8 /* We must be tearing the block queue down already */ 9 return; 10 /* 11 * To start with, we keep looping until the queue is empty, or until 12 * the host is no longer able to accept any more requests. 13 */ 14 shost = sdev->host; 15 for (;;) { 16 int rtn; 17 /* 18 * get next queueable request. We do this early to make sure 19 * that the request is fully prepared even if we cannot 20 * accept it. 21 */ 22 req = blk_peek_request(q); // 获得下一个可排队的请求,如果没有请求或者现在还不能想SCSI设备发送请求,则退出循环 23 if (!req || !scsi_dev_queue_ready(q, sdev)) 24 break; 25 /* 如果设备已经离线,则输出错误消息, 调用scsi_kill_request函数释放请求,并以此方式处理后面所有的请求 */ 26 if (unlikely(!scsi_device_online(sdev))) { 27 sdev_printk(KERN_ERR, sdev, 28 "rejecting I/O to offline device\n"); 29 scsi_kill_request(req, q); 30 continue; 31 } 32 /* 33 * Remove the request from the request list. 34 * 如果队列不是使用generic tag queueing,并且没有为请求启动tagged操作,调用blk_start_request开始由驱动处理请求,这个函数将请求从队列中取出,为它启动超时定时器 35 */ 36 if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req))) // 37 blk_start_request(req); 38 sdev->device_busy++; 39 spin_unlock(q->queue_lock); 40 /* 从块设备驱动层请求描述符的special域获得SCSI命令描述符,这是在之前的blk_peek_request函数中调用请求队列的prep_rq_fn回调函数准备的 */ 41 cmd = req->special; 42 if (unlikely(cmd == NULL)) { 43 printk(KERN_CRIT "impossible request in %s.\n" 44 "please mail a stack trace to " 45 "linux-scsi@vger.kernel.org\n", 46 __func__); 47 blk_dump_rq_flags(req, "foo"); 48 BUG(); 49 } 50 spin_lock(shost->host_lock); 51 /* 52 * We hit this when the driver is using a host wide 53 * tag map. For device level tag maps the queue_depth check 54 * in the device ready fn would prevent us from trying 55 * to allocate a tag. Since the map is a shared host resource 56 * we add the dev to the starved list so it eventually gets 57 * a run when a tag is freed. 58 */ 59 if (blk_queue_tagged(q) && !blk_rq_tagged(req)) { 60 if (list_empty(&sdev->starved_entry)) 61 list_add_tail(&sdev->starved_entry, 62 &shost->starved_list); 63 goto not_ready; 64 } 65 if (!scsi_target_queue_ready(shost, sdev)) 66 goto not_ready; 67 if (!scsi_host_queue_ready(q, shost, sdev)) 68 goto not_ready; 69 scsi_target(sdev)->target_busy++; 70 shost->host_busy++; 71 /* 72 * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will 73 * take the lock again. 74 */ 75 spin_unlock_irq(shost->host_lock); 76 /* 77 * Finally, initialize any error handling parameters, and set up the timers for timeouts. 78 * 初始化错误处理参数, 设置超时定时器 79 */ 80 scsi_init_cmd_errh(cmd); 81 /* 82 * Dispatch the command to the low-level driver. 83 * 将命令派发到底层驱动 84 */ 85 rtn = scsi_dispatch_cmd(cmd); 86 spin_lock_irq(q->queue_lock); 87 if (rtn) 88 goto out_delay; 89 } 90 goto out; 91 92 not_ready: 93 spin_unlock_irq(shost->host_lock); 94 /* 95 * lock q, handle tag, requeue req, and decrement device_busy. We 96 * must return with queue_lock held. 97 * 98 * Decrementing device_busy without checking it is OK, as all such 99 * cases (host limits or settings) should run the queue at some100 * later time.101 */102 spin_lock_irq(q->queue_lock);103 blk_requeue_request(q, req);104 sdev->device_busy--;105 out_delay:106 if (sdev->device_busy == 0)107 blk_delay_queue(q, SCSI_QUEUE_DELAY);108 out:109 /* must be careful here...if we trigger the ->remove() function110 * we cannot be holding the q lock */111 spin_unlock_irq(q->queue_lock);112 put_device(&sdev->sdev_gendev);113 spin_lock_irq(q->queue_lock);114 }
blk_peek_request从请求队列“顶部”取得下一个请求。函数的实现就是一个大循环,每次调用__elv_next_request从电梯队列中取出一个请求进行处理
1 /** 2 * blk_peek_request - peek at the top of a request queue 3 * @q: request queue to peek at 4 * 5 * Description: 6 * Return the request at the top of @q. The returned request 7 * should be started using blk_start_request() before LLD starts 8 * processing it. 9 * 10 * Return: 11 * Pointer to the request at the top of @q if available. Null 12 * otherwise. 13 * 14 * Context: 15 * queue_lock must be held. 16 */ 17 struct request *blk_peek_request(struct request_queue *q) 18 { 19 struct request *rq; 20 int ret; 21 22 while ((rq = __elv_next_request(q)) != NULL) { 23 24 rq = blk_pm_peek_request(q, rq); 25 if (!rq) 26 break; 27 /* 请求可能是全新的或者是由于暂时不能处理而重新排入队列的,对于后一种情况,必然设置了REQ_STARTED标志。 28 * 换句话说,如果没有该标志,则表示第一次看见此请求,如果请求被插入还需要排序,则调用elv_activate_rq函数确定合适执行该请求 29 */ 30 if (!(rq->cmd_flags & REQ_STARTED)) { 31 /* 32 * This is the first time the device driver 33 * sees this request (possibly after 34 * requeueing). Notify IO scheduler. 35 */ 36 if (rq->cmd_flags & REQ_SORTED) 37 elv_activate_rq(q, rq); 38 39 /* 40 * just mark as started even if we don't start 41 * it, a request that has been delayed should 42 * not be passed by new incoming requests 43 */ 44 rq->cmd_flags |= REQ_STARTED; 45 trace_block_rq_issue(q, rq); 46 } 47 /* 配合IO调度器 */ 48 if (!q->boundary_rq || q->boundary_rq == rq) { 49 q->end_sector = rq_end_sector(rq); 50 q->boundary_rq = NULL; 51 } 52 53 /* 如果请求队列设置了REQ_DONTPREP,表明不需要准备SCSI命令,退出循环,向调用者返回这个请求 */ 54 if (rq->cmd_flags & REQ_DONTPREP) 55 break; 56 57 /* 58 * 如果请求队列的dma_drain_size不为0,说明存在“过剩DMA”问题,这种情况下,需要为请求增加一个额外的段 59 * 以便将来在聚散列表后追加“抽干缓冲区” 60 */ 61 if (q->dma_drain_size && blk_rq_bytes(rq)) { 62 /* 63 * make sure space for the drain appears we 64 * know we can do this because max_hw_segments 65 * has been adjusted to be one fewer than the 66 * device can handle 67 */ 68 rq->nr_phys_segments++; 69 } 70 /* 71 * 如果没有定义 prep_rq_fn回调,则返回 72 * 否则调用回调为请求准备SCSI命令描述符,它有三种返回值: 73 * BLKPREP_OK:表示命令初期准备成功 74 * BLKPREP_DEFER:表示暂时还不能继续处理,需要将命令重新排入队列 75 * BLKPREP_KILL:该请求没办法继续处理,上上层报告IO错误,这里不退出循环,而是继续尝试下一个请求 76 */ 77 if (!q->prep_rq_fn) 78 break; 79 80 ret = q->prep_rq_fn(q, rq); 81 if (ret == BLKPREP_OK) { 82 break; 83 } else if (ret == BLKPREP_DEFER) { 84 /* 85 * the request may have been (partially) prepped. 86 * we need to keep this request in the front to 87 * avoid resource deadlock. REQ_STARTED will 88 * prevent other fs requests from passing this one. 89 */ 90 if (q->dma_drain_size && blk_rq_bytes(rq) && 91 !(rq->cmd_flags & REQ_DONTPREP)) { 92 /* 93 * remove the space for the drain we added 94 * so that we don't add it again 95 */ 96 --rq->nr_phys_segments; 97 } 98 99 rq = NULL;100 break;101 } else if (ret == BLKPREP_KILL) {102 rq->cmd_flags |= REQ_QUIET;103 /*104 * Mark this request as started so we don't trigger105 * any debug logic in the end I/O path.106 */107 blk_start_request(rq);108 __blk_end_request_all(rq, -EIO);109 } else {110 printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);111 break;112 }113 }114 115 return rq;116 }
请求队列中的prep_rq_fn回调函数实现了从请求构造SCSI命令的方法,prep_rq_fn回调函数关键有两个任务:
- 构造命令描述块
- 如果需要的话为数据传输准备聚散列表
命令描述块和聚散列表都被封装到SCSI命令描述符中,我们知道,请求至少有两个来源
- 来自上层bio
- 来自SCSI公共服务层
在刚找到SCSI设备为其初始化请求队列时,这个回调函数被设置为scsi_prep_fn
1 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) 2 { 3 struct request_queue *q; 4 5 q = __scsi_alloc_queue(sdev->host, scsi_request_fn); 6 if (!q) 7 return NULL; 8 9 blk_queue_prep_rq(q, scsi_prep_fn);10 blk_queue_softirq_done(q, scsi_softirq_done);11 blk_queue_rq_timed_out(q, scsi_times_out);12 blk_queue_lld_busy(q, scsi_lld_busy);13 return q;14 }15 16 /**17 * blk_queue_prep_rq - set a prepare_request function for queue18 * @q: queue19 * @pfn: prepare_request function20 *21 * It's possible for a queue to register a prepare_request callback which22 * is invoked before the request is handed to the request_fn. The goal of23 * the function is to prepare a request for I/O, it can be used to build a24 * cdb from the request data for instance.25 *26 */27 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)28 {29 q->prep_rq_fn = pfn;30 }
如果SCSI设备被高层驱动绑定,这个回调函数会被修改,例如,在sd_probe中被设置成sd_prep_fn
1 static void sd_probe_async(void *data, async_cookie_t cookie) 2 { 3 struct scsi_disk *sdkp = data; 4 struct scsi_device *sdp; 5 struct gendisk *gd; 6 u32 index; 7 struct device *dev; 8 9 sdp = sdkp->device;10 gd = sdkp->disk;11 index = sdkp->index;12 dev = &sdp->sdev_gendev;13 14 gd->major = sd_major((index & 0xf0) >> 4);15 gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);16 gd->minors = SD_MINORS;17 18 gd->fops = &sd_fops;19 gd->private_data = &sdkp->driver;20 gd->queue = sdkp->device->request_queue;21 22 /* defaults, until the device tells us otherwise */23 sdp->sector_size = 512;24 sdkp->capacity = 0;25 sdkp->media_present = 1;26 sdkp->write_prot = 0;27 sdkp->cache_override = 0;28 sdkp->WCE = 0;29 sdkp->RCD = 0;30 sdkp->ATO = 0;31 sdkp->first_scan = 1;32 sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;33 34 sd_revalidate_disk(gd);35 36 blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);37 blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn);38 39 gd->driverfs_dev = &sdp->sdev_gendev;40 gd->flags = GENHD_FL_EXT_DEVT;41 if (sdp->removable) {42 gd->flags |= GENHD_FL_REMOVABLE;43 gd->events |= DISK_EVENT_MEDIA_CHANGE;44 }45 46 add_disk(gd);47 if (sdkp->capacity)48 sd_dif_config_host(sdkp);49 50 sd_revalidate_disk(gd);51 52 sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",53 sdp->removable ? "removable " : "");54 blk_pm_runtime_init(sdp->request_queue, dev);55 scsi_autopm_put_device(sdp);56 put_device(&sdkp->dev);57 }
在前一种情况下,SCSI设备只能处理来自SCSI公共服务层的请求,后一种情况下,SCSI命令不仅能处理来自SCSI公共服务层的请求,还能够处理来自上层的bio请求,分析见下一节