#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include <inttypes.h>
#include <time.h>
#include <fcntl.h>
#include <errno.h>
#include <pthread.h>
#include <sys/utsname.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/uio.h>

#include <xs.h>
#include <xenctrl.h>
#include <xen/io/xenbus.h>

#ifdef HAVE_LIBAIO
# include <libaio.h>
# define IO_CMD_PREADV  7
# define IO_CMD_PWRITEV 8
#endif

#include "blkif.h"
#include "shared.h"
#include "list.h"
#include "hexdump.h"
#include "daemon.h"
#include "xenbackd.h"
#include "evtchnd.h"

#include "qemu/vl.h"
#include "qemu/block_int.h"

/* ------------------------------------------------------------- */

#define BLOCK_SIZE  512
#define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)

struct ioreq {
    blkif_request_t     req;
    int16_t             status;

    /* parsed request */
    off_t               start, end;
    struct iovec        vec[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    int                 vecs;
    int                 presync;
    int                 postsync;

    /* grant mapping */
    uint32_t            domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    uint32_t            refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    int                 prot;
    void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    void                *pages;

#ifdef HAVE_LIBAIO
    struct iocb         iocb[IOCB_COUNT];
    int                 iocbs;
    int                 iocbs_done;
    int                 iocbs_ok;
#endif

    struct blkdev       *blkdev;
    struct list_head    list;
};

struct blkstats {
    uint64_t            req_read;      /* read requests          */
    uint64_t            req_write;     /* write requests (incl barriers) */
    uint64_t            req_barrier;   /* barrier writes         */
    uint64_t            req_full;      /* # of queue full events */
    uint64_t            req_err_parse; /* parse errors           */
    uint64_t            req_err_io;    /* I/O errors             */
    uint64_t            bytes_read;    /* bytes read             */
    uint64_t            bytes_write;   /* bytes written          */
    uint64_t            wakeups;
    uint64_t            aio_wait;
    uint64_t            aio_full;
};

struct blkdev {
    struct xendev       xendev;  /* must be first */
    char                *params;
    char                *mode;
    char                *type;
    char                *dev;
    char                *fileproto;
    char                *filename;
    int                 ring_ref;
    void                *sring;
    int                 file;
    int64_t             file_blk;
    int64_t             file_size;
    int                 protocol;
    blkif_back_rings_t  rings;
    int                 more_work;
    int                 cnt_map;

    /* worker thread */
    pthread_t           mainthread;

    /* request lists */
    struct list_head    inflight;
    struct list_head    finished;
    struct list_head    freelist;
    int                 requests;

    /* qemu block driver */
    BlockDriverState    *bs;

    int                 use_aio;
    int                 use_aio_vec;
#ifdef HAVE_LIBAIO
    int                 cnt_aio;
    io_context_t        ctx;
#endif

    /* statistics */
    struct blkstats     st;
    struct blkstats     st_prev;
    struct timeval      tv_prev;
    struct blkstats     st_rate;
    struct blkstats     st_peak;
    int                 st_count;
};

static int syncwrite;
static int use_aio      = 0;
static int use_aio_vec  = 0;
static int batch_maps   = 0;
static int stat_secs    = 10;
static int max_requests = 32;
static char *be_name = "blkbackd";

/* ------------------------------------------------------------- */

static struct ioreq *ioreq_start(struct blkdev *blkdev)
{
    struct ioreq *ioreq = NULL;

    if (list_empty(&blkdev->freelist)) {
	if (blkdev->requests >= max_requests) {
	    d2printf("%s: request limit (%d) reached\n", __FUNCTION__, max_requests);
	    blkdev->st.req_full++;
	    goto out;
	}
	/* allocate new struct */
	ioreq = malloc(sizeof(*ioreq));
	if (NULL == ioreq)
	    goto out;
	memset(ioreq, 0, sizeof(*ioreq));
	ioreq->blkdev = blkdev;
	blkdev->requests++;
    } else {
	/* get one from freelist */
	ioreq = list_entry(blkdev->freelist.next, struct ioreq, list);
	list_del(&ioreq->list);
    }
    list_add_tail(&ioreq->list, &blkdev->inflight);

out:
    return ioreq;
}

static void ioreq_finish(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;

    list_del(&ioreq->list);
    list_add_tail(&ioreq->list, &blkdev->finished);
}

static void ioreq_release(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;

    list_del(&ioreq->list);
    memset(ioreq, 0, sizeof(*ioreq));
    ioreq->blkdev = blkdev;
    list_add_tail(&ioreq->list, &blkdev->freelist);
}

/*
 * translate request into iovec + start offset + end offset
 * do sanity checks along the way
 */
static int ioreq_parse(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;
    uintptr_t mem;
    size_t len;
    int i;

    d3printf("%s: op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
	     __FUNCTION__, ioreq->req.operation, ioreq->req.nr_segments,
	     ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
    switch (ioreq->req.operation) {
    case BLKIF_OP_READ:
	blkdev->st.req_read++;
	ioreq->prot = PROT_WRITE; /* to memory */
	if (BLKIF_OP_READ != ioreq->req.operation && blkdev->mode[0] != 'w') {
	    d1printf("%s: error: write req for ro device\n", __FUNCTION__);
	    goto err;
	}
	break;
    case BLKIF_OP_WRITE_BARRIER:
	blkdev->st.req_barrier++;
	if (!syncwrite)
	    ioreq->presync = ioreq->postsync = 1;
	/* fall through */
    case BLKIF_OP_WRITE:
	blkdev->st.req_write++;
	ioreq->prot = PROT_READ; /* from memory */
	if (syncwrite)
	    ioreq->postsync = 1;
	break;
    default:
	d1printf("%s: error: unknown operation (%d)\n",
		 __FUNCTION__, ioreq->req.operation);
	goto err;
    };

    ioreq->start = ioreq->end = ioreq->req.sector_number * blkdev->file_blk;
    for (i = 0; i < ioreq->req.nr_segments; i++) {
	if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
	    d1printf("%s: error: nr_segments too big\n", __FUNCTION__);
	    goto err;
	}
	d3printf("%s:   %d: gref %d sects %d-%d\n", __FUNCTION__,
		 i, ioreq->req.seg[i].gref, ioreq->req.seg[i].first_sect, ioreq->req.seg[i].last_sect);
	if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
	    d1printf("%s: first > last sector\n", __FUNCTION__);
	    goto err;
	}
	if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= PAGE_SIZE) {
	    d1printf("%s: page crossing\n", __FUNCTION__);
	    goto err;
	}
	len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;

	ioreq->domids[i] = blkdev->xendev.dom;
	ioreq->refs[i]   = ioreq->req.seg[i].gref;
	mem = ioreq->req.seg[i].first_sect * blkdev->file_blk;
	d3printf("%s:   %d: op %d, mem %" PRIxPTR ", len %zd\n",
		 __FUNCTION__, i, ioreq->req.operation, mem, len);

	ioreq->vec[i].iov_base = (void*)mem;
	ioreq->vec[i].iov_len  = len;
	ioreq->end += len;
    }
    if (ioreq->end > blkdev->file_size) {
	d1printf("%s: access beyond end of file\n", __FUNCTION__);
	goto err;
    }
    ioreq->vecs = i;
    return 0;

err:
    blkdev->st.req_err_parse++;
    ioreq->status = BLKIF_RSP_ERROR;
    return -1;
}

static void ioreq_unmap(struct ioreq *ioreq)
{
    int gnt = ioreq->blkdev->xendev.gnttabdev;
    int i;

    if (batch_maps) {
	if (!ioreq->pages)
	    return;
	d3printf("%s: unmap %d grants %p\n", __FUNCTION__,
		 ioreq->vecs, ioreq->pages);
	if (0 != xc_gnttab_munmap(gnt, ioreq->pages, ioreq->vecs))
	    d1printf("%s: xc_gnttab_munmap failed: %s\n",
		     __FUNCTION__, strerror(errno));
	ioreq->blkdev->cnt_map -= ioreq->vecs;
	ioreq->pages = NULL;
    } else {
	for (i = 0; i < ioreq->vecs; i++) {
	    if (!ioreq->page[i])
		continue;
	    d3printf("%s: unmap grant %p\n", __FUNCTION__, ioreq->page[i]);
	    if (0 != xc_gnttab_munmap(gnt, ioreq->page[i], 1))
		d1printf("%s: xc_gnttab_munmap failed: %s\n",
			 __FUNCTION__, strerror(errno));
	    ioreq->blkdev->cnt_map--;
	    ioreq->page[i] = NULL;
	}
    }
}

static int ioreq_map(struct ioreq *ioreq)
{
    int gnt = ioreq->blkdev->xendev.gnttabdev;
    int i;

    if (batch_maps) {
	ioreq->pages = xc_gnttab_map_grant_refs
	    (gnt, ioreq->vecs, ioreq->domids, ioreq->refs, ioreq->prot);
	d3printf("%s: map %d grant refs -> %p\n", __FUNCTION__,
		 ioreq->vecs, ioreq->pages);
	if (NULL == ioreq->pages) {
	    d1printf("%s: can't map %d grant refs (%s, %d maps)\n", __FUNCTION__,
		     ioreq->vecs, strerror(errno), ioreq->blkdev->cnt_map);
	    return -1;
	}
	for (i = 0; i < ioreq->vecs; i++)
	    ioreq->vec[i].iov_base = ioreq->pages + i * PAGE_SIZE +
		(uintptr_t)ioreq->vec[i].iov_base;
	ioreq->blkdev->cnt_map += ioreq->vecs;
    } else  {
	for (i = 0; i < ioreq->vecs; i++) {
	    ioreq->page[i] = xc_gnttab_map_grant_ref
		(gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
	    d3printf("%s: map grant ref %d -> %p\n", __FUNCTION__,
		     ioreq->refs[i], ioreq->page[i]);
	    if (NULL == ioreq->page[i]) {
		d1printf("%s: can't map grant ref %d (%s, %d maps)\n", __FUNCTION__,
			 ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
		ioreq_unmap(ioreq);
		return -1;
	    }
	    ioreq->vec[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->vec[i].iov_base;
	    ioreq->blkdev->cnt_map++;
	}
    }
    return 0;
}

/*
 * Run I/O using readv / writev.
 */
static int ioreq_runio_serial(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;
    int len;

    if (-1 == ioreq_map(ioreq))
	goto err;
    if (ioreq->presync)
	fdatasync(blkdev->file);

    lseek(blkdev->file, ioreq->start, SEEK_SET);
    switch (ioreq->req.operation) {
    case BLKIF_OP_READ:
	len = readv(blkdev->file, ioreq->vec, ioreq->vecs);
	break;
    case BLKIF_OP_WRITE:
    case BLKIF_OP_WRITE_BARRIER:
	len = writev(blkdev->file, ioreq->vec, ioreq->vecs);
	break;
    default:
	/* unknown operation (shouldn't happen -- parse catches this) */
	goto err;
    }
    if (len != ioreq->end - ioreq->start) {
	d1printf("%s: I/O error (len %ld, rc %d, %s)\n", __FUNCTION__,
		 (long)(ioreq->end - ioreq->start), len, strerror(errno));
	goto err; /* I/O error */
    }

    if (ioreq->postsync)
	fdatasync(blkdev->file);

    if (BLKIF_OP_READ == ioreq->req.operation)
	blkdev->st.bytes_read += len;
    else
	blkdev->st.bytes_write += len;
    ioreq->status = BLKIF_RSP_OKAY;

    ioreq_unmap(ioreq);
    return 0;

err:
    blkdev->st.req_err_io++;
    ioreq->status = BLKIF_RSP_ERROR;
    ioreq_unmap(ioreq);
    return -1;
}

#ifdef HAVE_LIBAIO
/* insert fsync aka write barrier */
static void aio_sync(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;
    struct iocb *iocb = ioreq->iocb + ioreq->iocbs;

    iocb->data = ioreq;
    iocb->aio_fildes = blkdev->file;
    iocb->aio_lio_opcode = IO_CMD_FSYNC;
    ioreq->iocbs++;
}

static void aio_iov(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;
    struct iocb *iocb = ioreq->iocb + ioreq->iocbs;

    iocb->data = ioreq;
    iocb->aio_fildes = blkdev->file;
    iocb->aio_lio_opcode =
	(ioreq->req.operation == BLKIF_OP_READ) ? IO_CMD_PREADV : IO_CMD_PWRITEV;
    iocb->u.v.vec    = ioreq->vec;
    iocb->u.v.nr     = ioreq->vecs;
    iocb->u.v.offset = ioreq->start;
    ioreq->iocbs++;
}

static void aio_iov_emu(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;
    struct iocb *iocb = ioreq->iocb + ioreq->iocbs;
    off_t offset = ioreq->start;
    int i;

    for (i = 0; i < ioreq->vecs; i++) {
	iocb = ioreq->iocb + ioreq->iocbs;
	iocb->data = ioreq;
	iocb->aio_fildes = blkdev->file;
	iocb->aio_lio_opcode =
	    (ioreq->req.operation == BLKIF_OP_READ) ? IO_CMD_PREAD : IO_CMD_PWRITE;
	iocb->u.c.buf    = ioreq->vec[i].iov_base;
	iocb->u.c.nbytes = ioreq->vec[i].iov_len;
	iocb->u.c.offset = offset;
	offset += ioreq->vec[i].iov_len;
	ioreq->iocbs++;
    }
}

static int ioreq_runio_async_getevents(struct blkdev *blkdev, int allow_block)
{
    struct ioreq *ioreq;
    struct io_event events[4];
    static const int max = sizeof(events)/sizeof(events[0]);
    int i, min, len, cnt, ok, read;
    int total = 0;

    min = allow_block ? 1 : 0;

getmore:
    cnt = io_getevents(blkdev->ctx, min, max, events, NULL);
    if (cnt < 0) {
#if 1 /* Huh? raw syscall or what? */
	if (cnt < -1) {
	    errno = -cnt;
	    cnt = -1;
	}
#endif
	d1printf("%s: io_getevents failed (%d/%d-%d): %s\n", __FUNCTION__,
		 cnt, min, max, strerror(errno));
	return total;
    }
    if (0 == cnt)
	return total;


    total += cnt;
    blkdev->cnt_aio -= cnt;
    d2printf("%s: %d -> %d\n", __FUNCTION__, cnt, blkdev->cnt_aio);
    for (i = 0; i < cnt; i++) {
	ioreq = events[i].data;
	len = 0;
	read = 0;
	switch (events[i].obj->aio_lio_opcode) {
	case IO_CMD_FSYNC:
	    ok = events[i].res2 == 0;
	    break;
	case IO_CMD_PREAD:
	    read = 1;
	    /* fall through */
	case IO_CMD_PWRITE:
	    len = events[i].obj->u.c.nbytes;
	    ok = (events[i].res2 == 0) && (events[i].res == len);
	    break;
	case IO_CMD_PREADV:
	    read = 1;
	    /* fall through */
	case IO_CMD_PWRITEV:
	    len = ioreq->end - ioreq->start;
	    ok = (events[i].res2 == 0) && (events[i].res == len);
	    break;
	default:
	    ok = false;
	    break;
	}

	ioreq->iocbs_done++;
	if (ok) {
	    ioreq->iocbs_ok++;
	    if (read)
		blkdev->st.bytes_read += len;
	    else
		blkdev->st.bytes_write += len;
	} else {
	    d1printf("%s: aio failure: op=%d\n", __FUNCTION__,
		     events[i].obj->aio_lio_opcode);
	}

	if (ioreq->iocbs_done < ioreq->iocbs)
	    continue;

	if (ioreq->iocbs_done == ioreq->iocbs_ok) {
	    ioreq->status = BLKIF_RSP_OKAY;
	} else {
	    blkdev->st.req_err_io++;
	    ioreq->status = BLKIF_RSP_ERROR;
	}
	ioreq_unmap(ioreq);
	ioreq_finish(ioreq);
    }

    if (cnt == max) {
	min = 0; /* block only once */
	goto getmore;
    }
    return total;
}

static int ioreq_runio_async_submit(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;
    struct iocb *iocb_list[IOCB_COUNT];
    struct io_event event;
    int i, rc, done = 0, try;

    if (-1 == ioreq_map(ioreq))
	goto err;

restart:
    if (ioreq->presync)
	aio_sync(ioreq);
    if (blkdev->use_aio_vec)
	aio_iov(ioreq);
    else
	aio_iov_emu(ioreq);
    if (ioreq->postsync)
	aio_sync(ioreq);
    for (i = 0; i < ioreq->iocbs; i++)
	iocb_list[i] = ioreq->iocb + i;

    for (done = 0, try = 1; done < ioreq->iocbs && try < IOCB_COUNT*2; try++) {
	rc = io_submit(blkdev->ctx, ioreq->iocbs - done, iocb_list + done);
	if (rc != ioreq->iocbs - done) {
#if 1 /* Huh? raw syscall or what? */
	    if (rc < -1) {
		errno = -rc;
		rc = -1;
	    }
#endif
	    if (try > IOCB_COUNT || -1 == rc)
		d1printf("%s: io_submit failed (%d/%d, try %d, inflight %d%s): %s\n",
			 __FUNCTION__, rc, ioreq->iocbs - done,
			 try, blkdev->cnt_aio,
			 blkdev->use_aio_vec ? ",iov" : "",
			 strerror(errno));
	    if (rc < 0 && EAGAIN != errno) {
		if (blkdev->use_aio_vec && EINVAL == errno) {
		    d1printf("%s: io_submit: got EINVAL, disable vectored aio\n",
			     __FUNCTION__);
		    blkdev->use_aio_vec = 0;
		    ioreq->iocbs = 0;
		    memset(&ioreq->iocb, 0, sizeof(ioreq->iocb));
		    goto restart;
		}
		goto err;
	    }
	    /* wait for some i/o finish, then try again */
	    ioreq_runio_async_getevents(blkdev, 1);
	    blkdev->st.aio_full++;
	}
	if (rc > 0)
	    done += rc;
    }
    if (done != ioreq->iocbs)
	goto err;

    blkdev->cnt_aio += ioreq->iocbs;
    d2printf("%s: %d -> %d\n", __FUNCTION__, ioreq->iocbs, blkdev->cnt_aio);
    return 0;

err:
    while (done > 0) {
	done--;
	io_cancel(blkdev->ctx, iocb_list[done], &event);
    }
    blkdev->st.req_err_io++;
    ioreq->status = ioreq->presync /* barrier request */
	? BLKIF_RSP_EOPNOTSUPP : BLKIF_RSP_ERROR;
    ioreq_unmap(ioreq);
    return -1;
}
#endif

static int ioreq_runio_qemu(struct ioreq *ioreq)
{
    struct blkdev *blkdev = ioreq->blkdev;
    int i, rc, len = 0;
    off_t pos;

    if (-1 == ioreq_map(ioreq))
	goto err;
    if (ioreq->presync)
	bdrv_flush(blkdev->bs);

    switch (ioreq->req.operation) {
    case BLKIF_OP_READ:
	pos = ioreq->start;
	for (i = 0; i < ioreq->vecs; i++) {
	    rc = bdrv_read(blkdev->bs, pos / BLOCK_SIZE,
			   ioreq->vec[i].iov_base,
			   ioreq->vec[i].iov_len / BLOCK_SIZE);
	    if (rc != 0) {
		d1printf("%s: rd I/O error (%p, len %zd)\n", __FUNCTION__,
			 ioreq->vec[i].iov_base,
			 ioreq->vec[i].iov_len);
		goto err;
	    }
	    len += ioreq->vec[i].iov_len;
	    pos += ioreq->vec[i].iov_len;
	}
	break;
    case BLKIF_OP_WRITE:
    case BLKIF_OP_WRITE_BARRIER:
	pos = ioreq->start;
	for (i = 0; i < ioreq->vecs; i++) {
	    rc = bdrv_write(blkdev->bs, pos / BLOCK_SIZE,
			    ioreq->vec[i].iov_base,
			    ioreq->vec[i].iov_len / BLOCK_SIZE);
	    if (rc != 0) {
		d1printf("%s: wr I/O error (%p, len %zd)\n", __FUNCTION__,
			 ioreq->vec[i].iov_base,
			 ioreq->vec[i].iov_len);
		goto err;
	    }
	    len += ioreq->vec[i].iov_len;
	    pos += ioreq->vec[i].iov_len;
	}
	break;
    default:
	/* unknown operation (shouldn't happen -- parse catches this) */
	goto err;
    }

    if (ioreq->postsync)
	bdrv_flush(blkdev->bs);

    if (BLKIF_OP_READ == ioreq->req.operation)
	blkdev->st.bytes_read += len;
    else
	blkdev->st.bytes_write += len;
    ioreq->status = BLKIF_RSP_OKAY;

    ioreq_unmap(ioreq);
    return 0;

err:
    blkdev->st.req_err_io++;
    ioreq->status = BLKIF_RSP_ERROR;
    return -1;
}

static int blk_send_response_one(struct ioreq *ioreq)
{
    struct blkdev     *blkdev = ioreq->blkdev;
    int               send_notify   = 0;
    int               have_requests = 0;
    blkif_response_t  resp;
    void              *dst;

    resp.id        = ioreq->req.id;
    resp.operation = ioreq->req.operation;
    resp.status    = ioreq->status;

    /* Place on the response ring for the relevant domain. */
    switch (blkdev->protocol) {
    case BLKIF_PROTOCOL_NATIVE:
	dst = RING_GET_RESPONSE(&blkdev->rings.native, blkdev->rings.native.rsp_prod_pvt);
	break;
    case BLKIF_PROTOCOL_X86_32:
	dst = RING_GET_RESPONSE(&blkdev->rings.x86_32, blkdev->rings.x86_32.rsp_prod_pvt);
	break;
    case BLKIF_PROTOCOL_X86_64:
	dst = RING_GET_RESPONSE(&blkdev->rings.x86_64, blkdev->rings.x86_64.rsp_prod_pvt);
	break;
    default:
	dst = NULL;
    }
    memcpy(dst, &resp, sizeof(resp));
    blkdev->rings.common.rsp_prod_pvt++;

    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
    if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
	/*
	 * Tail check for pending requests. Allows frontend to avoid
	 * notifications if requests are already in flight (lower
	 * overheads and promotes batching).
	 */
	RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
    } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
	have_requests = 1;
    }

    d3printf("%s: send_notify %d, have_requests %d\n",
	     __FUNCTION__, send_notify, have_requests);

    if (have_requests)
	blkdev->more_work++;
    return send_notify;
}

/* walk finished list, send outstanding responses, free requests */
static void blk_send_response_all(struct blkdev *blkdev)
{
    struct list_head *item, *safe;
    struct ioreq *ioreq;
    int send_notify = 0;

    list_for_each_safe(item, safe, &blkdev->finished) {
	ioreq = list_entry(item, struct ioreq, list);
	send_notify += blk_send_response_one(ioreq);
	ioreq_release(ioreq);
    }
    if (send_notify)
	xc_evtchn_notify(blkdev->xendev.evtchnd, blkdev->xendev.local_port);
}

static int blk_get_request(struct blkdev *blkdev, struct ioreq *ioreq, RING_IDX rc)
{
    switch (blkdev->protocol) {
    case BLKIF_PROTOCOL_NATIVE:
	memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
	       sizeof(ioreq->req));
	break;
    case BLKIF_PROTOCOL_X86_32:
	blkif_get_x86_32_req(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.x86_32, rc));
	break;
    case BLKIF_PROTOCOL_X86_64:
	blkif_get_x86_64_req(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.x86_64, rc));
	break;
    }
    return 0;
}

static void blk_thread_main_gosleep(struct blkdev *blkdev)
{
#ifdef HAVE_LIBAIO
    while (!blkdev->more_work && blkdev->use_aio && blkdev->cnt_aio) {
	/* no requests from frontend and aio in flight -> blocking aio wait */
	d2printf("%s: go wait for aio finish (%d in flight)\n",
		 __FUNCTION__, blkdev->cnt_aio);
	blkdev->st.aio_wait++;
	if (ioreq_runio_async_getevents(blkdev, 1))
	    blk_send_response_all(blkdev);
    }
#endif

    if (!blkdev->more_work && blkdev->xendev.state == XENDEV_CONNECTED) {
	d2printf("%s: go wait for events\n", __FUNCTION__);
	if (-1 != wait_for_event(&blkdev->xendev))
	    blkdev->st.wakeups++;
    } else
	d2printf("%s: more work (%d), nosleep\n", __FUNCTION__, blkdev->more_work);
    blkdev->more_work = 0;
}

static void *blk_thread_main(void *arg)
{
    struct blkdev *blkdev = arg;
    RING_IDX rc, rp;
    struct ioreq *ioreq;

    d1printf("%s: start\n", __FUNCTION__);
    for (;;) {
	if (blkdev->xendev.state != XENDEV_CONNECTED)
	    break;
	rc = blkdev->rings.common.req_cons;
	rp = blkdev->rings.common.sring->req_prod;
	xen_rmb(); /* Ensure we see queued requests up to 'rp'. */

	/* Limit #of requests we queue up for I/O so we ack requests
	 * faster if busy.  Improves backend/frontend parallelism and
	 * reduces evchn signaling. */
	if (rp > rc + (max_requests >> 2)) {
	    rp = rc + (max_requests >> 2);
	    blkdev->more_work++;
	}

	while ((rc != rp)) {
	    /* pull request from ring */
	    if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc))
		break;
	    ioreq = ioreq_start(blkdev);
	    if (NULL == ioreq) {
		blkdev->more_work++;
		break;
	    }
	    blk_get_request(blkdev, ioreq, rc);
	    blkdev->rings.common.req_cons = ++rc;

	    /* parse them */
	    if (0 != ioreq_parse(ioreq)) {
		if (blk_send_response_one(ioreq))
		    xc_evtchn_notify(blkdev->xendev.evtchnd, blkdev->xendev.local_port);
		ioreq_release(ioreq);
		continue;
	    }

	    if (blkdev->bs) {
		/* run i/o in qemu mode */
		ioreq_runio_qemu(ioreq);
		ioreq_finish(ioreq);
#ifdef HAVE_LIBAIO
	    } else if (blkdev->use_aio) {
		/* submit async i/o */
		if (-1 == ioreq_runio_async_submit(ioreq))
		    ioreq_finish(ioreq);
#endif
	    } else {
		/* run i/o in sequential mode */
		ioreq_runio_serial(ioreq);
		ioreq_finish(ioreq);
	    }
	}
#ifdef HAVE_LIBAIO
	if (blkdev->use_aio)
	    ioreq_runio_async_getevents(blkdev, 0);
#endif
	blk_send_response_all(blkdev);
	blk_thread_main_gosleep(blkdev);
    }

    d1printf("%s: cleanup\n", __FUNCTION__);
#ifdef HAVE_LIBAIO
    while (blkdev->use_aio && blkdev->cnt_aio)
	ioreq_runio_async_getevents(blkdev, 1);
#endif
    blk_send_response_all(blkdev);

    d1printf("%s: exit\n", __FUNCTION__);
    return NULL;
}

/* ------------------------------------------------------------- */

#if 0
static int open_as(uid_t uid, gid_t gid, const char *file, int flags)
{
    int fd = -1, err = 0;
    uid_t ruid;
    gid_t rgid;

    ruid = getuid();
    rgid = getgid();

    if (rgid != gid  &&  -1 == setegid(gid)) {
	d1printf("setegid(%d): %s\n", gid, strerror(errno));
	goto fail_id;
    }
    if (ruid != uid  &&  -1 == seteuid(uid)) {
	d1printf("seteuid(%d): %s\n", uid, strerror(errno));
	goto fail_id;
    }

    fd = open(file, flags);
    if (-1 == fd) {
	err = errno;
	d1printf("open_as(%d, %d, %s): %s\n", uid, gid, file, strerror(errno));
    }

    if (ruid != uid  &&  -1 == seteuid(ruid)) {
	d1printf("seteuid(%d): %s\n", ruid, strerror(errno));
	goto fail_id;
    }
    if (rgid != gid  &&  -1 == setegid(rgid)) {
	d1printf("setegid(%d): %s\n", rgid, strerror(errno));
	goto fail_id;
    }

    errno = err;
    return fd;

fail_id:
    if (-1 != fd)
	close(fd);
    errno = EPERM;
    return -1;
}
#endif

static void blk_setup_backend(struct blkdev *blkdev)
{
    int mode, qflags, use_qemu, have_barriers, info = 0;
    char *h;

    if (blkdev->xendev.state != XENDEV_PROBED)
	return;
    if (!blkdev->xendev.online)
	return;

    /* read xenstore entries */
    if (NULL == blkdev->params) {
	blkdev->params = read_be_str(&blkdev->xendev, "params");
	if (NULL != (h = strchr(blkdev->params, ':'))) {
	    blkdev->fileproto = blkdev->params;
	    blkdev->filename  = h+1;
	    *h = 0;
	} else {
	    blkdev->fileproto = "<unset>";
	    blkdev->filename  = blkdev->params;
	}
    }
    if (NULL == blkdev->mode)
	blkdev->mode = read_be_str(&blkdev->xendev, "mode");
    if (NULL == blkdev->type)
	blkdev->type = read_be_str(&blkdev->xendev, "type");
    if (NULL == blkdev->dev)
	blkdev->dev  = read_be_str(&blkdev->xendev, "dev");

    /* do we have all we need? */
    if (NULL == blkdev->params ||
	NULL == blkdev->mode   ||
	NULL == blkdev->type   ||
	NULL == blkdev->dev)
	return;

    /* which mode to use? */
    if (0 == strcmp(blkdev->type,      "phy")  ||
	0 == strcmp(blkdev->type,      "file") ||
	0 == strcmp(blkdev->fileproto, "raw")  ||
	0 == strcmp(blkdev->fileproto, "aio"))
	use_qemu = 0;
    else
	use_qemu = 1;

    /* read-only ? */
    if (0 == strcmp(blkdev->mode, "w")) {
	mode   = O_RDWR;
	qflags = BDRV_O_RDWR;
    } else {
	mode   = O_RDONLY;
	qflags = BDRV_O_RDONLY;
	info  |= VDISK_READONLY | VDISK_REMOVABLE | VDISK_CDROM;
    }

    if (use_qemu) {
	/* use qemu block driver */
	blkdev->bs = bdrv_new(blkdev->dev);
	if (blkdev->bs) {
	    if (0 != bdrv_open2(blkdev->bs, blkdev->filename, qflags,
				bdrv_find_format(blkdev->fileproto))) {
		bdrv_delete(blkdev->bs);
		blkdev->bs = NULL;
	    }
	}
	if (!blkdev->bs)
	    return;
	blkdev->file_blk  = BLOCK_SIZE;
	blkdev->file_size = bdrv_getlength(blkdev->bs);
	have_barriers     = blkdev->bs->drv->bdrv_flush ? 1 : 0;
    } else {
	/* try opening file directly */
#ifdef HAVE_LIBAIO
	if (use_aio) {
	    blkdev->use_aio = 1;
	    blkdev->use_aio_vec = use_aio_vec;
	    io_setup(max_requests, &blkdev->ctx);
	    mode |= O_DIRECT;
	}
#endif
	blkdev->file = open(blkdev->filename, mode);
	if (-1 == blkdev->file) {
	    d1printf("%s: open %s: %s\n", __FUNCTION__,
		     blkdev->filename, strerror(errno));
	    return;
	}
	blkdev->file_blk  = BLOCK_SIZE;
	blkdev->file_size = lseek(blkdev->file, 0, SEEK_END);
	if (-1 == blkdev->file_size)
	    perror("lseek");
	have_barriers     = 1;
    }
    d1printf("%s: type \"%s\", fileproto \"%s\", filename \"%s\","
	     " qemu %s, aio %s, size %" PRId64 " (%" PRId64 " MB)\n",
	     __FUNCTION__, blkdev->type, blkdev->fileproto, blkdev->filename,
	     use_qemu        ? "yes" : "no",
	     blkdev->use_aio ? "yes" : "no",
	     blkdev->file_size, blkdev->file_size >> 20);

    /* fill info */
    write_be_int(&blkdev->xendev, "feature-barrier", have_barriers);
    write_be_int(&blkdev->xendev, "info",            info);
    write_be_int(&blkdev->xendev, "sector-size",     blkdev->file_blk);
    write_be_int(&blkdev->xendev, "sectors",
		 blkdev->file_size / blkdev->file_blk);
    write_be_str(&blkdev->xendev, "hotplug-status", "connected");

    change_state_xendev(&blkdev->xendev, XenbusStateInitWait);
    blkdev->xendev.state = XENDEV_INITIALISED;
}

static void blk_connect_frontend(struct blkdev *blkdev)
{
    if (blkdev->xendev.state != XENDEV_INITIALISED)
	return;

    blkdev->ring_ref = read_fe_int(&blkdev->xendev, "ring-ref");
    blkdev->xendev.remote_port = read_fe_int(&blkdev->xendev, "event-channel");
    blkdev->xendev.protocol = read_fe_str(&blkdev->xendev, "protocol");
    if (0 == blkdev->ring_ref ||
	0 == blkdev->xendev.remote_port ||
	NULL == blkdev->xendev.protocol)
	return;

    blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
    if (blkdev->xendev.protocol) {
        if (0 == strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_32))
            blkdev->protocol = BLKIF_PROTOCOL_X86_32;
        if (0 == strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_64))
            blkdev->protocol = BLKIF_PROTOCOL_X86_64;
    }

    blkdev->sring = xc_gnttab_map_grant_ref(blkdev->xendev.gnttabdev,
					    blkdev->xendev.dom,
					    blkdev->ring_ref,
					    PROT_READ | PROT_WRITE);
    d2printf("%s: map grant ref %d -> %p\n", __FUNCTION__,
	     blkdev->ring_ref, blkdev->sring);
    if (!blkdev->sring)
	return;
    blkdev->cnt_map++;

    switch (blkdev->protocol) {
    case BLKIF_PROTOCOL_NATIVE:
    {
	blkif_sring_t *sring_native = blkdev->sring;
	BACK_RING_INIT(&blkdev->rings.native, sring_native, PAGE_SIZE);
	break;
    }
    case BLKIF_PROTOCOL_X86_32:
    {
	blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
	BACK_RING_INIT(&blkdev->rings.x86_32, sring_x86_32, PAGE_SIZE);
	break;
    }
    case BLKIF_PROTOCOL_X86_64:
    {
	blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
	BACK_RING_INIT(&blkdev->rings.x86_64, sring_x86_64, PAGE_SIZE);
	break;
    }
    }

    blkdev->xendev.local_port = xc_evtchn_bind_interdomain
	(blkdev->xendev.evtchnd, blkdev->xendev.dom, blkdev->xendev.remote_port);
    xc_evtchn_unmask(blkdev->xendev.evtchnd, blkdev->xendev.local_port);

    d1printf("%s: ok: proto %s, ring-ref %d, "
	     "remote port %d, local port %d\n", __FUNCTION__,
	     blkdev->xendev.protocol, blkdev->ring_ref,
	     blkdev->xendev.remote_port, blkdev->xendev.local_port);

    change_state_xendev(&blkdev->xendev, XenbusStateConnected);
    blkdev->xendev.state = XENDEV_CONNECTED;

    pthread_create(&blkdev->mainthread, NULL, blk_thread_main, blkdev);
}

static void blk_disconnect(struct blkdev *blkdev, enum xenbus_state state)
{
    void *dummy;

    if (blkdev->xendev.state == XENDEV_DISCONNECTED)
        goto out;
    blkdev->xendev.state = XENDEV_DISCONNECTED;

    if (blkdev->mainthread) {
	pthread_kill(blkdev->mainthread, SIGUSR2);
	pthread_join(blkdev->mainthread, &dummy);
    }

#ifdef HAVE_LIBAIO
    if (blkdev->use_aio) {
	io_destroy(blkdev->ctx);
        memset(&blkdev->ctx, 0, sizeof(blkdev->ctx));
	blkdev->use_aio = 0;
    }
#endif
    if (-1 != blkdev->file) {
	close(blkdev->file);
	blkdev->file = -1;
    }
    if (blkdev->bs) {
	bdrv_close(blkdev->bs);
	bdrv_delete(blkdev->bs);
	blkdev->bs = NULL;
    }
    if (blkdev->xendev.local_port) {
	xc_evtchn_unbind(blkdev->xendev.evtchnd, blkdev->xendev.local_port);
	blkdev->xendev.local_port = 0;
    }

    if (blkdev->sring) {
	d2printf("%s: unmap grant %p\n", __FUNCTION__, blkdev->sring);
	xc_gnttab_munmap(blkdev->xendev.gnttabdev, blkdev->sring, 1);
	blkdev->cnt_map--;
	blkdev->sring = NULL;
    }

out:
    if (state)
	change_state_xendev(&blkdev->xendev, state);
}

static void blk_check_ring(struct blkdev *blkdev)
{
    if (blkdev->mainthread) {
	blkdev->more_work++;
	pthread_kill(blkdev->mainthread, SIGUSR2);
    }
}

static int blk_alloc(struct xendev *xendev)
{
    struct blkdev *blkdev = container_of(xendev, struct blkdev, xendev);

    d1printf("%s: %p\n", __FUNCTION__, blkdev);
    blkdev->xendev.state = XENDEV_PROBED;

    INIT_LIST_HEAD(&blkdev->inflight);
    INIT_LIST_HEAD(&blkdev->finished);
    INIT_LIST_HEAD(&blkdev->freelist);
    return 0;
}

static int blk_setup(struct xendev *xendev)
{
    struct blkdev *blkdev = container_of(xendev, struct blkdev, xendev);

    d1printf("%s: %p\n", __FUNCTION__, blkdev);
    blk_setup_backend(blkdev);
    blk_connect_frontend(blkdev);
    blk_check_ring(blkdev);
    return 0;
}

static int blk_backend(struct xendev *xendev, char *node, char *val)
{
    struct blkdev *blkdev = container_of(xendev, struct blkdev, xendev);

    d2printf("%s: %p \"%s\" = \"%s\"\n", __FUNCTION__, blkdev, node,
	     val ? val : "<deleted>");
    blk_setup_backend(blkdev);
    return 0;
}

static int blk_frontend(struct xendev *xendev, char *node, char *val)
{
    struct blkdev *blkdev = container_of(xendev, struct blkdev, xendev);

    d2printf("%s: %p \"%s\" = \"%s\"\n", __FUNCTION__, blkdev, node,
	     val ? val : "<deleted>");
    switch (xendev->fe_state) {
    case XenbusStateInitialising:
        if (blkdev->xendev.state == XENDEV_DISCONNECTED) {
            /* re-init backend for other frontend */
            blkdev->xendev.state = XENDEV_PROBED;
            blk_setup_backend(blkdev);
        }
        break;

    case XenbusStateInitialised:
    case XenbusStateConnected:
	blk_connect_frontend(blkdev);
	blk_check_ring(blkdev);
	break;

    case XenbusStateClosing:
    case XenbusStateClosed:
	blk_disconnect(blkdev, xendev->fe_state);
	break;

    case XenbusStateInitWait:
    case XenbusStateUnknown:
	/* make gcc happy */
	break;
    }
    return 0;
}

static void blk_rates(struct blkstats *now, struct blkstats *last,
		      struct blkstats *rate, struct blkstats *peak,
		      int msecs)
{
    uint64_t *n = (uint64_t*) now;
    uint64_t *l = (uint64_t*) last;
    uint64_t *r = (uint64_t*) rate;
    uint64_t *p = (uint64_t*) peak;
    int i, count = sizeof(struct blkstats) / sizeof(uint64_t);

    for (i = 0; i < count; i++) {
	r[i] = (n[i] - l[i]) * 1000 / msecs;
	if (p[i] < r[i])
	    p[i] = r[i];
    }
}

static void blk_print_rate(char *type, struct blkstats *st, int msecs)
{
    d1printf("  i/o %s, kB/s:"
	     "  rd %5" PRId64 ","
	     "  wr %5" PRId64 "  |"
	     "  wake %3" PRId64 ","
	     "  aiow %3" PRId64 ","
	     "  aiof %3" PRId64 "  |"
	     "  %2d.%03d sec\n",
	     type,
	     st->bytes_read  / 1024,
	     st->bytes_write / 1024,
	     st->wakeups,
	     st->aio_wait,
	     st->aio_full,
	     msecs / 1000, msecs % 1000);
}

static void blk_stats(struct xendev *xendev, FILE *fp)
{
    struct blkdev *blkdev = container_of(xendev, struct blkdev, xendev);
    struct blkstats *st = &blkdev->st;
    struct timeval tv;

    fprintf(fp,
	    "    read : bytes %" PRId64 " requests %" PRId64 "\n"
	    "    write: bytes %" PRId64 " requests %" PRId64 " barrier %" PRId64 "\n"
	    "    error: parse %" PRId64 " io %" PRId64 "\n"
	    "    queue: max %d/%d full %" PRId64 "\n",
	    st->bytes_read, st->req_read,
	    st->bytes_write, st->req_write, st->req_barrier,
	    st->req_err_parse, st->req_err_io,
	    blkdev->requests, max_requests, st->req_full);

    gettimeofday(&tv, NULL);
    if (blkdev->tv_prev.tv_sec) {
	uint32_t msecs = timediff_msecs(&tv, &blkdev->tv_prev);
	blk_rates(st, &blkdev->st_prev, &blkdev->st_rate, &blkdev->st_peak, msecs);
	fprintf(fp,
		"    rate : read %" PRId64 " write %" PRId64 " wake %" PRId64
		" aiow %" PRId64 " aiof %" PRId64 "\n"
		"    peak : read %" PRId64 " write %" PRId64 " wake %" PRId64
		" aiow %" PRId64 " aiof %" PRId64 "\n",
		blkdev->st_rate.bytes_read,
		blkdev->st_rate.bytes_write,
		blkdev->st_rate.wakeups,
		blkdev->st_rate.aio_wait,
		blkdev->st_rate.aio_full,
		blkdev->st_peak.bytes_read,
		blkdev->st_peak.bytes_write,
		blkdev->st_peak.wakeups,
		blkdev->st_peak.aio_wait,
		blkdev->st_peak.aio_full);
	blk_print_rate("rate", &blkdev->st_rate, msecs);
	if (!(++blkdev->st_count % 10))
	    blk_print_rate("PEAK", &blkdev->st_peak, 0);
    }
    blkdev->tv_prev = tv;
    blkdev->st_prev = *st;
}

static int blk_free(struct xendev *xendev)
{
    struct blkdev *blkdev = container_of(xendev, struct blkdev, xendev);
    struct list_head *item, *safe;
    struct ioreq *ioreq;

    d1printf("%s: %p\n", __FUNCTION__, blkdev);

    blk_disconnect(blkdev, 0);

    list_for_each_safe(item, safe, &blkdev->freelist) {
	ioreq = list_entry(item, struct ioreq, list);
	list_del(&ioreq->list);
	free(ioreq);
    }

    if (blkdev->params)
	free(blkdev->params);
    if (blkdev->mode)
	free(blkdev->mode);
    return 0;
}

static struct devops blkdev_ops = {
    .size  = sizeof(struct blkdev),
    .alloc = blk_alloc,
    .setup = blk_setup,
    .xs_be = blk_backend,
    .xs_fe = blk_frontend,
    .stats = blk_stats,
    .free  = blk_free,
};

/* ------------------------------------------------------------- */
/* for qemu bits                                                 */

void term_printf(const char *fmt, ...)
{
    va_list ap;
    va_start(ap, fmt);
    vprintf(fmt, ap);
    va_end(ap);
}

void term_print_filename(const char *filename)
{
    term_printf(filename);
}

/* ------------------------------------------------------------- */

static void usage(FILE *fp)
{
    fprintf(fp,
	    "\n"
	    "blkbackd --  xenner block backend daemon\n"
	    "\n"
	    "usage: blkbackd [options]\n"
	    "options:\n"
	    "   -h            print this text\n"
	    "   -d            increase debuglevel\n"
	    "   -p <file>     specify pidfile\n"
	    "   -l <file>     specify logfile\n"
	    "   -n <name>     backend name                  [%s]\n"
	    "   -i <secs>     stats interval                [%d]"
	    "\n"
	    "   -s            do syncronous writes\n"
	    "   -b            batch grant table mappings\n"
#ifdef HAVE_LIBAIO
	    "   -a            enable aio\n"
	    "   -A            enable aio, but don't use vectored aio\n"
#else
	    "   -a / -A       no effect (compiled without aio)\n"
#endif
	    "   -m <nr>       max nr of requests (per dev)  [%d]\n"
	    "\n"
	    "-- \n"
	    "(c) 2007 Gerd Hoffmann <kraxel@redhat.com>\n"
	    "\n",
	    be_name, stat_secs,
	    max_requests);
}

int main(int argc, char *argv[])
{
    int c;

    for (;;) {
        if (-1 == (c = getopt(argc, argv, "hdsaAbp:m:l:i:")))
            break;
        switch (c) {
        case 'd':
	    debug++;
	    break;
	case 's':
	    syncwrite = 1;
	    break;
	case 'a':
	    use_aio = 1;
	    use_aio_vec = 1;
	    break;
	case 'A':
	    use_aio = 1;
	    use_aio_vec = 0;
	    break;
	case 'b':
	    batch_maps = 1;
	    break;

	case 'i':
	    stat_secs = atoi(optarg);
	    break;
	case 'p':
	    pidfile = optarg;
	    break;
	case 'l':
	    log_setfile(optarg);
	    break;
	case 'm':
	    max_requests = atoi(optarg);
	    break;

        case 'h':
            usage(stdout);
            exit(0);
        default:
            usage(stderr);
            exit(1);
        }
    }

    bdrv_init();
    return mainloop(&blkdev_ops, be_name, stat_secs);
}
