Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block: (113 commits) cfq-iosched: Do not access cfqq after freeing it block: include linux/err.h to use ERR_PTR cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit blkio: Allow CFQ group IO scheduling even when CFQ is a module blkio: Implement dynamic io controlling policy registration blkio: Export some symbols from blkio as its user CFQ can be a module block: Fix io_context leak after failure of clone with CLONE_IO block: Fix io_context leak after clone with CLONE_IO cfq-iosched: make nonrot check logic consistent io controller: quick fix for blk-cgroup and modular CFQ cfq-iosched: move IO controller declerations to a header file cfq-iosched: fix compile problem with !CONFIG_CGROUP blkio: Documentation blkio: Wait on sync-noidle queue even if rq_noidle = 1 blkio: Implement group_isolation tunable blkio: Determine async workload length based on total number of queues blkio: Wait for cfq queue to get backlogged if group is empty blkio: Propagate cgroup weight updation to cfq groups blkio: Drop the reference to queue once the task changes cgroup blkio: Provide some isolation between groups ...
This commit is contained in:
@@ -331,4 +331,17 @@ static inline int bdi_sched_wait(void *word)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
|
||||
struct page *page)
|
||||
{
|
||||
if (bdi && bdi->unplug_io_fn)
|
||||
bdi->unplug_io_fn(bdi, page);
|
||||
}
|
||||
|
||||
static inline void blk_run_address_space(struct address_space *mapping)
|
||||
{
|
||||
if (mapping)
|
||||
blk_run_backing_dev(mapping->backing_dev_info, NULL);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_BACKING_DEV_H */
|
||||
|
||||
+14
-6
@@ -391,6 +391,18 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
|
||||
gfp_t, int);
|
||||
extern void bio_set_pages_dirty(struct bio *bio);
|
||||
extern void bio_check_pages_dirty(struct bio *bio);
|
||||
|
||||
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
|
||||
# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
|
||||
#endif
|
||||
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
|
||||
extern void bio_flush_dcache_pages(struct bio *bi);
|
||||
#else
|
||||
static inline void bio_flush_dcache_pages(struct bio *bi)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
|
||||
unsigned long, unsigned int, int, gfp_t);
|
||||
extern struct bio *bio_copy_user_iov(struct request_queue *,
|
||||
@@ -450,11 +462,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
|
||||
/*
|
||||
* remember never ever reenable interrupts between a bvec_kmap_irq and
|
||||
* bvec_kunmap_irq!
|
||||
*
|
||||
* This function MUST be inlined - it plays with the CPU interrupt flags.
|
||||
*/
|
||||
static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
|
||||
unsigned long *flags)
|
||||
static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
|
||||
{
|
||||
unsigned long addr;
|
||||
|
||||
@@ -470,8 +479,7 @@ static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
|
||||
return (char *) addr + bvec->bv_offset;
|
||||
}
|
||||
|
||||
static __always_inline void bvec_kunmap_irq(char *buffer,
|
||||
unsigned long *flags)
|
||||
static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
|
||||
{
|
||||
unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
|
||||
|
||||
|
||||
+43
-13
@@ -312,13 +312,17 @@ struct queue_limits {
|
||||
unsigned int io_min;
|
||||
unsigned int io_opt;
|
||||
unsigned int max_discard_sectors;
|
||||
unsigned int discard_granularity;
|
||||
unsigned int discard_alignment;
|
||||
|
||||
unsigned short logical_block_size;
|
||||
unsigned short max_hw_segments;
|
||||
unsigned short max_phys_segments;
|
||||
|
||||
unsigned char misaligned;
|
||||
unsigned char discard_misaligned;
|
||||
unsigned char no_cluster;
|
||||
signed char discard_zeroes_data;
|
||||
};
|
||||
|
||||
struct request_queue
|
||||
@@ -749,6 +753,17 @@ struct req_iterator {
|
||||
#define rq_iter_last(rq, _iter) \
|
||||
(_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
|
||||
|
||||
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
|
||||
# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
|
||||
#endif
|
||||
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
|
||||
extern void rq_flush_dcache_pages(struct request *rq);
|
||||
#else
|
||||
static inline void rq_flush_dcache_pages(struct request *rq)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
extern int blk_register_queue(struct gendisk *disk);
|
||||
extern void blk_unregister_queue(struct gendisk *disk);
|
||||
extern void register_disk(struct gendisk *dev);
|
||||
@@ -823,19 +838,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
|
||||
return bdev->bd_disk->queue;
|
||||
}
|
||||
|
||||
static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
|
||||
struct page *page)
|
||||
{
|
||||
if (bdi && bdi->unplug_io_fn)
|
||||
bdi->unplug_io_fn(bdi, page);
|
||||
}
|
||||
|
||||
static inline void blk_run_address_space(struct address_space *mapping)
|
||||
{
|
||||
if (mapping)
|
||||
blk_run_backing_dev(mapping->backing_dev_info, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* blk_rq_pos() : the current sector
|
||||
* blk_rq_bytes() : bytes left in the entire request
|
||||
@@ -1134,6 +1136,34 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
|
||||
return q->limits.alignment_offset;
|
||||
}
|
||||
|
||||
static inline int queue_discard_alignment(struct request_queue *q)
|
||||
{
|
||||
if (q->limits.discard_misaligned)
|
||||
return -1;
|
||||
|
||||
return q->limits.discard_alignment;
|
||||
}
|
||||
|
||||
static inline int queue_sector_discard_alignment(struct request_queue *q,
|
||||
sector_t sector)
|
||||
{
|
||||
return ((sector << 9) - q->limits.discard_alignment)
|
||||
& (q->limits.discard_granularity - 1);
|
||||
}
|
||||
|
||||
static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
|
||||
{
|
||||
if (q->limits.discard_zeroes_data == 1)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
|
||||
{
|
||||
return queue_discard_zeroes_data(bdev_get_queue(bdev));
|
||||
}
|
||||
|
||||
static inline int queue_dma_alignment(struct request_queue *q)
|
||||
{
|
||||
return q ? q->dma_alignment : 511;
|
||||
|
||||
@@ -60,3 +60,9 @@ SUBSYS(net_cls)
|
||||
#endif
|
||||
|
||||
/* */
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP
|
||||
SUBSYS(blkio)
|
||||
#endif
|
||||
|
||||
/* */
|
||||
|
||||
@@ -43,6 +43,8 @@
|
||||
#define CN_DST_VAL 0x1
|
||||
#define CN_IDX_DM 0x7 /* Device Mapper */
|
||||
#define CN_VAL_DM_USERSPACE_LOG 0x1
|
||||
#define CN_IDX_DRBD 0x8
|
||||
#define CN_VAL_DRBD 0x1
|
||||
|
||||
#define CN_NETLINK_USERS 8
|
||||
|
||||
|
||||
@@ -0,0 +1,343 @@
|
||||
/*
|
||||
drbd.h
|
||||
Kernel module for 2.6.x Kernels
|
||||
|
||||
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
|
||||
|
||||
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
|
||||
Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
|
||||
Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
|
||||
|
||||
drbd is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2, or (at your option)
|
||||
any later version.
|
||||
|
||||
drbd is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with drbd; see the file COPYING. If not, write to
|
||||
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
*/
|
||||
#ifndef DRBD_H
|
||||
#define DRBD_H
|
||||
#include <linux/connector.h>
|
||||
#include <asm/types.h>
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#include <linux/types.h>
|
||||
#include <asm/byteorder.h>
|
||||
#else
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <limits.h>
|
||||
|
||||
/* Altough the Linux source code makes a difference between
|
||||
generic endianness and the bitfields' endianness, there is no
|
||||
architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
|
||||
does not match the generic endianness. */
|
||||
|
||||
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||
#define __LITTLE_ENDIAN_BITFIELD
|
||||
#elif __BYTE_ORDER == __BIG_ENDIAN
|
||||
#define __BIG_ENDIAN_BITFIELD
|
||||
#else
|
||||
# error "sorry, weird endianness on this box"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
extern const char *drbd_buildtag(void);
|
||||
#define REL_VERSION "8.3.6"
|
||||
#define API_VERSION 88
|
||||
#define PRO_VERSION_MIN 86
|
||||
#define PRO_VERSION_MAX 91
|
||||
|
||||
|
||||
enum drbd_io_error_p {
|
||||
EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
|
||||
EP_CALL_HELPER,
|
||||
EP_DETACH
|
||||
};
|
||||
|
||||
enum drbd_fencing_p {
|
||||
FP_DONT_CARE,
|
||||
FP_RESOURCE,
|
||||
FP_STONITH
|
||||
};
|
||||
|
||||
enum drbd_disconnect_p {
|
||||
DP_RECONNECT,
|
||||
DP_DROP_NET_CONF,
|
||||
DP_FREEZE_IO
|
||||
};
|
||||
|
||||
enum drbd_after_sb_p {
|
||||
ASB_DISCONNECT,
|
||||
ASB_DISCARD_YOUNGER_PRI,
|
||||
ASB_DISCARD_OLDER_PRI,
|
||||
ASB_DISCARD_ZERO_CHG,
|
||||
ASB_DISCARD_LEAST_CHG,
|
||||
ASB_DISCARD_LOCAL,
|
||||
ASB_DISCARD_REMOTE,
|
||||
ASB_CONSENSUS,
|
||||
ASB_DISCARD_SECONDARY,
|
||||
ASB_CALL_HELPER,
|
||||
ASB_VIOLENTLY
|
||||
};
|
||||
|
||||
/* KEEP the order, do not delete or insert. Only append. */
|
||||
enum drbd_ret_codes {
|
||||
ERR_CODE_BASE = 100,
|
||||
NO_ERROR = 101,
|
||||
ERR_LOCAL_ADDR = 102,
|
||||
ERR_PEER_ADDR = 103,
|
||||
ERR_OPEN_DISK = 104,
|
||||
ERR_OPEN_MD_DISK = 105,
|
||||
ERR_DISK_NOT_BDEV = 107,
|
||||
ERR_MD_NOT_BDEV = 108,
|
||||
ERR_DISK_TO_SMALL = 111,
|
||||
ERR_MD_DISK_TO_SMALL = 112,
|
||||
ERR_BDCLAIM_DISK = 114,
|
||||
ERR_BDCLAIM_MD_DISK = 115,
|
||||
ERR_MD_IDX_INVALID = 116,
|
||||
ERR_IO_MD_DISK = 118,
|
||||
ERR_MD_INVALID = 119,
|
||||
ERR_AUTH_ALG = 120,
|
||||
ERR_AUTH_ALG_ND = 121,
|
||||
ERR_NOMEM = 122,
|
||||
ERR_DISCARD = 123,
|
||||
ERR_DISK_CONFIGURED = 124,
|
||||
ERR_NET_CONFIGURED = 125,
|
||||
ERR_MANDATORY_TAG = 126,
|
||||
ERR_MINOR_INVALID = 127,
|
||||
ERR_INTR = 129, /* EINTR */
|
||||
ERR_RESIZE_RESYNC = 130,
|
||||
ERR_NO_PRIMARY = 131,
|
||||
ERR_SYNC_AFTER = 132,
|
||||
ERR_SYNC_AFTER_CYCLE = 133,
|
||||
ERR_PAUSE_IS_SET = 134,
|
||||
ERR_PAUSE_IS_CLEAR = 135,
|
||||
ERR_PACKET_NR = 137,
|
||||
ERR_NO_DISK = 138,
|
||||
ERR_NOT_PROTO_C = 139,
|
||||
ERR_NOMEM_BITMAP = 140,
|
||||
ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */
|
||||
ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */
|
||||
ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */
|
||||
ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */
|
||||
ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */
|
||||
ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */
|
||||
ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */
|
||||
ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
|
||||
ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */
|
||||
ERR_DATA_NOT_CURRENT = 150,
|
||||
ERR_CONNECTED = 151, /* DRBD 8.3 only */
|
||||
ERR_PERM = 152,
|
||||
|
||||
/* insert new ones above this line */
|
||||
AFTER_LAST_ERR_CODE
|
||||
};
|
||||
|
||||
#define DRBD_PROT_A 1
|
||||
#define DRBD_PROT_B 2
|
||||
#define DRBD_PROT_C 3
|
||||
|
||||
enum drbd_role {
|
||||
R_UNKNOWN = 0,
|
||||
R_PRIMARY = 1, /* role */
|
||||
R_SECONDARY = 2, /* role */
|
||||
R_MASK = 3,
|
||||
};
|
||||
|
||||
/* The order of these constants is important.
|
||||
* The lower ones (<C_WF_REPORT_PARAMS) indicate
|
||||
* that there is no socket!
|
||||
* >=C_WF_REPORT_PARAMS ==> There is a socket
|
||||
*/
|
||||
enum drbd_conns {
|
||||
C_STANDALONE,
|
||||
C_DISCONNECTING, /* Temporal state on the way to StandAlone. */
|
||||
C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */
|
||||
|
||||
/* These temporal states are all used on the way
|
||||
* from >= C_CONNECTED to Unconnected.
|
||||
* The 'disconnect reason' states
|
||||
* I do not allow to change beween them. */
|
||||
C_TIMEOUT,
|
||||
C_BROKEN_PIPE,
|
||||
C_NETWORK_FAILURE,
|
||||
C_PROTOCOL_ERROR,
|
||||
C_TEAR_DOWN,
|
||||
|
||||
C_WF_CONNECTION,
|
||||
C_WF_REPORT_PARAMS, /* we have a socket */
|
||||
C_CONNECTED, /* we have introduced each other */
|
||||
C_STARTING_SYNC_S, /* starting full sync by admin request. */
|
||||
C_STARTING_SYNC_T, /* stariing full sync by admin request. */
|
||||
C_WF_BITMAP_S,
|
||||
C_WF_BITMAP_T,
|
||||
C_WF_SYNC_UUID,
|
||||
|
||||
/* All SyncStates are tested with this comparison
|
||||
* xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
|
||||
C_SYNC_SOURCE,
|
||||
C_SYNC_TARGET,
|
||||
C_VERIFY_S,
|
||||
C_VERIFY_T,
|
||||
C_PAUSED_SYNC_S,
|
||||
C_PAUSED_SYNC_T,
|
||||
C_MASK = 31
|
||||
};
|
||||
|
||||
enum drbd_disk_state {
|
||||
D_DISKLESS,
|
||||
D_ATTACHING, /* In the process of reading the meta-data */
|
||||
D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */
|
||||
/* when >= D_FAILED it is legal to access mdev->bc */
|
||||
D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */
|
||||
D_INCONSISTENT,
|
||||
D_OUTDATED,
|
||||
D_UNKNOWN, /* Only used for the peer, never for myself */
|
||||
D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
|
||||
D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */
|
||||
D_MASK = 15
|
||||
};
|
||||
|
||||
union drbd_state {
|
||||
/* According to gcc's docs is the ...
|
||||
* The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
|
||||
* Determined by ABI.
|
||||
* pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
|
||||
* even though we transmit as "cpu_to_be32(state)",
|
||||
* the offsets of the bitfields still need to be swapped
|
||||
* on different endianess.
|
||||
*/
|
||||
struct {
|
||||
#if defined(__LITTLE_ENDIAN_BITFIELD)
|
||||
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
|
||||
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
|
||||
unsigned conn:5 ; /* 17/32 cstates */
|
||||
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
|
||||
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
|
||||
unsigned susp:1 ; /* 2/2 IO suspended no/yes */
|
||||
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
|
||||
unsigned peer_isp:1 ;
|
||||
unsigned user_isp:1 ;
|
||||
unsigned _pad:11; /* 0 unused */
|
||||
#elif defined(__BIG_ENDIAN_BITFIELD)
|
||||
unsigned _pad:11; /* 0 unused */
|
||||
unsigned user_isp:1 ;
|
||||
unsigned peer_isp:1 ;
|
||||
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
|
||||
unsigned susp:1 ; /* 2/2 IO suspended no/yes */
|
||||
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
|
||||
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
|
||||
unsigned conn:5 ; /* 17/32 cstates */
|
||||
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
|
||||
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
|
||||
#else
|
||||
# error "this endianess is not supported"
|
||||
#endif
|
||||
};
|
||||
unsigned int i;
|
||||
};
|
||||
|
||||
enum drbd_state_ret_codes {
|
||||
SS_CW_NO_NEED = 4,
|
||||
SS_CW_SUCCESS = 3,
|
||||
SS_NOTHING_TO_DO = 2,
|
||||
SS_SUCCESS = 1,
|
||||
SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
|
||||
SS_TWO_PRIMARIES = -1,
|
||||
SS_NO_UP_TO_DATE_DISK = -2,
|
||||
SS_NO_LOCAL_DISK = -4,
|
||||
SS_NO_REMOTE_DISK = -5,
|
||||
SS_CONNECTED_OUTDATES = -6,
|
||||
SS_PRIMARY_NOP = -7,
|
||||
SS_RESYNC_RUNNING = -8,
|
||||
SS_ALREADY_STANDALONE = -9,
|
||||
SS_CW_FAILED_BY_PEER = -10,
|
||||
SS_IS_DISKLESS = -11,
|
||||
SS_DEVICE_IN_USE = -12,
|
||||
SS_NO_NET_CONFIG = -13,
|
||||
SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */
|
||||
SS_NEED_CONNECTION = -15, /* drbd-8.2 only */
|
||||
SS_LOWER_THAN_OUTDATED = -16,
|
||||
SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
|
||||
SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
|
||||
SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
|
||||
SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
|
||||
};
|
||||
|
||||
/* from drbd_strings.c */
|
||||
extern const char *drbd_conn_str(enum drbd_conns);
|
||||
extern const char *drbd_role_str(enum drbd_role);
|
||||
extern const char *drbd_disk_str(enum drbd_disk_state);
|
||||
extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
|
||||
|
||||
#define SHARED_SECRET_MAX 64
|
||||
|
||||
#define MDF_CONSISTENT (1 << 0)
|
||||
#define MDF_PRIMARY_IND (1 << 1)
|
||||
#define MDF_CONNECTED_IND (1 << 2)
|
||||
#define MDF_FULL_SYNC (1 << 3)
|
||||
#define MDF_WAS_UP_TO_DATE (1 << 4)
|
||||
#define MDF_PEER_OUT_DATED (1 << 5)
|
||||
#define MDF_CRASHED_PRIMARY (1 << 6)
|
||||
|
||||
enum drbd_uuid_index {
|
||||
UI_CURRENT,
|
||||
UI_BITMAP,
|
||||
UI_HISTORY_START,
|
||||
UI_HISTORY_END,
|
||||
UI_SIZE, /* nl-packet: number of dirty bits */
|
||||
UI_FLAGS, /* nl-packet: flags */
|
||||
UI_EXTENDED_SIZE /* Everything. */
|
||||
};
|
||||
|
||||
enum drbd_timeout_flag {
|
||||
UT_DEFAULT = 0,
|
||||
UT_DEGRADED = 1,
|
||||
UT_PEER_OUTDATED = 2,
|
||||
};
|
||||
|
||||
#define UUID_JUST_CREATED ((__u64)4)
|
||||
|
||||
#define DRBD_MAGIC 0x83740267
|
||||
#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
|
||||
|
||||
/* these are of type "int" */
|
||||
#define DRBD_MD_INDEX_INTERNAL -1
|
||||
#define DRBD_MD_INDEX_FLEX_EXT -2
|
||||
#define DRBD_MD_INDEX_FLEX_INT -3
|
||||
|
||||
/* Start of the new netlink/connector stuff */
|
||||
|
||||
#define DRBD_NL_CREATE_DEVICE 0x01
|
||||
#define DRBD_NL_SET_DEFAULTS 0x02
|
||||
|
||||
|
||||
/* For searching a vacant cn_idx value */
|
||||
#define CN_IDX_STEP 6977
|
||||
|
||||
struct drbd_nl_cfg_req {
|
||||
int packet_type;
|
||||
unsigned int drbd_minor;
|
||||
int flags;
|
||||
unsigned short tag_list[];
|
||||
};
|
||||
|
||||
struct drbd_nl_cfg_reply {
|
||||
int packet_type;
|
||||
unsigned int minor;
|
||||
int ret_code; /* enum ret_code or set_st_err_t */
|
||||
unsigned short tag_list[]; /* only used with get_* calls */
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
drbd_limits.h
|
||||
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Our current limitations.
|
||||
* Some of them are hard limits,
|
||||
* some of them are arbitrary range limits, that make it easier to provide
|
||||
* feedback about nonsense settings for certain configurable values.
|
||||
*/
|
||||
|
||||
#ifndef DRBD_LIMITS_H
|
||||
#define DRBD_LIMITS_H 1
|
||||
|
||||
#define DEBUG_RANGE_CHECK 0
|
||||
|
||||
#define DRBD_MINOR_COUNT_MIN 1
|
||||
#define DRBD_MINOR_COUNT_MAX 255
|
||||
|
||||
#define DRBD_DIALOG_REFRESH_MIN 0
|
||||
#define DRBD_DIALOG_REFRESH_MAX 600
|
||||
|
||||
/* valid port number */
|
||||
#define DRBD_PORT_MIN 1
|
||||
#define DRBD_PORT_MAX 0xffff
|
||||
|
||||
/* startup { */
|
||||
/* if you want more than 3.4 days, disable */
|
||||
#define DRBD_WFC_TIMEOUT_MIN 0
|
||||
#define DRBD_WFC_TIMEOUT_MAX 300000
|
||||
#define DRBD_WFC_TIMEOUT_DEF 0
|
||||
|
||||
#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
|
||||
#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
|
||||
#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
|
||||
|
||||
#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
|
||||
#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
|
||||
#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
|
||||
/* }*/
|
||||
|
||||
/* net { */
|
||||
/* timeout, unit centi seconds
|
||||
* more than one minute timeout is not usefull */
|
||||
#define DRBD_TIMEOUT_MIN 1
|
||||
#define DRBD_TIMEOUT_MAX 600
|
||||
#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
|
||||
|
||||
/* active connection retries when C_WF_CONNECTION */
|
||||
#define DRBD_CONNECT_INT_MIN 1
|
||||
#define DRBD_CONNECT_INT_MAX 120
|
||||
#define DRBD_CONNECT_INT_DEF 10 /* seconds */
|
||||
|
||||
/* keep-alive probes when idle */
|
||||
#define DRBD_PING_INT_MIN 1
|
||||
#define DRBD_PING_INT_MAX 120
|
||||
#define DRBD_PING_INT_DEF 10
|
||||
|
||||
/* timeout for the ping packets.*/
|
||||
#define DRBD_PING_TIMEO_MIN 1
|
||||
#define DRBD_PING_TIMEO_MAX 100
|
||||
#define DRBD_PING_TIMEO_DEF 5
|
||||
|
||||
/* max number of write requests between write barriers */
|
||||
#define DRBD_MAX_EPOCH_SIZE_MIN 1
|
||||
#define DRBD_MAX_EPOCH_SIZE_MAX 20000
|
||||
#define DRBD_MAX_EPOCH_SIZE_DEF 2048
|
||||
|
||||
/* I don't think that a tcp send buffer of more than 10M is usefull */
|
||||
#define DRBD_SNDBUF_SIZE_MIN 0
|
||||
#define DRBD_SNDBUF_SIZE_MAX (10<<20)
|
||||
#define DRBD_SNDBUF_SIZE_DEF 0
|
||||
|
||||
#define DRBD_RCVBUF_SIZE_MIN 0
|
||||
#define DRBD_RCVBUF_SIZE_MAX (10<<20)
|
||||
#define DRBD_RCVBUF_SIZE_DEF 0
|
||||
|
||||
/* @4k PageSize -> 128kB - 512MB */
|
||||
#define DRBD_MAX_BUFFERS_MIN 32
|
||||
#define DRBD_MAX_BUFFERS_MAX 131072
|
||||
#define DRBD_MAX_BUFFERS_DEF 2048
|
||||
|
||||
/* @4k PageSize -> 4kB - 512MB */
|
||||
#define DRBD_UNPLUG_WATERMARK_MIN 1
|
||||
#define DRBD_UNPLUG_WATERMARK_MAX 131072
|
||||
#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
|
||||
|
||||
/* 0 is disabled.
|
||||
* 200 should be more than enough even for very short timeouts */
|
||||
#define DRBD_KO_COUNT_MIN 0
|
||||
#define DRBD_KO_COUNT_MAX 200
|
||||
#define DRBD_KO_COUNT_DEF 0
|
||||
/* } */
|
||||
|
||||
/* syncer { */
|
||||
/* FIXME allow rate to be zero? */
|
||||
#define DRBD_RATE_MIN 1
|
||||
/* channel bonding 10 GbE, or other hardware */
|
||||
#define DRBD_RATE_MAX (4 << 20)
|
||||
#define DRBD_RATE_DEF 250 /* kb/second */
|
||||
|
||||
/* less than 7 would hit performance unneccessarily.
|
||||
* 3833 is the largest prime that still does fit
|
||||
* into 64 sectors of activity log */
|
||||
#define DRBD_AL_EXTENTS_MIN 7
|
||||
#define DRBD_AL_EXTENTS_MAX 3833
|
||||
#define DRBD_AL_EXTENTS_DEF 127
|
||||
|
||||
#define DRBD_AFTER_MIN -1
|
||||
#define DRBD_AFTER_MAX 255
|
||||
#define DRBD_AFTER_DEF -1
|
||||
|
||||
/* } */
|
||||
|
||||
/* drbdsetup XY resize -d Z
|
||||
* you are free to reduce the device size to nothing, if you want to.
|
||||
* the upper limit with 64bit kernel, enough ram and flexible meta data
|
||||
* is 16 TB, currently. */
|
||||
/* DRBD_MAX_SECTORS */
|
||||
#define DRBD_DISK_SIZE_SECT_MIN 0
|
||||
#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30))
|
||||
#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
|
||||
|
||||
#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
|
||||
#define DRBD_FENCING_DEF FP_DONT_CARE
|
||||
#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
|
||||
#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
|
||||
#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
|
||||
#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
|
||||
|
||||
#define DRBD_MAX_BIO_BVECS_MIN 0
|
||||
#define DRBD_MAX_BIO_BVECS_MAX 128
|
||||
#define DRBD_MAX_BIO_BVECS_DEF 0
|
||||
|
||||
#undef RANGE
|
||||
#endif
|
||||
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
PAKET( name,
|
||||
TYPE ( pn, pr, member )
|
||||
...
|
||||
)
|
||||
|
||||
You may never reissue one of the pn arguments
|
||||
*/
|
||||
|
||||
#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
|
||||
#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
|
||||
#endif
|
||||
|
||||
NL_PACKET(primary, 1,
|
||||
NL_BIT( 1, T_MAY_IGNORE, overwrite_peer)
|
||||
)
|
||||
|
||||
NL_PACKET(secondary, 2, )
|
||||
|
||||
NL_PACKET(disk_conf, 3,
|
||||
NL_INT64( 2, T_MAY_IGNORE, disk_size)
|
||||
NL_STRING( 3, T_MANDATORY, backing_dev, 128)
|
||||
NL_STRING( 4, T_MANDATORY, meta_dev, 128)
|
||||
NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
|
||||
NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
|
||||
NL_INTEGER( 7, T_MAY_IGNORE, fencing)
|
||||
NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
|
||||
NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
|
||||
NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
|
||||
/* 55 max_bio_size was available in 8.2.6rc2 */
|
||||
NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
|
||||
NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
|
||||
NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
|
||||
)
|
||||
|
||||
NL_PACKET(detach, 4, )
|
||||
|
||||
NL_PACKET(net_conf, 5,
|
||||
NL_STRING( 8, T_MANDATORY, my_addr, 128)
|
||||
NL_STRING( 9, T_MANDATORY, peer_addr, 128)
|
||||
NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
|
||||
NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
|
||||
NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
|
||||
NL_INTEGER( 14, T_MAY_IGNORE, timeout)
|
||||
NL_INTEGER( 15, T_MANDATORY, wire_protocol)
|
||||
NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
|
||||
NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
|
||||
NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
|
||||
NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
|
||||
NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
|
||||
NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
|
||||
NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
|
||||
NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
|
||||
NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
|
||||
NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
|
||||
NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
|
||||
NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
|
||||
NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
|
||||
/* 59 addr_family was available in GIT, never released */
|
||||
NL_BIT( 60, T_MANDATORY, mind_af)
|
||||
NL_BIT( 27, T_MAY_IGNORE, want_lose)
|
||||
NL_BIT( 28, T_MAY_IGNORE, two_primaries)
|
||||
NL_BIT( 41, T_MAY_IGNORE, always_asbp)
|
||||
NL_BIT( 61, T_MAY_IGNORE, no_cork)
|
||||
NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
|
||||
)
|
||||
|
||||
NL_PACKET(disconnect, 6, )
|
||||
|
||||
NL_PACKET(resize, 7,
|
||||
NL_INT64( 29, T_MAY_IGNORE, resize_size)
|
||||
)
|
||||
|
||||
NL_PACKET(syncer_conf, 8,
|
||||
NL_INTEGER( 30, T_MAY_IGNORE, rate)
|
||||
NL_INTEGER( 31, T_MAY_IGNORE, after)
|
||||
NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
|
||||
NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
|
||||
NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
|
||||
NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
|
||||
NL_BIT( 65, T_MAY_IGNORE, use_rle)
|
||||
)
|
||||
|
||||
NL_PACKET(invalidate, 9, )
|
||||
NL_PACKET(invalidate_peer, 10, )
|
||||
NL_PACKET(pause_sync, 11, )
|
||||
NL_PACKET(resume_sync, 12, )
|
||||
NL_PACKET(suspend_io, 13, )
|
||||
NL_PACKET(resume_io, 14, )
|
||||
NL_PACKET(outdate, 15, )
|
||||
NL_PACKET(get_config, 16, )
|
||||
NL_PACKET(get_state, 17,
|
||||
NL_INTEGER( 33, T_MAY_IGNORE, state_i)
|
||||
)
|
||||
|
||||
NL_PACKET(get_uuids, 18,
|
||||
NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
|
||||
NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
|
||||
)
|
||||
|
||||
NL_PACKET(get_timeout_flag, 19,
|
||||
NL_BIT( 36, T_MAY_IGNORE, use_degraded)
|
||||
)
|
||||
|
||||
NL_PACKET(call_helper, 20,
|
||||
NL_STRING( 38, T_MAY_IGNORE, helper, 32)
|
||||
)
|
||||
|
||||
/* Tag nr 42 already allocated in drbd-8.1 development. */
|
||||
|
||||
NL_PACKET(sync_progress, 23,
|
||||
NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
|
||||
)
|
||||
|
||||
NL_PACKET(dump_ee, 24,
|
||||
NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
|
||||
NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
|
||||
NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
|
||||
NL_INT64( 48, T_MAY_IGNORE, ee_sector)
|
||||
NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
|
||||
NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
|
||||
)
|
||||
|
||||
NL_PACKET(start_ov, 25,
|
||||
NL_INT64( 66, T_MAY_IGNORE, start_sector)
|
||||
)
|
||||
|
||||
NL_PACKET(new_c_uuid, 26,
|
||||
NL_BIT( 63, T_MANDATORY, clear_bm)
|
||||
)
|
||||
|
||||
#undef NL_PACKET
|
||||
#undef NL_INTEGER
|
||||
#undef NL_INT64
|
||||
#undef NL_BIT
|
||||
#undef NL_STRING
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
#ifndef DRBD_TAG_MAGIC_H
|
||||
#define DRBD_TAG_MAGIC_H
|
||||
|
||||
#define TT_END 0
|
||||
#define TT_REMOVED 0xE000
|
||||
|
||||
/* declare packet_type enums */
|
||||
enum packet_types {
|
||||
#define NL_PACKET(name, number, fields) P_ ## name = number,
|
||||
#define NL_INTEGER(pn, pr, member)
|
||||
#define NL_INT64(pn, pr, member)
|
||||
#define NL_BIT(pn, pr, member)
|
||||
#define NL_STRING(pn, pr, member, len)
|
||||
#include "drbd_nl.h"
|
||||
P_nl_after_last_packet,
|
||||
};
|
||||
|
||||
/* These struct are used to deduce the size of the tag lists: */
|
||||
#define NL_PACKET(name, number, fields) \
|
||||
struct name ## _tag_len_struct { fields };
|
||||
#define NL_INTEGER(pn, pr, member) \
|
||||
int member; int tag_and_len ## member;
|
||||
#define NL_INT64(pn, pr, member) \
|
||||
__u64 member; int tag_and_len ## member;
|
||||
#define NL_BIT(pn, pr, member) \
|
||||
unsigned char member:1; int tag_and_len ## member;
|
||||
#define NL_STRING(pn, pr, member, len) \
|
||||
unsigned char member[len]; int member ## _len; \
|
||||
int tag_and_len ## member;
|
||||
#include "linux/drbd_nl.h"
|
||||
|
||||
/* declate tag-list-sizes */
|
||||
static const int tag_list_sizes[] = {
|
||||
#define NL_PACKET(name, number, fields) 2 fields ,
|
||||
#define NL_INTEGER(pn, pr, member) + 4 + 4
|
||||
#define NL_INT64(pn, pr, member) + 4 + 8
|
||||
#define NL_BIT(pn, pr, member) + 4 + 1
|
||||
#define NL_STRING(pn, pr, member, len) + 4 + (len)
|
||||
#include "drbd_nl.h"
|
||||
};
|
||||
|
||||
/* The two highest bits are used for the tag type */
|
||||
#define TT_MASK 0xC000
|
||||
#define TT_INTEGER 0x0000
|
||||
#define TT_INT64 0x4000
|
||||
#define TT_BIT 0x8000
|
||||
#define TT_STRING 0xC000
|
||||
/* The next bit indicates if processing of the tag is mandatory */
|
||||
#define T_MANDATORY 0x2000
|
||||
#define T_MAY_IGNORE 0x0000
|
||||
#define TN_MASK 0x1fff
|
||||
/* The remaining 13 bits are used to enumerate the tags */
|
||||
|
||||
#define tag_type(T) ((T) & TT_MASK)
|
||||
#define tag_number(T) ((T) & TN_MASK)
|
||||
|
||||
/* declare tag enums */
|
||||
#define NL_PACKET(name, number, fields) fields
|
||||
enum drbd_tags {
|
||||
#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
|
||||
#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
|
||||
#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
|
||||
#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
|
||||
#include "drbd_nl.h"
|
||||
};
|
||||
|
||||
struct tag {
|
||||
const char *name;
|
||||
int type_n_flags;
|
||||
int max_len;
|
||||
};
|
||||
|
||||
/* declare tag names */
|
||||
#define NL_PACKET(name, number, fields) fields
|
||||
static const struct tag tag_descriptions[] = {
|
||||
#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
|
||||
#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
|
||||
#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
|
||||
#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
|
||||
#include "drbd_nl.h"
|
||||
};
|
||||
|
||||
#endif
|
||||
+3
-2
@@ -129,7 +129,7 @@ struct inodes_stat_t {
|
||||
* WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device
|
||||
* immediately after submission. The write equivalent
|
||||
* of READ_SYNC.
|
||||
* WRITE_ODIRECT Special case write for O_DIRECT only.
|
||||
* WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
|
||||
* SWRITE_SYNC
|
||||
* SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
|
||||
* See SWRITE.
|
||||
@@ -151,7 +151,7 @@ struct inodes_stat_t {
|
||||
#define READ_META (READ | (1 << BIO_RW_META))
|
||||
#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
|
||||
#define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
|
||||
#define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
|
||||
#define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO))
|
||||
#define SWRITE_SYNC_PLUG \
|
||||
(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
|
||||
#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
|
||||
@@ -304,6 +304,7 @@ struct inodes_stat_t {
|
||||
#define BLKIOOPT _IO(0x12,121)
|
||||
#define BLKALIGNOFF _IO(0x12,122)
|
||||
#define BLKPBSZGET _IO(0x12,123)
|
||||
#define BLKDISCARDZEROES _IO(0x12,124)
|
||||
|
||||
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
|
||||
#define FIBMAP _IO(0x00,1) /* bmap access */
|
||||
|
||||
@@ -91,6 +91,7 @@ struct hd_struct {
|
||||
sector_t start_sect;
|
||||
sector_t nr_sects;
|
||||
sector_t alignment_offset;
|
||||
unsigned int discard_alignment;
|
||||
struct device __dev;
|
||||
struct kobject *holder_dir;
|
||||
int policy, partno;
|
||||
|
||||
@@ -40,16 +40,11 @@ struct cfq_io_context {
|
||||
struct io_context *ioc;
|
||||
|
||||
unsigned long last_end_request;
|
||||
sector_t last_request_pos;
|
||||
|
||||
unsigned long ttime_total;
|
||||
unsigned long ttime_samples;
|
||||
unsigned long ttime_mean;
|
||||
|
||||
unsigned int seek_samples;
|
||||
u64 seek_total;
|
||||
sector_t seek_mean;
|
||||
|
||||
struct list_head queue_list;
|
||||
struct hlist_node cic_list;
|
||||
|
||||
@@ -73,6 +68,10 @@ struct io_context {
|
||||
unsigned short ioprio;
|
||||
unsigned short ioprio_changed;
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP
|
||||
unsigned short cgroup_changed;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* For request batching
|
||||
*/
|
||||
@@ -99,14 +98,15 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct task_struct;
|
||||
#ifdef CONFIG_BLOCK
|
||||
int put_io_context(struct io_context *ioc);
|
||||
void exit_io_context(void);
|
||||
void exit_io_context(struct task_struct *task);
|
||||
struct io_context *get_io_context(gfp_t gfp_flags, int node);
|
||||
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
|
||||
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
|
||||
#else
|
||||
static inline void exit_io_context(void)
|
||||
static inline void exit_io_context(struct task_struct *task)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,294 @@
|
||||
/*
|
||||
lru_cache.c
|
||||
|
||||
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
|
||||
|
||||
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
|
||||
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
|
||||
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
|
||||
|
||||
drbd is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2, or (at your option)
|
||||
any later version.
|
||||
|
||||
drbd is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with drbd; see the file COPYING. If not, write to
|
||||
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef LRU_CACHE_H
|
||||
#define LRU_CACHE_H
|
||||
|
||||
#include <linux/list.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/string.h> /* for memset */
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
/*
|
||||
This header file (and its .c file; kernel-doc of functions see there)
|
||||
define a helper framework to easily keep track of index:label associations,
|
||||
and changes to an "active set" of objects, as well as pending transactions,
|
||||
to persistently record those changes.
|
||||
|
||||
We use an LRU policy if it is necessary to "cool down" a region currently in
|
||||
the active set before we can "heat" a previously unused region.
|
||||
|
||||
Because of this later property, it is called "lru_cache".
|
||||
As it actually Tracks Objects in an Active SeT, we could also call it
|
||||
toast (incidentally that is what may happen to the data on the
|
||||
backend storage uppon next resync, if we don't get it right).
|
||||
|
||||
What for?
|
||||
|
||||
We replicate IO (more or less synchronously) to local and remote disk.
|
||||
|
||||
For crash recovery after replication node failure,
|
||||
we need to resync all regions that have been target of in-flight WRITE IO
|
||||
(in use, or "hot", regions), as we don't know wether or not those WRITEs have
|
||||
made it to stable storage.
|
||||
|
||||
To avoid a "full resync", we need to persistently track these regions.
|
||||
|
||||
This is known as "write intent log", and can be implemented as on-disk
|
||||
(coarse or fine grained) bitmap, or other meta data.
|
||||
|
||||
To avoid the overhead of frequent extra writes to this meta data area,
|
||||
usually the condition is softened to regions that _may_ have been target of
|
||||
in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
|
||||
bitmap, trading frequency of meta data transactions against amount of
|
||||
(possibly unneccessary) resync traffic.
|
||||
|
||||
If we set a hard limit on the area that may be "hot" at any given time, we
|
||||
limit the amount of resync traffic needed for crash recovery.
|
||||
|
||||
For recovery after replication link failure,
|
||||
we need to resync all blocks that have been changed on the other replica
|
||||
in the mean time, or, if both replica have been changed independently [*],
|
||||
all blocks that have been changed on either replica in the mean time.
|
||||
[*] usually as a result of a cluster split-brain and insufficient protection.
|
||||
but there are valid use cases to do this on purpose.
|
||||
|
||||
Tracking those blocks can be implemented as "dirty bitmap".
|
||||
Having it fine-grained reduces the amount of resync traffic.
|
||||
It should also be persistent, to allow for reboots (or crashes)
|
||||
while the replication link is down.
|
||||
|
||||
There are various possible implementations for persistently storing
|
||||
write intent log information, three of which are mentioned here.
|
||||
|
||||
"Chunk dirtying"
|
||||
The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
|
||||
To reduce the frequency of bitmap updates for write-intent log purposes,
|
||||
one could dirty "chunks" (of some size) at a time of the (fine grained)
|
||||
on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
|
||||
possible, flushing it to disk again when a previously "hot" (and on-disk
|
||||
dirtied as full chunk) area "cools down" again (no IO in flight anymore,
|
||||
and none expected in the near future either).
|
||||
|
||||
"Explicit (coarse) write intent bitmap"
|
||||
An other implementation could chose a (probably coarse) explicit bitmap,
|
||||
for write-intent log purposes, additionally to the fine grained dirty bitmap.
|
||||
|
||||
"Activity log"
|
||||
Yet an other implementation may keep track of the hot regions, by starting
|
||||
with an empty set, and writing down a journal of region numbers that have
|
||||
become "hot", or have "cooled down" again.
|
||||
|
||||
To be able to use a ring buffer for this journal of changes to the active
|
||||
set, we not only record the actual changes to that set, but also record the
|
||||
not changing members of the set in a round robin fashion. To do so, we use a
|
||||
fixed (but configurable) number of slots which we can identify by index, and
|
||||
associate region numbers (labels) with these indices.
|
||||
For each transaction recording a change to the active set, we record the
|
||||
change itself (index: -old_label, +new_label), and which index is associated
|
||||
with which label (index: current_label) within a certain sliding window that
|
||||
is moved further over the available indices with each such transaction.
|
||||
|
||||
Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
|
||||
accurately reconstruct the active set.
|
||||
|
||||
Sufficiently large depends only on maximum number of active objects, and the
|
||||
size of the sliding window recording "index: current_label" associations within
|
||||
each transaction.
|
||||
|
||||
This is what we call the "activity log".
|
||||
|
||||
Currently we need one activity log transaction per single label change, which
|
||||
does not give much benefit over the "dirty chunks of bitmap" approach, other
|
||||
than potentially less seeks.
|
||||
|
||||
We plan to change the transaction format to support multiple changes per
|
||||
transaction, which then would reduce several (disjoint, "random") updates to
|
||||
the bitmap into one transaction to the activity log ring buffer.
|
||||
*/
|
||||
|
||||
/* this defines an element in a tracked set
|
||||
* .colision is for hash table lookup.
|
||||
* When we process a new IO request, we know its sector, thus can deduce the
|
||||
* region number (label) easily. To do the label -> object lookup without a
|
||||
* full list walk, we use a simple hash table.
|
||||
*
|
||||
* .list is on one of three lists:
|
||||
* in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
|
||||
* lru: unused but ready to be reused or recycled
|
||||
* (ts_refcnt == 0, lc_number != LC_FREE),
|
||||
* free: unused but ready to be recycled
|
||||
* (ts_refcnt == 0, lc_number == LC_FREE),
|
||||
*
|
||||
* an element is said to be "in the active set",
|
||||
* if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
|
||||
*
|
||||
* DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
|
||||
* (total memory usage 2 pages), and up to 3833 elements on the act_log
|
||||
* lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages.
|
||||
*
|
||||
* We usually do not actually free these objects again, but only "recycle"
|
||||
* them, as the change "index: -old_label, +LC_FREE" would need a transaction
|
||||
* as well. Which also means that using a kmem_cache to allocate the objects
|
||||
* from wastes some resources.
|
||||
* But it avoids high order page allocations in kmalloc.
|
||||
*/
|
||||
struct lc_element {
|
||||
struct hlist_node colision;
|
||||
struct list_head list; /* LRU list or free list */
|
||||
unsigned refcnt;
|
||||
/* back "pointer" into ts_cache->element[index],
|
||||
* for paranoia, and for "ts_element_to_index" */
|
||||
unsigned lc_index;
|
||||
/* if we want to track a larger set of objects,
|
||||
* it needs to become arch independend u64 */
|
||||
unsigned lc_number;
|
||||
|
||||
/* special label when on free list */
|
||||
#define LC_FREE (~0U)
|
||||
};
|
||||
|
||||
struct lru_cache {
|
||||
/* the least recently used item is kept at lru->prev */
|
||||
struct list_head lru;
|
||||
struct list_head free;
|
||||
struct list_head in_use;
|
||||
|
||||
/* the pre-created kmem cache to allocate the objects from */
|
||||
struct kmem_cache *lc_cache;
|
||||
|
||||
/* size of tracked objects, used to memset(,0,) them in lc_reset */
|
||||
size_t element_size;
|
||||
/* offset of struct lc_element member in the tracked object */
|
||||
size_t element_off;
|
||||
|
||||
/* number of elements (indices) */
|
||||
unsigned int nr_elements;
|
||||
/* Arbitrary limit on maximum tracked objects. Practical limit is much
|
||||
* lower due to allocation failures, probably. For typical use cases,
|
||||
* nr_elements should be a few thousand at most.
|
||||
* This also limits the maximum value of ts_element.ts_index, allowing the
|
||||
* 8 high bits of .ts_index to be overloaded with flags in the future. */
|
||||
#define LC_MAX_ACTIVE (1<<24)
|
||||
|
||||
/* statistics */
|
||||
unsigned used; /* number of lelements currently on in_use list */
|
||||
unsigned long hits, misses, starving, dirty, changed;
|
||||
|
||||
/* see below: flag-bits for lru_cache */
|
||||
unsigned long flags;
|
||||
|
||||
/* when changing the label of an index element */
|
||||
unsigned int new_number;
|
||||
|
||||
/* for paranoia when changing the label of an index element */
|
||||
struct lc_element *changing_element;
|
||||
|
||||
void *lc_private;
|
||||
const char *name;
|
||||
|
||||
/* nr_elements there */
|
||||
struct hlist_head *lc_slot;
|
||||
struct lc_element **lc_element;
|
||||
};
|
||||
|
||||
|
||||
/* flag-bits for lru_cache */
|
||||
enum {
|
||||
/* debugging aid, to catch concurrent access early.
|
||||
* user needs to guarantee exclusive access by proper locking! */
|
||||
__LC_PARANOIA,
|
||||
/* if we need to change the set, but currently there is a changing
|
||||
* transaction pending, we are "dirty", and must deferr further
|
||||
* changing requests */
|
||||
__LC_DIRTY,
|
||||
/* if we need to change the set, but currently there is no free nor
|
||||
* unused element available, we are "starving", and must not give out
|
||||
* further references, to guarantee that eventually some refcnt will
|
||||
* drop to zero and we will be able to make progress again, changing
|
||||
* the set, writing the transaction.
|
||||
* if the statistics say we are frequently starving,
|
||||
* nr_elements is too small. */
|
||||
__LC_STARVING,
|
||||
};
|
||||
#define LC_PARANOIA (1<<__LC_PARANOIA)
|
||||
#define LC_DIRTY (1<<__LC_DIRTY)
|
||||
#define LC_STARVING (1<<__LC_STARVING)
|
||||
|
||||
extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
|
||||
unsigned e_count, size_t e_size, size_t e_off);
|
||||
extern void lc_reset(struct lru_cache *lc);
|
||||
extern void lc_destroy(struct lru_cache *lc);
|
||||
extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
|
||||
extern void lc_del(struct lru_cache *lc, struct lc_element *element);
|
||||
|
||||
extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
|
||||
extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
|
||||
extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
|
||||
extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
|
||||
extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
|
||||
|
||||
struct seq_file;
|
||||
extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
|
||||
|
||||
extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
|
||||
void (*detail) (struct seq_file *, struct lc_element *));
|
||||
|
||||
/**
|
||||
* lc_try_lock - can be used to stop lc_get() from changing the tracked set
|
||||
* @lc: the lru cache to operate on
|
||||
*
|
||||
* Note that the reference counts and order on the active and lru lists may
|
||||
* still change. Returns true if we aquired the lock.
|
||||
*/
|
||||
static inline int lc_try_lock(struct lru_cache *lc)
|
||||
{
|
||||
return !test_and_set_bit(__LC_DIRTY, &lc->flags);
|
||||
}
|
||||
|
||||
/**
|
||||
* lc_unlock - unlock @lc, allow lc_get() to change the set again
|
||||
* @lc: the lru cache to operate on
|
||||
*/
|
||||
static inline void lc_unlock(struct lru_cache *lc)
|
||||
{
|
||||
clear_bit(__LC_DIRTY, &lc->flags);
|
||||
smp_mb__after_clear_bit();
|
||||
}
|
||||
|
||||
static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
|
||||
{
|
||||
struct lc_element *e = lc_find(lc, enr);
|
||||
return e && e->refcnt;
|
||||
}
|
||||
|
||||
#define lc_entry(ptr, type, member) \
|
||||
container_of(ptr, type, member)
|
||||
|
||||
extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
|
||||
extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
|
||||
|
||||
#endif
|
||||
@@ -49,6 +49,7 @@ struct writeback_control {
|
||||
unsigned nonblocking:1; /* Don't get stuck on request queues */
|
||||
unsigned encountered_congestion:1; /* An output: a queue is full */
|
||||
unsigned for_kupdate:1; /* A kupdate writeback */
|
||||
unsigned for_background:1; /* A background writeback */
|
||||
unsigned for_reclaim:1; /* Invoked from the page allocator */
|
||||
unsigned range_cyclic:1; /* range_start is cyclic */
|
||||
unsigned more_io:1; /* more io to be dispatched */
|
||||
|
||||
Reference in New Issue
Block a user