Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "There is quite a bit here, including some overdue refactoring and
  cleanup on the mon_client and osd_client code from Ilya, scattered
  writeback support for CephFS and a pile of bug fixes from Zheng, and a
  few random cleanups and fixes from others"

[ I already decided not to pull this because of it having been rebased
  recently, but ended up changing my mind after all.  Next time I'll
  really hold people to it.  Oh well.   - Linus ]

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (34 commits)
  libceph: use KMEM_CACHE macro
  ceph: use kmem_cache_zalloc
  rbd: use KMEM_CACHE macro
  ceph: use lookup request to revalidate dentry
  ceph: kill ceph_get_dentry_parent_inode()
  ceph: fix security xattr deadlock
  ceph: don't request vxattrs from MDS
  ceph: fix mounting same fs multiple times
  ceph: remove unnecessary NULL check
  ceph: avoid updating directory inode's i_size accidentally
  ceph: fix race during filling readdir cache
  libceph: use sizeof_footer() more
  ceph: kill ceph_empty_snapc
  ceph: fix a wrong comparison
  ceph: replace CURRENT_TIME by current_fs_time()
  ceph: scattered page writeback
  libceph: add helper that duplicates last extent operation
  libceph: enable large, variable-sized OSD requests
  libceph: osdc->req_mempool should be backed by a slab pool
  libceph: make r_request msg_size calculation clearer
  ...
This commit is contained in:
Linus Torvalds
2016-03-26 15:53:16 -07:00
22 changed files with 807 additions and 515 deletions

View File

@@ -361,7 +361,6 @@ ceph_parse_options(char *options, const char *dev_name,
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
/* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */
@@ -686,6 +685,9 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
return client->auth_err;
}
pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid);
ceph_debugfs_client_init(client);
return 0;
}
EXPORT_SYMBOL(__ceph_open_session);

View File

@@ -112,15 +112,20 @@ static int monc_show(struct seq_file *s, void *p)
struct ceph_mon_generic_request *req;
struct ceph_mon_client *monc = &client->monc;
struct rb_node *rp;
int i;
mutex_lock(&monc->mutex);
if (monc->have_mdsmap)
seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap);
if (monc->have_osdmap)
seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap);
if (monc->want_next_osdmap)
seq_printf(s, "want next osdmap\n");
for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
seq_printf(s, "have %s %u", ceph_sub_str[i],
monc->subs[i].have);
if (monc->subs[i].want)
seq_printf(s, " want %llu%s",
le64_to_cpu(monc->subs[i].item.start),
(monc->subs[i].item.flags &
CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
seq_putc(s, '\n');
}
for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
__u16 op;

View File

@@ -235,18 +235,12 @@ static struct workqueue_struct *ceph_msgr_wq;
static int ceph_msgr_slab_init(void)
{
BUG_ON(ceph_msg_cache);
ceph_msg_cache = kmem_cache_create("ceph_msg",
sizeof (struct ceph_msg),
__alignof__(struct ceph_msg), 0, NULL);
ceph_msg_cache = KMEM_CACHE(ceph_msg, 0);
if (!ceph_msg_cache)
return -ENOMEM;
BUG_ON(ceph_msg_data_cache);
ceph_msg_data_cache = kmem_cache_create("ceph_msg_data",
sizeof (struct ceph_msg_data),
__alignof__(struct ceph_msg_data),
0, NULL);
ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0);
if (ceph_msg_data_cache)
return 0;
@@ -1221,25 +1215,19 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
static void prepare_write_message_footer(struct ceph_connection *con)
{
struct ceph_msg *m = con->out_msg;
int v = con->out_kvec_left;
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
dout("prepare_write_message_footer %p\n", con);
con->out_kvec[v].iov_base = &m->footer;
con_out_kvec_add(con, sizeof_footer(con), &m->footer);
if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
if (con->ops->sign_message)
con->ops->sign_message(m);
else
m->footer.sig = 0;
con->out_kvec[v].iov_len = sizeof(m->footer);
con->out_kvec_bytes += sizeof(m->footer);
} else {
m->old_footer.flags = m->footer.flags;
con->out_kvec[v].iov_len = sizeof(m->old_footer);
con->out_kvec_bytes += sizeof(m->old_footer);
}
con->out_kvec_left++;
con->out_more = m->more_to_follow;
con->out_msg_done = true;
}
@@ -2409,11 +2397,7 @@ static int read_partial_message(struct ceph_connection *con)
}
/* footer */
if (need_sign)
size = sizeof(m->footer);
else
size = sizeof(m->old_footer);
size = sizeof_footer(con);
end += size;
ret = read_partial(con, end, size, &m->footer);
if (ret <= 0)
@@ -3089,10 +3073,7 @@ void ceph_msg_revoke(struct ceph_msg *msg)
con->out_skip += con_out_kvec_skip(con);
} else {
BUG_ON(!msg->data_length);
if (con->peer_features & CEPH_FEATURE_MSG_AUTH)
con->out_skip += sizeof(msg->footer);
else
con->out_skip += sizeof(msg->old_footer);
con->out_skip += sizeof_footer(con);
}
/* data, middle, front */
if (msg->data_length)

View File

@@ -122,51 +122,91 @@ static void __close_session(struct ceph_mon_client *monc)
ceph_msg_revoke(monc->m_subscribe);
ceph_msg_revoke_incoming(monc->m_subscribe_ack);
ceph_con_close(&monc->con);
monc->cur_mon = -1;
monc->pending_auth = 0;
ceph_auth_reset(monc->auth);
}
/*
* Open a session with a (new) monitor.
* Pick a new monitor at random and set cur_mon. If we are repicking
* (i.e. cur_mon is already set), be sure to pick a different one.
*/
static int __open_session(struct ceph_mon_client *monc)
static void pick_new_mon(struct ceph_mon_client *monc)
{
char r;
int ret;
int old_mon = monc->cur_mon;
if (monc->cur_mon < 0) {
get_random_bytes(&r, 1);
monc->cur_mon = r % monc->monmap->num_mon;
dout("open_session num=%d r=%d -> mon%d\n",
monc->monmap->num_mon, r, monc->cur_mon);
monc->sub_sent = 0;
monc->sub_renew_after = jiffies; /* i.e., expired */
monc->want_next_osdmap = !!monc->want_next_osdmap;
BUG_ON(monc->monmap->num_mon < 1);
dout("open_session mon%d opening\n", monc->cur_mon);
ceph_con_open(&monc->con,
CEPH_ENTITY_TYPE_MON, monc->cur_mon,
&monc->monmap->mon_inst[monc->cur_mon].addr);
/* send an initial keepalive to ensure our timestamp is
* valid by the time we are in an OPENED state */
ceph_con_keepalive(&monc->con);
/* initiatiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
monc->m_auth->front.iov_base,
monc->m_auth->front_alloc_len);
__send_prepared_auth_request(monc, ret);
if (monc->monmap->num_mon == 1) {
monc->cur_mon = 0;
} else {
dout("open_session mon%d already open\n", monc->cur_mon);
int max = monc->monmap->num_mon;
int o = -1;
int n;
if (monc->cur_mon >= 0) {
if (monc->cur_mon < monc->monmap->num_mon)
o = monc->cur_mon;
if (o >= 0)
max--;
}
n = prandom_u32() % max;
if (o >= 0 && n >= o)
n++;
monc->cur_mon = n;
}
return 0;
dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon,
monc->cur_mon, monc->monmap->num_mon);
}
static bool __sub_expired(struct ceph_mon_client *monc)
/*
* Open a session with a new monitor.
*/
static void __open_session(struct ceph_mon_client *monc)
{
return time_after_eq(jiffies, monc->sub_renew_after);
int ret;
pick_new_mon(monc);
monc->hunting = true;
if (monc->had_a_connection) {
monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF;
if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT)
monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT;
}
monc->sub_renew_after = jiffies; /* i.e., expired */
monc->sub_renew_sent = 0;
dout("%s opening mon%d\n", __func__, monc->cur_mon);
ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon,
&monc->monmap->mon_inst[monc->cur_mon].addr);
/*
* send an initial keepalive to ensure our timestamp is valid
* by the time we are in an OPENED state
*/
ceph_con_keepalive(&monc->con);
/* initiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
monc->m_auth->front.iov_base,
monc->m_auth->front_alloc_len);
BUG_ON(ret <= 0);
__send_prepared_auth_request(monc, ret);
}
static void reopen_session(struct ceph_mon_client *monc)
{
if (!monc->hunting)
pr_info("mon%d %s session lost, hunting for new mon\n",
monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr));
__close_session(monc);
__open_session(monc);
}
/*
@@ -174,74 +214,70 @@ static bool __sub_expired(struct ceph_mon_client *monc)
*/
static void __schedule_delayed(struct ceph_mon_client *monc)
{
struct ceph_options *opt = monc->client->options;
unsigned long delay;
if (monc->cur_mon < 0 || __sub_expired(monc)) {
delay = 10 * HZ;
} else {
delay = 20 * HZ;
if (opt->monc_ping_timeout > 0)
delay = min(delay, opt->monc_ping_timeout / 3);
}
if (monc->hunting)
delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult;
else
delay = CEPH_MONC_PING_INTERVAL;
dout("__schedule_delayed after %lu\n", delay);
schedule_delayed_work(&monc->delayed_work,
round_jiffies_relative(delay));
mod_delayed_work(system_wq, &monc->delayed_work,
round_jiffies_relative(delay));
}
const char *ceph_sub_str[] = {
[CEPH_SUB_MDSMAP] = "mdsmap",
[CEPH_SUB_MONMAP] = "monmap",
[CEPH_SUB_OSDMAP] = "osdmap",
};
/*
* Send subscribe request for mdsmap and/or osdmap.
* Send subscribe request for one or more maps, according to
* monc->subs.
*/
static void __send_subscribe(struct ceph_mon_client *monc)
{
dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
(unsigned int)monc->sub_sent, __sub_expired(monc),
monc->want_next_osdmap);
if ((__sub_expired(monc) && !monc->sub_sent) ||
monc->want_next_osdmap == 1) {
struct ceph_msg *msg = monc->m_subscribe;
struct ceph_mon_subscribe_item *i;
void *p, *end;
int num;
struct ceph_msg *msg = monc->m_subscribe;
void *p = msg->front.iov_base;
void *const end = p + msg->front_alloc_len;
int num = 0;
int i;
p = msg->front.iov_base;
end = p + msg->front_alloc_len;
dout("%s sent %lu\n", __func__, monc->sub_renew_sent);
num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
ceph_encode_32(&p, num);
BUG_ON(monc->cur_mon < 0);
if (monc->want_next_osdmap) {
dout("__send_subscribe to 'osdmap' %u\n",
(unsigned int)monc->have_osdmap);
ceph_encode_string(&p, end, "osdmap", 6);
i = p;
i->have = cpu_to_le64(monc->have_osdmap);
i->onetime = 1;
p += sizeof(*i);
monc->want_next_osdmap = 2; /* requested */
}
if (monc->want_mdsmap) {
dout("__send_subscribe to 'mdsmap' %u+\n",
(unsigned int)monc->have_mdsmap);
ceph_encode_string(&p, end, "mdsmap", 6);
i = p;
i->have = cpu_to_le64(monc->have_mdsmap);
i->onetime = 0;
p += sizeof(*i);
}
ceph_encode_string(&p, end, "monmap", 6);
i = p;
i->have = 0;
i->onetime = 0;
p += sizeof(*i);
if (!monc->sub_renew_sent)
monc->sub_renew_sent = jiffies | 1; /* never 0 */
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_msg_revoke(msg);
ceph_con_send(&monc->con, ceph_msg_get(msg));
msg->hdr.version = cpu_to_le16(2);
monc->sub_sent = jiffies | 1; /* never 0 */
for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
if (monc->subs[i].want)
num++;
}
BUG_ON(num < 1); /* monmap sub is always there */
ceph_encode_32(&p, num);
for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
const char *s = ceph_sub_str[i];
if (!monc->subs[i].want)
continue;
dout("%s %s start %llu flags 0x%x\n", __func__, s,
le64_to_cpu(monc->subs[i].item.start),
monc->subs[i].item.flags);
ceph_encode_string(&p, end, s, strlen(s));
memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
p += sizeof(monc->subs[i].item);
}
BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_msg_revoke(msg);
ceph_con_send(&monc->con, ceph_msg_get(msg));
}
static void handle_subscribe_ack(struct ceph_mon_client *monc,
@@ -255,15 +291,16 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
seconds = le32_to_cpu(h->duration);
mutex_lock(&monc->mutex);
if (monc->hunting) {
pr_info("mon%d %s session established\n",
monc->cur_mon,
ceph_pr_addr(&monc->con.peer_addr.in_addr));
monc->hunting = false;
if (monc->sub_renew_sent) {
monc->sub_renew_after = monc->sub_renew_sent +
(seconds >> 1) * HZ - 1;
dout("%s sent %lu duration %d renew after %lu\n", __func__,
monc->sub_renew_sent, seconds, monc->sub_renew_after);
monc->sub_renew_sent = 0;
} else {
dout("%s sent %lu renew after %lu, ignoring\n", __func__,
monc->sub_renew_sent, monc->sub_renew_after);
}
dout("handle_subscribe_ack after %d seconds\n", seconds);
monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
monc->sub_sent = 0;
mutex_unlock(&monc->mutex);
return;
bad:
@@ -272,36 +309,82 @@ bad:
}
/*
* Keep track of which maps we have
* Register interest in a map
*
* @sub: one of CEPH_SUB_*
* @epoch: X for "every map since X", or 0 for "just the latest"
*/
int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub,
u32 epoch, bool continuous)
{
mutex_lock(&monc->mutex);
monc->have_mdsmap = got;
mutex_unlock(&monc->mutex);
return 0;
}
EXPORT_SYMBOL(ceph_monc_got_mdsmap);
__le64 start = cpu_to_le64(epoch);
u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0;
int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub],
epoch, continuous);
if (monc->subs[sub].want &&
monc->subs[sub].item.start == start &&
monc->subs[sub].item.flags == flags)
return false;
monc->subs[sub].item.start = start;
monc->subs[sub].item.flags = flags;
monc->subs[sub].want = true;
return true;
}
bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
bool continuous)
{
bool need_request;
mutex_lock(&monc->mutex);
need_request = __ceph_monc_want_map(monc, sub, epoch, continuous);
mutex_unlock(&monc->mutex);
return need_request;
}
EXPORT_SYMBOL(ceph_monc_want_map);
/*
* Keep track of which maps we have
*
* @sub: one of CEPH_SUB_*
*/
static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub,
u32 epoch)
{
dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch);
if (monc->subs[sub].want) {
if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME)
monc->subs[sub].want = false;
else
monc->subs[sub].item.start = cpu_to_le64(epoch + 1);
}
monc->subs[sub].have = epoch;
}
void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
{
mutex_lock(&monc->mutex);
monc->have_osdmap = got;
monc->want_next_osdmap = 0;
__ceph_monc_got_map(monc, sub, epoch);
mutex_unlock(&monc->mutex);
return 0;
}
EXPORT_SYMBOL(ceph_monc_got_map);
/*
* Register interest in the next osdmap
*/
void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
{
dout("request_next_osdmap have %u\n", monc->have_osdmap);
dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
mutex_lock(&monc->mutex);
if (!monc->want_next_osdmap)
monc->want_next_osdmap = 1;
if (monc->want_next_osdmap < 2)
if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
__send_subscribe(monc);
mutex_unlock(&monc->mutex);
}
@@ -320,15 +403,15 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
long ret;
mutex_lock(&monc->mutex);
while (monc->have_osdmap < epoch) {
while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) {
mutex_unlock(&monc->mutex);
if (timeout && time_after_eq(jiffies, started + timeout))
return -ETIMEDOUT;
ret = wait_event_interruptible_timeout(monc->client->auth_wq,
monc->have_osdmap >= epoch,
ceph_timeout_jiffies(timeout));
monc->subs[CEPH_SUB_OSDMAP].have >= epoch,
ceph_timeout_jiffies(timeout));
if (ret < 0)
return ret;
@@ -341,11 +424,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
EXPORT_SYMBOL(ceph_monc_wait_osdmap);
/*
*
* Open a session with a random monitor. Request monmap and osdmap,
* which are waited upon in __ceph_open_session().
*/
int ceph_monc_open_session(struct ceph_mon_client *monc)
{
mutex_lock(&monc->mutex);
__ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true);
__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false);
__open_session(monc);
__schedule_delayed(monc);
mutex_unlock(&monc->mutex);
@@ -353,29 +439,15 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
}
EXPORT_SYMBOL(ceph_monc_open_session);
/*
* We require the fsid and global_id in order to initialize our
* debugfs dir.
*/
static bool have_debugfs_info(struct ceph_mon_client *monc)
{
dout("have_debugfs_info fsid %d globalid %lld\n",
(int)monc->client->have_fsid, monc->auth->global_id);
return monc->client->have_fsid && monc->auth->global_id > 0;
}
static void ceph_monc_handle_map(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
struct ceph_client *client = monc->client;
struct ceph_monmap *monmap = NULL, *old = monc->monmap;
void *p, *end;
int had_debugfs_info, init_debugfs = 0;
mutex_lock(&monc->mutex);
had_debugfs_info = have_debugfs_info(monc);
dout("handle_monmap\n");
p = msg->front.iov_base;
end = p + msg->front.iov_len;
@@ -395,29 +467,11 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
client->monc.monmap = monmap;
kfree(old);
if (!client->have_fsid) {
client->have_fsid = true;
if (!had_debugfs_info && have_debugfs_info(monc)) {
pr_info("client%lld fsid %pU\n",
ceph_client_id(monc->client),
&monc->client->fsid);
init_debugfs = 1;
}
mutex_unlock(&monc->mutex);
__ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
client->have_fsid = true;
if (init_debugfs) {
/*
* do debugfs initialization without mutex to avoid
* creating a locking dependency
*/
ceph_debugfs_client_init(monc->client);
}
goto out_unlocked;
}
out:
mutex_unlock(&monc->mutex);
out_unlocked:
wake_up_all(&client->auth_wq);
}
@@ -745,18 +799,15 @@ static void delayed_work(struct work_struct *work)
dout("monc delayed_work\n");
mutex_lock(&monc->mutex);
if (monc->hunting) {
__close_session(monc);
__open_session(monc); /* continue hunting */
dout("%s continuing hunt\n", __func__);
reopen_session(monc);
} else {
struct ceph_options *opt = monc->client->options;
int is_auth = ceph_auth_is_authenticated(monc->auth);
if (ceph_con_keepalive_expired(&monc->con,
opt->monc_ping_timeout)) {
CEPH_MONC_PING_TIMEOUT)) {
dout("monc keepalive timeout\n");
is_auth = 0;
__close_session(monc);
monc->hunting = true;
__open_session(monc);
reopen_session(monc);
}
if (!monc->hunting) {
@@ -764,8 +815,14 @@ static void delayed_work(struct work_struct *work)
__validate_auth(monc);
}
if (is_auth)
__send_subscribe(monc);
if (is_auth) {
unsigned long now = jiffies;
dout("%s renew subs? now %lu renew after %lu\n",
__func__, now, monc->sub_renew_after);
if (time_after_eq(now, monc->sub_renew_after))
__send_subscribe(monc);
}
}
__schedule_delayed(monc);
mutex_unlock(&monc->mutex);
@@ -852,18 +909,14 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
&monc->client->msgr);
monc->cur_mon = -1;
monc->hunting = true;
monc->sub_renew_after = jiffies;
monc->sub_sent = 0;
monc->had_a_connection = false;
monc->hunt_mult = 1;
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
monc->generic_request_tree = RB_ROOT;
monc->num_generic_requests = 0;
monc->last_tid = 0;
monc->have_mdsmap = 0;
monc->have_osdmap = 0;
monc->want_next_osdmap = 1;
return 0;
out_auth_reply:
@@ -888,7 +941,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
mutex_lock(&monc->mutex);
__close_session(monc);
monc->cur_mon = -1;
mutex_unlock(&monc->mutex);
/*
@@ -910,26 +963,40 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
}
EXPORT_SYMBOL(ceph_monc_stop);
static void finish_hunting(struct ceph_mon_client *monc)
{
if (monc->hunting) {
dout("%s found mon%d\n", __func__, monc->cur_mon);
monc->hunting = false;
monc->had_a_connection = true;
monc->hunt_mult /= 2; /* reduce by 50% */
if (monc->hunt_mult < 1)
monc->hunt_mult = 1;
}
}
static void handle_auth_reply(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
int ret;
int was_auth = 0;
int had_debugfs_info, init_debugfs = 0;
mutex_lock(&monc->mutex);
had_debugfs_info = have_debugfs_info(monc);
was_auth = ceph_auth_is_authenticated(monc->auth);
monc->pending_auth = 0;
ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
msg->front.iov_len,
monc->m_auth->front.iov_base,
monc->m_auth->front_alloc_len);
if (ret > 0) {
__send_prepared_auth_request(monc, ret);
goto out;
}
finish_hunting(monc);
if (ret < 0) {
monc->client->auth_err = ret;
wake_up_all(&monc->client->auth_wq);
} else if (ret > 0) {
__send_prepared_auth_request(monc, ret);
} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
dout("authenticated, starting session\n");
@@ -939,23 +1006,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
__send_subscribe(monc);
__resend_generic_request(monc);
pr_info("mon%d %s session established\n", monc->cur_mon,
ceph_pr_addr(&monc->con.peer_addr.in_addr));
}
if (!had_debugfs_info && have_debugfs_info(monc)) {
pr_info("client%lld fsid %pU\n",
ceph_client_id(monc->client),
&monc->client->fsid);
init_debugfs = 1;
}
out:
mutex_unlock(&monc->mutex);
if (init_debugfs) {
/*
* do debugfs initialization without mutex to avoid
* creating a locking dependency
*/
ceph_debugfs_client_init(monc->client);
}
if (monc->client->auth_err < 0)
wake_up_all(&monc->client->auth_wq);
}
static int __validate_auth(struct ceph_mon_client *monc)
@@ -1096,29 +1155,17 @@ static void mon_fault(struct ceph_connection *con)
{
struct ceph_mon_client *monc = con->private;
if (!monc)
return;
dout("mon_fault\n");
mutex_lock(&monc->mutex);
if (!con->private)
goto out;
if (!monc->hunting)
pr_info("mon%d %s session lost, "
"hunting for new mon\n", monc->cur_mon,
ceph_pr_addr(&monc->con.peer_addr.in_addr));
__close_session(monc);
if (!monc->hunting) {
/* start hunting */
monc->hunting = true;
__open_session(monc);
} else {
/* already hunting, let's wait a bit */
__schedule_delayed(monc);
dout("%s mon%d\n", __func__, monc->cur_mon);
if (monc->cur_mon >= 0) {
if (!monc->hunting) {
dout("%s hunting for new mon\n", __func__);
reopen_session(monc);
__schedule_delayed(monc);
} else {
dout("%s already hunting\n", __func__);
}
}
out:
mutex_unlock(&monc->mutex);
}

View File

@@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref)
ceph_put_snap_context(req->r_snapc);
if (req->r_mempool)
mempool_free(req, req->r_osdc->req_mempool);
else
else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
kmem_cache_free(ceph_osd_request_cache, req);
else
kfree(req);
}
void ceph_osdc_get_request(struct ceph_osd_request *req)
@@ -369,28 +370,22 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_msg *msg;
size_t msg_size;
BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
BUG_ON(num_ops > CEPH_OSD_MAX_OP);
msg_size = 4 + 4 + 8 + 8 + 4+8;
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pg_t */
msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
msg_size += 4;
if (use_mempool) {
BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
req = mempool_alloc(osdc->req_mempool, gfp_flags);
memset(req, 0, sizeof(*req));
} else if (num_ops <= CEPH_OSD_SLAB_OPS) {
req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
} else {
req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
gfp_flags);
}
if (req == NULL)
if (unlikely(!req))
return NULL;
/* req only, each op is zeroed in _osd_req_op_init() */
memset(req, 0, sizeof(*req));
req->r_osdc = osdc;
req->r_mempool = use_mempool;
req->r_num_ops = num_ops;
@@ -408,18 +403,36 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
req->r_base_oloc.pool = -1;
req->r_target_oloc.pool = -1;
msg_size = OSD_OPREPLY_FRONT_LEN;
if (num_ops > CEPH_OSD_SLAB_OPS) {
/* ceph_osd_op and rval */
msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
(sizeof(struct ceph_osd_op) + 4);
}
/* create reply message */
if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
else
msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
OSD_OPREPLY_FRONT_LEN, gfp_flags, true);
msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
gfp_flags, true);
if (!msg) {
ceph_osdc_put_request(req);
return NULL;
}
req->r_reply = msg;
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
msg_size += 4; /* retry_attempt */
/* create request message; allow space for oid */
if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
@@ -498,7 +511,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
payload_len += length;
op->payload_len = payload_len;
op->indata_len = payload_len;
}
EXPORT_SYMBOL(osd_req_op_extent_init);
@@ -517,10 +530,32 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
BUG_ON(length > previous);
op->extent.length = length;
op->payload_len -= previous - length;
op->indata_len -= previous - length;
}
EXPORT_SYMBOL(osd_req_op_extent_update);
void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
unsigned int which, u64 offset_inc)
{
struct ceph_osd_req_op *op, *prev_op;
BUG_ON(which + 1 >= osd_req->r_num_ops);
prev_op = &osd_req->r_ops[which];
op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
/* dup previous one */
op->indata_len = prev_op->indata_len;
op->outdata_len = prev_op->outdata_len;
op->extent = prev_op->extent;
/* adjust offset */
op->extent.offset += offset_inc;
op->extent.length -= offset_inc;
if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
op->indata_len -= offset_inc;
}
EXPORT_SYMBOL(osd_req_op_extent_dup_last);
void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *class, const char *method)
{
@@ -554,7 +589,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
op->cls.argc = 0; /* currently unused */
op->payload_len = payload_len;
op->indata_len = payload_len;
}
EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -587,7 +622,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
op->xattr.cmp_mode = cmp_mode;
ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
op->payload_len = payload_len;
op->indata_len = payload_len;
return 0;
}
EXPORT_SYMBOL(osd_req_op_xattr_init);
@@ -707,7 +742,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
dst->cls.indata_len = cpu_to_le32(data_length);
ceph_osdc_msg_data_add(req->r_request, osd_data);
src->payload_len += data_length;
src->indata_len += data_length;
request_data_len += data_length;
}
osd_data = &src->cls.response_data;
@@ -750,7 +785,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->op = cpu_to_le16(src->op);
dst->flags = cpu_to_le32(src->flags);
dst->payload_len = cpu_to_le32(src->payload_len);
dst->payload_len = cpu_to_le32(src->indata_len);
return request_data_len;
}
@@ -1810,7 +1845,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
ceph_decode_need(&p, end, 4, bad_put);
numops = ceph_decode_32(&p);
if (numops > CEPH_OSD_MAX_OP)
if (numops > CEPH_OSD_MAX_OPS)
goto bad_put;
if (numops != req->r_num_ops)
goto bad_put;
@@ -1821,7 +1856,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
int len;
len = le32_to_cpu(op->payload_len);
req->r_reply_op_len[i] = len;
req->r_ops[i].outdata_len = len;
dout(" op %d has %d bytes\n", i, len);
payload_len += len;
p += sizeof(*op);
@@ -1836,7 +1871,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
retry_attempt = ceph_decode_32(&p);
for (i = 0; i < numops; i++)
req->r_reply_op_result[i] = ceph_decode_32(&p);
req->r_ops[i].rval = ceph_decode_32(&p);
if (le16_to_cpu(msg->hdr.version) >= 6) {
p += 8 + 4; /* skip replay_version */
@@ -2187,7 +2222,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
goto bad;
done:
downgrade_write(&osdc->map_sem);
ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
osdc->osdmap->epoch);
/*
* subscribe to subsequent osdmap updates if full to ensure
@@ -2646,8 +2682,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
round_jiffies_relative(osdc->client->options->osd_idle_ttl));
err = -ENOMEM;
osdc->req_mempool = mempool_create_kmalloc_pool(10,
sizeof(struct ceph_osd_request));
osdc->req_mempool = mempool_create_slab_pool(10,
ceph_osd_request_cache);
if (!osdc->req_mempool)
goto out;
@@ -2782,11 +2818,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages);
int ceph_osdc_setup(void)
{
size_t size = sizeof(struct ceph_osd_request) +
CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
BUG_ON(ceph_osd_request_cache);
ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
sizeof (struct ceph_osd_request),
__alignof__(struct ceph_osd_request),
0, NULL);
ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
0, 0, NULL);
return ceph_osd_request_cache ? 0 : -ENOMEM;
}