diff --git a/src/client/sysint/sys-osd-io.sm b/src/client/sysint/sys-osd-io.sm index 7395c8c..07c6ae3 100644 --- a/src/client/sysint/sys-osd-io.sm +++ b/src/client/sysint/sys-osd-io.sm @@ -96,7 +96,6 @@ static int osd_io_init(struct PINT_smcb *smcb, job_status_s *js_p) PVFS_object_attr *attr = &sm_p->getattr.attr; int i, ret; - ret = PINT_msgpairarray_init(&sm_p->msgarray, io->datafile_count); if (ret) goto out; @@ -179,65 +178,49 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) struct osd_command *command; struct bsg_iovec *iov; - /* temp space for output from request processing */ + /* clients offset and length pairs */ PVFS_offset offseta[OSD_INIT_MAX_IOVEC]; PVFS_size sizea[OSD_INIT_MAX_IOVEC]; + int csegs_count; + PVFS_size cagg_len; + + /* temporary space for querying offset length pairs */ + PVFS_offset temp_offset[OSD_INIT_MAX_IOVEC]; + PVFS_size temp_size[OSD_INIT_MAX_IOVEC]; - /* output from running server's view of request processing */ + /* targets offset and length pairs */ PVFS_offset target_offset[OSD_INIT_MAX_IOVEC]; PVFS_size target_size[OSD_INIT_MAX_IOVEC]; + /* total bytes to write across all io servers */ PVFS_size sbytemax = io->mem_req->aggregate_size; - /* kernel SCSI layer can only handle so many pages at once */ - if (sbytemax > KERNEL_BUFSIZE) - sbytemax = KERNEL_BUFSIZE; - - PINT_Request_result sresult = { - .offset_array = target_offset, - .size_array = target_size, - .segmax = ARRAY_SIZE(target_offset), - }; - PINT_Request_result cresult = { - .offset_array = offseta, - .size_array = sizea, - .segmax = ARRAY_SIZE(offseta), - }; + PVFS_size this_aggregate_size = 0; + + /* structures to pass into req proc engine */ + PINT_Request_result sresult, cresult; - /* - * Cannot send the distribution to the server on OSDs, unlike with - * normal PVFS servers. So we have to generate file-contiguous - * chunks for each server. We do this in two steps, suggested by Phil: - * 1. run server-side processing with segmax = 1, to generate the - * number of bytes in the contiguous segment. - * 2. run client-side processing with bytemax = what we got from step - * 1. Set segmax to the client-side max. - * Iterate. - * One flaw with this approach is that looking for a single segment - * on the server will not cross distribution chunk boundaries, i.e. - * if you have a contiguous memory region that gets simply striped at - * 64k onto two servers, each write will be exactly 64k, although you - * could have done a gather iovec on the client to write all of each - * servers' chunks in one go, as they end up being contig in file on - * each respective server. - * - * Instead of following the above new plan is to - * 1. run server-side processing with segmax = many to generate as much - * as possible for that server - * 2. run client-side processing with same bytemax as 1 to get multiple - * segments. - * 3. build an OSD CDB with SGL headers and set options byte appropriately - * 4. since we have to process each segment we can watch to see if they are - * the same size/stride for our optimized case - */ - for (i=0; iu.io.datafile_count; i++) { + + /* kernel SCSI layer can only handle so many pages at once */ + if (sbytemax > KERNEL_BUFSIZE) { + gossip_err("%s: Kernel SCSI buffer too small\n", __func__); + ret = -ENOMEM; + goto out; + } + + /* for each datafile/io server/osd write ALL of its data */ + for (i = 0; i < sm_p->u.io.datafile_count; i++) { uint8_t *p; uint32_t len; PVFS_offset cur_offset; - PVFS_handle datafile_handle = attr->u.meta.dfile_array[ - io->datafile_index_array[i]]; + PVFS_handle datafile_handle; + int j; + + datafile_handle = attr->u.meta.dfile_array[io->datafile_index_array[i]]; if (PINT_REQUEST_DONE(&io->file_req_state[i]) || io->short_read[i]) { + gossip_debug(GOSSIP_IO_DEBUG, "%s: Nothing to do for server %d\n", + __func__, i); sm_p->msgarray[i].suppress = 1; /* disable this entry */ continue; } @@ -248,47 +231,99 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) cur_offset = io->file_req_state[i].target_offset; gossip_debug(GOSSIP_IO_DEBUG, - "%s: %d: frs type %lld target %lld final %lld cur %lld\n", + "%s: %d: frs type %lld target %lld final %lld cur %lld bytemax %lld\n", __func__, i, lld(io->file_req_state[i].type_offset), lld(io->file_req_state[i].target_offset), - lld(io->file_req_state[i].final_offset), lld(cur_offset)); + lld(io->file_req_state[i].final_offset), + lld(cur_offset), + lld(sbytemax)); - /* - * Used to: Run the server side to find a single segment. All we care - * about is the resulting size. --- Now: get all segments - */ - sresult.segs = 0; + PINT_REQUEST_STATE_RST(io->temp_req_state); + + /* */ + /* figure out servers offset/lengths should get all in one go */ + /* */ + sresult.segs = 0; sresult.bytes = 0; - sresult.bytemax = sbytemax; - PINT_REQUEST_STATE_RST(io->temp_req_state); - io->temp_req_state->target_offset = cur_offset; + sresult.bytemax = sbytemax; /* request full size */ + sresult.offset_array = target_offset; + sresult.size_array = target_size; + sresult.segmax = OSD_INIT_MAX_IOVEC; + + /* set up the io req state to pass in */ + io->temp_req_state->target_offset = cur_offset; /*may or may not be 0 */ io->temp_req_state->final_offset = io->file_req_state[i].final_offset; - ret = PINT_process_request(io->temp_req_state, NULL, &io->file_data[i], + + ret = PINT_process_request(io->temp_req_state, NULL, &io->file_data[i], &sresult, PINT_SERVER); if (ret) { - gossip_err("%s: server %d process failed\n", __func__, i); + gossip_err("%s: server %d process_request call failed\n", + __func__, i); goto out; } + this_aggregate_size += sresult.bytes; + + #if 0 + printf("SERVER INFO [%d]:\n", i); + printf("Total size %lld \n", lld(sresult.bytes)); + for (j = 0; j < sresult.segs; j++) { + printf("offset= %lld length= %lld\n", lld(target_offset[j]), + lld(target_size[j])); + } + #endif + + /* */ + /* figure out clinets offset/lengths one stripe at a time */ + /* */ + csegs_count = 0; + cagg_len = 0; + + while (cagg_len != sresult.bytes) { + cresult.segs = 0; + cresult.bytes = 0; + cresult.bytemax = sresult.bytes; + cresult.offset_array = temp_offset; + cresult.size_array = temp_size; + cresult.segmax = OSD_INIT_MAX_IOVEC - csegs_count; + + ret = PINT_process_request(&io->file_req_state[i], + &io->mem_req_state[i], &io->file_data[i], + &cresult, PINT_CLIENT); + if (ret) { + gossip_err("%s: client %d process_request call failed\n", + __func__, i); + goto out; + } - /* - * Run the client side to fill the segments. - */ - cresult.segs = 0; - cresult.bytes = 0; - cresult.bytemax = sresult.bytes; - ret = PINT_process_request(&io->file_req_state[i], - &io->mem_req_state[i], &io->file_data[i], - &cresult, PINT_CLIENT); - if (ret) { - gossip_err("%s: client %d process failed\n", __func__, i); - goto out; - } + /* now move the results for this strip to the perm array */ + for (j = 0; j < cresult.segs; j++) { + offseta[csegs_count + j] = temp_offset[j]; + sizea[csegs_count + j] = temp_size[j]; + } + csegs_count += cresult.segs; + cagg_len += cresult.bytes; + + if(cagg_len > sresult.bytes) { + gossip_err("%s: Client Agg len too big\n", __func__); + ret = -EINVAL; + goto out; + } + } + + #if 0 + printf("CLIENT INFO [%d]:\n", i); + printf("Total size %lld\n", lld(cagg_len)); + for( j = 0; j < csegs_count; j++ ) { + printf("offset= %lld length= %lld\n", lld(offseta[j]), + lld(sizea[j])); + } + #endif gossip_debug(GOSSIP_IO_DEBUG, "%s: %d: %d Server Segments, %lld bytes\n", __func__, i, sresult.segs, lld(sresult.bytes)); gossip_debug(GOSSIP_IO_DEBUG, "%s: %d: %d Client Segments, %lld bytes\n", - __func__, i, cresult.segs, lld(cresult.bytes)); + __func__, i, csegs_count, lld(cagg_len)); command = &sm_p->msgarray[i].osd_command; @@ -300,21 +335,22 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) p = io->buffer; if (sresult.segs == 1) { /* contiguous server buff to write to */ - if (cresult.segs == 1) { + if (csegs_count == 1) { p += offseta[0]; len = sizea[0]; - cresult.segs = 0; /* do not build a 1-unit iovec */ - } else if (cresult.segs > 1) { - iov = malloc(cresult.segs * sizeof(*iov)); + csegs_count = 0; /* do not build a 1-unit iovec */ + } else if (csegs_count > 1) { + iov = malloc(csegs_count * sizeof(*iov)); if (iov == NULL) { ret = -ENOMEM; goto out; } len = 0; - for (i=0; imsgarray[i].osd_iov = p; /* free IOV later */ @@ -329,14 +365,14 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) len, 0); command->indata = p; command->inlen_alloc = len; - command->iov_inlen = cresult.segs; + command->iov_inlen = csegs_count; } else if (io->io_type == PVFS_IO_WRITE) { osd_command_set_write(command, PVFS_OSD_DATA_PID, datafile_handle, len, 0); command->outdata = p; command->outlen = len; - command->iov_outlen = cresult.segs; + command->iov_outlen = csegs_count; } } else { /* either need a SGL or optimized SGL */ @@ -374,11 +410,11 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) total_len += segl; } - //flag=0; /* force SGL mode always so we can compare to VEC later */ + //~ flag=0; /* force SGL mode always so we can compare to VEC later */ /* move this into a config file or something -- remove eventually*/ if (io->io_type == PVFS_IO_WRITE) { - iov = malloc((cresult.segs + 1) * sizeof(*iov)); + iov = malloc((csegs_count + 1) * sizeof(*iov)); if (iov == NULL) { ret = -ENOMEM; goto out; @@ -427,7 +463,7 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) iov[0].iov_base = (uintptr_t)sgl; iov[0].iov_len = ddt_size; - for (j=1; j<=cresult.segs; j++) { + for (j=1; j<=csegs_count; j++) { iov[j].iov_base = (uintptr_t)(p + offseta[j-1]); iov[j].iov_len = sizea[j-1]; } @@ -445,7 +481,7 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) len, 0); command->outdata = p; command->outlen = len; - command->iov_outlen = cresult.segs+1; + command->iov_outlen = csegs_count+1; if (flag) osd_command_set_ddt(command, DDT_VEC); @@ -453,12 +489,12 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) osd_command_set_ddt(command, DDT_SGL); } else if (io->io_type == PVFS_IO_READ) { - if (cresult.segs == 1) { + if (csegs_count == 1) { p += offseta[0]; len = sizea[0]; - cresult.segs = 0; /* do not build a 1-unit iovec */ - } else if (cresult.segs > 1) { - iov = malloc(cresult.segs * sizeof(*iov)); + csegs_count = 0; /* do not build a 1-unit iovec */ + } else if (csegs_count > 1) { + iov = malloc(csegs_count * sizeof(*iov)); if (iov == NULL) { ret = -ENOMEM; goto out; @@ -466,7 +502,7 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) len = 0; sm_p->msgarray[i].osd_iov = iov; /* free sgl later */ - for (i=0; iindata = p; command->inlen_alloc = len; - command->iov_inlen = cresult.segs; + command->iov_inlen = csegs_count; len = 0; @@ -535,12 +571,21 @@ static int osd_io_setup_msgpairs(struct PINT_smcb *smcb, job_status_s *js_p) } - } + + + gossip_debug(GOSSIP_IO_DEBUG, "%s: Aggreagte Written: %lld\n", __func__, + lld(this_aggregate_size)); + + if (this_aggregate_size != sbytemax) { + gossip_err("%s: Did not get correct amount of data\n", __func__); + ret = -EINVAL; + } + out: - js_p->error_code = ret; + js_p->error_code = ret; return 1; } @@ -563,7 +608,6 @@ static int osd_io_completion_fn( - /* fixup short read error */ if (io->io_type == PVFS_IO_READ) { if (command->status == SAM_STAT_CHECK_CONDITION) { @@ -653,7 +697,6 @@ static int osd_io_maybe_xfer_more(struct PINT_smcb *smcb, job_status_s *js_p) /* only need to look at the first one */ //~ struct osd_command *command = &sm_p->msgarray[0].osd_command; - gossip_debug(GOSSIP_IO_DEBUG, "%s: total %lld want %lld.\n", __func__, lld(io->total_size), lld(io->mem_req->aggregate_size)); @@ -710,7 +753,6 @@ static int osd_io_analyze_results(struct PINT_smcb *smcb, job_status_s *js_p) struct PINT_client_sm *sm_p = PINT_sm_frame(smcb, PINT_FRAME_CURRENT); PVFS_offset eor, filereq_ub_offset; int ret = 0; - gossip_debug(GOSSIP_IO_DEBUG, "%s: total bytes transferred %lld\n", __func__, lld(sm_p->u.io.total_size)); @@ -768,7 +810,6 @@ static int osd_io_cleanup(struct PINT_smcb *smcb, job_status_s *js_p) { struct PINT_client_sm *sm_p = PINT_sm_frame(smcb, PINT_FRAME_CURRENT); struct PINT_client_io_sm *io = &sm_p->u.io; - free(sm_p->msgarray); sm_p->msgarray = NULL; sm_p->msgarray_count = 0;