/* This program is designed to test mmap file * reads and iovec writes as per cyrus imap's * skiplist database usage. It emulates the * behaviour of a skiplist database (sans * fcntl locking) across a file checkpoint */ #include #include #include #include #include #include #include #include #include #include /* use some definitions out of the cyrus skiplist * code to allow identical behaviour */ #define SKIPLIST_HEADER "\241\002\213\015skiplist file\0\0\0" typedef unsigned int bit32; #define WRITEV_ADD_TO_IOVEC(iov, num_iov, s, len) \ do { (iov)[(num_iov)].iov_base = (s); \ (iov)[(num_iov)++].iov_len = (len); } while (0) /* bump to the next multiple of 4 bytes */ #define ROUNDUP(num) (((num) + 3) & 0xFFFFFFFC) bit32 zero; bit32 inorder; bit32 add; bit32 delete; bit32 commit; bit32 dummy; bit32 max; #define keylen 16 char *key; #define vallen 6506 char *val; void setup_vars() { int i; /* skiplist magic values */ zero = htonl(0); inorder = htonl(1); add = htonl(2); delete = htonl(4); commit = htonl(255); dummy = htonl(257); max = htonl((bit32)-1); /* our key and value */ key = malloc(keylen * sizeof(char)); for (i = 0; i < keylen; i++) { key[i] = 'k'; } val = malloc(vallen * sizeof(char)); for (i = 0; i < vallen; i++) { val[i] = 'b'; } } void write_dummy(int fd) { char buffer[96]; int i; for (i = 0; i < 96; i++) buffer[i] = 0; *(bit32 *)(buffer + 0) = dummy; /* type */ *(bit32 *)(buffer + 92) = max; /* end ptr list */ lseek(fd, 48, 0); write(fd, buffer, 96); } void write_header(int fd) { char buffer[48]; int i; for (i = 0; i < 48; i++) buffer[i] = 0; strncpy(buffer, SKIPLIST_HEADER, 20); *(bit32 *)(buffer + 20) = htonl((bit32) 1); /* version */ *(bit32 *)(buffer + 24) = htonl((bit32) 2); /* minorversion */ *(bit32 *)(buffer + 28) = htonl((bit32) 20); /* maxlevel */ *(bit32 *)(buffer + 32) = htonl((bit32) 1); /* curlevel */ *(bit32 *)(buffer + 36) = htonl((bit32) 0); /* numrecords */ *(bit32 *)(buffer + 40) = htonl((bit32)144); /* length at last checkpoint */ *(bit32 *)(buffer + 44) = htonl((bit32) 0); /* last recovery timestamp */ lseek(fd, 0, 0); write(fd, buffer, 48); } int main(int argc, char **argv) { /* files */ char filename[1024]; int fd; char newfilename[1024]; int newfd; /* io vector */ struct iovec iov[50]; int num_iov; /* mmap stuff */ char *mapbase; int mapsize; int maplen; /* network order values */ bit32 netkeylen; bit32 netvallen; bit32 netoffset; /* misc ints */ int i; int offset; int newoffset; int size; int magic; setup_vars(); if (argc <= 1) { printf("NEED A FILE\n"); exit(1); } strncpy(filename, argv[1], sizeof(filename)); unlink(filename); fd = open(filename, O_RDWR | O_CREAT, 0666); /* STAGE 1 - create a "pre-checkpointed" file that looks just like * a real seen file before it gets updated */ write_header(fd); write_dummy(fd); fdatasync(fd); /* mmap the file now */ maplen = 1024 * 16; mapbase = (char *)mmap((caddr_t)0, maplen, PROT_READ, MAP_SHARED, fd, 0L); /* create the initial checkpointed record */ num_iov = 0; WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&inorder, 4); /* header */ /* key */ netkeylen = htonl(keylen); WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netkeylen, 4); /* length */ WRITEV_ADD_TO_IOVEC(iov, num_iov, key, keylen); /* data */ if (ROUNDUP(keylen) > keylen) { /* need to pad */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, ROUNDUP(keylen) - keylen); } /* value */ netvallen = htonl(vallen); WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netvallen, 4); /* length */ WRITEV_ADD_TO_IOVEC(iov, num_iov, val, vallen); /* data */ if (ROUNDUP(vallen) > vallen) { /* need to pad */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, ROUNDUP(vallen) - vallen); } /* only one pointer, and then finish the record off */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, 4); /* pointer */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&max, 4); /* end record */ /* write the record */ lseek(fd, 0, 2); writev(fd, iov, num_iov); /* update the dummy pointer */ lseek(fd, 60, 0); netoffset = htonl(144); write(fd, (char *)&netoffset, 4); /* update the header */ netoffset = htonl(0x1a20); lseek(fd, 40, 0); write(fd, (char *)&netoffset, 4); /* sync the record to disk */ fdatasync(fd); /* CHECKPOINT 1 - the file on disk is now 100% identical to * the file that would exist after loading the data above * into a cyrus skiplist file and running the checkpoint code * (ok, except the timestamp) */ /* Cyrus would check the MMAP at this point and discover it * still fits all the data within the "slot", so it doesn't * actually remap anything */ /* STAGE 2 - write a new record - starting by finding the * pointers and deleting the old record. Simplified for * a level 1 node here. This appends to the file. */ /* find the pointer record - Cyrus does this, so do we */ for (i = 20; i > 0; i--) { offset = ntohl(*(bit32 *)(mapbase + 60 + 4*(i-1))); if (offset) break; } /* test that the keys match */ if (strncmp(mapbase + offset + 4 + 4, key, keylen)) { printf("ODD, key wasn't found\n"); } num_iov = 0; /* add the delete record */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&delete, 4); /* header */ netoffset = htonl(offset); WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netoffset, 4); /* offset to fix */ /* move offset along - this is "NEXT" of deleted record, will * always be zero because there's only one record */ if (offset) { int klen = ROUNDUP(ntohl(*(bit32 *)(mapbase + offset + 4))); int vlen = ROUNDUP(ntohl(*(bit32 *)(mapbase + offset + 4 + 4 + klen))); offset = ntohl(*(bit32 *)(mapbase + offset + 4 + 4 + klen + 4 + vlen)); } if (offset != 0) { printf("ODD, offset isn't 0 after deleting record\n"); } /* mark the deletion */ lseek(fd, 60, 0); write(fd, (char *)&zero, 4); /* set up the new record */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&add, 4); /* header */ /* key */ netkeylen = htonl(keylen); WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netkeylen, 4); /* length */ WRITEV_ADD_TO_IOVEC(iov, num_iov, key, keylen); /* data */ if (ROUNDUP(keylen) > keylen) { /* need to pad */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, ROUNDUP(keylen) - keylen); } /* value */ netvallen = htonl(vallen); WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&netvallen, 4); /* length */ WRITEV_ADD_TO_IOVEC(iov, num_iov, val, vallen); /* data */ if (ROUNDUP(vallen) > vallen) { /* need to pad */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, ROUNDUP(vallen) - vallen); } /* only one pointer, and then finish the record off */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&zero, 4); /* pointer */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&max, 4); /* end record */ /* write the record */ offset = lseek(fd, 0, 2); writev(fd, iov, num_iov); /* update the dummy pointer */ lseek(fd, 60, 0); netoffset = htonl(offset + 8); /* skip the delete */ write(fd, (char *)&netoffset, 4); fdatasync(fd); /* commit record goes on the end of the file only once all * data changes are fsynced to disk */ lseek(fd, 0, 2); write(fd, (char *)&commit, 4); fdatasync(fd); /* CHECKPOINT 2 - we've finished writing and committing the new record, * and the Cyrus skiplist would checkpoint this file now. * We simulate that here */ /* again - we fit insize the mmap space, so it doesn't get refreshed */ /* STAGE 3 - perform a cyrus checkpoint */ if (argc > 2) { /* PERFORM MAGIC - access the mmap in order */ for (i = 0; i < maplen; i++) { magic ^= mapbase[i]; } } /* stream to a new file */ snprintf(newfilename, sizeof(newfilename), "%s.NEW", filename); newfd = open(newfilename, O_RDWR | O_CREAT, 0666); ftruncate(newfd, 0); /* dummy goes first, header gets written last */ write_dummy(newfd); lseek(newfd, 0, 2); /* 144 */ /* again, Cyrus finds the record first */ for (i = 20; i > 0; i--) { offset = ntohl(*(bit32 *)(mapbase + 60 + 4*(i-1))); if (offset) break; } /* and calculates the length by reading the lengths of the * various components and summing them up, finally walking * the pointers until it hits a "-1" */ if (offset) { int klen = ROUNDUP(ntohl(*(bit32 *)(mapbase + offset + 4))); int vlen = ROUNDUP(ntohl(*(bit32 *)(mapbase + offset + 4 + 4 + klen))); size = 4 + 4 + klen + 4 + vlen; while (ntohl(*(bit32 *)(mapbase + offset + size)) != -1) { size += 4; } size += 4; /* final -1 pointer */ } else { /* didn't find the record */ printf("Failed to find the record, dying\n"); exit(1); } num_iov = 0; /* write the entire record, but change the TYPE field from * whatever it was (ADD or INORDER) to INORDER only */ WRITEV_ADD_TO_IOVEC(iov, num_iov, (char *)&inorder, 4); /* header */ WRITEV_ADD_TO_IOVEC(iov, num_iov, mapbase + offset + 4, size - 4); /* data */ writev(newfd, iov, num_iov); /* write the forward pointer */ lseek(newfd, 60, 0); netoffset = htonl(144); write(newfd, (char *)&netoffset, 4); /* write the zeros - first the one in the new record */ lseek(newfd, mapbase + offset + size - 8, 0); write(newfd, (char *)&zero, 4); /* then the rest into the dummy */ for (i = 1; i < 20; i++) { lseek(newfd, 60 + 4*i, 0); write(newfd, (char *)&zero, 4); } /* write the header last */ write_header(fd); close(newfd); close(fd); exit(0); }