| 1 | /*
|
|---|
| 2 | Unix SMB/CIFS implementation.
|
|---|
| 3 |
|
|---|
| 4 | trivial database library
|
|---|
| 5 |
|
|---|
| 6 | Copyright (C) Andrew Tridgell 2005
|
|---|
| 7 |
|
|---|
| 8 | ** NOTE! The following LGPL license applies to the tdb
|
|---|
| 9 | ** library. This does NOT imply that all of Samba is released
|
|---|
| 10 | ** under the LGPL
|
|---|
| 11 |
|
|---|
| 12 | This library is free software; you can redistribute it and/or
|
|---|
| 13 | modify it under the terms of the GNU Lesser General Public
|
|---|
| 14 | License as published by the Free Software Foundation; either
|
|---|
| 15 | version 3 of the License, or (at your option) any later version.
|
|---|
| 16 |
|
|---|
| 17 | This library is distributed in the hope that it will be useful,
|
|---|
| 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|---|
| 20 | Lesser General Public License for more details.
|
|---|
| 21 |
|
|---|
| 22 | You should have received a copy of the GNU Lesser General Public
|
|---|
| 23 | License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|---|
| 24 | */
|
|---|
| 25 |
|
|---|
| 26 | #include "tdb_private.h"
|
|---|
| 27 |
|
|---|
| 28 | /*
|
|---|
| 29 | transaction design:
|
|---|
| 30 |
|
|---|
| 31 | - only allow a single transaction at a time per database. This makes
|
|---|
| 32 | using the transaction API simpler, as otherwise the caller would
|
|---|
| 33 | have to cope with temporary failures in transactions that conflict
|
|---|
| 34 | with other current transactions
|
|---|
| 35 |
|
|---|
| 36 | - keep the transaction recovery information in the same file as the
|
|---|
| 37 | database, using a special 'transaction recovery' record pointed at
|
|---|
| 38 | by the header. This removes the need for extra journal files as
|
|---|
| 39 | used by some other databases
|
|---|
| 40 |
|
|---|
| 41 | - dynamically allocated the transaction recover record, re-using it
|
|---|
| 42 | for subsequent transactions. If a larger record is needed then
|
|---|
| 43 | tdb_free() the old record to place it on the normal tdb freelist
|
|---|
| 44 | before allocating the new record
|
|---|
| 45 |
|
|---|
| 46 | - during transactions, keep a linked list of writes all that have
|
|---|
| 47 | been performed by intercepting all tdb_write() calls. The hooked
|
|---|
| 48 | transaction versions of tdb_read() and tdb_write() check this
|
|---|
| 49 | linked list and try to use the elements of the list in preference
|
|---|
| 50 | to the real database.
|
|---|
| 51 |
|
|---|
| 52 | - don't allow any locks to be held when a transaction starts,
|
|---|
| 53 | otherwise we can end up with deadlock (plus lack of lock nesting
|
|---|
| 54 | in posix locks would mean the lock is lost)
|
|---|
| 55 |
|
|---|
| 56 | - if the caller gains a lock during the transaction but doesn't
|
|---|
| 57 | release it then fail the commit
|
|---|
| 58 |
|
|---|
| 59 | - allow for nested calls to tdb_transaction_start(), re-using the
|
|---|
| 60 | existing transaction record. If the inner transaction is cancelled
|
|---|
| 61 | then a subsequent commit will fail
|
|---|
| 62 |
|
|---|
| 63 | - keep a mirrored copy of the tdb hash chain heads to allow for the
|
|---|
| 64 | fast hash heads scan on traverse, updating the mirrored copy in
|
|---|
| 65 | the transaction version of tdb_write
|
|---|
| 66 |
|
|---|
| 67 | - allow callers to mix transaction and non-transaction use of tdb,
|
|---|
| 68 | although once a transaction is started then an exclusive lock is
|
|---|
| 69 | gained until the transaction is committed or cancelled
|
|---|
| 70 |
|
|---|
| 71 | - the commit stategy involves first saving away all modified data
|
|---|
| 72 | into a linearised buffer in the transaction recovery area, then
|
|---|
| 73 | marking the transaction recovery area with a magic value to
|
|---|
| 74 | indicate a valid recovery record. In total 4 fsync/msync calls are
|
|---|
| 75 | needed per commit to prevent race conditions. It might be possible
|
|---|
| 76 | to reduce this to 3 or even 2 with some more work.
|
|---|
| 77 |
|
|---|
| 78 | - check for a valid recovery record on open of the tdb, while the
|
|---|
| 79 | global lock is held. Automatically recover from the transaction
|
|---|
| 80 | recovery area if needed, then continue with the open as
|
|---|
| 81 | usual. This allows for smooth crash recovery with no administrator
|
|---|
| 82 | intervention.
|
|---|
| 83 |
|
|---|
| 84 | - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
|
|---|
| 85 | still available, but no transaction recovery area is used and no
|
|---|
| 86 | fsync/msync calls are made.
|
|---|
| 87 |
|
|---|
| 88 | - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using
|
|---|
| 89 | tdb_add_flags() transaction nesting is enabled.
|
|---|
| 90 | It resets the TDB_DISALLOW_NESTING flag, as both cannot be used together.
|
|---|
| 91 | The default is that transaction nesting is allowed.
|
|---|
| 92 | Note: this default may change in future versions of tdb.
|
|---|
| 93 |
|
|---|
| 94 | Beware. when transactions are nested a transaction successfully
|
|---|
| 95 | completed with tdb_transaction_commit() can be silently unrolled later.
|
|---|
| 96 |
|
|---|
| 97 | - if TDB_DISALLOW_NESTING is passed to flags in tdb open, or added using
|
|---|
| 98 | tdb_add_flags() transaction nesting is disabled.
|
|---|
| 99 | It resets the TDB_ALLOW_NESTING flag, as both cannot be used together.
|
|---|
| 100 | An attempt create a nested transaction will fail with TDB_ERR_NESTING.
|
|---|
| 101 | The default is that transaction nesting is allowed.
|
|---|
| 102 | Note: this default may change in future versions of tdb.
|
|---|
| 103 | */
|
|---|
| 104 |
|
|---|
| 105 |
|
|---|
| 106 | /*
|
|---|
| 107 | hold the context of any current transaction
|
|---|
| 108 | */
|
|---|
| 109 | struct tdb_transaction {
|
|---|
| 110 | /* we keep a mirrored copy of the tdb hash heads here so
|
|---|
| 111 | tdb_next_hash_chain() can operate efficiently */
|
|---|
| 112 | uint32_t *hash_heads;
|
|---|
| 113 |
|
|---|
| 114 | /* the original io methods - used to do IOs to the real db */
|
|---|
| 115 | const struct tdb_methods *io_methods;
|
|---|
| 116 |
|
|---|
| 117 | /* the list of transaction blocks. When a block is first
|
|---|
| 118 | written to, it gets created in this list */
|
|---|
| 119 | uint8_t **blocks;
|
|---|
| 120 | uint32_t num_blocks;
|
|---|
| 121 | uint32_t block_size; /* bytes in each block */
|
|---|
| 122 | uint32_t last_block_size; /* number of valid bytes in the last block */
|
|---|
| 123 |
|
|---|
| 124 | /* non-zero when an internal transaction error has
|
|---|
| 125 | occurred. All write operations will then fail until the
|
|---|
| 126 | transaction is ended */
|
|---|
| 127 | int transaction_error;
|
|---|
| 128 |
|
|---|
| 129 | /* when inside a transaction we need to keep track of any
|
|---|
| 130 | nested tdb_transaction_start() calls, as these are allowed,
|
|---|
| 131 | but don't create a new transaction */
|
|---|
| 132 | int nesting;
|
|---|
| 133 |
|
|---|
| 134 | /* set when a prepare has already occurred */
|
|---|
| 135 | bool prepared;
|
|---|
| 136 | tdb_off_t magic_offset;
|
|---|
| 137 |
|
|---|
| 138 | /* set when the GLOBAL_LOCK has been taken */
|
|---|
| 139 | bool global_lock_taken;
|
|---|
| 140 |
|
|---|
| 141 | /* old file size before transaction */
|
|---|
| 142 | tdb_len_t old_map_size;
|
|---|
| 143 |
|
|---|
| 144 | /* we should re-pack on commit */
|
|---|
| 145 | bool need_repack;
|
|---|
| 146 | };
|
|---|
| 147 |
|
|---|
| 148 |
|
|---|
| 149 | /*
|
|---|
| 150 | read while in a transaction. We need to check first if the data is in our list
|
|---|
| 151 | of transaction elements, then if not do a real read
|
|---|
| 152 | */
|
|---|
| 153 | static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
|
|---|
| 154 | tdb_len_t len, int cv)
|
|---|
| 155 | {
|
|---|
| 156 | uint32_t blk;
|
|---|
| 157 |
|
|---|
| 158 | /* break it down into block sized ops */
|
|---|
| 159 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
|---|
| 160 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
|---|
| 161 | if (transaction_read(tdb, off, buf, len2, cv) != 0) {
|
|---|
| 162 | return -1;
|
|---|
| 163 | }
|
|---|
| 164 | len -= len2;
|
|---|
| 165 | off += len2;
|
|---|
| 166 | buf = (void *)(len2 + (char *)buf);
|
|---|
| 167 | }
|
|---|
| 168 |
|
|---|
| 169 | if (len == 0) {
|
|---|
| 170 | return 0;
|
|---|
| 171 | }
|
|---|
| 172 |
|
|---|
| 173 | blk = off / tdb->transaction->block_size;
|
|---|
| 174 |
|
|---|
| 175 | /* see if we have it in the block list */
|
|---|
| 176 | if (tdb->transaction->num_blocks <= blk ||
|
|---|
| 177 | tdb->transaction->blocks[blk] == NULL) {
|
|---|
| 178 | /* nope, do a real read */
|
|---|
| 179 | if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) {
|
|---|
| 180 | goto fail;
|
|---|
| 181 | }
|
|---|
| 182 | return 0;
|
|---|
| 183 | }
|
|---|
| 184 |
|
|---|
| 185 | /* it is in the block list. Now check for the last block */
|
|---|
| 186 | if (blk == tdb->transaction->num_blocks-1) {
|
|---|
| 187 | if (len > tdb->transaction->last_block_size) {
|
|---|
| 188 | goto fail;
|
|---|
| 189 | }
|
|---|
| 190 | }
|
|---|
| 191 |
|
|---|
| 192 | /* now copy it out of this block */
|
|---|
| 193 | memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
|
|---|
| 194 | if (cv) {
|
|---|
| 195 | tdb_convert(buf, len);
|
|---|
| 196 | }
|
|---|
| 197 | return 0;
|
|---|
| 198 |
|
|---|
| 199 | fail:
|
|---|
| 200 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len));
|
|---|
| 201 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 202 | tdb->transaction->transaction_error = 1;
|
|---|
| 203 | return -1;
|
|---|
| 204 | }
|
|---|
| 205 |
|
|---|
| 206 |
|
|---|
| 207 | /*
|
|---|
| 208 | write while in a transaction
|
|---|
| 209 | */
|
|---|
| 210 | static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
|
|---|
| 211 | const void *buf, tdb_len_t len)
|
|---|
| 212 | {
|
|---|
| 213 | uint32_t blk;
|
|---|
| 214 |
|
|---|
| 215 | /* Only a commit is allowed on a prepared transaction */
|
|---|
| 216 | if (tdb->transaction->prepared) {
|
|---|
| 217 | tdb->ecode = TDB_ERR_EINVAL;
|
|---|
| 218 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n"));
|
|---|
| 219 | tdb->transaction->transaction_error = 1;
|
|---|
| 220 | return -1;
|
|---|
| 221 | }
|
|---|
| 222 |
|
|---|
| 223 | /* if the write is to a hash head, then update the transaction
|
|---|
| 224 | hash heads */
|
|---|
| 225 | if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
|
|---|
| 226 | off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
|
|---|
| 227 | uint32_t chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
|
|---|
| 228 | memcpy(&tdb->transaction->hash_heads[chain], buf, len);
|
|---|
| 229 | }
|
|---|
| 230 |
|
|---|
| 231 | /* break it up into block sized chunks */
|
|---|
| 232 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
|---|
| 233 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
|---|
| 234 | if (transaction_write(tdb, off, buf, len2) != 0) {
|
|---|
| 235 | return -1;
|
|---|
| 236 | }
|
|---|
| 237 | len -= len2;
|
|---|
| 238 | off += len2;
|
|---|
| 239 | if (buf != NULL) {
|
|---|
| 240 | buf = (const void *)(len2 + (const char *)buf);
|
|---|
| 241 | }
|
|---|
| 242 | }
|
|---|
| 243 |
|
|---|
| 244 | if (len == 0) {
|
|---|
| 245 | return 0;
|
|---|
| 246 | }
|
|---|
| 247 |
|
|---|
| 248 | blk = off / tdb->transaction->block_size;
|
|---|
| 249 | off = off % tdb->transaction->block_size;
|
|---|
| 250 |
|
|---|
| 251 | if (tdb->transaction->num_blocks <= blk) {
|
|---|
| 252 | uint8_t **new_blocks;
|
|---|
| 253 | /* expand the blocks array */
|
|---|
| 254 | if (tdb->transaction->blocks == NULL) {
|
|---|
| 255 | new_blocks = (uint8_t **)malloc(
|
|---|
| 256 | (blk+1)*sizeof(uint8_t *));
|
|---|
| 257 | } else {
|
|---|
| 258 | new_blocks = (uint8_t **)realloc(
|
|---|
| 259 | tdb->transaction->blocks,
|
|---|
| 260 | (blk+1)*sizeof(uint8_t *));
|
|---|
| 261 | }
|
|---|
| 262 | if (new_blocks == NULL) {
|
|---|
| 263 | tdb->ecode = TDB_ERR_OOM;
|
|---|
| 264 | goto fail;
|
|---|
| 265 | }
|
|---|
| 266 | memset(&new_blocks[tdb->transaction->num_blocks], 0,
|
|---|
| 267 | (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
|
|---|
| 268 | tdb->transaction->blocks = new_blocks;
|
|---|
| 269 | tdb->transaction->num_blocks = blk+1;
|
|---|
| 270 | tdb->transaction->last_block_size = 0;
|
|---|
| 271 | }
|
|---|
| 272 |
|
|---|
| 273 | /* allocate and fill a block? */
|
|---|
| 274 | if (tdb->transaction->blocks[blk] == NULL) {
|
|---|
| 275 | tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1);
|
|---|
| 276 | if (tdb->transaction->blocks[blk] == NULL) {
|
|---|
| 277 | tdb->ecode = TDB_ERR_OOM;
|
|---|
| 278 | tdb->transaction->transaction_error = 1;
|
|---|
| 279 | return -1;
|
|---|
| 280 | }
|
|---|
| 281 | if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) {
|
|---|
| 282 | tdb_len_t len2 = tdb->transaction->block_size;
|
|---|
| 283 | if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) {
|
|---|
| 284 | len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size);
|
|---|
| 285 | }
|
|---|
| 286 | if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size,
|
|---|
| 287 | tdb->transaction->blocks[blk],
|
|---|
| 288 | len2, 0) != 0) {
|
|---|
| 289 | SAFE_FREE(tdb->transaction->blocks[blk]);
|
|---|
| 290 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 291 | goto fail;
|
|---|
| 292 | }
|
|---|
| 293 | if (blk == tdb->transaction->num_blocks-1) {
|
|---|
| 294 | tdb->transaction->last_block_size = len2;
|
|---|
| 295 | }
|
|---|
| 296 | }
|
|---|
| 297 | }
|
|---|
| 298 |
|
|---|
| 299 | /* overwrite part of an existing block */
|
|---|
| 300 | if (buf == NULL) {
|
|---|
| 301 | memset(tdb->transaction->blocks[blk] + off, 0, len);
|
|---|
| 302 | } else {
|
|---|
| 303 | memcpy(tdb->transaction->blocks[blk] + off, buf, len);
|
|---|
| 304 | }
|
|---|
| 305 | if (blk == tdb->transaction->num_blocks-1) {
|
|---|
| 306 | if (len + off > tdb->transaction->last_block_size) {
|
|---|
| 307 | tdb->transaction->last_block_size = len + off;
|
|---|
| 308 | }
|
|---|
| 309 | }
|
|---|
| 310 |
|
|---|
| 311 | return 0;
|
|---|
| 312 |
|
|---|
| 313 | fail:
|
|---|
| 314 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n",
|
|---|
| 315 | (blk*tdb->transaction->block_size) + off, len));
|
|---|
| 316 | tdb->transaction->transaction_error = 1;
|
|---|
| 317 | return -1;
|
|---|
| 318 | }
|
|---|
| 319 |
|
|---|
| 320 |
|
|---|
| 321 | /*
|
|---|
| 322 | write while in a transaction - this varient never expands the transaction blocks, it only
|
|---|
| 323 | updates existing blocks. This means it cannot change the recovery size
|
|---|
| 324 | */
|
|---|
| 325 | static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
|
|---|
| 326 | const void *buf, tdb_len_t len)
|
|---|
| 327 | {
|
|---|
| 328 | uint32_t blk;
|
|---|
| 329 |
|
|---|
| 330 | /* break it up into block sized chunks */
|
|---|
| 331 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
|---|
| 332 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
|---|
| 333 | if (transaction_write_existing(tdb, off, buf, len2) != 0) {
|
|---|
| 334 | return -1;
|
|---|
| 335 | }
|
|---|
| 336 | len -= len2;
|
|---|
| 337 | off += len2;
|
|---|
| 338 | if (buf != NULL) {
|
|---|
| 339 | buf = (const void *)(len2 + (const char *)buf);
|
|---|
| 340 | }
|
|---|
| 341 | }
|
|---|
| 342 |
|
|---|
| 343 | if (len == 0) {
|
|---|
| 344 | return 0;
|
|---|
| 345 | }
|
|---|
| 346 |
|
|---|
| 347 | blk = off / tdb->transaction->block_size;
|
|---|
| 348 | off = off % tdb->transaction->block_size;
|
|---|
| 349 |
|
|---|
| 350 | if (tdb->transaction->num_blocks <= blk ||
|
|---|
| 351 | tdb->transaction->blocks[blk] == NULL) {
|
|---|
| 352 | return 0;
|
|---|
| 353 | }
|
|---|
| 354 |
|
|---|
| 355 | if (blk == tdb->transaction->num_blocks-1 &&
|
|---|
| 356 | off + len > tdb->transaction->last_block_size) {
|
|---|
| 357 | if (off >= tdb->transaction->last_block_size) {
|
|---|
| 358 | return 0;
|
|---|
| 359 | }
|
|---|
| 360 | len = tdb->transaction->last_block_size - off;
|
|---|
| 361 | }
|
|---|
| 362 |
|
|---|
| 363 | /* overwrite part of an existing block */
|
|---|
| 364 | memcpy(tdb->transaction->blocks[blk] + off, buf, len);
|
|---|
| 365 |
|
|---|
| 366 | return 0;
|
|---|
| 367 | }
|
|---|
| 368 |
|
|---|
| 369 |
|
|---|
| 370 | /*
|
|---|
| 371 | accelerated hash chain head search, using the cached hash heads
|
|---|
| 372 | */
|
|---|
| 373 | static void transaction_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
|
|---|
| 374 | {
|
|---|
| 375 | uint32_t h = *chain;
|
|---|
| 376 | for (;h < tdb->header.hash_size;h++) {
|
|---|
| 377 | /* the +1 takes account of the freelist */
|
|---|
| 378 | if (0 != tdb->transaction->hash_heads[h+1]) {
|
|---|
| 379 | break;
|
|---|
| 380 | }
|
|---|
| 381 | }
|
|---|
| 382 | (*chain) = h;
|
|---|
| 383 | }
|
|---|
| 384 |
|
|---|
| 385 | /*
|
|---|
| 386 | out of bounds check during a transaction
|
|---|
| 387 | */
|
|---|
| 388 | static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
|
|---|
| 389 | {
|
|---|
| 390 | if (len <= tdb->map_size) {
|
|---|
| 391 | return 0;
|
|---|
| 392 | }
|
|---|
| 393 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 394 | return -1;
|
|---|
| 395 | }
|
|---|
| 396 |
|
|---|
| 397 | /*
|
|---|
| 398 | transaction version of tdb_expand().
|
|---|
| 399 | */
|
|---|
| 400 | static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
|
|---|
| 401 | tdb_off_t addition)
|
|---|
| 402 | {
|
|---|
| 403 | /* add a write to the transaction elements, so subsequent
|
|---|
| 404 | reads see the zero data */
|
|---|
| 405 | if (transaction_write(tdb, size, NULL, addition) != 0) {
|
|---|
| 406 | return -1;
|
|---|
| 407 | }
|
|---|
| 408 |
|
|---|
| 409 | tdb->transaction->need_repack = true;
|
|---|
| 410 |
|
|---|
| 411 | return 0;
|
|---|
| 412 | }
|
|---|
| 413 |
|
|---|
| 414 | /*
|
|---|
| 415 | brlock during a transaction - ignore them
|
|---|
| 416 | */
|
|---|
| 417 | static int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
|
|---|
| 418 | int rw_type, int lck_type, int probe, size_t len)
|
|---|
| 419 | {
|
|---|
| 420 | return 0;
|
|---|
| 421 | }
|
|---|
| 422 |
|
|---|
| 423 | static const struct tdb_methods transaction_methods = {
|
|---|
| 424 | transaction_read,
|
|---|
| 425 | transaction_write,
|
|---|
| 426 | transaction_next_hash_chain,
|
|---|
| 427 | transaction_oob,
|
|---|
| 428 | transaction_expand_file,
|
|---|
| 429 | transaction_brlock
|
|---|
| 430 | };
|
|---|
| 431 |
|
|---|
| 432 |
|
|---|
| 433 | /*
|
|---|
| 434 | start a tdb transaction. No token is returned, as only a single
|
|---|
| 435 | transaction is allowed to be pending per tdb_context
|
|---|
| 436 | */
|
|---|
| 437 | int tdb_transaction_start(struct tdb_context *tdb)
|
|---|
| 438 | {
|
|---|
| 439 | /* some sanity checks */
|
|---|
| 440 | if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
|
|---|
| 441 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
|
|---|
| 442 | tdb->ecode = TDB_ERR_EINVAL;
|
|---|
| 443 | return -1;
|
|---|
| 444 | }
|
|---|
| 445 |
|
|---|
| 446 | /* cope with nested tdb_transaction_start() calls */
|
|---|
| 447 | if (tdb->transaction != NULL) {
|
|---|
| 448 | if (!(tdb->flags & TDB_ALLOW_NESTING)) {
|
|---|
| 449 | tdb->ecode = TDB_ERR_NESTING;
|
|---|
| 450 | return -1;
|
|---|
| 451 | }
|
|---|
| 452 | tdb->transaction->nesting++;
|
|---|
| 453 | TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
|
|---|
| 454 | tdb->transaction->nesting));
|
|---|
| 455 | return 0;
|
|---|
| 456 | }
|
|---|
| 457 |
|
|---|
| 458 | if (tdb->num_locks != 0 || tdb->global_lock.count) {
|
|---|
| 459 | /* the caller must not have any locks when starting a
|
|---|
| 460 | transaction as otherwise we'll be screwed by lack
|
|---|
| 461 | of nested locks in posix */
|
|---|
| 462 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
|
|---|
| 463 | tdb->ecode = TDB_ERR_LOCK;
|
|---|
| 464 | return -1;
|
|---|
| 465 | }
|
|---|
| 466 |
|
|---|
| 467 | if (tdb->travlocks.next != NULL) {
|
|---|
| 468 | /* you cannot use transactions inside a traverse (although you can use
|
|---|
| 469 | traverse inside a transaction) as otherwise you can end up with
|
|---|
| 470 | deadlock */
|
|---|
| 471 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
|
|---|
| 472 | tdb->ecode = TDB_ERR_LOCK;
|
|---|
| 473 | return -1;
|
|---|
| 474 | }
|
|---|
| 475 |
|
|---|
| 476 | tdb->transaction = (struct tdb_transaction *)
|
|---|
| 477 | calloc(sizeof(struct tdb_transaction), 1);
|
|---|
| 478 | if (tdb->transaction == NULL) {
|
|---|
| 479 | tdb->ecode = TDB_ERR_OOM;
|
|---|
| 480 | return -1;
|
|---|
| 481 | }
|
|---|
| 482 |
|
|---|
| 483 | /* a page at a time seems like a reasonable compromise between compactness and efficiency */
|
|---|
| 484 | tdb->transaction->block_size = tdb->page_size;
|
|---|
| 485 |
|
|---|
| 486 | /* get the transaction write lock. This is a blocking lock. As
|
|---|
| 487 | discussed with Volker, there are a number of ways we could
|
|---|
| 488 | make this async, which we will probably do in the future */
|
|---|
| 489 | if (tdb_transaction_lock(tdb, F_WRLCK) == -1) {
|
|---|
| 490 | SAFE_FREE(tdb->transaction->blocks);
|
|---|
| 491 | SAFE_FREE(tdb->transaction);
|
|---|
| 492 | return -1;
|
|---|
| 493 | }
|
|---|
| 494 |
|
|---|
| 495 | /* get a read lock from the freelist to the end of file. This
|
|---|
| 496 | is upgraded to a write lock during the commit */
|
|---|
| 497 | if (tdb_brlock(tdb, FREELIST_TOP, F_RDLCK, F_SETLKW, 0, 0) == -1) {
|
|---|
| 498 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
|
|---|
| 499 | tdb->ecode = TDB_ERR_LOCK;
|
|---|
| 500 | goto fail;
|
|---|
| 501 | }
|
|---|
| 502 |
|
|---|
| 503 | /* setup a copy of the hash table heads so the hash scan in
|
|---|
| 504 | traverse can be fast */
|
|---|
| 505 | tdb->transaction->hash_heads = (uint32_t *)
|
|---|
| 506 | calloc(tdb->header.hash_size+1, sizeof(uint32_t));
|
|---|
| 507 | if (tdb->transaction->hash_heads == NULL) {
|
|---|
| 508 | tdb->ecode = TDB_ERR_OOM;
|
|---|
| 509 | goto fail;
|
|---|
| 510 | }
|
|---|
| 511 | if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
|
|---|
| 512 | TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
|
|---|
| 513 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
|
|---|
| 514 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 515 | goto fail;
|
|---|
| 516 | }
|
|---|
| 517 |
|
|---|
| 518 | /* make sure we know about any file expansions already done by
|
|---|
| 519 | anyone else */
|
|---|
| 520 | tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
|---|
| 521 | tdb->transaction->old_map_size = tdb->map_size;
|
|---|
| 522 |
|
|---|
| 523 | /* finally hook the io methods, replacing them with
|
|---|
| 524 | transaction specific methods */
|
|---|
| 525 | tdb->transaction->io_methods = tdb->methods;
|
|---|
| 526 | tdb->methods = &transaction_methods;
|
|---|
| 527 |
|
|---|
| 528 | /* Trace at the end, so we get sequence number correct. */
|
|---|
| 529 | tdb_trace(tdb, "tdb_transaction_start");
|
|---|
| 530 | return 0;
|
|---|
| 531 |
|
|---|
| 532 | fail:
|
|---|
| 533 | tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
|
|---|
| 534 | tdb_transaction_unlock(tdb);
|
|---|
| 535 | SAFE_FREE(tdb->transaction->blocks);
|
|---|
| 536 | SAFE_FREE(tdb->transaction->hash_heads);
|
|---|
| 537 | SAFE_FREE(tdb->transaction);
|
|---|
| 538 | return -1;
|
|---|
| 539 | }
|
|---|
| 540 |
|
|---|
| 541 |
|
|---|
| 542 | /*
|
|---|
| 543 | sync to disk
|
|---|
| 544 | */
|
|---|
| 545 | static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
|
|---|
| 546 | {
|
|---|
| 547 | if (tdb->flags & TDB_NOSYNC) {
|
|---|
| 548 | return 0;
|
|---|
| 549 | }
|
|---|
| 550 |
|
|---|
| 551 | if (fsync(tdb->fd) != 0) {
|
|---|
| 552 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 553 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
|
|---|
| 554 | return -1;
|
|---|
| 555 | }
|
|---|
| 556 | #ifdef HAVE_MMAP
|
|---|
| 557 | if (tdb->map_ptr) {
|
|---|
| 558 | tdb_off_t moffset = offset & ~(tdb->page_size-1);
|
|---|
| 559 | if (msync(moffset + (char *)tdb->map_ptr,
|
|---|
| 560 | length + (offset - moffset), MS_SYNC) != 0) {
|
|---|
| 561 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 562 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
|
|---|
| 563 | strerror(errno)));
|
|---|
| 564 | return -1;
|
|---|
| 565 | }
|
|---|
| 566 | }
|
|---|
| 567 | #endif
|
|---|
| 568 | return 0;
|
|---|
| 569 | }
|
|---|
| 570 |
|
|---|
| 571 |
|
|---|
| 572 | int _tdb_transaction_cancel(struct tdb_context *tdb)
|
|---|
| 573 | {
|
|---|
| 574 | int i, ret = 0;
|
|---|
| 575 |
|
|---|
| 576 | if (tdb->transaction == NULL) {
|
|---|
| 577 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
|
|---|
| 578 | return -1;
|
|---|
| 579 | }
|
|---|
| 580 |
|
|---|
| 581 | if (tdb->transaction->nesting != 0) {
|
|---|
| 582 | tdb->transaction->transaction_error = 1;
|
|---|
| 583 | tdb->transaction->nesting--;
|
|---|
| 584 | return 0;
|
|---|
| 585 | }
|
|---|
| 586 |
|
|---|
| 587 | tdb->map_size = tdb->transaction->old_map_size;
|
|---|
| 588 |
|
|---|
| 589 | /* free all the transaction blocks */
|
|---|
| 590 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
|---|
| 591 | if (tdb->transaction->blocks[i] != NULL) {
|
|---|
| 592 | free(tdb->transaction->blocks[i]);
|
|---|
| 593 | }
|
|---|
| 594 | }
|
|---|
| 595 | SAFE_FREE(tdb->transaction->blocks);
|
|---|
| 596 |
|
|---|
| 597 | if (tdb->transaction->magic_offset) {
|
|---|
| 598 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
|---|
| 599 | uint32_t zero = 0;
|
|---|
| 600 |
|
|---|
| 601 | /* remove the recovery marker */
|
|---|
| 602 | if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &zero, 4) == -1 ||
|
|---|
| 603 | transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
|
|---|
| 604 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
|
|---|
| 605 | ret = -1;
|
|---|
| 606 | }
|
|---|
| 607 | }
|
|---|
| 608 |
|
|---|
| 609 | if (tdb->transaction->global_lock_taken) {
|
|---|
| 610 | tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
|
|---|
| 611 | tdb->transaction->global_lock_taken = false;
|
|---|
| 612 | }
|
|---|
| 613 |
|
|---|
| 614 | /* remove any global lock created during the transaction */
|
|---|
| 615 | if (tdb->global_lock.count != 0) {
|
|---|
| 616 | tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 4*tdb->header.hash_size);
|
|---|
| 617 | tdb->global_lock.count = 0;
|
|---|
| 618 | }
|
|---|
| 619 |
|
|---|
| 620 | /* remove any locks created during the transaction */
|
|---|
| 621 | if (tdb->num_locks != 0) {
|
|---|
| 622 | for (i=0;i<tdb->num_lockrecs;i++) {
|
|---|
| 623 | tdb_brlock(tdb,FREELIST_TOP+4*tdb->lockrecs[i].list,
|
|---|
| 624 | F_UNLCK,F_SETLKW, 0, 1);
|
|---|
| 625 | }
|
|---|
| 626 | tdb->num_locks = 0;
|
|---|
| 627 | tdb->num_lockrecs = 0;
|
|---|
| 628 | SAFE_FREE(tdb->lockrecs);
|
|---|
| 629 | }
|
|---|
| 630 |
|
|---|
| 631 | /* restore the normal io methods */
|
|---|
| 632 | tdb->methods = tdb->transaction->io_methods;
|
|---|
| 633 |
|
|---|
| 634 | tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
|
|---|
| 635 | tdb_transaction_unlock(tdb);
|
|---|
| 636 | SAFE_FREE(tdb->transaction->hash_heads);
|
|---|
| 637 | SAFE_FREE(tdb->transaction);
|
|---|
| 638 |
|
|---|
| 639 | return ret;
|
|---|
| 640 | }
|
|---|
| 641 |
|
|---|
| 642 | /*
|
|---|
| 643 | cancel the current transaction
|
|---|
| 644 | */
|
|---|
| 645 | int tdb_transaction_cancel(struct tdb_context *tdb)
|
|---|
| 646 | {
|
|---|
| 647 | tdb_trace(tdb, "tdb_transaction_cancel");
|
|---|
| 648 | return _tdb_transaction_cancel(tdb);
|
|---|
| 649 | }
|
|---|
| 650 |
|
|---|
| 651 | /*
|
|---|
| 652 | work out how much space the linearised recovery data will consume
|
|---|
| 653 | */
|
|---|
| 654 | static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
|
|---|
| 655 | {
|
|---|
| 656 | tdb_len_t recovery_size = 0;
|
|---|
| 657 | int i;
|
|---|
| 658 |
|
|---|
| 659 | recovery_size = sizeof(uint32_t);
|
|---|
| 660 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
|---|
| 661 | if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) {
|
|---|
| 662 | break;
|
|---|
| 663 | }
|
|---|
| 664 | if (tdb->transaction->blocks[i] == NULL) {
|
|---|
| 665 | continue;
|
|---|
| 666 | }
|
|---|
| 667 | recovery_size += 2*sizeof(tdb_off_t);
|
|---|
| 668 | if (i == tdb->transaction->num_blocks-1) {
|
|---|
| 669 | recovery_size += tdb->transaction->last_block_size;
|
|---|
| 670 | } else {
|
|---|
| 671 | recovery_size += tdb->transaction->block_size;
|
|---|
| 672 | }
|
|---|
| 673 | }
|
|---|
| 674 |
|
|---|
| 675 | return recovery_size;
|
|---|
| 676 | }
|
|---|
| 677 |
|
|---|
| 678 | /*
|
|---|
| 679 | allocate the recovery area, or use an existing recovery area if it is
|
|---|
| 680 | large enough
|
|---|
| 681 | */
|
|---|
| 682 | static int tdb_recovery_allocate(struct tdb_context *tdb,
|
|---|
| 683 | tdb_len_t *recovery_size,
|
|---|
| 684 | tdb_off_t *recovery_offset,
|
|---|
| 685 | tdb_len_t *recovery_max_size)
|
|---|
| 686 | {
|
|---|
| 687 | struct tdb_record rec;
|
|---|
| 688 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
|---|
| 689 | tdb_off_t recovery_head;
|
|---|
| 690 |
|
|---|
| 691 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
|
|---|
| 692 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
|
|---|
| 693 | return -1;
|
|---|
| 694 | }
|
|---|
| 695 |
|
|---|
| 696 | rec.rec_len = 0;
|
|---|
| 697 |
|
|---|
| 698 | if (recovery_head != 0 &&
|
|---|
| 699 | methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
|
|---|
| 700 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery record\n"));
|
|---|
| 701 | return -1;
|
|---|
| 702 | }
|
|---|
| 703 |
|
|---|
| 704 | *recovery_size = tdb_recovery_size(tdb);
|
|---|
| 705 |
|
|---|
| 706 | if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
|
|---|
| 707 | /* it fits in the existing area */
|
|---|
| 708 | *recovery_max_size = rec.rec_len;
|
|---|
| 709 | *recovery_offset = recovery_head;
|
|---|
| 710 | return 0;
|
|---|
| 711 | }
|
|---|
| 712 |
|
|---|
| 713 | /* we need to free up the old recovery area, then allocate a
|
|---|
| 714 | new one at the end of the file. Note that we cannot use
|
|---|
| 715 | tdb_allocate() to allocate the new one as that might return
|
|---|
| 716 | us an area that is being currently used (as of the start of
|
|---|
| 717 | the transaction) */
|
|---|
| 718 | if (recovery_head != 0) {
|
|---|
| 719 | if (tdb_free(tdb, recovery_head, &rec) == -1) {
|
|---|
| 720 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to free previous recovery area\n"));
|
|---|
| 721 | return -1;
|
|---|
| 722 | }
|
|---|
| 723 | }
|
|---|
| 724 |
|
|---|
| 725 | /* the tdb_free() call might have increased the recovery size */
|
|---|
| 726 | *recovery_size = tdb_recovery_size(tdb);
|
|---|
| 727 |
|
|---|
| 728 | /* round up to a multiple of page size */
|
|---|
| 729 | *recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
|
|---|
| 730 | *recovery_offset = tdb->map_size;
|
|---|
| 731 | recovery_head = *recovery_offset;
|
|---|
| 732 |
|
|---|
| 733 | if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
|
|---|
| 734 | (tdb->map_size - tdb->transaction->old_map_size) +
|
|---|
| 735 | sizeof(rec) + *recovery_max_size) == -1) {
|
|---|
| 736 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
|
|---|
| 737 | return -1;
|
|---|
| 738 | }
|
|---|
| 739 |
|
|---|
| 740 | /* remap the file (if using mmap) */
|
|---|
| 741 | methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
|---|
| 742 |
|
|---|
| 743 | /* we have to reset the old map size so that we don't try to expand the file
|
|---|
| 744 | again in the transaction commit, which would destroy the recovery area */
|
|---|
| 745 | tdb->transaction->old_map_size = tdb->map_size;
|
|---|
| 746 |
|
|---|
| 747 | /* write the recovery header offset and sync - we can sync without a race here
|
|---|
| 748 | as the magic ptr in the recovery record has not been set */
|
|---|
| 749 | CONVERT(recovery_head);
|
|---|
| 750 | if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
|
|---|
| 751 | &recovery_head, sizeof(tdb_off_t)) == -1) {
|
|---|
| 752 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
|
|---|
| 753 | return -1;
|
|---|
| 754 | }
|
|---|
| 755 | if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) {
|
|---|
| 756 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
|
|---|
| 757 | return -1;
|
|---|
| 758 | }
|
|---|
| 759 |
|
|---|
| 760 | return 0;
|
|---|
| 761 | }
|
|---|
| 762 |
|
|---|
| 763 |
|
|---|
| 764 | /*
|
|---|
| 765 | setup the recovery data that will be used on a crash during commit
|
|---|
| 766 | */
|
|---|
| 767 | static int transaction_setup_recovery(struct tdb_context *tdb,
|
|---|
| 768 | tdb_off_t *magic_offset)
|
|---|
| 769 | {
|
|---|
| 770 | tdb_len_t recovery_size;
|
|---|
| 771 | unsigned char *data, *p;
|
|---|
| 772 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
|---|
| 773 | struct tdb_record *rec;
|
|---|
| 774 | tdb_off_t recovery_offset, recovery_max_size;
|
|---|
| 775 | tdb_off_t old_map_size = tdb->transaction->old_map_size;
|
|---|
| 776 | uint32_t magic, tailer;
|
|---|
| 777 | int i;
|
|---|
| 778 |
|
|---|
| 779 | /*
|
|---|
| 780 | check that the recovery area has enough space
|
|---|
| 781 | */
|
|---|
| 782 | if (tdb_recovery_allocate(tdb, &recovery_size,
|
|---|
| 783 | &recovery_offset, &recovery_max_size) == -1) {
|
|---|
| 784 | return -1;
|
|---|
| 785 | }
|
|---|
| 786 |
|
|---|
| 787 | data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
|
|---|
| 788 | if (data == NULL) {
|
|---|
| 789 | tdb->ecode = TDB_ERR_OOM;
|
|---|
| 790 | return -1;
|
|---|
| 791 | }
|
|---|
| 792 |
|
|---|
| 793 | rec = (struct tdb_record *)data;
|
|---|
| 794 | memset(rec, 0, sizeof(*rec));
|
|---|
| 795 |
|
|---|
| 796 | rec->magic = 0;
|
|---|
| 797 | rec->data_len = recovery_size;
|
|---|
| 798 | rec->rec_len = recovery_max_size;
|
|---|
| 799 | rec->key_len = old_map_size;
|
|---|
| 800 | CONVERT(rec);
|
|---|
| 801 |
|
|---|
| 802 | /* build the recovery data into a single blob to allow us to do a single
|
|---|
| 803 | large write, which should be more efficient */
|
|---|
| 804 | p = data + sizeof(*rec);
|
|---|
| 805 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
|---|
| 806 | tdb_off_t offset;
|
|---|
| 807 | tdb_len_t length;
|
|---|
| 808 |
|
|---|
| 809 | if (tdb->transaction->blocks[i] == NULL) {
|
|---|
| 810 | continue;
|
|---|
| 811 | }
|
|---|
| 812 |
|
|---|
| 813 | offset = i * tdb->transaction->block_size;
|
|---|
| 814 | length = tdb->transaction->block_size;
|
|---|
| 815 | if (i == tdb->transaction->num_blocks-1) {
|
|---|
| 816 | length = tdb->transaction->last_block_size;
|
|---|
| 817 | }
|
|---|
| 818 |
|
|---|
| 819 | if (offset >= old_map_size) {
|
|---|
| 820 | continue;
|
|---|
| 821 | }
|
|---|
| 822 | if (offset + length > tdb->transaction->old_map_size) {
|
|---|
| 823 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
|
|---|
| 824 | free(data);
|
|---|
| 825 | tdb->ecode = TDB_ERR_CORRUPT;
|
|---|
| 826 | return -1;
|
|---|
| 827 | }
|
|---|
| 828 | memcpy(p, &offset, 4);
|
|---|
| 829 | memcpy(p+4, &length, 4);
|
|---|
| 830 | if (DOCONV()) {
|
|---|
| 831 | tdb_convert(p, 8);
|
|---|
| 832 | }
|
|---|
| 833 | /* the recovery area contains the old data, not the
|
|---|
| 834 | new data, so we have to call the original tdb_read
|
|---|
| 835 | method to get it */
|
|---|
| 836 | if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) {
|
|---|
| 837 | free(data);
|
|---|
| 838 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 839 | return -1;
|
|---|
| 840 | }
|
|---|
| 841 | p += 8 + length;
|
|---|
| 842 | }
|
|---|
| 843 |
|
|---|
| 844 | /* and the tailer */
|
|---|
| 845 | tailer = sizeof(*rec) + recovery_max_size;
|
|---|
| 846 | memcpy(p, &tailer, 4);
|
|---|
| 847 | CONVERT(p);
|
|---|
| 848 |
|
|---|
| 849 | /* write the recovery data to the recovery area */
|
|---|
| 850 | if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
|
|---|
| 851 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
|
|---|
| 852 | free(data);
|
|---|
| 853 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 854 | return -1;
|
|---|
| 855 | }
|
|---|
| 856 | if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
|
|---|
| 857 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n"));
|
|---|
| 858 | free(data);
|
|---|
| 859 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 860 | return -1;
|
|---|
| 861 | }
|
|---|
| 862 |
|
|---|
| 863 | /* as we don't have ordered writes, we have to sync the recovery
|
|---|
| 864 | data before we update the magic to indicate that the recovery
|
|---|
| 865 | data is present */
|
|---|
| 866 | if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
|
|---|
| 867 | free(data);
|
|---|
| 868 | return -1;
|
|---|
| 869 | }
|
|---|
| 870 |
|
|---|
| 871 | free(data);
|
|---|
| 872 |
|
|---|
| 873 | magic = TDB_RECOVERY_MAGIC;
|
|---|
| 874 | CONVERT(magic);
|
|---|
| 875 |
|
|---|
| 876 | *magic_offset = recovery_offset + offsetof(struct tdb_record, magic);
|
|---|
| 877 |
|
|---|
| 878 | if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
|
|---|
| 879 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
|
|---|
| 880 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 881 | return -1;
|
|---|
| 882 | }
|
|---|
| 883 | if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
|
|---|
| 884 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n"));
|
|---|
| 885 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 886 | return -1;
|
|---|
| 887 | }
|
|---|
| 888 |
|
|---|
| 889 | /* ensure the recovery magic marker is on disk */
|
|---|
| 890 | if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
|
|---|
| 891 | return -1;
|
|---|
| 892 | }
|
|---|
| 893 |
|
|---|
| 894 | return 0;
|
|---|
| 895 | }
|
|---|
| 896 |
|
|---|
| 897 | static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
|
|---|
| 898 | {
|
|---|
| 899 | const struct tdb_methods *methods;
|
|---|
| 900 |
|
|---|
| 901 | if (tdb->transaction == NULL) {
|
|---|
| 902 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n"));
|
|---|
| 903 | return -1;
|
|---|
| 904 | }
|
|---|
| 905 |
|
|---|
| 906 | if (tdb->transaction->prepared) {
|
|---|
| 907 | tdb->ecode = TDB_ERR_EINVAL;
|
|---|
| 908 | _tdb_transaction_cancel(tdb);
|
|---|
| 909 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n"));
|
|---|
| 910 | return -1;
|
|---|
| 911 | }
|
|---|
| 912 |
|
|---|
| 913 | if (tdb->transaction->transaction_error) {
|
|---|
| 914 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 915 | _tdb_transaction_cancel(tdb);
|
|---|
| 916 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n"));
|
|---|
| 917 | return -1;
|
|---|
| 918 | }
|
|---|
| 919 |
|
|---|
| 920 |
|
|---|
| 921 | if (tdb->transaction->nesting != 0) {
|
|---|
| 922 | return 0;
|
|---|
| 923 | }
|
|---|
| 924 |
|
|---|
| 925 | /* check for a null transaction */
|
|---|
| 926 | if (tdb->transaction->blocks == NULL) {
|
|---|
| 927 | return 0;
|
|---|
| 928 | }
|
|---|
| 929 |
|
|---|
| 930 | methods = tdb->transaction->io_methods;
|
|---|
| 931 |
|
|---|
| 932 | /* if there are any locks pending then the caller has not
|
|---|
| 933 | nested their locks properly, so fail the transaction */
|
|---|
| 934 | if (tdb->num_locks || tdb->global_lock.count) {
|
|---|
| 935 | tdb->ecode = TDB_ERR_LOCK;
|
|---|
| 936 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
|
|---|
| 937 | _tdb_transaction_cancel(tdb);
|
|---|
| 938 | return -1;
|
|---|
| 939 | }
|
|---|
| 940 |
|
|---|
| 941 | /* upgrade the main transaction lock region to a write lock */
|
|---|
| 942 | if (tdb_brlock_upgrade(tdb, FREELIST_TOP, 0) == -1) {
|
|---|
| 943 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to upgrade hash locks\n"));
|
|---|
| 944 | tdb->ecode = TDB_ERR_LOCK;
|
|---|
| 945 | _tdb_transaction_cancel(tdb);
|
|---|
| 946 | return -1;
|
|---|
| 947 | }
|
|---|
| 948 |
|
|---|
| 949 | /* get the global lock - this prevents new users attaching to the database
|
|---|
| 950 | during the commit */
|
|---|
| 951 | if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
|
|---|
| 952 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get global lock\n"));
|
|---|
| 953 | tdb->ecode = TDB_ERR_LOCK;
|
|---|
| 954 | _tdb_transaction_cancel(tdb);
|
|---|
| 955 | return -1;
|
|---|
| 956 | }
|
|---|
| 957 |
|
|---|
| 958 | tdb->transaction->global_lock_taken = true;
|
|---|
| 959 |
|
|---|
| 960 | if (!(tdb->flags & TDB_NOSYNC)) {
|
|---|
| 961 | /* write the recovery data to the end of the file */
|
|---|
| 962 | if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
|
|---|
| 963 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n"));
|
|---|
| 964 | _tdb_transaction_cancel(tdb);
|
|---|
| 965 | return -1;
|
|---|
| 966 | }
|
|---|
| 967 | }
|
|---|
| 968 |
|
|---|
| 969 | tdb->transaction->prepared = true;
|
|---|
| 970 |
|
|---|
| 971 | /* expand the file to the new size if needed */
|
|---|
| 972 | if (tdb->map_size != tdb->transaction->old_map_size) {
|
|---|
| 973 | if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
|
|---|
| 974 | tdb->map_size -
|
|---|
| 975 | tdb->transaction->old_map_size) == -1) {
|
|---|
| 976 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 977 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n"));
|
|---|
| 978 | _tdb_transaction_cancel(tdb);
|
|---|
| 979 | return -1;
|
|---|
| 980 | }
|
|---|
| 981 | tdb->map_size = tdb->transaction->old_map_size;
|
|---|
| 982 | methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
|---|
| 983 | }
|
|---|
| 984 |
|
|---|
| 985 | /* Keep the global lock until the actual commit */
|
|---|
| 986 |
|
|---|
| 987 | return 0;
|
|---|
| 988 | }
|
|---|
| 989 |
|
|---|
| 990 | /*
|
|---|
| 991 | prepare to commit the current transaction
|
|---|
| 992 | */
|
|---|
| 993 | int tdb_transaction_prepare_commit(struct tdb_context *tdb)
|
|---|
| 994 | {
|
|---|
| 995 | tdb_trace(tdb, "tdb_transaction_prepare_commit");
|
|---|
| 996 | return _tdb_transaction_prepare_commit(tdb);
|
|---|
| 997 | }
|
|---|
| 998 |
|
|---|
| 999 | /*
|
|---|
| 1000 | commit the current transaction
|
|---|
| 1001 | */
|
|---|
| 1002 | int tdb_transaction_commit(struct tdb_context *tdb)
|
|---|
| 1003 | {
|
|---|
| 1004 | const struct tdb_methods *methods;
|
|---|
| 1005 | int i;
|
|---|
| 1006 | bool need_repack;
|
|---|
| 1007 |
|
|---|
| 1008 | if (tdb->transaction == NULL) {
|
|---|
| 1009 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
|
|---|
| 1010 | return -1;
|
|---|
| 1011 | }
|
|---|
| 1012 |
|
|---|
| 1013 | tdb_trace(tdb, "tdb_transaction_commit");
|
|---|
| 1014 |
|
|---|
| 1015 | if (tdb->transaction->transaction_error) {
|
|---|
| 1016 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1017 | _tdb_transaction_cancel(tdb);
|
|---|
| 1018 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
|
|---|
| 1019 | return -1;
|
|---|
| 1020 | }
|
|---|
| 1021 |
|
|---|
| 1022 |
|
|---|
| 1023 | if (tdb->transaction->nesting != 0) {
|
|---|
| 1024 | tdb->transaction->nesting--;
|
|---|
| 1025 | return 0;
|
|---|
| 1026 | }
|
|---|
| 1027 |
|
|---|
| 1028 | /* check for a null transaction */
|
|---|
| 1029 | if (tdb->transaction->blocks == NULL) {
|
|---|
| 1030 | _tdb_transaction_cancel(tdb);
|
|---|
| 1031 | return 0;
|
|---|
| 1032 | }
|
|---|
| 1033 |
|
|---|
| 1034 | if (!tdb->transaction->prepared) {
|
|---|
| 1035 | int ret = _tdb_transaction_prepare_commit(tdb);
|
|---|
| 1036 | if (ret)
|
|---|
| 1037 | return ret;
|
|---|
| 1038 | }
|
|---|
| 1039 |
|
|---|
| 1040 | methods = tdb->transaction->io_methods;
|
|---|
| 1041 |
|
|---|
| 1042 | /* perform all the writes */
|
|---|
| 1043 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
|---|
| 1044 | tdb_off_t offset;
|
|---|
| 1045 | tdb_len_t length;
|
|---|
| 1046 |
|
|---|
| 1047 | if (tdb->transaction->blocks[i] == NULL) {
|
|---|
| 1048 | continue;
|
|---|
| 1049 | }
|
|---|
| 1050 |
|
|---|
| 1051 | offset = i * tdb->transaction->block_size;
|
|---|
| 1052 | length = tdb->transaction->block_size;
|
|---|
| 1053 | if (i == tdb->transaction->num_blocks-1) {
|
|---|
| 1054 | length = tdb->transaction->last_block_size;
|
|---|
| 1055 | }
|
|---|
| 1056 |
|
|---|
| 1057 | if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
|
|---|
| 1058 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
|
|---|
| 1059 |
|
|---|
| 1060 | /* we've overwritten part of the data and
|
|---|
| 1061 | possibly expanded the file, so we need to
|
|---|
| 1062 | run the crash recovery code */
|
|---|
| 1063 | tdb->methods = methods;
|
|---|
| 1064 | tdb_transaction_recover(tdb);
|
|---|
| 1065 |
|
|---|
| 1066 | _tdb_transaction_cancel(tdb);
|
|---|
| 1067 |
|
|---|
| 1068 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
|
|---|
| 1069 | return -1;
|
|---|
| 1070 | }
|
|---|
| 1071 | SAFE_FREE(tdb->transaction->blocks[i]);
|
|---|
| 1072 | }
|
|---|
| 1073 |
|
|---|
| 1074 | SAFE_FREE(tdb->transaction->blocks);
|
|---|
| 1075 | tdb->transaction->num_blocks = 0;
|
|---|
| 1076 |
|
|---|
| 1077 | /* ensure the new data is on disk */
|
|---|
| 1078 | if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
|
|---|
| 1079 | return -1;
|
|---|
| 1080 | }
|
|---|
| 1081 |
|
|---|
| 1082 | /*
|
|---|
| 1083 | TODO: maybe write to some dummy hdr field, or write to magic
|
|---|
| 1084 | offset without mmap, before the last sync, instead of the
|
|---|
| 1085 | utime() call
|
|---|
| 1086 | */
|
|---|
| 1087 |
|
|---|
| 1088 | /* on some systems (like Linux 2.6.x) changes via mmap/msync
|
|---|
| 1089 | don't change the mtime of the file, this means the file may
|
|---|
| 1090 | not be backed up (as tdb rounding to block sizes means that
|
|---|
| 1091 | file size changes are quite rare too). The following forces
|
|---|
| 1092 | mtime changes when a transaction completes */
|
|---|
| 1093 | #ifdef HAVE_UTIME
|
|---|
| 1094 | utime(tdb->name, NULL);
|
|---|
| 1095 | #endif
|
|---|
| 1096 |
|
|---|
| 1097 | need_repack = tdb->transaction->need_repack;
|
|---|
| 1098 |
|
|---|
| 1099 | /* use a transaction cancel to free memory and remove the
|
|---|
| 1100 | transaction locks */
|
|---|
| 1101 | _tdb_transaction_cancel(tdb);
|
|---|
| 1102 |
|
|---|
| 1103 | if (need_repack) {
|
|---|
| 1104 | return tdb_repack(tdb);
|
|---|
| 1105 | }
|
|---|
| 1106 |
|
|---|
| 1107 | return 0;
|
|---|
| 1108 | }
|
|---|
| 1109 |
|
|---|
| 1110 |
|
|---|
| 1111 | /*
|
|---|
| 1112 | recover from an aborted transaction. Must be called with exclusive
|
|---|
| 1113 | database write access already established (including the global
|
|---|
| 1114 | lock to prevent new processes attaching)
|
|---|
| 1115 | */
|
|---|
| 1116 | int tdb_transaction_recover(struct tdb_context *tdb)
|
|---|
| 1117 | {
|
|---|
| 1118 | tdb_off_t recovery_head, recovery_eof;
|
|---|
| 1119 | unsigned char *data, *p;
|
|---|
| 1120 | uint32_t zero = 0;
|
|---|
| 1121 | struct tdb_record rec;
|
|---|
| 1122 |
|
|---|
| 1123 | /* find the recovery area */
|
|---|
| 1124 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
|
|---|
| 1125 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
|
|---|
| 1126 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1127 | return -1;
|
|---|
| 1128 | }
|
|---|
| 1129 |
|
|---|
| 1130 | if (recovery_head == 0) {
|
|---|
| 1131 | /* we have never allocated a recovery record */
|
|---|
| 1132 | return 0;
|
|---|
| 1133 | }
|
|---|
| 1134 |
|
|---|
| 1135 | /* read the recovery record */
|
|---|
| 1136 | if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
|
|---|
| 1137 | sizeof(rec), DOCONV()) == -1) {
|
|---|
| 1138 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
|
|---|
| 1139 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1140 | return -1;
|
|---|
| 1141 | }
|
|---|
| 1142 |
|
|---|
| 1143 | if (rec.magic != TDB_RECOVERY_MAGIC) {
|
|---|
| 1144 | /* there is no valid recovery data */
|
|---|
| 1145 | return 0;
|
|---|
| 1146 | }
|
|---|
| 1147 |
|
|---|
| 1148 | if (tdb->read_only) {
|
|---|
| 1149 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
|
|---|
| 1150 | tdb->ecode = TDB_ERR_CORRUPT;
|
|---|
| 1151 | return -1;
|
|---|
| 1152 | }
|
|---|
| 1153 |
|
|---|
| 1154 | recovery_eof = rec.key_len;
|
|---|
| 1155 |
|
|---|
| 1156 | data = (unsigned char *)malloc(rec.data_len);
|
|---|
| 1157 | if (data == NULL) {
|
|---|
| 1158 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
|
|---|
| 1159 | tdb->ecode = TDB_ERR_OOM;
|
|---|
| 1160 | return -1;
|
|---|
| 1161 | }
|
|---|
| 1162 |
|
|---|
| 1163 | /* read the full recovery data */
|
|---|
| 1164 | if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
|
|---|
| 1165 | rec.data_len, 0) == -1) {
|
|---|
| 1166 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
|
|---|
| 1167 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1168 | return -1;
|
|---|
| 1169 | }
|
|---|
| 1170 |
|
|---|
| 1171 | /* recover the file data */
|
|---|
| 1172 | p = data;
|
|---|
| 1173 | while (p+8 < data + rec.data_len) {
|
|---|
| 1174 | uint32_t ofs, len;
|
|---|
| 1175 | if (DOCONV()) {
|
|---|
| 1176 | tdb_convert(p, 8);
|
|---|
| 1177 | }
|
|---|
| 1178 | memcpy(&ofs, p, 4);
|
|---|
| 1179 | memcpy(&len, p+4, 4);
|
|---|
| 1180 |
|
|---|
| 1181 | if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
|
|---|
| 1182 | free(data);
|
|---|
| 1183 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
|
|---|
| 1184 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1185 | return -1;
|
|---|
| 1186 | }
|
|---|
| 1187 | p += 8 + len;
|
|---|
| 1188 | }
|
|---|
| 1189 |
|
|---|
| 1190 | free(data);
|
|---|
| 1191 |
|
|---|
| 1192 | if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
|
|---|
| 1193 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
|
|---|
| 1194 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1195 | return -1;
|
|---|
| 1196 | }
|
|---|
| 1197 |
|
|---|
| 1198 | /* if the recovery area is after the recovered eof then remove it */
|
|---|
| 1199 | if (recovery_eof <= recovery_head) {
|
|---|
| 1200 | if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
|
|---|
| 1201 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
|
|---|
| 1202 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1203 | return -1;
|
|---|
| 1204 | }
|
|---|
| 1205 | }
|
|---|
| 1206 |
|
|---|
| 1207 | /* remove the recovery magic */
|
|---|
| 1208 | if (tdb_ofs_write(tdb, recovery_head + offsetof(struct tdb_record, magic),
|
|---|
| 1209 | &zero) == -1) {
|
|---|
| 1210 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
|
|---|
| 1211 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1212 | return -1;
|
|---|
| 1213 | }
|
|---|
| 1214 |
|
|---|
| 1215 | /* reduce the file size to the old size */
|
|---|
| 1216 | tdb_munmap(tdb);
|
|---|
| 1217 | if (ftruncate(tdb->fd, recovery_eof) != 0) {
|
|---|
| 1218 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to reduce to recovery size\n"));
|
|---|
| 1219 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1220 | return -1;
|
|---|
| 1221 | }
|
|---|
| 1222 | tdb->map_size = recovery_eof;
|
|---|
| 1223 | tdb_mmap(tdb);
|
|---|
| 1224 |
|
|---|
| 1225 | if (transaction_sync(tdb, 0, recovery_eof) == -1) {
|
|---|
| 1226 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
|
|---|
| 1227 | tdb->ecode = TDB_ERR_IO;
|
|---|
| 1228 | return -1;
|
|---|
| 1229 | }
|
|---|
| 1230 |
|
|---|
| 1231 | TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %d byte database\n",
|
|---|
| 1232 | recovery_eof));
|
|---|
| 1233 |
|
|---|
| 1234 | /* all done */
|
|---|
| 1235 | return 0;
|
|---|
| 1236 | }
|
|---|