1 | /*
|
---|
2 | Unix SMB/CIFS implementation.
|
---|
3 |
|
---|
4 | trivial database library
|
---|
5 |
|
---|
6 | Copyright (C) Andrew Tridgell 2005
|
---|
7 |
|
---|
8 | ** NOTE! The following LGPL license applies to the tdb
|
---|
9 | ** library. This does NOT imply that all of Samba is released
|
---|
10 | ** under the LGPL
|
---|
11 |
|
---|
12 | This library is free software; you can redistribute it and/or
|
---|
13 | modify it under the terms of the GNU Lesser General Public
|
---|
14 | License as published by the Free Software Foundation; either
|
---|
15 | version 3 of the License, or (at your option) any later version.
|
---|
16 |
|
---|
17 | This library is distributed in the hope that it will be useful,
|
---|
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
20 | Lesser General Public License for more details.
|
---|
21 |
|
---|
22 | You should have received a copy of the GNU Lesser General Public
|
---|
23 | License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
---|
24 | */
|
---|
25 |
|
---|
26 | #include "tdb_private.h"
|
---|
27 |
|
---|
28 | /*
|
---|
29 | transaction design:
|
---|
30 |
|
---|
31 | - only allow a single transaction at a time per database. This makes
|
---|
32 | using the transaction API simpler, as otherwise the caller would
|
---|
33 | have to cope with temporary failures in transactions that conflict
|
---|
34 | with other current transactions
|
---|
35 |
|
---|
36 | - keep the transaction recovery information in the same file as the
|
---|
37 | database, using a special 'transaction recovery' record pointed at
|
---|
38 | by the header. This removes the need for extra journal files as
|
---|
39 | used by some other databases
|
---|
40 |
|
---|
41 | - dynamically allocated the transaction recover record, re-using it
|
---|
42 | for subsequent transactions. If a larger record is needed then
|
---|
43 | tdb_free() the old record to place it on the normal tdb freelist
|
---|
44 | before allocating the new record
|
---|
45 |
|
---|
46 | - during transactions, keep a linked list of writes all that have
|
---|
47 | been performed by intercepting all tdb_write() calls. The hooked
|
---|
48 | transaction versions of tdb_read() and tdb_write() check this
|
---|
49 | linked list and try to use the elements of the list in preference
|
---|
50 | to the real database.
|
---|
51 |
|
---|
52 | - don't allow any locks to be held when a transaction starts,
|
---|
53 | otherwise we can end up with deadlock (plus lack of lock nesting
|
---|
54 | in posix locks would mean the lock is lost)
|
---|
55 |
|
---|
56 | - if the caller gains a lock during the transaction but doesn't
|
---|
57 | release it then fail the commit
|
---|
58 |
|
---|
59 | - allow for nested calls to tdb_transaction_start(), re-using the
|
---|
60 | existing transaction record. If the inner transaction is cancelled
|
---|
61 | then a subsequent commit will fail
|
---|
62 |
|
---|
63 | - keep a mirrored copy of the tdb hash chain heads to allow for the
|
---|
64 | fast hash heads scan on traverse, updating the mirrored copy in
|
---|
65 | the transaction version of tdb_write
|
---|
66 |
|
---|
67 | - allow callers to mix transaction and non-transaction use of tdb,
|
---|
68 | although once a transaction is started then an exclusive lock is
|
---|
69 | gained until the transaction is committed or cancelled
|
---|
70 |
|
---|
71 | - the commit stategy involves first saving away all modified data
|
---|
72 | into a linearised buffer in the transaction recovery area, then
|
---|
73 | marking the transaction recovery area with a magic value to
|
---|
74 | indicate a valid recovery record. In total 4 fsync/msync calls are
|
---|
75 | needed per commit to prevent race conditions. It might be possible
|
---|
76 | to reduce this to 3 or even 2 with some more work.
|
---|
77 |
|
---|
78 | - check for a valid recovery record on open of the tdb, while the
|
---|
79 | global lock is held. Automatically recover from the transaction
|
---|
80 | recovery area if needed, then continue with the open as
|
---|
81 | usual. This allows for smooth crash recovery with no administrator
|
---|
82 | intervention.
|
---|
83 |
|
---|
84 | - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
|
---|
85 | still available, but no transaction recovery area is used and no
|
---|
86 | fsync/msync calls are made.
|
---|
87 |
|
---|
88 | - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using
|
---|
89 | tdb_add_flags() transaction nesting is enabled.
|
---|
90 | It resets the TDB_DISALLOW_NESTING flag, as both cannot be used together.
|
---|
91 | The default is that transaction nesting is allowed.
|
---|
92 | Note: this default may change in future versions of tdb.
|
---|
93 |
|
---|
94 | Beware. when transactions are nested a transaction successfully
|
---|
95 | completed with tdb_transaction_commit() can be silently unrolled later.
|
---|
96 |
|
---|
97 | - if TDB_DISALLOW_NESTING is passed to flags in tdb open, or added using
|
---|
98 | tdb_add_flags() transaction nesting is disabled.
|
---|
99 | It resets the TDB_ALLOW_NESTING flag, as both cannot be used together.
|
---|
100 | An attempt create a nested transaction will fail with TDB_ERR_NESTING.
|
---|
101 | The default is that transaction nesting is allowed.
|
---|
102 | Note: this default may change in future versions of tdb.
|
---|
103 | */
|
---|
104 |
|
---|
105 |
|
---|
106 | /*
|
---|
107 | hold the context of any current transaction
|
---|
108 | */
|
---|
109 | struct tdb_transaction {
|
---|
110 | /* we keep a mirrored copy of the tdb hash heads here so
|
---|
111 | tdb_next_hash_chain() can operate efficiently */
|
---|
112 | uint32_t *hash_heads;
|
---|
113 |
|
---|
114 | /* the original io methods - used to do IOs to the real db */
|
---|
115 | const struct tdb_methods *io_methods;
|
---|
116 |
|
---|
117 | /* the list of transaction blocks. When a block is first
|
---|
118 | written to, it gets created in this list */
|
---|
119 | uint8_t **blocks;
|
---|
120 | uint32_t num_blocks;
|
---|
121 | uint32_t block_size; /* bytes in each block */
|
---|
122 | uint32_t last_block_size; /* number of valid bytes in the last block */
|
---|
123 |
|
---|
124 | /* non-zero when an internal transaction error has
|
---|
125 | occurred. All write operations will then fail until the
|
---|
126 | transaction is ended */
|
---|
127 | int transaction_error;
|
---|
128 |
|
---|
129 | /* when inside a transaction we need to keep track of any
|
---|
130 | nested tdb_transaction_start() calls, as these are allowed,
|
---|
131 | but don't create a new transaction */
|
---|
132 | int nesting;
|
---|
133 |
|
---|
134 | /* set when a prepare has already occurred */
|
---|
135 | bool prepared;
|
---|
136 | tdb_off_t magic_offset;
|
---|
137 |
|
---|
138 | /* set when the GLOBAL_LOCK has been taken */
|
---|
139 | bool global_lock_taken;
|
---|
140 |
|
---|
141 | /* old file size before transaction */
|
---|
142 | tdb_len_t old_map_size;
|
---|
143 |
|
---|
144 | /* we should re-pack on commit */
|
---|
145 | bool need_repack;
|
---|
146 | };
|
---|
147 |
|
---|
148 |
|
---|
149 | /*
|
---|
150 | read while in a transaction. We need to check first if the data is in our list
|
---|
151 | of transaction elements, then if not do a real read
|
---|
152 | */
|
---|
153 | static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
|
---|
154 | tdb_len_t len, int cv)
|
---|
155 | {
|
---|
156 | uint32_t blk;
|
---|
157 |
|
---|
158 | /* break it down into block sized ops */
|
---|
159 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
---|
160 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
---|
161 | if (transaction_read(tdb, off, buf, len2, cv) != 0) {
|
---|
162 | return -1;
|
---|
163 | }
|
---|
164 | len -= len2;
|
---|
165 | off += len2;
|
---|
166 | buf = (void *)(len2 + (char *)buf);
|
---|
167 | }
|
---|
168 |
|
---|
169 | if (len == 0) {
|
---|
170 | return 0;
|
---|
171 | }
|
---|
172 |
|
---|
173 | blk = off / tdb->transaction->block_size;
|
---|
174 |
|
---|
175 | /* see if we have it in the block list */
|
---|
176 | if (tdb->transaction->num_blocks <= blk ||
|
---|
177 | tdb->transaction->blocks[blk] == NULL) {
|
---|
178 | /* nope, do a real read */
|
---|
179 | if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) {
|
---|
180 | goto fail;
|
---|
181 | }
|
---|
182 | return 0;
|
---|
183 | }
|
---|
184 |
|
---|
185 | /* it is in the block list. Now check for the last block */
|
---|
186 | if (blk == tdb->transaction->num_blocks-1) {
|
---|
187 | if (len > tdb->transaction->last_block_size) {
|
---|
188 | goto fail;
|
---|
189 | }
|
---|
190 | }
|
---|
191 |
|
---|
192 | /* now copy it out of this block */
|
---|
193 | memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
|
---|
194 | if (cv) {
|
---|
195 | tdb_convert(buf, len);
|
---|
196 | }
|
---|
197 | return 0;
|
---|
198 |
|
---|
199 | fail:
|
---|
200 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len));
|
---|
201 | tdb->ecode = TDB_ERR_IO;
|
---|
202 | tdb->transaction->transaction_error = 1;
|
---|
203 | return -1;
|
---|
204 | }
|
---|
205 |
|
---|
206 |
|
---|
207 | /*
|
---|
208 | write while in a transaction
|
---|
209 | */
|
---|
210 | static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
|
---|
211 | const void *buf, tdb_len_t len)
|
---|
212 | {
|
---|
213 | uint32_t blk;
|
---|
214 |
|
---|
215 | /* Only a commit is allowed on a prepared transaction */
|
---|
216 | if (tdb->transaction->prepared) {
|
---|
217 | tdb->ecode = TDB_ERR_EINVAL;
|
---|
218 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n"));
|
---|
219 | tdb->transaction->transaction_error = 1;
|
---|
220 | return -1;
|
---|
221 | }
|
---|
222 |
|
---|
223 | /* if the write is to a hash head, then update the transaction
|
---|
224 | hash heads */
|
---|
225 | if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
|
---|
226 | off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
|
---|
227 | uint32_t chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
|
---|
228 | memcpy(&tdb->transaction->hash_heads[chain], buf, len);
|
---|
229 | }
|
---|
230 |
|
---|
231 | /* break it up into block sized chunks */
|
---|
232 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
---|
233 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
---|
234 | if (transaction_write(tdb, off, buf, len2) != 0) {
|
---|
235 | return -1;
|
---|
236 | }
|
---|
237 | len -= len2;
|
---|
238 | off += len2;
|
---|
239 | if (buf != NULL) {
|
---|
240 | buf = (const void *)(len2 + (const char *)buf);
|
---|
241 | }
|
---|
242 | }
|
---|
243 |
|
---|
244 | if (len == 0) {
|
---|
245 | return 0;
|
---|
246 | }
|
---|
247 |
|
---|
248 | blk = off / tdb->transaction->block_size;
|
---|
249 | off = off % tdb->transaction->block_size;
|
---|
250 |
|
---|
251 | if (tdb->transaction->num_blocks <= blk) {
|
---|
252 | uint8_t **new_blocks;
|
---|
253 | /* expand the blocks array */
|
---|
254 | if (tdb->transaction->blocks == NULL) {
|
---|
255 | new_blocks = (uint8_t **)malloc(
|
---|
256 | (blk+1)*sizeof(uint8_t *));
|
---|
257 | } else {
|
---|
258 | new_blocks = (uint8_t **)realloc(
|
---|
259 | tdb->transaction->blocks,
|
---|
260 | (blk+1)*sizeof(uint8_t *));
|
---|
261 | }
|
---|
262 | if (new_blocks == NULL) {
|
---|
263 | tdb->ecode = TDB_ERR_OOM;
|
---|
264 | goto fail;
|
---|
265 | }
|
---|
266 | memset(&new_blocks[tdb->transaction->num_blocks], 0,
|
---|
267 | (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
|
---|
268 | tdb->transaction->blocks = new_blocks;
|
---|
269 | tdb->transaction->num_blocks = blk+1;
|
---|
270 | tdb->transaction->last_block_size = 0;
|
---|
271 | }
|
---|
272 |
|
---|
273 | /* allocate and fill a block? */
|
---|
274 | if (tdb->transaction->blocks[blk] == NULL) {
|
---|
275 | tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1);
|
---|
276 | if (tdb->transaction->blocks[blk] == NULL) {
|
---|
277 | tdb->ecode = TDB_ERR_OOM;
|
---|
278 | tdb->transaction->transaction_error = 1;
|
---|
279 | return -1;
|
---|
280 | }
|
---|
281 | if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) {
|
---|
282 | tdb_len_t len2 = tdb->transaction->block_size;
|
---|
283 | if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) {
|
---|
284 | len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size);
|
---|
285 | }
|
---|
286 | if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size,
|
---|
287 | tdb->transaction->blocks[blk],
|
---|
288 | len2, 0) != 0) {
|
---|
289 | SAFE_FREE(tdb->transaction->blocks[blk]);
|
---|
290 | tdb->ecode = TDB_ERR_IO;
|
---|
291 | goto fail;
|
---|
292 | }
|
---|
293 | if (blk == tdb->transaction->num_blocks-1) {
|
---|
294 | tdb->transaction->last_block_size = len2;
|
---|
295 | }
|
---|
296 | }
|
---|
297 | }
|
---|
298 |
|
---|
299 | /* overwrite part of an existing block */
|
---|
300 | if (buf == NULL) {
|
---|
301 | memset(tdb->transaction->blocks[blk] + off, 0, len);
|
---|
302 | } else {
|
---|
303 | memcpy(tdb->transaction->blocks[blk] + off, buf, len);
|
---|
304 | }
|
---|
305 | if (blk == tdb->transaction->num_blocks-1) {
|
---|
306 | if (len + off > tdb->transaction->last_block_size) {
|
---|
307 | tdb->transaction->last_block_size = len + off;
|
---|
308 | }
|
---|
309 | }
|
---|
310 |
|
---|
311 | return 0;
|
---|
312 |
|
---|
313 | fail:
|
---|
314 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n",
|
---|
315 | (blk*tdb->transaction->block_size) + off, len));
|
---|
316 | tdb->transaction->transaction_error = 1;
|
---|
317 | return -1;
|
---|
318 | }
|
---|
319 |
|
---|
320 |
|
---|
321 | /*
|
---|
322 | write while in a transaction - this varient never expands the transaction blocks, it only
|
---|
323 | updates existing blocks. This means it cannot change the recovery size
|
---|
324 | */
|
---|
325 | static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
|
---|
326 | const void *buf, tdb_len_t len)
|
---|
327 | {
|
---|
328 | uint32_t blk;
|
---|
329 |
|
---|
330 | /* break it up into block sized chunks */
|
---|
331 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
---|
332 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
---|
333 | if (transaction_write_existing(tdb, off, buf, len2) != 0) {
|
---|
334 | return -1;
|
---|
335 | }
|
---|
336 | len -= len2;
|
---|
337 | off += len2;
|
---|
338 | if (buf != NULL) {
|
---|
339 | buf = (const void *)(len2 + (const char *)buf);
|
---|
340 | }
|
---|
341 | }
|
---|
342 |
|
---|
343 | if (len == 0) {
|
---|
344 | return 0;
|
---|
345 | }
|
---|
346 |
|
---|
347 | blk = off / tdb->transaction->block_size;
|
---|
348 | off = off % tdb->transaction->block_size;
|
---|
349 |
|
---|
350 | if (tdb->transaction->num_blocks <= blk ||
|
---|
351 | tdb->transaction->blocks[blk] == NULL) {
|
---|
352 | return 0;
|
---|
353 | }
|
---|
354 |
|
---|
355 | if (blk == tdb->transaction->num_blocks-1 &&
|
---|
356 | off + len > tdb->transaction->last_block_size) {
|
---|
357 | if (off >= tdb->transaction->last_block_size) {
|
---|
358 | return 0;
|
---|
359 | }
|
---|
360 | len = tdb->transaction->last_block_size - off;
|
---|
361 | }
|
---|
362 |
|
---|
363 | /* overwrite part of an existing block */
|
---|
364 | memcpy(tdb->transaction->blocks[blk] + off, buf, len);
|
---|
365 |
|
---|
366 | return 0;
|
---|
367 | }
|
---|
368 |
|
---|
369 |
|
---|
370 | /*
|
---|
371 | accelerated hash chain head search, using the cached hash heads
|
---|
372 | */
|
---|
373 | static void transaction_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
|
---|
374 | {
|
---|
375 | uint32_t h = *chain;
|
---|
376 | for (;h < tdb->header.hash_size;h++) {
|
---|
377 | /* the +1 takes account of the freelist */
|
---|
378 | if (0 != tdb->transaction->hash_heads[h+1]) {
|
---|
379 | break;
|
---|
380 | }
|
---|
381 | }
|
---|
382 | (*chain) = h;
|
---|
383 | }
|
---|
384 |
|
---|
385 | /*
|
---|
386 | out of bounds check during a transaction
|
---|
387 | */
|
---|
388 | static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
|
---|
389 | {
|
---|
390 | if (len <= tdb->map_size) {
|
---|
391 | return 0;
|
---|
392 | }
|
---|
393 | tdb->ecode = TDB_ERR_IO;
|
---|
394 | return -1;
|
---|
395 | }
|
---|
396 |
|
---|
397 | /*
|
---|
398 | transaction version of tdb_expand().
|
---|
399 | */
|
---|
400 | static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
|
---|
401 | tdb_off_t addition)
|
---|
402 | {
|
---|
403 | /* add a write to the transaction elements, so subsequent
|
---|
404 | reads see the zero data */
|
---|
405 | if (transaction_write(tdb, size, NULL, addition) != 0) {
|
---|
406 | return -1;
|
---|
407 | }
|
---|
408 |
|
---|
409 | tdb->transaction->need_repack = true;
|
---|
410 |
|
---|
411 | return 0;
|
---|
412 | }
|
---|
413 |
|
---|
414 | /*
|
---|
415 | brlock during a transaction - ignore them
|
---|
416 | */
|
---|
417 | static int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
|
---|
418 | int rw_type, int lck_type, int probe, size_t len)
|
---|
419 | {
|
---|
420 | return 0;
|
---|
421 | }
|
---|
422 |
|
---|
423 | static const struct tdb_methods transaction_methods = {
|
---|
424 | transaction_read,
|
---|
425 | transaction_write,
|
---|
426 | transaction_next_hash_chain,
|
---|
427 | transaction_oob,
|
---|
428 | transaction_expand_file,
|
---|
429 | transaction_brlock
|
---|
430 | };
|
---|
431 |
|
---|
432 |
|
---|
433 | /*
|
---|
434 | start a tdb transaction. No token is returned, as only a single
|
---|
435 | transaction is allowed to be pending per tdb_context
|
---|
436 | */
|
---|
437 | int tdb_transaction_start(struct tdb_context *tdb)
|
---|
438 | {
|
---|
439 | /* some sanity checks */
|
---|
440 | if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
|
---|
441 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
|
---|
442 | tdb->ecode = TDB_ERR_EINVAL;
|
---|
443 | return -1;
|
---|
444 | }
|
---|
445 |
|
---|
446 | /* cope with nested tdb_transaction_start() calls */
|
---|
447 | if (tdb->transaction != NULL) {
|
---|
448 | if (!(tdb->flags & TDB_ALLOW_NESTING)) {
|
---|
449 | tdb->ecode = TDB_ERR_NESTING;
|
---|
450 | return -1;
|
---|
451 | }
|
---|
452 | tdb->transaction->nesting++;
|
---|
453 | TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
|
---|
454 | tdb->transaction->nesting));
|
---|
455 | return 0;
|
---|
456 | }
|
---|
457 |
|
---|
458 | if (tdb->num_locks != 0 || tdb->global_lock.count) {
|
---|
459 | /* the caller must not have any locks when starting a
|
---|
460 | transaction as otherwise we'll be screwed by lack
|
---|
461 | of nested locks in posix */
|
---|
462 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
|
---|
463 | tdb->ecode = TDB_ERR_LOCK;
|
---|
464 | return -1;
|
---|
465 | }
|
---|
466 |
|
---|
467 | if (tdb->travlocks.next != NULL) {
|
---|
468 | /* you cannot use transactions inside a traverse (although you can use
|
---|
469 | traverse inside a transaction) as otherwise you can end up with
|
---|
470 | deadlock */
|
---|
471 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
|
---|
472 | tdb->ecode = TDB_ERR_LOCK;
|
---|
473 | return -1;
|
---|
474 | }
|
---|
475 |
|
---|
476 | tdb->transaction = (struct tdb_transaction *)
|
---|
477 | calloc(sizeof(struct tdb_transaction), 1);
|
---|
478 | if (tdb->transaction == NULL) {
|
---|
479 | tdb->ecode = TDB_ERR_OOM;
|
---|
480 | return -1;
|
---|
481 | }
|
---|
482 |
|
---|
483 | /* a page at a time seems like a reasonable compromise between compactness and efficiency */
|
---|
484 | tdb->transaction->block_size = tdb->page_size;
|
---|
485 |
|
---|
486 | /* get the transaction write lock. This is a blocking lock. As
|
---|
487 | discussed with Volker, there are a number of ways we could
|
---|
488 | make this async, which we will probably do in the future */
|
---|
489 | if (tdb_transaction_lock(tdb, F_WRLCK) == -1) {
|
---|
490 | SAFE_FREE(tdb->transaction->blocks);
|
---|
491 | SAFE_FREE(tdb->transaction);
|
---|
492 | return -1;
|
---|
493 | }
|
---|
494 |
|
---|
495 | /* get a read lock from the freelist to the end of file. This
|
---|
496 | is upgraded to a write lock during the commit */
|
---|
497 | if (tdb_brlock(tdb, FREELIST_TOP, F_RDLCK, F_SETLKW, 0, 0) == -1) {
|
---|
498 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
|
---|
499 | tdb->ecode = TDB_ERR_LOCK;
|
---|
500 | goto fail;
|
---|
501 | }
|
---|
502 |
|
---|
503 | /* setup a copy of the hash table heads so the hash scan in
|
---|
504 | traverse can be fast */
|
---|
505 | tdb->transaction->hash_heads = (uint32_t *)
|
---|
506 | calloc(tdb->header.hash_size+1, sizeof(uint32_t));
|
---|
507 | if (tdb->transaction->hash_heads == NULL) {
|
---|
508 | tdb->ecode = TDB_ERR_OOM;
|
---|
509 | goto fail;
|
---|
510 | }
|
---|
511 | if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
|
---|
512 | TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
|
---|
513 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
|
---|
514 | tdb->ecode = TDB_ERR_IO;
|
---|
515 | goto fail;
|
---|
516 | }
|
---|
517 |
|
---|
518 | /* make sure we know about any file expansions already done by
|
---|
519 | anyone else */
|
---|
520 | tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
---|
521 | tdb->transaction->old_map_size = tdb->map_size;
|
---|
522 |
|
---|
523 | /* finally hook the io methods, replacing them with
|
---|
524 | transaction specific methods */
|
---|
525 | tdb->transaction->io_methods = tdb->methods;
|
---|
526 | tdb->methods = &transaction_methods;
|
---|
527 |
|
---|
528 | /* Trace at the end, so we get sequence number correct. */
|
---|
529 | tdb_trace(tdb, "tdb_transaction_start");
|
---|
530 | return 0;
|
---|
531 |
|
---|
532 | fail:
|
---|
533 | tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
|
---|
534 | tdb_transaction_unlock(tdb);
|
---|
535 | SAFE_FREE(tdb->transaction->blocks);
|
---|
536 | SAFE_FREE(tdb->transaction->hash_heads);
|
---|
537 | SAFE_FREE(tdb->transaction);
|
---|
538 | return -1;
|
---|
539 | }
|
---|
540 |
|
---|
541 |
|
---|
542 | /*
|
---|
543 | sync to disk
|
---|
544 | */
|
---|
545 | static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
|
---|
546 | {
|
---|
547 | if (tdb->flags & TDB_NOSYNC) {
|
---|
548 | return 0;
|
---|
549 | }
|
---|
550 |
|
---|
551 | if (fsync(tdb->fd) != 0) {
|
---|
552 | tdb->ecode = TDB_ERR_IO;
|
---|
553 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
|
---|
554 | return -1;
|
---|
555 | }
|
---|
556 | #ifdef HAVE_MMAP
|
---|
557 | if (tdb->map_ptr) {
|
---|
558 | tdb_off_t moffset = offset & ~(tdb->page_size-1);
|
---|
559 | if (msync(moffset + (char *)tdb->map_ptr,
|
---|
560 | length + (offset - moffset), MS_SYNC) != 0) {
|
---|
561 | tdb->ecode = TDB_ERR_IO;
|
---|
562 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
|
---|
563 | strerror(errno)));
|
---|
564 | return -1;
|
---|
565 | }
|
---|
566 | }
|
---|
567 | #endif
|
---|
568 | return 0;
|
---|
569 | }
|
---|
570 |
|
---|
571 |
|
---|
572 | int _tdb_transaction_cancel(struct tdb_context *tdb)
|
---|
573 | {
|
---|
574 | int i, ret = 0;
|
---|
575 |
|
---|
576 | if (tdb->transaction == NULL) {
|
---|
577 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
|
---|
578 | return -1;
|
---|
579 | }
|
---|
580 |
|
---|
581 | if (tdb->transaction->nesting != 0) {
|
---|
582 | tdb->transaction->transaction_error = 1;
|
---|
583 | tdb->transaction->nesting--;
|
---|
584 | return 0;
|
---|
585 | }
|
---|
586 |
|
---|
587 | tdb->map_size = tdb->transaction->old_map_size;
|
---|
588 |
|
---|
589 | /* free all the transaction blocks */
|
---|
590 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
---|
591 | if (tdb->transaction->blocks[i] != NULL) {
|
---|
592 | free(tdb->transaction->blocks[i]);
|
---|
593 | }
|
---|
594 | }
|
---|
595 | SAFE_FREE(tdb->transaction->blocks);
|
---|
596 |
|
---|
597 | if (tdb->transaction->magic_offset) {
|
---|
598 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
---|
599 | uint32_t zero = 0;
|
---|
600 |
|
---|
601 | /* remove the recovery marker */
|
---|
602 | if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &zero, 4) == -1 ||
|
---|
603 | transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
|
---|
604 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
|
---|
605 | ret = -1;
|
---|
606 | }
|
---|
607 | }
|
---|
608 |
|
---|
609 | if (tdb->transaction->global_lock_taken) {
|
---|
610 | tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
|
---|
611 | tdb->transaction->global_lock_taken = false;
|
---|
612 | }
|
---|
613 |
|
---|
614 | /* remove any global lock created during the transaction */
|
---|
615 | if (tdb->global_lock.count != 0) {
|
---|
616 | tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 4*tdb->header.hash_size);
|
---|
617 | tdb->global_lock.count = 0;
|
---|
618 | }
|
---|
619 |
|
---|
620 | /* remove any locks created during the transaction */
|
---|
621 | if (tdb->num_locks != 0) {
|
---|
622 | for (i=0;i<tdb->num_lockrecs;i++) {
|
---|
623 | tdb_brlock(tdb,FREELIST_TOP+4*tdb->lockrecs[i].list,
|
---|
624 | F_UNLCK,F_SETLKW, 0, 1);
|
---|
625 | }
|
---|
626 | tdb->num_locks = 0;
|
---|
627 | tdb->num_lockrecs = 0;
|
---|
628 | SAFE_FREE(tdb->lockrecs);
|
---|
629 | }
|
---|
630 |
|
---|
631 | /* restore the normal io methods */
|
---|
632 | tdb->methods = tdb->transaction->io_methods;
|
---|
633 |
|
---|
634 | tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
|
---|
635 | tdb_transaction_unlock(tdb);
|
---|
636 | SAFE_FREE(tdb->transaction->hash_heads);
|
---|
637 | SAFE_FREE(tdb->transaction);
|
---|
638 |
|
---|
639 | return ret;
|
---|
640 | }
|
---|
641 |
|
---|
642 | /*
|
---|
643 | cancel the current transaction
|
---|
644 | */
|
---|
645 | int tdb_transaction_cancel(struct tdb_context *tdb)
|
---|
646 | {
|
---|
647 | tdb_trace(tdb, "tdb_transaction_cancel");
|
---|
648 | return _tdb_transaction_cancel(tdb);
|
---|
649 | }
|
---|
650 |
|
---|
651 | /*
|
---|
652 | work out how much space the linearised recovery data will consume
|
---|
653 | */
|
---|
654 | static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
|
---|
655 | {
|
---|
656 | tdb_len_t recovery_size = 0;
|
---|
657 | int i;
|
---|
658 |
|
---|
659 | recovery_size = sizeof(uint32_t);
|
---|
660 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
---|
661 | if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) {
|
---|
662 | break;
|
---|
663 | }
|
---|
664 | if (tdb->transaction->blocks[i] == NULL) {
|
---|
665 | continue;
|
---|
666 | }
|
---|
667 | recovery_size += 2*sizeof(tdb_off_t);
|
---|
668 | if (i == tdb->transaction->num_blocks-1) {
|
---|
669 | recovery_size += tdb->transaction->last_block_size;
|
---|
670 | } else {
|
---|
671 | recovery_size += tdb->transaction->block_size;
|
---|
672 | }
|
---|
673 | }
|
---|
674 |
|
---|
675 | return recovery_size;
|
---|
676 | }
|
---|
677 |
|
---|
678 | /*
|
---|
679 | allocate the recovery area, or use an existing recovery area if it is
|
---|
680 | large enough
|
---|
681 | */
|
---|
682 | static int tdb_recovery_allocate(struct tdb_context *tdb,
|
---|
683 | tdb_len_t *recovery_size,
|
---|
684 | tdb_off_t *recovery_offset,
|
---|
685 | tdb_len_t *recovery_max_size)
|
---|
686 | {
|
---|
687 | struct tdb_record rec;
|
---|
688 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
---|
689 | tdb_off_t recovery_head;
|
---|
690 |
|
---|
691 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
|
---|
692 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
|
---|
693 | return -1;
|
---|
694 | }
|
---|
695 |
|
---|
696 | rec.rec_len = 0;
|
---|
697 |
|
---|
698 | if (recovery_head != 0 &&
|
---|
699 | methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
|
---|
700 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery record\n"));
|
---|
701 | return -1;
|
---|
702 | }
|
---|
703 |
|
---|
704 | *recovery_size = tdb_recovery_size(tdb);
|
---|
705 |
|
---|
706 | if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
|
---|
707 | /* it fits in the existing area */
|
---|
708 | *recovery_max_size = rec.rec_len;
|
---|
709 | *recovery_offset = recovery_head;
|
---|
710 | return 0;
|
---|
711 | }
|
---|
712 |
|
---|
713 | /* we need to free up the old recovery area, then allocate a
|
---|
714 | new one at the end of the file. Note that we cannot use
|
---|
715 | tdb_allocate() to allocate the new one as that might return
|
---|
716 | us an area that is being currently used (as of the start of
|
---|
717 | the transaction) */
|
---|
718 | if (recovery_head != 0) {
|
---|
719 | if (tdb_free(tdb, recovery_head, &rec) == -1) {
|
---|
720 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to free previous recovery area\n"));
|
---|
721 | return -1;
|
---|
722 | }
|
---|
723 | }
|
---|
724 |
|
---|
725 | /* the tdb_free() call might have increased the recovery size */
|
---|
726 | *recovery_size = tdb_recovery_size(tdb);
|
---|
727 |
|
---|
728 | /* round up to a multiple of page size */
|
---|
729 | *recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
|
---|
730 | *recovery_offset = tdb->map_size;
|
---|
731 | recovery_head = *recovery_offset;
|
---|
732 |
|
---|
733 | if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
|
---|
734 | (tdb->map_size - tdb->transaction->old_map_size) +
|
---|
735 | sizeof(rec) + *recovery_max_size) == -1) {
|
---|
736 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
|
---|
737 | return -1;
|
---|
738 | }
|
---|
739 |
|
---|
740 | /* remap the file (if using mmap) */
|
---|
741 | methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
---|
742 |
|
---|
743 | /* we have to reset the old map size so that we don't try to expand the file
|
---|
744 | again in the transaction commit, which would destroy the recovery area */
|
---|
745 | tdb->transaction->old_map_size = tdb->map_size;
|
---|
746 |
|
---|
747 | /* write the recovery header offset and sync - we can sync without a race here
|
---|
748 | as the magic ptr in the recovery record has not been set */
|
---|
749 | CONVERT(recovery_head);
|
---|
750 | if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
|
---|
751 | &recovery_head, sizeof(tdb_off_t)) == -1) {
|
---|
752 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
|
---|
753 | return -1;
|
---|
754 | }
|
---|
755 | if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) {
|
---|
756 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
|
---|
757 | return -1;
|
---|
758 | }
|
---|
759 |
|
---|
760 | return 0;
|
---|
761 | }
|
---|
762 |
|
---|
763 |
|
---|
764 | /*
|
---|
765 | setup the recovery data that will be used on a crash during commit
|
---|
766 | */
|
---|
767 | static int transaction_setup_recovery(struct tdb_context *tdb,
|
---|
768 | tdb_off_t *magic_offset)
|
---|
769 | {
|
---|
770 | tdb_len_t recovery_size;
|
---|
771 | unsigned char *data, *p;
|
---|
772 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
---|
773 | struct tdb_record *rec;
|
---|
774 | tdb_off_t recovery_offset, recovery_max_size;
|
---|
775 | tdb_off_t old_map_size = tdb->transaction->old_map_size;
|
---|
776 | uint32_t magic, tailer;
|
---|
777 | int i;
|
---|
778 |
|
---|
779 | /*
|
---|
780 | check that the recovery area has enough space
|
---|
781 | */
|
---|
782 | if (tdb_recovery_allocate(tdb, &recovery_size,
|
---|
783 | &recovery_offset, &recovery_max_size) == -1) {
|
---|
784 | return -1;
|
---|
785 | }
|
---|
786 |
|
---|
787 | data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
|
---|
788 | if (data == NULL) {
|
---|
789 | tdb->ecode = TDB_ERR_OOM;
|
---|
790 | return -1;
|
---|
791 | }
|
---|
792 |
|
---|
793 | rec = (struct tdb_record *)data;
|
---|
794 | memset(rec, 0, sizeof(*rec));
|
---|
795 |
|
---|
796 | rec->magic = 0;
|
---|
797 | rec->data_len = recovery_size;
|
---|
798 | rec->rec_len = recovery_max_size;
|
---|
799 | rec->key_len = old_map_size;
|
---|
800 | CONVERT(rec);
|
---|
801 |
|
---|
802 | /* build the recovery data into a single blob to allow us to do a single
|
---|
803 | large write, which should be more efficient */
|
---|
804 | p = data + sizeof(*rec);
|
---|
805 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
---|
806 | tdb_off_t offset;
|
---|
807 | tdb_len_t length;
|
---|
808 |
|
---|
809 | if (tdb->transaction->blocks[i] == NULL) {
|
---|
810 | continue;
|
---|
811 | }
|
---|
812 |
|
---|
813 | offset = i * tdb->transaction->block_size;
|
---|
814 | length = tdb->transaction->block_size;
|
---|
815 | if (i == tdb->transaction->num_blocks-1) {
|
---|
816 | length = tdb->transaction->last_block_size;
|
---|
817 | }
|
---|
818 |
|
---|
819 | if (offset >= old_map_size) {
|
---|
820 | continue;
|
---|
821 | }
|
---|
822 | if (offset + length > tdb->transaction->old_map_size) {
|
---|
823 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
|
---|
824 | free(data);
|
---|
825 | tdb->ecode = TDB_ERR_CORRUPT;
|
---|
826 | return -1;
|
---|
827 | }
|
---|
828 | memcpy(p, &offset, 4);
|
---|
829 | memcpy(p+4, &length, 4);
|
---|
830 | if (DOCONV()) {
|
---|
831 | tdb_convert(p, 8);
|
---|
832 | }
|
---|
833 | /* the recovery area contains the old data, not the
|
---|
834 | new data, so we have to call the original tdb_read
|
---|
835 | method to get it */
|
---|
836 | if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) {
|
---|
837 | free(data);
|
---|
838 | tdb->ecode = TDB_ERR_IO;
|
---|
839 | return -1;
|
---|
840 | }
|
---|
841 | p += 8 + length;
|
---|
842 | }
|
---|
843 |
|
---|
844 | /* and the tailer */
|
---|
845 | tailer = sizeof(*rec) + recovery_max_size;
|
---|
846 | memcpy(p, &tailer, 4);
|
---|
847 | CONVERT(p);
|
---|
848 |
|
---|
849 | /* write the recovery data to the recovery area */
|
---|
850 | if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
|
---|
851 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
|
---|
852 | free(data);
|
---|
853 | tdb->ecode = TDB_ERR_IO;
|
---|
854 | return -1;
|
---|
855 | }
|
---|
856 | if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
|
---|
857 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n"));
|
---|
858 | free(data);
|
---|
859 | tdb->ecode = TDB_ERR_IO;
|
---|
860 | return -1;
|
---|
861 | }
|
---|
862 |
|
---|
863 | /* as we don't have ordered writes, we have to sync the recovery
|
---|
864 | data before we update the magic to indicate that the recovery
|
---|
865 | data is present */
|
---|
866 | if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
|
---|
867 | free(data);
|
---|
868 | return -1;
|
---|
869 | }
|
---|
870 |
|
---|
871 | free(data);
|
---|
872 |
|
---|
873 | magic = TDB_RECOVERY_MAGIC;
|
---|
874 | CONVERT(magic);
|
---|
875 |
|
---|
876 | *magic_offset = recovery_offset + offsetof(struct tdb_record, magic);
|
---|
877 |
|
---|
878 | if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
|
---|
879 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
|
---|
880 | tdb->ecode = TDB_ERR_IO;
|
---|
881 | return -1;
|
---|
882 | }
|
---|
883 | if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
|
---|
884 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n"));
|
---|
885 | tdb->ecode = TDB_ERR_IO;
|
---|
886 | return -1;
|
---|
887 | }
|
---|
888 |
|
---|
889 | /* ensure the recovery magic marker is on disk */
|
---|
890 | if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
|
---|
891 | return -1;
|
---|
892 | }
|
---|
893 |
|
---|
894 | return 0;
|
---|
895 | }
|
---|
896 |
|
---|
897 | static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
|
---|
898 | {
|
---|
899 | const struct tdb_methods *methods;
|
---|
900 |
|
---|
901 | if (tdb->transaction == NULL) {
|
---|
902 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n"));
|
---|
903 | return -1;
|
---|
904 | }
|
---|
905 |
|
---|
906 | if (tdb->transaction->prepared) {
|
---|
907 | tdb->ecode = TDB_ERR_EINVAL;
|
---|
908 | _tdb_transaction_cancel(tdb);
|
---|
909 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n"));
|
---|
910 | return -1;
|
---|
911 | }
|
---|
912 |
|
---|
913 | if (tdb->transaction->transaction_error) {
|
---|
914 | tdb->ecode = TDB_ERR_IO;
|
---|
915 | _tdb_transaction_cancel(tdb);
|
---|
916 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n"));
|
---|
917 | return -1;
|
---|
918 | }
|
---|
919 |
|
---|
920 |
|
---|
921 | if (tdb->transaction->nesting != 0) {
|
---|
922 | return 0;
|
---|
923 | }
|
---|
924 |
|
---|
925 | /* check for a null transaction */
|
---|
926 | if (tdb->transaction->blocks == NULL) {
|
---|
927 | return 0;
|
---|
928 | }
|
---|
929 |
|
---|
930 | methods = tdb->transaction->io_methods;
|
---|
931 |
|
---|
932 | /* if there are any locks pending then the caller has not
|
---|
933 | nested their locks properly, so fail the transaction */
|
---|
934 | if (tdb->num_locks || tdb->global_lock.count) {
|
---|
935 | tdb->ecode = TDB_ERR_LOCK;
|
---|
936 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
|
---|
937 | _tdb_transaction_cancel(tdb);
|
---|
938 | return -1;
|
---|
939 | }
|
---|
940 |
|
---|
941 | /* upgrade the main transaction lock region to a write lock */
|
---|
942 | if (tdb_brlock_upgrade(tdb, FREELIST_TOP, 0) == -1) {
|
---|
943 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to upgrade hash locks\n"));
|
---|
944 | tdb->ecode = TDB_ERR_LOCK;
|
---|
945 | _tdb_transaction_cancel(tdb);
|
---|
946 | return -1;
|
---|
947 | }
|
---|
948 |
|
---|
949 | /* get the global lock - this prevents new users attaching to the database
|
---|
950 | during the commit */
|
---|
951 | if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
|
---|
952 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get global lock\n"));
|
---|
953 | tdb->ecode = TDB_ERR_LOCK;
|
---|
954 | _tdb_transaction_cancel(tdb);
|
---|
955 | return -1;
|
---|
956 | }
|
---|
957 |
|
---|
958 | tdb->transaction->global_lock_taken = true;
|
---|
959 |
|
---|
960 | if (!(tdb->flags & TDB_NOSYNC)) {
|
---|
961 | /* write the recovery data to the end of the file */
|
---|
962 | if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
|
---|
963 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n"));
|
---|
964 | _tdb_transaction_cancel(tdb);
|
---|
965 | return -1;
|
---|
966 | }
|
---|
967 | }
|
---|
968 |
|
---|
969 | tdb->transaction->prepared = true;
|
---|
970 |
|
---|
971 | /* expand the file to the new size if needed */
|
---|
972 | if (tdb->map_size != tdb->transaction->old_map_size) {
|
---|
973 | if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
|
---|
974 | tdb->map_size -
|
---|
975 | tdb->transaction->old_map_size) == -1) {
|
---|
976 | tdb->ecode = TDB_ERR_IO;
|
---|
977 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n"));
|
---|
978 | _tdb_transaction_cancel(tdb);
|
---|
979 | return -1;
|
---|
980 | }
|
---|
981 | tdb->map_size = tdb->transaction->old_map_size;
|
---|
982 | methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
---|
983 | }
|
---|
984 |
|
---|
985 | /* Keep the global lock until the actual commit */
|
---|
986 |
|
---|
987 | return 0;
|
---|
988 | }
|
---|
989 |
|
---|
990 | /*
|
---|
991 | prepare to commit the current transaction
|
---|
992 | */
|
---|
993 | int tdb_transaction_prepare_commit(struct tdb_context *tdb)
|
---|
994 | {
|
---|
995 | tdb_trace(tdb, "tdb_transaction_prepare_commit");
|
---|
996 | return _tdb_transaction_prepare_commit(tdb);
|
---|
997 | }
|
---|
998 |
|
---|
999 | /*
|
---|
1000 | commit the current transaction
|
---|
1001 | */
|
---|
1002 | int tdb_transaction_commit(struct tdb_context *tdb)
|
---|
1003 | {
|
---|
1004 | const struct tdb_methods *methods;
|
---|
1005 | int i;
|
---|
1006 | bool need_repack;
|
---|
1007 |
|
---|
1008 | if (tdb->transaction == NULL) {
|
---|
1009 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
|
---|
1010 | return -1;
|
---|
1011 | }
|
---|
1012 |
|
---|
1013 | tdb_trace(tdb, "tdb_transaction_commit");
|
---|
1014 | if (tdb->transaction->transaction_error) {
|
---|
1015 | tdb->ecode = TDB_ERR_IO;
|
---|
1016 | _tdb_transaction_cancel(tdb);
|
---|
1017 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
|
---|
1018 | return -1;
|
---|
1019 | }
|
---|
1020 |
|
---|
1021 | if (tdb->transaction->nesting != 0) {
|
---|
1022 | tdb->transaction->nesting--;
|
---|
1023 | return 0;
|
---|
1024 | }
|
---|
1025 |
|
---|
1026 | /* check for a null transaction */
|
---|
1027 | if (tdb->transaction->blocks == NULL) {
|
---|
1028 | _tdb_transaction_cancel(tdb);
|
---|
1029 | return 0;
|
---|
1030 | }
|
---|
1031 |
|
---|
1032 | if (!tdb->transaction->prepared) {
|
---|
1033 | int ret = _tdb_transaction_prepare_commit(tdb);
|
---|
1034 | if (ret)
|
---|
1035 | return ret;
|
---|
1036 | }
|
---|
1037 |
|
---|
1038 | methods = tdb->transaction->io_methods;
|
---|
1039 |
|
---|
1040 | /* perform all the writes */
|
---|
1041 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
---|
1042 | tdb_off_t offset;
|
---|
1043 | tdb_len_t length;
|
---|
1044 |
|
---|
1045 | if (tdb->transaction->blocks[i] == NULL) {
|
---|
1046 | continue;
|
---|
1047 | }
|
---|
1048 |
|
---|
1049 | offset = i * tdb->transaction->block_size;
|
---|
1050 | length = tdb->transaction->block_size;
|
---|
1051 | if (i == tdb->transaction->num_blocks-1) {
|
---|
1052 | length = tdb->transaction->last_block_size;
|
---|
1053 | }
|
---|
1054 | if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
|
---|
1055 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
|
---|
1056 |
|
---|
1057 | /* we've overwritten part of the data and
|
---|
1058 | possibly expanded the file, so we need to
|
---|
1059 | run the crash recovery code */
|
---|
1060 | tdb->methods = methods;
|
---|
1061 | tdb_transaction_recover(tdb);
|
---|
1062 |
|
---|
1063 | _tdb_transaction_cancel(tdb);
|
---|
1064 |
|
---|
1065 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
|
---|
1066 | return -1;
|
---|
1067 | }
|
---|
1068 | SAFE_FREE(tdb->transaction->blocks[i]);
|
---|
1069 | }
|
---|
1070 |
|
---|
1071 | SAFE_FREE(tdb->transaction->blocks);
|
---|
1072 | tdb->transaction->num_blocks = 0;
|
---|
1073 |
|
---|
1074 | /* ensure the new data is on disk */
|
---|
1075 | if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
|
---|
1076 | return -1;
|
---|
1077 | }
|
---|
1078 |
|
---|
1079 | /*
|
---|
1080 | TODO: maybe write to some dummy hdr field, or write to magic
|
---|
1081 | offset without mmap, before the last sync, instead of the
|
---|
1082 | utime() call
|
---|
1083 | */
|
---|
1084 |
|
---|
1085 | /* on some systems (like Linux 2.6.x) changes via mmap/msync
|
---|
1086 | don't change the mtime of the file, this means the file may
|
---|
1087 | not be backed up (as tdb rounding to block sizes means that
|
---|
1088 | file size changes are quite rare too). The following forces
|
---|
1089 | mtime changes when a transaction completes */
|
---|
1090 | #ifdef HAVE_UTIME
|
---|
1091 | utime(tdb->name, NULL);
|
---|
1092 | #endif
|
---|
1093 |
|
---|
1094 | need_repack = tdb->transaction->need_repack;
|
---|
1095 |
|
---|
1096 | /* use a transaction cancel to free memory and remove the
|
---|
1097 | transaction locks */
|
---|
1098 | _tdb_transaction_cancel(tdb);
|
---|
1099 |
|
---|
1100 | if (need_repack) {
|
---|
1101 | return tdb_repack(tdb);
|
---|
1102 | }
|
---|
1103 |
|
---|
1104 | return 0;
|
---|
1105 | }
|
---|
1106 |
|
---|
1107 |
|
---|
1108 | /*
|
---|
1109 | recover from an aborted transaction. Must be called with exclusive
|
---|
1110 | database write access already established (including the global
|
---|
1111 | lock to prevent new processes attaching)
|
---|
1112 | */
|
---|
1113 | int tdb_transaction_recover(struct tdb_context *tdb)
|
---|
1114 | {
|
---|
1115 | tdb_off_t recovery_head, recovery_eof;
|
---|
1116 | unsigned char *data, *p;
|
---|
1117 | uint32_t zero = 0;
|
---|
1118 | struct tdb_record rec;
|
---|
1119 |
|
---|
1120 | /* find the recovery area */
|
---|
1121 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
|
---|
1122 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
|
---|
1123 | tdb->ecode = TDB_ERR_IO;
|
---|
1124 | return -1;
|
---|
1125 | }
|
---|
1126 |
|
---|
1127 | if (recovery_head == 0) {
|
---|
1128 | /* we have never allocated a recovery record */
|
---|
1129 | return 0;
|
---|
1130 | }
|
---|
1131 |
|
---|
1132 | /* read the recovery record */
|
---|
1133 | if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
|
---|
1134 | sizeof(rec), DOCONV()) == -1) {
|
---|
1135 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
|
---|
1136 | tdb->ecode = TDB_ERR_IO;
|
---|
1137 | return -1;
|
---|
1138 | }
|
---|
1139 |
|
---|
1140 | if (rec.magic != TDB_RECOVERY_MAGIC) {
|
---|
1141 | /* there is no valid recovery data */
|
---|
1142 | return 0;
|
---|
1143 | }
|
---|
1144 |
|
---|
1145 | if (tdb->read_only) {
|
---|
1146 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
|
---|
1147 | tdb->ecode = TDB_ERR_CORRUPT;
|
---|
1148 | return -1;
|
---|
1149 | }
|
---|
1150 |
|
---|
1151 | recovery_eof = rec.key_len;
|
---|
1152 |
|
---|
1153 | data = (unsigned char *)malloc(rec.data_len);
|
---|
1154 | if (data == NULL) {
|
---|
1155 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
|
---|
1156 | tdb->ecode = TDB_ERR_OOM;
|
---|
1157 | return -1;
|
---|
1158 | }
|
---|
1159 |
|
---|
1160 | /* read the full recovery data */
|
---|
1161 | if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
|
---|
1162 | rec.data_len, 0) == -1) {
|
---|
1163 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
|
---|
1164 | tdb->ecode = TDB_ERR_IO;
|
---|
1165 | return -1;
|
---|
1166 | }
|
---|
1167 |
|
---|
1168 | /* recover the file data */
|
---|
1169 | p = data;
|
---|
1170 | while (p+8 < data + rec.data_len) {
|
---|
1171 | uint32_t ofs, len;
|
---|
1172 | if (DOCONV()) {
|
---|
1173 | tdb_convert(p, 8);
|
---|
1174 | }
|
---|
1175 | memcpy(&ofs, p, 4);
|
---|
1176 | memcpy(&len, p+4, 4);
|
---|
1177 |
|
---|
1178 | if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
|
---|
1179 | free(data);
|
---|
1180 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
|
---|
1181 | tdb->ecode = TDB_ERR_IO;
|
---|
1182 | return -1;
|
---|
1183 | }
|
---|
1184 | p += 8 + len;
|
---|
1185 | }
|
---|
1186 |
|
---|
1187 | free(data);
|
---|
1188 |
|
---|
1189 | if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
|
---|
1190 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
|
---|
1191 | tdb->ecode = TDB_ERR_IO;
|
---|
1192 | return -1;
|
---|
1193 | }
|
---|
1194 |
|
---|
1195 | /* if the recovery area is after the recovered eof then remove it */
|
---|
1196 | if (recovery_eof <= recovery_head) {
|
---|
1197 | if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
|
---|
1198 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
|
---|
1199 | tdb->ecode = TDB_ERR_IO;
|
---|
1200 | return -1;
|
---|
1201 | }
|
---|
1202 | }
|
---|
1203 |
|
---|
1204 | /* remove the recovery magic */
|
---|
1205 | if (tdb_ofs_write(tdb, recovery_head + offsetof(struct tdb_record, magic),
|
---|
1206 | &zero) == -1) {
|
---|
1207 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
|
---|
1208 | tdb->ecode = TDB_ERR_IO;
|
---|
1209 | return -1;
|
---|
1210 | }
|
---|
1211 |
|
---|
1212 | /* reduce the file size to the old size */
|
---|
1213 | tdb_munmap(tdb);
|
---|
1214 | if (ftruncate(tdb->fd, recovery_eof) != 0) {
|
---|
1215 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to reduce to recovery size\n"));
|
---|
1216 | tdb->ecode = TDB_ERR_IO;
|
---|
1217 | return -1;
|
---|
1218 | }
|
---|
1219 | tdb->map_size = recovery_eof;
|
---|
1220 | tdb_mmap(tdb);
|
---|
1221 |
|
---|
1222 | if (transaction_sync(tdb, 0, recovery_eof) == -1) {
|
---|
1223 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
|
---|
1224 | tdb->ecode = TDB_ERR_IO;
|
---|
1225 | return -1;
|
---|
1226 | }
|
---|
1227 |
|
---|
1228 | TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %d byte database\n",
|
---|
1229 | recovery_eof));
|
---|
1230 |
|
---|
1231 | /* all done */
|
---|
1232 | return 0;
|
---|
1233 | }
|
---|