1 | /*
|
---|
2 | Unix SMB/CIFS implementation.
|
---|
3 |
|
---|
4 | trivial database library
|
---|
5 |
|
---|
6 | Copyright (C) Andrew Tridgell 2005
|
---|
7 |
|
---|
8 | ** NOTE! The following LGPL license applies to the tdb
|
---|
9 | ** library. This does NOT imply that all of Samba is released
|
---|
10 | ** under the LGPL
|
---|
11 |
|
---|
12 | This library is free software; you can redistribute it and/or
|
---|
13 | modify it under the terms of the GNU Lesser General Public
|
---|
14 | License as published by the Free Software Foundation; either
|
---|
15 | version 3 of the License, or (at your option) any later version.
|
---|
16 |
|
---|
17 | This library is distributed in the hope that it will be useful,
|
---|
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
---|
20 | Lesser General Public License for more details.
|
---|
21 |
|
---|
22 | You should have received a copy of the GNU Lesser General Public
|
---|
23 | License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
---|
24 | */
|
---|
25 |
|
---|
26 | #include "tdb_private.h"
|
---|
27 |
|
---|
28 | /*
|
---|
29 | transaction design:
|
---|
30 |
|
---|
31 | - only allow a single transaction at a time per database. This makes
|
---|
32 | using the transaction API simpler, as otherwise the caller would
|
---|
33 | have to cope with temporary failures in transactions that conflict
|
---|
34 | with other current transactions
|
---|
35 |
|
---|
36 | - keep the transaction recovery information in the same file as the
|
---|
37 | database, using a special 'transaction recovery' record pointed at
|
---|
38 | by the header. This removes the need for extra journal files as
|
---|
39 | used by some other databases
|
---|
40 |
|
---|
41 | - dynamically allocated the transaction recover record, re-using it
|
---|
42 | for subsequent transactions. If a larger record is needed then
|
---|
43 | tdb_free() the old record to place it on the normal tdb freelist
|
---|
44 | before allocating the new record
|
---|
45 |
|
---|
46 | - during transactions, keep a linked list of writes all that have
|
---|
47 | been performed by intercepting all tdb_write() calls. The hooked
|
---|
48 | transaction versions of tdb_read() and tdb_write() check this
|
---|
49 | linked list and try to use the elements of the list in preference
|
---|
50 | to the real database.
|
---|
51 |
|
---|
52 | - don't allow any locks to be held when a transaction starts,
|
---|
53 | otherwise we can end up with deadlock (plus lack of lock nesting
|
---|
54 | in posix locks would mean the lock is lost)
|
---|
55 |
|
---|
56 | - if the caller gains a lock during the transaction but doesn't
|
---|
57 | release it then fail the commit
|
---|
58 |
|
---|
59 | - allow for nested calls to tdb_transaction_start(), re-using the
|
---|
60 | existing transaction record. If the inner transaction is cancelled
|
---|
61 | then a subsequent commit will fail
|
---|
62 |
|
---|
63 | - keep a mirrored copy of the tdb hash chain heads to allow for the
|
---|
64 | fast hash heads scan on traverse, updating the mirrored copy in
|
---|
65 | the transaction version of tdb_write
|
---|
66 |
|
---|
67 | - allow callers to mix transaction and non-transaction use of tdb,
|
---|
68 | although once a transaction is started then an exclusive lock is
|
---|
69 | gained until the transaction is committed or cancelled
|
---|
70 |
|
---|
71 | - the commit stategy involves first saving away all modified data
|
---|
72 | into a linearised buffer in the transaction recovery area, then
|
---|
73 | marking the transaction recovery area with a magic value to
|
---|
74 | indicate a valid recovery record. In total 4 fsync/msync calls are
|
---|
75 | needed per commit to prevent race conditions. It might be possible
|
---|
76 | to reduce this to 3 or even 2 with some more work.
|
---|
77 |
|
---|
78 | - check for a valid recovery record on open of the tdb, while the
|
---|
79 | open lock is held. Automatically recover from the transaction
|
---|
80 | recovery area if needed, then continue with the open as
|
---|
81 | usual. This allows for smooth crash recovery with no administrator
|
---|
82 | intervention.
|
---|
83 |
|
---|
84 | - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
|
---|
85 | still available, but no transaction recovery area is used and no
|
---|
86 | fsync/msync calls are made.
|
---|
87 |
|
---|
88 | - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using
|
---|
89 | tdb_add_flags() transaction nesting is enabled.
|
---|
90 | It resets the TDB_DISALLOW_NESTING flag, as both cannot be used together.
|
---|
91 | The default is that transaction nesting is allowed.
|
---|
92 | Note: this default may change in future versions of tdb.
|
---|
93 |
|
---|
94 | Beware. when transactions are nested a transaction successfully
|
---|
95 | completed with tdb_transaction_commit() can be silently unrolled later.
|
---|
96 |
|
---|
97 | - if TDB_DISALLOW_NESTING is passed to flags in tdb open, or added using
|
---|
98 | tdb_add_flags() transaction nesting is disabled.
|
---|
99 | It resets the TDB_ALLOW_NESTING flag, as both cannot be used together.
|
---|
100 | An attempt create a nested transaction will fail with TDB_ERR_NESTING.
|
---|
101 | The default is that transaction nesting is allowed.
|
---|
102 | Note: this default may change in future versions of tdb.
|
---|
103 | */
|
---|
104 |
|
---|
105 |
|
---|
106 | /*
|
---|
107 | hold the context of any current transaction
|
---|
108 | */
|
---|
109 | struct tdb_transaction {
|
---|
110 | /* we keep a mirrored copy of the tdb hash heads here so
|
---|
111 | tdb_next_hash_chain() can operate efficiently */
|
---|
112 | uint32_t *hash_heads;
|
---|
113 |
|
---|
114 | /* the original io methods - used to do IOs to the real db */
|
---|
115 | const struct tdb_methods *io_methods;
|
---|
116 |
|
---|
117 | /* the list of transaction blocks. When a block is first
|
---|
118 | written to, it gets created in this list */
|
---|
119 | uint8_t **blocks;
|
---|
120 | uint32_t num_blocks;
|
---|
121 | uint32_t block_size; /* bytes in each block */
|
---|
122 | uint32_t last_block_size; /* number of valid bytes in the last block */
|
---|
123 |
|
---|
124 | /* non-zero when an internal transaction error has
|
---|
125 | occurred. All write operations will then fail until the
|
---|
126 | transaction is ended */
|
---|
127 | int transaction_error;
|
---|
128 |
|
---|
129 | /* when inside a transaction we need to keep track of any
|
---|
130 | nested tdb_transaction_start() calls, as these are allowed,
|
---|
131 | but don't create a new transaction */
|
---|
132 | int nesting;
|
---|
133 |
|
---|
134 | /* set when a prepare has already occurred */
|
---|
135 | bool prepared;
|
---|
136 | tdb_off_t magic_offset;
|
---|
137 |
|
---|
138 | /* old file size before transaction */
|
---|
139 | tdb_len_t old_map_size;
|
---|
140 |
|
---|
141 | /* did we expand in this transaction */
|
---|
142 | bool expanded;
|
---|
143 | };
|
---|
144 |
|
---|
145 |
|
---|
146 | /*
|
---|
147 | read while in a transaction. We need to check first if the data is in our list
|
---|
148 | of transaction elements, then if not do a real read
|
---|
149 | */
|
---|
150 | static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
|
---|
151 | tdb_len_t len, int cv)
|
---|
152 | {
|
---|
153 | uint32_t blk;
|
---|
154 |
|
---|
155 | /* break it down into block sized ops */
|
---|
156 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
---|
157 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
---|
158 | if (transaction_read(tdb, off, buf, len2, cv) != 0) {
|
---|
159 | return -1;
|
---|
160 | }
|
---|
161 | len -= len2;
|
---|
162 | off += len2;
|
---|
163 | buf = (void *)(len2 + (char *)buf);
|
---|
164 | }
|
---|
165 |
|
---|
166 | if (len == 0) {
|
---|
167 | return 0;
|
---|
168 | }
|
---|
169 |
|
---|
170 | blk = off / tdb->transaction->block_size;
|
---|
171 |
|
---|
172 | /* see if we have it in the block list */
|
---|
173 | if (tdb->transaction->num_blocks <= blk ||
|
---|
174 | tdb->transaction->blocks[blk] == NULL) {
|
---|
175 | /* nope, do a real read */
|
---|
176 | if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) {
|
---|
177 | goto fail;
|
---|
178 | }
|
---|
179 | return 0;
|
---|
180 | }
|
---|
181 |
|
---|
182 | /* it is in the block list. Now check for the last block */
|
---|
183 | if (blk == tdb->transaction->num_blocks-1) {
|
---|
184 | if (len > tdb->transaction->last_block_size) {
|
---|
185 | goto fail;
|
---|
186 | }
|
---|
187 | }
|
---|
188 |
|
---|
189 | /* now copy it out of this block */
|
---|
190 | memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
|
---|
191 | if (cv) {
|
---|
192 | tdb_convert(buf, len);
|
---|
193 | }
|
---|
194 | return 0;
|
---|
195 |
|
---|
196 | fail:
|
---|
197 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len));
|
---|
198 | tdb->ecode = TDB_ERR_IO;
|
---|
199 | tdb->transaction->transaction_error = 1;
|
---|
200 | return -1;
|
---|
201 | }
|
---|
202 |
|
---|
203 |
|
---|
204 | /*
|
---|
205 | write while in a transaction
|
---|
206 | */
|
---|
207 | static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
|
---|
208 | const void *buf, tdb_len_t len)
|
---|
209 | {
|
---|
210 | uint32_t blk;
|
---|
211 |
|
---|
212 | /* Only a commit is allowed on a prepared transaction */
|
---|
213 | if (tdb->transaction->prepared) {
|
---|
214 | tdb->ecode = TDB_ERR_EINVAL;
|
---|
215 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n"));
|
---|
216 | tdb->transaction->transaction_error = 1;
|
---|
217 | return -1;
|
---|
218 | }
|
---|
219 |
|
---|
220 | /* if the write is to a hash head, then update the transaction
|
---|
221 | hash heads */
|
---|
222 | if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
|
---|
223 | off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
|
---|
224 | uint32_t chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
|
---|
225 | memcpy(&tdb->transaction->hash_heads[chain], buf, len);
|
---|
226 | }
|
---|
227 |
|
---|
228 | /* break it up into block sized chunks */
|
---|
229 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
---|
230 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
---|
231 | if (transaction_write(tdb, off, buf, len2) != 0) {
|
---|
232 | return -1;
|
---|
233 | }
|
---|
234 | len -= len2;
|
---|
235 | off += len2;
|
---|
236 | if (buf != NULL) {
|
---|
237 | buf = (const void *)(len2 + (const char *)buf);
|
---|
238 | }
|
---|
239 | }
|
---|
240 |
|
---|
241 | if (len == 0) {
|
---|
242 | return 0;
|
---|
243 | }
|
---|
244 |
|
---|
245 | blk = off / tdb->transaction->block_size;
|
---|
246 | off = off % tdb->transaction->block_size;
|
---|
247 |
|
---|
248 | if (tdb->transaction->num_blocks <= blk) {
|
---|
249 | uint8_t **new_blocks;
|
---|
250 | /* expand the blocks array */
|
---|
251 | if (tdb->transaction->blocks == NULL) {
|
---|
252 | new_blocks = (uint8_t **)malloc(
|
---|
253 | (blk+1)*sizeof(uint8_t *));
|
---|
254 | } else {
|
---|
255 | new_blocks = (uint8_t **)realloc(
|
---|
256 | tdb->transaction->blocks,
|
---|
257 | (blk+1)*sizeof(uint8_t *));
|
---|
258 | }
|
---|
259 | if (new_blocks == NULL) {
|
---|
260 | tdb->ecode = TDB_ERR_OOM;
|
---|
261 | goto fail;
|
---|
262 | }
|
---|
263 | memset(&new_blocks[tdb->transaction->num_blocks], 0,
|
---|
264 | (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
|
---|
265 | tdb->transaction->blocks = new_blocks;
|
---|
266 | tdb->transaction->num_blocks = blk+1;
|
---|
267 | tdb->transaction->last_block_size = 0;
|
---|
268 | }
|
---|
269 |
|
---|
270 | /* allocate and fill a block? */
|
---|
271 | if (tdb->transaction->blocks[blk] == NULL) {
|
---|
272 | tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1);
|
---|
273 | if (tdb->transaction->blocks[blk] == NULL) {
|
---|
274 | tdb->ecode = TDB_ERR_OOM;
|
---|
275 | tdb->transaction->transaction_error = 1;
|
---|
276 | return -1;
|
---|
277 | }
|
---|
278 | if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) {
|
---|
279 | tdb_len_t len2 = tdb->transaction->block_size;
|
---|
280 | if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) {
|
---|
281 | len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size);
|
---|
282 | }
|
---|
283 | if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size,
|
---|
284 | tdb->transaction->blocks[blk],
|
---|
285 | len2, 0) != 0) {
|
---|
286 | SAFE_FREE(tdb->transaction->blocks[blk]);
|
---|
287 | tdb->ecode = TDB_ERR_IO;
|
---|
288 | goto fail;
|
---|
289 | }
|
---|
290 | if (blk == tdb->transaction->num_blocks-1) {
|
---|
291 | tdb->transaction->last_block_size = len2;
|
---|
292 | }
|
---|
293 | }
|
---|
294 | }
|
---|
295 |
|
---|
296 | /* overwrite part of an existing block */
|
---|
297 | if (buf == NULL) {
|
---|
298 | memset(tdb->transaction->blocks[blk] + off, 0, len);
|
---|
299 | } else {
|
---|
300 | memcpy(tdb->transaction->blocks[blk] + off, buf, len);
|
---|
301 | }
|
---|
302 | if (blk == tdb->transaction->num_blocks-1) {
|
---|
303 | if (len + off > tdb->transaction->last_block_size) {
|
---|
304 | tdb->transaction->last_block_size = len + off;
|
---|
305 | }
|
---|
306 | }
|
---|
307 |
|
---|
308 | return 0;
|
---|
309 |
|
---|
310 | fail:
|
---|
311 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n",
|
---|
312 | (blk*tdb->transaction->block_size) + off, len));
|
---|
313 | tdb->transaction->transaction_error = 1;
|
---|
314 | return -1;
|
---|
315 | }
|
---|
316 |
|
---|
317 |
|
---|
318 | /*
|
---|
319 | write while in a transaction - this varient never expands the transaction blocks, it only
|
---|
320 | updates existing blocks. This means it cannot change the recovery size
|
---|
321 | */
|
---|
322 | static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
|
---|
323 | const void *buf, tdb_len_t len)
|
---|
324 | {
|
---|
325 | uint32_t blk;
|
---|
326 |
|
---|
327 | /* break it up into block sized chunks */
|
---|
328 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
|
---|
329 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
|
---|
330 | if (transaction_write_existing(tdb, off, buf, len2) != 0) {
|
---|
331 | return -1;
|
---|
332 | }
|
---|
333 | len -= len2;
|
---|
334 | off += len2;
|
---|
335 | if (buf != NULL) {
|
---|
336 | buf = (const void *)(len2 + (const char *)buf);
|
---|
337 | }
|
---|
338 | }
|
---|
339 |
|
---|
340 | if (len == 0) {
|
---|
341 | return 0;
|
---|
342 | }
|
---|
343 |
|
---|
344 | blk = off / tdb->transaction->block_size;
|
---|
345 | off = off % tdb->transaction->block_size;
|
---|
346 |
|
---|
347 | if (tdb->transaction->num_blocks <= blk ||
|
---|
348 | tdb->transaction->blocks[blk] == NULL) {
|
---|
349 | return 0;
|
---|
350 | }
|
---|
351 |
|
---|
352 | if (blk == tdb->transaction->num_blocks-1 &&
|
---|
353 | off + len > tdb->transaction->last_block_size) {
|
---|
354 | if (off >= tdb->transaction->last_block_size) {
|
---|
355 | return 0;
|
---|
356 | }
|
---|
357 | len = tdb->transaction->last_block_size - off;
|
---|
358 | }
|
---|
359 |
|
---|
360 | /* overwrite part of an existing block */
|
---|
361 | memcpy(tdb->transaction->blocks[blk] + off, buf, len);
|
---|
362 |
|
---|
363 | return 0;
|
---|
364 | }
|
---|
365 |
|
---|
366 |
|
---|
367 | /*
|
---|
368 | accelerated hash chain head search, using the cached hash heads
|
---|
369 | */
|
---|
370 | static void transaction_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
|
---|
371 | {
|
---|
372 | uint32_t h = *chain;
|
---|
373 | for (;h < tdb->header.hash_size;h++) {
|
---|
374 | /* the +1 takes account of the freelist */
|
---|
375 | if (0 != tdb->transaction->hash_heads[h+1]) {
|
---|
376 | break;
|
---|
377 | }
|
---|
378 | }
|
---|
379 | (*chain) = h;
|
---|
380 | }
|
---|
381 |
|
---|
382 | /*
|
---|
383 | out of bounds check during a transaction
|
---|
384 | */
|
---|
385 | static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
|
---|
386 | {
|
---|
387 | if (len <= tdb->map_size) {
|
---|
388 | return 0;
|
---|
389 | }
|
---|
390 | tdb->ecode = TDB_ERR_IO;
|
---|
391 | return -1;
|
---|
392 | }
|
---|
393 |
|
---|
394 | /*
|
---|
395 | transaction version of tdb_expand().
|
---|
396 | */
|
---|
397 | static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
|
---|
398 | tdb_off_t addition)
|
---|
399 | {
|
---|
400 | /* add a write to the transaction elements, so subsequent
|
---|
401 | reads see the zero data */
|
---|
402 | if (transaction_write(tdb, size, NULL, addition) != 0) {
|
---|
403 | return -1;
|
---|
404 | }
|
---|
405 |
|
---|
406 | tdb->transaction->expanded = true;
|
---|
407 |
|
---|
408 | return 0;
|
---|
409 | }
|
---|
410 |
|
---|
411 | static const struct tdb_methods transaction_methods = {
|
---|
412 | transaction_read,
|
---|
413 | transaction_write,
|
---|
414 | transaction_next_hash_chain,
|
---|
415 | transaction_oob,
|
---|
416 | transaction_expand_file,
|
---|
417 | };
|
---|
418 |
|
---|
419 |
|
---|
420 | /*
|
---|
421 | start a tdb transaction. No token is returned, as only a single
|
---|
422 | transaction is allowed to be pending per tdb_context
|
---|
423 | */
|
---|
424 | static int _tdb_transaction_start(struct tdb_context *tdb,
|
---|
425 | enum tdb_lock_flags lockflags)
|
---|
426 | {
|
---|
427 | /* some sanity checks */
|
---|
428 | if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
|
---|
429 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
|
---|
430 | tdb->ecode = TDB_ERR_EINVAL;
|
---|
431 | return -1;
|
---|
432 | }
|
---|
433 |
|
---|
434 | /* cope with nested tdb_transaction_start() calls */
|
---|
435 | if (tdb->transaction != NULL) {
|
---|
436 | if (!(tdb->flags & TDB_ALLOW_NESTING)) {
|
---|
437 | tdb->ecode = TDB_ERR_NESTING;
|
---|
438 | return -1;
|
---|
439 | }
|
---|
440 | tdb->transaction->nesting++;
|
---|
441 | TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
|
---|
442 | tdb->transaction->nesting));
|
---|
443 | return 0;
|
---|
444 | }
|
---|
445 |
|
---|
446 | if (tdb_have_extra_locks(tdb)) {
|
---|
447 | /* the caller must not have any locks when starting a
|
---|
448 | transaction as otherwise we'll be screwed by lack
|
---|
449 | of nested locks in posix */
|
---|
450 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
|
---|
451 | tdb->ecode = TDB_ERR_LOCK;
|
---|
452 | return -1;
|
---|
453 | }
|
---|
454 |
|
---|
455 | if (tdb->travlocks.next != NULL) {
|
---|
456 | /* you cannot use transactions inside a traverse (although you can use
|
---|
457 | traverse inside a transaction) as otherwise you can end up with
|
---|
458 | deadlock */
|
---|
459 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
|
---|
460 | tdb->ecode = TDB_ERR_LOCK;
|
---|
461 | return -1;
|
---|
462 | }
|
---|
463 |
|
---|
464 | tdb->transaction = (struct tdb_transaction *)
|
---|
465 | calloc(sizeof(struct tdb_transaction), 1);
|
---|
466 | if (tdb->transaction == NULL) {
|
---|
467 | tdb->ecode = TDB_ERR_OOM;
|
---|
468 | return -1;
|
---|
469 | }
|
---|
470 |
|
---|
471 | /* a page at a time seems like a reasonable compromise between compactness and efficiency */
|
---|
472 | tdb->transaction->block_size = tdb->page_size;
|
---|
473 |
|
---|
474 | /* get the transaction write lock. This is a blocking lock. As
|
---|
475 | discussed with Volker, there are a number of ways we could
|
---|
476 | make this async, which we will probably do in the future */
|
---|
477 | if (tdb_transaction_lock(tdb, F_WRLCK, lockflags) == -1) {
|
---|
478 | SAFE_FREE(tdb->transaction->blocks);
|
---|
479 | SAFE_FREE(tdb->transaction);
|
---|
480 | if ((lockflags & TDB_LOCK_WAIT) == 0) {
|
---|
481 | tdb->ecode = TDB_ERR_NOLOCK;
|
---|
482 | }
|
---|
483 | return -1;
|
---|
484 | }
|
---|
485 |
|
---|
486 | /* get a read lock from the freelist to the end of file. This
|
---|
487 | is upgraded to a write lock during the commit */
|
---|
488 | if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) {
|
---|
489 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
|
---|
490 | goto fail_allrecord_lock;
|
---|
491 | }
|
---|
492 |
|
---|
493 | /* setup a copy of the hash table heads so the hash scan in
|
---|
494 | traverse can be fast */
|
---|
495 | tdb->transaction->hash_heads = (uint32_t *)
|
---|
496 | calloc(tdb->header.hash_size+1, sizeof(uint32_t));
|
---|
497 | if (tdb->transaction->hash_heads == NULL) {
|
---|
498 | tdb->ecode = TDB_ERR_OOM;
|
---|
499 | goto fail;
|
---|
500 | }
|
---|
501 | if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
|
---|
502 | TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
|
---|
503 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
|
---|
504 | tdb->ecode = TDB_ERR_IO;
|
---|
505 | goto fail;
|
---|
506 | }
|
---|
507 |
|
---|
508 | /* make sure we know about any file expansions already done by
|
---|
509 | anyone else */
|
---|
510 | tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
---|
511 | tdb->transaction->old_map_size = tdb->map_size;
|
---|
512 |
|
---|
513 | /* finally hook the io methods, replacing them with
|
---|
514 | transaction specific methods */
|
---|
515 | tdb->transaction->io_methods = tdb->methods;
|
---|
516 | tdb->methods = &transaction_methods;
|
---|
517 |
|
---|
518 | /* Trace at the end, so we get sequence number correct. */
|
---|
519 | tdb_trace(tdb, "tdb_transaction_start");
|
---|
520 | return 0;
|
---|
521 |
|
---|
522 | fail:
|
---|
523 | tdb_allrecord_unlock(tdb, F_RDLCK, false);
|
---|
524 | fail_allrecord_lock:
|
---|
525 | tdb_transaction_unlock(tdb, F_WRLCK);
|
---|
526 | SAFE_FREE(tdb->transaction->blocks);
|
---|
527 | SAFE_FREE(tdb->transaction->hash_heads);
|
---|
528 | SAFE_FREE(tdb->transaction);
|
---|
529 | return -1;
|
---|
530 | }
|
---|
531 |
|
---|
532 | _PUBLIC_ int tdb_transaction_start(struct tdb_context *tdb)
|
---|
533 | {
|
---|
534 | return _tdb_transaction_start(tdb, TDB_LOCK_WAIT);
|
---|
535 | }
|
---|
536 |
|
---|
537 | _PUBLIC_ int tdb_transaction_start_nonblock(struct tdb_context *tdb)
|
---|
538 | {
|
---|
539 | return _tdb_transaction_start(tdb, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
|
---|
540 | }
|
---|
541 |
|
---|
542 | /*
|
---|
543 | sync to disk
|
---|
544 | */
|
---|
545 | static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
|
---|
546 | {
|
---|
547 | if (tdb->flags & TDB_NOSYNC) {
|
---|
548 | return 0;
|
---|
549 | }
|
---|
550 |
|
---|
551 | #ifdef HAVE_FDATASYNC
|
---|
552 | if (fdatasync(tdb->fd) != 0) {
|
---|
553 | #else
|
---|
554 | if (fsync(tdb->fd) != 0) {
|
---|
555 | #endif
|
---|
556 | tdb->ecode = TDB_ERR_IO;
|
---|
557 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
|
---|
558 | return -1;
|
---|
559 | }
|
---|
560 | #ifdef HAVE_MMAP
|
---|
561 | if (tdb->map_ptr) {
|
---|
562 | tdb_off_t moffset = offset & ~(tdb->page_size-1);
|
---|
563 | if (msync(moffset + (char *)tdb->map_ptr,
|
---|
564 | length + (offset - moffset), MS_SYNC) != 0) {
|
---|
565 | tdb->ecode = TDB_ERR_IO;
|
---|
566 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
|
---|
567 | strerror(errno)));
|
---|
568 | return -1;
|
---|
569 | }
|
---|
570 | }
|
---|
571 | #endif
|
---|
572 | return 0;
|
---|
573 | }
|
---|
574 |
|
---|
575 |
|
---|
576 | static int _tdb_transaction_cancel(struct tdb_context *tdb)
|
---|
577 | {
|
---|
578 | int i, ret = 0;
|
---|
579 |
|
---|
580 | if (tdb->transaction == NULL) {
|
---|
581 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
|
---|
582 | return -1;
|
---|
583 | }
|
---|
584 |
|
---|
585 | if (tdb->transaction->nesting != 0) {
|
---|
586 | tdb->transaction->transaction_error = 1;
|
---|
587 | tdb->transaction->nesting--;
|
---|
588 | return 0;
|
---|
589 | }
|
---|
590 |
|
---|
591 | tdb->map_size = tdb->transaction->old_map_size;
|
---|
592 |
|
---|
593 | /* free all the transaction blocks */
|
---|
594 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
---|
595 | if (tdb->transaction->blocks[i] != NULL) {
|
---|
596 | free(tdb->transaction->blocks[i]);
|
---|
597 | }
|
---|
598 | }
|
---|
599 | SAFE_FREE(tdb->transaction->blocks);
|
---|
600 |
|
---|
601 | if (tdb->transaction->magic_offset) {
|
---|
602 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
---|
603 | const uint32_t invalid = TDB_RECOVERY_INVALID_MAGIC;
|
---|
604 |
|
---|
605 | /* remove the recovery marker */
|
---|
606 | if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &invalid, 4) == -1 ||
|
---|
607 | transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
|
---|
608 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
|
---|
609 | ret = -1;
|
---|
610 | }
|
---|
611 | }
|
---|
612 |
|
---|
613 | /* This also removes the OPEN_LOCK, if we have it. */
|
---|
614 | tdb_release_transaction_locks(tdb);
|
---|
615 |
|
---|
616 | /* restore the normal io methods */
|
---|
617 | tdb->methods = tdb->transaction->io_methods;
|
---|
618 |
|
---|
619 | SAFE_FREE(tdb->transaction->hash_heads);
|
---|
620 | SAFE_FREE(tdb->transaction);
|
---|
621 |
|
---|
622 | return ret;
|
---|
623 | }
|
---|
624 |
|
---|
625 | /*
|
---|
626 | cancel the current transaction
|
---|
627 | */
|
---|
628 | _PUBLIC_ int tdb_transaction_cancel(struct tdb_context *tdb)
|
---|
629 | {
|
---|
630 | tdb_trace(tdb, "tdb_transaction_cancel");
|
---|
631 | return _tdb_transaction_cancel(tdb);
|
---|
632 | }
|
---|
633 |
|
---|
634 | /*
|
---|
635 | work out how much space the linearised recovery data will consume
|
---|
636 | */
|
---|
637 | static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
|
---|
638 | {
|
---|
639 | tdb_len_t recovery_size = 0;
|
---|
640 | int i;
|
---|
641 |
|
---|
642 | recovery_size = sizeof(uint32_t);
|
---|
643 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
---|
644 | if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) {
|
---|
645 | break;
|
---|
646 | }
|
---|
647 | if (tdb->transaction->blocks[i] == NULL) {
|
---|
648 | continue;
|
---|
649 | }
|
---|
650 | recovery_size += 2*sizeof(tdb_off_t);
|
---|
651 | if (i == tdb->transaction->num_blocks-1) {
|
---|
652 | recovery_size += tdb->transaction->last_block_size;
|
---|
653 | } else {
|
---|
654 | recovery_size += tdb->transaction->block_size;
|
---|
655 | }
|
---|
656 | }
|
---|
657 |
|
---|
658 | return recovery_size;
|
---|
659 | }
|
---|
660 |
|
---|
661 | int tdb_recovery_area(struct tdb_context *tdb,
|
---|
662 | const struct tdb_methods *methods,
|
---|
663 | tdb_off_t *recovery_offset,
|
---|
664 | struct tdb_record *rec)
|
---|
665 | {
|
---|
666 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, recovery_offset) == -1) {
|
---|
667 | return -1;
|
---|
668 | }
|
---|
669 |
|
---|
670 | if (*recovery_offset == 0) {
|
---|
671 | rec->rec_len = 0;
|
---|
672 | return 0;
|
---|
673 | }
|
---|
674 |
|
---|
675 | if (methods->tdb_read(tdb, *recovery_offset, rec, sizeof(*rec),
|
---|
676 | DOCONV()) == -1) {
|
---|
677 | return -1;
|
---|
678 | }
|
---|
679 |
|
---|
680 | /* ignore invalid recovery regions: can happen in crash */
|
---|
681 | if (rec->magic != TDB_RECOVERY_MAGIC &&
|
---|
682 | rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
|
---|
683 | *recovery_offset = 0;
|
---|
684 | rec->rec_len = 0;
|
---|
685 | }
|
---|
686 | return 0;
|
---|
687 | }
|
---|
688 |
|
---|
689 | /*
|
---|
690 | allocate the recovery area, or use an existing recovery area if it is
|
---|
691 | large enough
|
---|
692 | */
|
---|
693 | static int tdb_recovery_allocate(struct tdb_context *tdb,
|
---|
694 | tdb_len_t *recovery_size,
|
---|
695 | tdb_off_t *recovery_offset,
|
---|
696 | tdb_len_t *recovery_max_size)
|
---|
697 | {
|
---|
698 | struct tdb_record rec;
|
---|
699 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
---|
700 | tdb_off_t recovery_head;
|
---|
701 |
|
---|
702 | if (tdb_recovery_area(tdb, methods, &recovery_head, &rec) == -1) {
|
---|
703 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
|
---|
704 | return -1;
|
---|
705 | }
|
---|
706 |
|
---|
707 | *recovery_size = tdb_recovery_size(tdb);
|
---|
708 |
|
---|
709 | if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
|
---|
710 | /* it fits in the existing area */
|
---|
711 | *recovery_max_size = rec.rec_len;
|
---|
712 | *recovery_offset = recovery_head;
|
---|
713 | return 0;
|
---|
714 | }
|
---|
715 |
|
---|
716 | /* we need to free up the old recovery area, then allocate a
|
---|
717 | new one at the end of the file. Note that we cannot use
|
---|
718 | tdb_allocate() to allocate the new one as that might return
|
---|
719 | us an area that is being currently used (as of the start of
|
---|
720 | the transaction) */
|
---|
721 | if (recovery_head != 0) {
|
---|
722 | if (tdb_free(tdb, recovery_head, &rec) == -1) {
|
---|
723 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to free previous recovery area\n"));
|
---|
724 | return -1;
|
---|
725 | }
|
---|
726 | }
|
---|
727 |
|
---|
728 | /* the tdb_free() call might have increased the recovery size */
|
---|
729 | *recovery_size = tdb_recovery_size(tdb);
|
---|
730 |
|
---|
731 | /* round up to a multiple of page size */
|
---|
732 | *recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
|
---|
733 | *recovery_offset = tdb->map_size;
|
---|
734 | recovery_head = *recovery_offset;
|
---|
735 |
|
---|
736 | if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
|
---|
737 | (tdb->map_size - tdb->transaction->old_map_size) +
|
---|
738 | sizeof(rec) + *recovery_max_size) == -1) {
|
---|
739 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
|
---|
740 | return -1;
|
---|
741 | }
|
---|
742 |
|
---|
743 | /* remap the file (if using mmap) */
|
---|
744 | methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
---|
745 |
|
---|
746 | /* we have to reset the old map size so that we don't try to expand the file
|
---|
747 | again in the transaction commit, which would destroy the recovery area */
|
---|
748 | tdb->transaction->old_map_size = tdb->map_size;
|
---|
749 |
|
---|
750 | /* write the recovery header offset and sync - we can sync without a race here
|
---|
751 | as the magic ptr in the recovery record has not been set */
|
---|
752 | CONVERT(recovery_head);
|
---|
753 | if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
|
---|
754 | &recovery_head, sizeof(tdb_off_t)) == -1) {
|
---|
755 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
|
---|
756 | return -1;
|
---|
757 | }
|
---|
758 | if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) {
|
---|
759 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
|
---|
760 | return -1;
|
---|
761 | }
|
---|
762 |
|
---|
763 | return 0;
|
---|
764 | }
|
---|
765 |
|
---|
766 |
|
---|
767 | /*
|
---|
768 | setup the recovery data that will be used on a crash during commit
|
---|
769 | */
|
---|
770 | static int transaction_setup_recovery(struct tdb_context *tdb,
|
---|
771 | tdb_off_t *magic_offset)
|
---|
772 | {
|
---|
773 | tdb_len_t recovery_size;
|
---|
774 | unsigned char *data, *p;
|
---|
775 | const struct tdb_methods *methods = tdb->transaction->io_methods;
|
---|
776 | struct tdb_record *rec;
|
---|
777 | tdb_off_t recovery_offset, recovery_max_size;
|
---|
778 | tdb_off_t old_map_size = tdb->transaction->old_map_size;
|
---|
779 | uint32_t magic, tailer;
|
---|
780 | int i;
|
---|
781 |
|
---|
782 | /*
|
---|
783 | check that the recovery area has enough space
|
---|
784 | */
|
---|
785 | if (tdb_recovery_allocate(tdb, &recovery_size,
|
---|
786 | &recovery_offset, &recovery_max_size) == -1) {
|
---|
787 | return -1;
|
---|
788 | }
|
---|
789 |
|
---|
790 | data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
|
---|
791 | if (data == NULL) {
|
---|
792 | tdb->ecode = TDB_ERR_OOM;
|
---|
793 | return -1;
|
---|
794 | }
|
---|
795 |
|
---|
796 | rec = (struct tdb_record *)data;
|
---|
797 | memset(rec, 0, sizeof(*rec));
|
---|
798 |
|
---|
799 | rec->magic = TDB_RECOVERY_INVALID_MAGIC;
|
---|
800 | rec->data_len = recovery_size;
|
---|
801 | rec->rec_len = recovery_max_size;
|
---|
802 | rec->key_len = old_map_size;
|
---|
803 | CONVERT(*rec);
|
---|
804 |
|
---|
805 | /* build the recovery data into a single blob to allow us to do a single
|
---|
806 | large write, which should be more efficient */
|
---|
807 | p = data + sizeof(*rec);
|
---|
808 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
---|
809 | tdb_off_t offset;
|
---|
810 | tdb_len_t length;
|
---|
811 |
|
---|
812 | if (tdb->transaction->blocks[i] == NULL) {
|
---|
813 | continue;
|
---|
814 | }
|
---|
815 |
|
---|
816 | offset = i * tdb->transaction->block_size;
|
---|
817 | length = tdb->transaction->block_size;
|
---|
818 | if (i == tdb->transaction->num_blocks-1) {
|
---|
819 | length = tdb->transaction->last_block_size;
|
---|
820 | }
|
---|
821 |
|
---|
822 | if (offset >= old_map_size) {
|
---|
823 | continue;
|
---|
824 | }
|
---|
825 | if (offset + length > tdb->transaction->old_map_size) {
|
---|
826 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
|
---|
827 | free(data);
|
---|
828 | tdb->ecode = TDB_ERR_CORRUPT;
|
---|
829 | return -1;
|
---|
830 | }
|
---|
831 | memcpy(p, &offset, 4);
|
---|
832 | memcpy(p+4, &length, 4);
|
---|
833 | if (DOCONV()) {
|
---|
834 | tdb_convert(p, 8);
|
---|
835 | }
|
---|
836 | /* the recovery area contains the old data, not the
|
---|
837 | new data, so we have to call the original tdb_read
|
---|
838 | method to get it */
|
---|
839 | if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) {
|
---|
840 | free(data);
|
---|
841 | tdb->ecode = TDB_ERR_IO;
|
---|
842 | return -1;
|
---|
843 | }
|
---|
844 | p += 8 + length;
|
---|
845 | }
|
---|
846 |
|
---|
847 | /* and the tailer */
|
---|
848 | tailer = sizeof(*rec) + recovery_max_size;
|
---|
849 | memcpy(p, &tailer, 4);
|
---|
850 | if (DOCONV()) {
|
---|
851 | tdb_convert(p, 4);
|
---|
852 | }
|
---|
853 |
|
---|
854 | /* write the recovery data to the recovery area */
|
---|
855 | if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
|
---|
856 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
|
---|
857 | free(data);
|
---|
858 | tdb->ecode = TDB_ERR_IO;
|
---|
859 | return -1;
|
---|
860 | }
|
---|
861 | if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
|
---|
862 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n"));
|
---|
863 | free(data);
|
---|
864 | tdb->ecode = TDB_ERR_IO;
|
---|
865 | return -1;
|
---|
866 | }
|
---|
867 |
|
---|
868 | /* as we don't have ordered writes, we have to sync the recovery
|
---|
869 | data before we update the magic to indicate that the recovery
|
---|
870 | data is present */
|
---|
871 | if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
|
---|
872 | free(data);
|
---|
873 | return -1;
|
---|
874 | }
|
---|
875 |
|
---|
876 | free(data);
|
---|
877 |
|
---|
878 | magic = TDB_RECOVERY_MAGIC;
|
---|
879 | CONVERT(magic);
|
---|
880 |
|
---|
881 | *magic_offset = recovery_offset + offsetof(struct tdb_record, magic);
|
---|
882 |
|
---|
883 | if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
|
---|
884 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
|
---|
885 | tdb->ecode = TDB_ERR_IO;
|
---|
886 | return -1;
|
---|
887 | }
|
---|
888 | if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
|
---|
889 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n"));
|
---|
890 | tdb->ecode = TDB_ERR_IO;
|
---|
891 | return -1;
|
---|
892 | }
|
---|
893 |
|
---|
894 | /* ensure the recovery magic marker is on disk */
|
---|
895 | if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
|
---|
896 | return -1;
|
---|
897 | }
|
---|
898 |
|
---|
899 | return 0;
|
---|
900 | }
|
---|
901 |
|
---|
902 | static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
|
---|
903 | {
|
---|
904 | const struct tdb_methods *methods;
|
---|
905 |
|
---|
906 | if (tdb->transaction == NULL) {
|
---|
907 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n"));
|
---|
908 | return -1;
|
---|
909 | }
|
---|
910 |
|
---|
911 | if (tdb->transaction->prepared) {
|
---|
912 | tdb->ecode = TDB_ERR_EINVAL;
|
---|
913 | _tdb_transaction_cancel(tdb);
|
---|
914 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n"));
|
---|
915 | return -1;
|
---|
916 | }
|
---|
917 |
|
---|
918 | if (tdb->transaction->transaction_error) {
|
---|
919 | tdb->ecode = TDB_ERR_IO;
|
---|
920 | _tdb_transaction_cancel(tdb);
|
---|
921 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n"));
|
---|
922 | return -1;
|
---|
923 | }
|
---|
924 |
|
---|
925 |
|
---|
926 | if (tdb->transaction->nesting != 0) {
|
---|
927 | return 0;
|
---|
928 | }
|
---|
929 |
|
---|
930 | /* check for a null transaction */
|
---|
931 | if (tdb->transaction->blocks == NULL) {
|
---|
932 | return 0;
|
---|
933 | }
|
---|
934 |
|
---|
935 | methods = tdb->transaction->io_methods;
|
---|
936 |
|
---|
937 | /* if there are any locks pending then the caller has not
|
---|
938 | nested their locks properly, so fail the transaction */
|
---|
939 | if (tdb_have_extra_locks(tdb)) {
|
---|
940 | tdb->ecode = TDB_ERR_LOCK;
|
---|
941 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
|
---|
942 | _tdb_transaction_cancel(tdb);
|
---|
943 | return -1;
|
---|
944 | }
|
---|
945 |
|
---|
946 | /* upgrade the main transaction lock region to a write lock */
|
---|
947 | if (tdb_allrecord_upgrade(tdb) == -1) {
|
---|
948 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to upgrade hash locks\n"));
|
---|
949 | _tdb_transaction_cancel(tdb);
|
---|
950 | return -1;
|
---|
951 | }
|
---|
952 |
|
---|
953 | /* get the open lock - this prevents new users attaching to the database
|
---|
954 | during the commit */
|
---|
955 | if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
|
---|
956 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get open lock\n"));
|
---|
957 | _tdb_transaction_cancel(tdb);
|
---|
958 | return -1;
|
---|
959 | }
|
---|
960 |
|
---|
961 | if (!(tdb->flags & TDB_NOSYNC)) {
|
---|
962 | /* write the recovery data to the end of the file */
|
---|
963 | if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
|
---|
964 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n"));
|
---|
965 | _tdb_transaction_cancel(tdb);
|
---|
966 | return -1;
|
---|
967 | }
|
---|
968 | }
|
---|
969 |
|
---|
970 | tdb->transaction->prepared = true;
|
---|
971 |
|
---|
972 | /* expand the file to the new size if needed */
|
---|
973 | if (tdb->map_size != tdb->transaction->old_map_size) {
|
---|
974 | if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
|
---|
975 | tdb->map_size -
|
---|
976 | tdb->transaction->old_map_size) == -1) {
|
---|
977 | tdb->ecode = TDB_ERR_IO;
|
---|
978 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n"));
|
---|
979 | _tdb_transaction_cancel(tdb);
|
---|
980 | return -1;
|
---|
981 | }
|
---|
982 | tdb->map_size = tdb->transaction->old_map_size;
|
---|
983 | methods->tdb_oob(tdb, tdb->map_size + 1, 1);
|
---|
984 | }
|
---|
985 |
|
---|
986 | /* Keep the open lock until the actual commit */
|
---|
987 |
|
---|
988 | return 0;
|
---|
989 | }
|
---|
990 |
|
---|
991 | /*
|
---|
992 | prepare to commit the current transaction
|
---|
993 | */
|
---|
994 | _PUBLIC_ int tdb_transaction_prepare_commit(struct tdb_context *tdb)
|
---|
995 | {
|
---|
996 | tdb_trace(tdb, "tdb_transaction_prepare_commit");
|
---|
997 | return _tdb_transaction_prepare_commit(tdb);
|
---|
998 | }
|
---|
999 |
|
---|
1000 | /* A repack is worthwhile if the largest is less than half total free. */
|
---|
1001 | static bool repack_worthwhile(struct tdb_context *tdb)
|
---|
1002 | {
|
---|
1003 | tdb_off_t ptr;
|
---|
1004 | struct tdb_record rec;
|
---|
1005 | tdb_len_t total = 0, largest = 0;
|
---|
1006 |
|
---|
1007 | if (tdb_ofs_read(tdb, FREELIST_TOP, &ptr) == -1) {
|
---|
1008 | return false;
|
---|
1009 | }
|
---|
1010 |
|
---|
1011 | while (ptr != 0 && tdb_rec_free_read(tdb, ptr, &rec) == 0) {
|
---|
1012 | total += rec.rec_len;
|
---|
1013 | if (rec.rec_len > largest) {
|
---|
1014 | largest = rec.rec_len;
|
---|
1015 | }
|
---|
1016 | ptr = rec.next;
|
---|
1017 | }
|
---|
1018 |
|
---|
1019 | return total > largest * 2;
|
---|
1020 | }
|
---|
1021 |
|
---|
1022 | /*
|
---|
1023 | commit the current transaction
|
---|
1024 | */
|
---|
1025 | _PUBLIC_ int tdb_transaction_commit(struct tdb_context *tdb)
|
---|
1026 | {
|
---|
1027 | const struct tdb_methods *methods;
|
---|
1028 | int i;
|
---|
1029 | bool need_repack = false;
|
---|
1030 |
|
---|
1031 | if (tdb->transaction == NULL) {
|
---|
1032 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
|
---|
1033 | return -1;
|
---|
1034 | }
|
---|
1035 |
|
---|
1036 | tdb_trace(tdb, "tdb_transaction_commit");
|
---|
1037 |
|
---|
1038 | if (tdb->transaction->transaction_error) {
|
---|
1039 | tdb->ecode = TDB_ERR_IO;
|
---|
1040 | _tdb_transaction_cancel(tdb);
|
---|
1041 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
|
---|
1042 | return -1;
|
---|
1043 | }
|
---|
1044 |
|
---|
1045 |
|
---|
1046 | if (tdb->transaction->nesting != 0) {
|
---|
1047 | tdb->transaction->nesting--;
|
---|
1048 | return 0;
|
---|
1049 | }
|
---|
1050 |
|
---|
1051 | /* check for a null transaction */
|
---|
1052 | if (tdb->transaction->blocks == NULL) {
|
---|
1053 | _tdb_transaction_cancel(tdb);
|
---|
1054 | return 0;
|
---|
1055 | }
|
---|
1056 |
|
---|
1057 | if (!tdb->transaction->prepared) {
|
---|
1058 | int ret = _tdb_transaction_prepare_commit(tdb);
|
---|
1059 | if (ret)
|
---|
1060 | return ret;
|
---|
1061 | }
|
---|
1062 |
|
---|
1063 | methods = tdb->transaction->io_methods;
|
---|
1064 |
|
---|
1065 | /* perform all the writes */
|
---|
1066 | for (i=0;i<tdb->transaction->num_blocks;i++) {
|
---|
1067 | tdb_off_t offset;
|
---|
1068 | tdb_len_t length;
|
---|
1069 |
|
---|
1070 | if (tdb->transaction->blocks[i] == NULL) {
|
---|
1071 | continue;
|
---|
1072 | }
|
---|
1073 |
|
---|
1074 | offset = i * tdb->transaction->block_size;
|
---|
1075 | length = tdb->transaction->block_size;
|
---|
1076 | if (i == tdb->transaction->num_blocks-1) {
|
---|
1077 | length = tdb->transaction->last_block_size;
|
---|
1078 | }
|
---|
1079 |
|
---|
1080 | if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
|
---|
1081 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
|
---|
1082 |
|
---|
1083 | /* we've overwritten part of the data and
|
---|
1084 | possibly expanded the file, so we need to
|
---|
1085 | run the crash recovery code */
|
---|
1086 | tdb->methods = methods;
|
---|
1087 | tdb_transaction_recover(tdb);
|
---|
1088 |
|
---|
1089 | _tdb_transaction_cancel(tdb);
|
---|
1090 |
|
---|
1091 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
|
---|
1092 | return -1;
|
---|
1093 | }
|
---|
1094 | SAFE_FREE(tdb->transaction->blocks[i]);
|
---|
1095 | }
|
---|
1096 |
|
---|
1097 | /* Do this before we drop lock or blocks. */
|
---|
1098 | if (tdb->transaction->expanded) {
|
---|
1099 | need_repack = repack_worthwhile(tdb);
|
---|
1100 | }
|
---|
1101 |
|
---|
1102 | SAFE_FREE(tdb->transaction->blocks);
|
---|
1103 | tdb->transaction->num_blocks = 0;
|
---|
1104 |
|
---|
1105 | /* ensure the new data is on disk */
|
---|
1106 | if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
|
---|
1107 | return -1;
|
---|
1108 | }
|
---|
1109 |
|
---|
1110 | /*
|
---|
1111 | TODO: maybe write to some dummy hdr field, or write to magic
|
---|
1112 | offset without mmap, before the last sync, instead of the
|
---|
1113 | utime() call
|
---|
1114 | */
|
---|
1115 |
|
---|
1116 | /* on some systems (like Linux 2.6.x) changes via mmap/msync
|
---|
1117 | don't change the mtime of the file, this means the file may
|
---|
1118 | not be backed up (as tdb rounding to block sizes means that
|
---|
1119 | file size changes are quite rare too). The following forces
|
---|
1120 | mtime changes when a transaction completes */
|
---|
1121 | #ifdef HAVE_UTIME
|
---|
1122 | utime(tdb->name, NULL);
|
---|
1123 | #endif
|
---|
1124 |
|
---|
1125 | /* use a transaction cancel to free memory and remove the
|
---|
1126 | transaction locks */
|
---|
1127 | _tdb_transaction_cancel(tdb);
|
---|
1128 |
|
---|
1129 | if (need_repack) {
|
---|
1130 | return tdb_repack(tdb);
|
---|
1131 | }
|
---|
1132 |
|
---|
1133 | return 0;
|
---|
1134 | }
|
---|
1135 |
|
---|
1136 |
|
---|
1137 | /*
|
---|
1138 | recover from an aborted transaction. Must be called with exclusive
|
---|
1139 | database write access already established (including the open
|
---|
1140 | lock to prevent new processes attaching)
|
---|
1141 | */
|
---|
1142 | int tdb_transaction_recover(struct tdb_context *tdb)
|
---|
1143 | {
|
---|
1144 | tdb_off_t recovery_head, recovery_eof;
|
---|
1145 | unsigned char *data, *p;
|
---|
1146 | uint32_t zero = 0;
|
---|
1147 | struct tdb_record rec;
|
---|
1148 |
|
---|
1149 | /* find the recovery area */
|
---|
1150 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
|
---|
1151 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
|
---|
1152 | tdb->ecode = TDB_ERR_IO;
|
---|
1153 | return -1;
|
---|
1154 | }
|
---|
1155 |
|
---|
1156 | if (recovery_head == 0) {
|
---|
1157 | /* we have never allocated a recovery record */
|
---|
1158 | return 0;
|
---|
1159 | }
|
---|
1160 |
|
---|
1161 | /* read the recovery record */
|
---|
1162 | if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
|
---|
1163 | sizeof(rec), DOCONV()) == -1) {
|
---|
1164 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
|
---|
1165 | tdb->ecode = TDB_ERR_IO;
|
---|
1166 | return -1;
|
---|
1167 | }
|
---|
1168 |
|
---|
1169 | if (rec.magic != TDB_RECOVERY_MAGIC) {
|
---|
1170 | /* there is no valid recovery data */
|
---|
1171 | return 0;
|
---|
1172 | }
|
---|
1173 |
|
---|
1174 | if (tdb->read_only) {
|
---|
1175 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
|
---|
1176 | tdb->ecode = TDB_ERR_CORRUPT;
|
---|
1177 | return -1;
|
---|
1178 | }
|
---|
1179 |
|
---|
1180 | recovery_eof = rec.key_len;
|
---|
1181 |
|
---|
1182 | data = (unsigned char *)malloc(rec.data_len);
|
---|
1183 | if (data == NULL) {
|
---|
1184 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
|
---|
1185 | tdb->ecode = TDB_ERR_OOM;
|
---|
1186 | return -1;
|
---|
1187 | }
|
---|
1188 |
|
---|
1189 | /* read the full recovery data */
|
---|
1190 | if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
|
---|
1191 | rec.data_len, 0) == -1) {
|
---|
1192 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
|
---|
1193 | tdb->ecode = TDB_ERR_IO;
|
---|
1194 | return -1;
|
---|
1195 | }
|
---|
1196 |
|
---|
1197 | /* recover the file data */
|
---|
1198 | p = data;
|
---|
1199 | while (p+8 < data + rec.data_len) {
|
---|
1200 | uint32_t ofs, len;
|
---|
1201 | if (DOCONV()) {
|
---|
1202 | tdb_convert(p, 8);
|
---|
1203 | }
|
---|
1204 | memcpy(&ofs, p, 4);
|
---|
1205 | memcpy(&len, p+4, 4);
|
---|
1206 |
|
---|
1207 | if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
|
---|
1208 | free(data);
|
---|
1209 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
|
---|
1210 | tdb->ecode = TDB_ERR_IO;
|
---|
1211 | return -1;
|
---|
1212 | }
|
---|
1213 | p += 8 + len;
|
---|
1214 | }
|
---|
1215 |
|
---|
1216 | free(data);
|
---|
1217 |
|
---|
1218 | if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
|
---|
1219 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
|
---|
1220 | tdb->ecode = TDB_ERR_IO;
|
---|
1221 | return -1;
|
---|
1222 | }
|
---|
1223 |
|
---|
1224 | /* if the recovery area is after the recovered eof then remove it */
|
---|
1225 | if (recovery_eof <= recovery_head) {
|
---|
1226 | if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
|
---|
1227 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
|
---|
1228 | tdb->ecode = TDB_ERR_IO;
|
---|
1229 | return -1;
|
---|
1230 | }
|
---|
1231 | }
|
---|
1232 |
|
---|
1233 | /* remove the recovery magic */
|
---|
1234 | if (tdb_ofs_write(tdb, recovery_head + offsetof(struct tdb_record, magic),
|
---|
1235 | &zero) == -1) {
|
---|
1236 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
|
---|
1237 | tdb->ecode = TDB_ERR_IO;
|
---|
1238 | return -1;
|
---|
1239 | }
|
---|
1240 |
|
---|
1241 | if (transaction_sync(tdb, 0, recovery_eof) == -1) {
|
---|
1242 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
|
---|
1243 | tdb->ecode = TDB_ERR_IO;
|
---|
1244 | return -1;
|
---|
1245 | }
|
---|
1246 |
|
---|
1247 | TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %d byte database\n",
|
---|
1248 | recovery_eof));
|
---|
1249 |
|
---|
1250 | /* all done */
|
---|
1251 | return 0;
|
---|
1252 | }
|
---|
1253 |
|
---|
1254 | /* Any I/O failures we say "needs recovery". */
|
---|
1255 | bool tdb_needs_recovery(struct tdb_context *tdb)
|
---|
1256 | {
|
---|
1257 | tdb_off_t recovery_head;
|
---|
1258 | struct tdb_record rec;
|
---|
1259 |
|
---|
1260 | /* find the recovery area */
|
---|
1261 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
|
---|
1262 | return true;
|
---|
1263 | }
|
---|
1264 |
|
---|
1265 | if (recovery_head == 0) {
|
---|
1266 | /* we have never allocated a recovery record */
|
---|
1267 | return false;
|
---|
1268 | }
|
---|
1269 |
|
---|
1270 | /* read the recovery record */
|
---|
1271 | if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
|
---|
1272 | sizeof(rec), DOCONV()) == -1) {
|
---|
1273 | return true;
|
---|
1274 | }
|
---|
1275 |
|
---|
1276 | return (rec.magic == TDB_RECOVERY_MAGIC);
|
---|
1277 | }
|
---|